| @@ -494,6 +494,8 @@ type Config struct { | |||
| UserCommand string `json:"user_command"` | |||
| CreateVersion bool `json:"create_version"` | |||
| Volumes []Volumes `json:"volumes"` | |||
| Flavor Flavor `json:"flavor"` | |||
| PoolID string `json:"pool_id"` | |||
| } | |||
| type Parameter struct { | |||
| @@ -509,7 +511,7 @@ type DataSource struct { | |||
| } | |||
| type Volumes struct { | |||
| Nfs Nfs `json:"nfs"` | |||
| Nfs Nfs `json:"nfs"` | |||
| HostPath HostPath `json:"host_path"` | |||
| } | |||
| @@ -526,6 +528,10 @@ type HostPath struct { | |||
| ReadOnly bool `json:"read_only"` | |||
| } | |||
| type Flavor struct { | |||
| Code string `json:"code"` | |||
| } | |||
| type CreateTrainJobResult struct { | |||
| ErrorCode string `json:"error_code"` | |||
| ErrorMsg string `json:"error_msg"` | |||
| @@ -539,6 +545,28 @@ type CreateTrainJobResult struct { | |||
| VersionName string `json:"version_name"` | |||
| } | |||
| type GetResourceSpecsResult struct { | |||
| ErrorCode string `json:"error_code"` | |||
| ErrorMsg string `json:"error_msg"` | |||
| IsSuccess bool `json:"is_success"` | |||
| SpecTotalCount int `json:"spec_total_count"` | |||
| Specs []Specs `json:"specs"` | |||
| } | |||
| type Specs struct { | |||
| ErrorCode string `json:"core"` | |||
| ErrorMsg string `json:"cpu"` | |||
| IsSuccess bool `json:"no_resource"` | |||
| JobName string `json:"gpu_type"` | |||
| JobID int64 `json:"spec_id"` | |||
| Status int `json:"gpu_num"` | |||
| ResourceID string `json:"spec_code"` | |||
| VersionName string `json:"storage"` | |||
| MaxNum int `json:"max_num"` | |||
| UnitNum int `json:"unit_num"` | |||
| InterfaceType int `json:"interface_type"` | |||
| } | |||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | |||
| sess := x.NewSession() | |||
| defer sess.Close() | |||
| @@ -21,7 +21,9 @@ type CreateModelArtsTrainJobForm struct { | |||
| BootFile string `form:"boot_file" binding:"Required"` | |||
| WorkServerNumber int `form:"work_server_number" binding:"Required"` | |||
| EngineID int `form:"engine_id" binding:"Required"` | |||
| SpecID int `form:"spec_id" binding:"Required"` | |||
| Flavor string `form:"flavor" binding:"Required"` | |||
| PoolID string `form:"pool_id" binding:"Required"` | |||
| Description string `form:"description"` | |||
| } | |||
| @@ -23,17 +23,17 @@ const ( | |||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | |||
| //train-job | |||
| ResourcePools = "{\"resource_pool\":[{\"id\":1, \"value\":\"专属资源池\"}]}" | |||
| ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" | |||
| Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" | |||
| EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + | |||
| "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + | |||
| "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + | |||
| "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + | |||
| "]}" | |||
| FlavorInfos = "{\"flavor\":[{\"id\":1,\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + | |||
| "{\"id\":2,\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + | |||
| "{\"id\":3,\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | |||
| "{\"id\":4,\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | |||
| FlavorInfos = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + | |||
| "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + | |||
| "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | |||
| "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | |||
| "]}" | |||
| CodePath = "/code/" | |||
| OutputPath = "/output/" | |||
| @@ -48,6 +48,9 @@ type GenerateTrainJobReq struct { | |||
| BootFile string | |||
| DataUrl string | |||
| TrainUrl string | |||
| FlavorCode string | |||
| PoolID string | |||
| SpecID int64 | |||
| WorkServerNumber int | |||
| EngineID int64 | |||
| } | |||
| @@ -61,8 +64,8 @@ type VersionInfo struct { | |||
| type Flavor struct { | |||
| Info []struct { | |||
| ID int `json:"id"` | |||
| Value string `json:"value"` | |||
| Code string `json:"code"` | |||
| Value string `json:"value"` | |||
| } `json:"flavor"` | |||
| } | |||
| @@ -75,8 +78,8 @@ type Engine struct { | |||
| type ResourcePool struct { | |||
| Info []struct { | |||
| ID int `json:"id"` | |||
| Value string `json:"value"` | |||
| ID string `json:"id"` | |||
| Value string `json:"value"` | |||
| } `json:"resource_pool"` | |||
| } | |||
| @@ -130,10 +133,15 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||
| Config: models.Config{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.CodeObsPath + req.BootFile, | |||
| BootFileUrl: req.BootFile, | |||
| DataUrl: req.DataUrl, | |||
| EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| PoolID: req.PoolID, | |||
| SpecID: req.SpecID, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| }, | |||
| }) | |||
| @@ -24,6 +24,8 @@ const ( | |||
| urlGetToken = "/v3/auth/tokens" | |||
| urlNotebook = "/demanager/instances" | |||
| urlTrainJob = "/training-jobs" | |||
| urlResourceSpecs = "/job/resource-specs" | |||
| errorCodeExceedLimit = "ModelArts.0118" | |||
| ) | |||
| func getRestyClient() *resty.Client { | |||
| @@ -293,6 +295,8 @@ func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.Create | |||
| client := getRestyClient() | |||
| var result models.CreateTrainJobResult | |||
| log.Info("%+v",createJobParams) | |||
| retry := 0 | |||
| sendjob: | |||
| @@ -307,6 +311,8 @@ sendjob: | |||
| return nil, fmt.Errorf("resty create train-job: %s", err) | |||
| } | |||
| log.Info("", res.StatusCode(), res.Request.Body) | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| @@ -320,3 +326,40 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetResourceSpecsResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetJob: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| log.Error("GetResourceSpecs failed(%d)", res.StatusCode()) | |||
| return &result, fmt.Errorf("GetResourceSpecs failed(%d)", res.StatusCode()) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| @@ -336,6 +336,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| workServerNumber := form.WorkServerNumber | |||
| engineID := form.EngineID | |||
| bootFile := form.BootFile | |||
| flavorCode := form.Flavor | |||
| poolID := form.PoolID | |||
| specID := form.SpecID | |||
| repo := ctx.Repo.Repository | |||
| codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | |||
| codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | |||
| @@ -349,6 +352,12 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| } | |||
| //todo: upload code (send to file_server todo this work?) | |||
| if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { | |||
| log.Error("Failed to obsMkdir: %s (%v)", repo.FullName(), err) | |||
| ctx.RenderWithErr("Failed to obsMkdir", tplModelArtsTrainJobNew, &form) | |||
| return | |||
| } | |||
| if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | |||
| log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | |||
| ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | |||
| @@ -360,14 +369,18 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| DataUrl: dataPath, | |||
| Description: description, | |||
| CodeObsPath: codeObsPath, | |||
| BootFile: codeObsPath + "/" + bootFile, | |||
| BootFile: codeObsPath + bootFile, | |||
| TrainUrl: outputObsPath, | |||
| FlavorCode: flavorCode, | |||
| PoolID: poolID, | |||
| WorkServerNumber: workServerNumber, | |||
| EngineID: int64(engineID), | |||
| SpecID: int64(specID), | |||
| } | |||
| err := modelarts.GenerateTrainJob(ctx, req) | |||
| if err != nil { | |||
| log.Error("GenerateTrainJob failed:%v", err.Error()) | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | |||
| return | |||
| } | |||
| @@ -408,7 +421,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
| if file.IsDir() { | |||
| input := &obs.PutObjectInput{} | |||
| input.Bucket = setting.Bucket | |||
| input.Key = codePath + file.Name() + "/" | |||
| input.Key = parentDir + file.Name() + "/" | |||
| _, err = storage.ObsCli.PutObject(input) | |||
| if err != nil { | |||
| log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | |||
| @@ -422,7 +435,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
| } else { | |||
| input := &obs.PutFileInput{} | |||
| input.Bucket = setting.Bucket | |||
| input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() | |||
| input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name() | |||
| input.SourceFile = codePath + file.Name() | |||
| _, err = storage.ObsCli.PutFile(input) | |||
| if err != nil { | |||
| @@ -434,3 +447,16 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
| return nil | |||
| } | |||
| func obsMkdir(dir string) error { | |||
| input := &obs.PutObjectInput{} | |||
| input.Bucket = setting.Bucket | |||
| input.Key = dir | |||
| _, err := storage.ObsCli.PutObject(input) | |||
| if err != nil { | |||
| log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| @@ -128,7 +128,7 @@ | |||
| </div> | |||
| <div class="inline required field"> | |||
| <label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label> | |||
| <select class="ui search dropdown" id="trainjob_datasets" style='width:385px'> | |||
| <select class="ui search dropdown" id="trainjob_datasets" style='width:385px' name="attachment"> | |||
| {{range .attachments}} | |||
| <option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option> | |||
| {{end}} | |||
| @@ -151,7 +151,7 @@ | |||
| </select> | |||
| </div> | |||
| <div class="field"> | |||
| <select class="ui search dropdown" id="trainjob_engine_versions" style='width:385px'> | |||
| <select class="ui search dropdown" id="trainjob_engine_versions" style='width:385px' name="engine_id"> | |||
| {{range .engine_versions}} | |||
| <option name="engine_id" value="{{.ID}}">{{.Value}}</option> | |||
| {{end}} | |||
| @@ -161,13 +161,13 @@ | |||
| </div> | |||
| <div class="inline required field"> | |||
| <label>{{.i18n.Tr "repo.modelarts.train_job.start_file"}}</label> | |||
| <input name="boot_file" id="trainjob_boot_file" value="{{.dataset_path}}" tabindex="3" autofocus required maxlength="255" readonly="readonly"> | |||
| <input name="boot_file" id="trainjob_boot_file" value="{{.dataset_path}}" tabindex="3" autofocus required maxlength="255"> | |||
| </div> | |||
| </div> | |||
| </div> | |||
| <div class="required field"> | |||
| <label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label> | |||
| <select class="ui search dropdown" id="trainjob_datasets" style='width:385px'> | |||
| <select class="ui search dropdown" id="trainjob_datasets" style='width:385px' name="attachment"> | |||
| {{range .attachments}} | |||
| <option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option> | |||
| {{end}} | |||
| @@ -184,9 +184,9 @@ | |||
| <h4 class="ui dividing header">{{.i18n.Tr "repo.modelarts.train_job.resource_setting"}}</h4> | |||
| <div class="required field"> | |||
| <label>{{.i18n.Tr "repo.modelarts.train_job.resource_pool"}}</label> | |||
| <select class="ui search dropdown" id="trainjob_resource_pool" style='width:385px'> | |||
| <select class="ui search dropdown" id="trainjob_resource_pool" style='width:385px' name="pool_id"> | |||
| {{range .resource_pools}} | |||
| <option value="{{.Value}}">{{.Value}}</option> | |||
| <option value="{{.ID}}">{{.Value}}</option> | |||
| {{end}} | |||
| </select> | |||
| </div> | |||
| @@ -211,9 +211,9 @@ | |||
| <div class="required field"> | |||
| <label>{{.i18n.Tr "repo.modelarts.train_job.standard"}}</label> | |||
| <select class="ui search dropdown" id="trainjob-flavor" style='width:385px'> | |||
| <select class="ui search dropdown" id="trainjob-flavor" style='width:385px' name="flavor"> | |||
| {{range .flavor_infos}} | |||
| <option name="flavor" value="{{.Value}}">{{.Value}}</option> | |||
| <option name="flavor" value="{{.Code}}">{{.Value}}</option> | |||
| {{end}} | |||
| </select> | |||
| </div> | |||
| @@ -232,9 +232,9 @@ | |||
| </div> | |||
| </div> | |||
| <div class="disabled field" id="save_para"> | |||
| <div class="required field"> | |||
| <div class="field"> | |||
| <label>{{.i18n.Tr "repo.modelarts.train_job.job_parameter_name"}}</label> | |||
| <input name="job_type" id="cloudbrain_job_type" value="{{.notebook_type}}" tabindex="3" autofocus required maxlength="255"> | |||
| <input name="job_type" id="cloudbrain_job_type" value="{{.notebook_type}}" tabindex="3" autofocus maxlength="255"> | |||
| </div> | |||
| <div class="field"> | |||
| <label for="parameter_description">{{.i18n.Tr "repo.modelarts.train_job.parameter_description"}}</label> | |||