| @@ -494,6 +494,8 @@ type Config struct { | |||||
| UserCommand string `json:"user_command"` | UserCommand string `json:"user_command"` | ||||
| CreateVersion bool `json:"create_version"` | CreateVersion bool `json:"create_version"` | ||||
| Volumes []Volumes `json:"volumes"` | Volumes []Volumes `json:"volumes"` | ||||
| Flavor Flavor `json:"flavor"` | |||||
| PoolID string `json:"pool_id"` | |||||
| } | } | ||||
| type Parameter struct { | type Parameter struct { | ||||
| @@ -509,7 +511,7 @@ type DataSource struct { | |||||
| } | } | ||||
| type Volumes struct { | type Volumes struct { | ||||
| Nfs Nfs `json:"nfs"` | |||||
| Nfs Nfs `json:"nfs"` | |||||
| HostPath HostPath `json:"host_path"` | HostPath HostPath `json:"host_path"` | ||||
| } | } | ||||
| @@ -526,6 +528,10 @@ type HostPath struct { | |||||
| ReadOnly bool `json:"read_only"` | ReadOnly bool `json:"read_only"` | ||||
| } | } | ||||
| type Flavor struct { | |||||
| Code string `json:"code"` | |||||
| } | |||||
| type CreateTrainJobResult struct { | type CreateTrainJobResult struct { | ||||
| ErrorCode string `json:"error_code"` | ErrorCode string `json:"error_code"` | ||||
| ErrorMsg string `json:"error_msg"` | ErrorMsg string `json:"error_msg"` | ||||
| @@ -539,6 +545,28 @@ type CreateTrainJobResult struct { | |||||
| VersionName string `json:"version_name"` | VersionName string `json:"version_name"` | ||||
| } | } | ||||
| type GetResourceSpecsResult struct { | |||||
| ErrorCode string `json:"error_code"` | |||||
| ErrorMsg string `json:"error_msg"` | |||||
| IsSuccess bool `json:"is_success"` | |||||
| SpecTotalCount int `json:"spec_total_count"` | |||||
| Specs []Specs `json:"specs"` | |||||
| } | |||||
| type Specs struct { | |||||
| ErrorCode string `json:"core"` | |||||
| ErrorMsg string `json:"cpu"` | |||||
| IsSuccess bool `json:"no_resource"` | |||||
| JobName string `json:"gpu_type"` | |||||
| JobID int64 `json:"spec_id"` | |||||
| Status int `json:"gpu_num"` | |||||
| ResourceID string `json:"spec_code"` | |||||
| VersionName string `json:"storage"` | |||||
| MaxNum int `json:"max_num"` | |||||
| UnitNum int `json:"unit_num"` | |||||
| InterfaceType int `json:"interface_type"` | |||||
| } | |||||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | ||||
| sess := x.NewSession() | sess := x.NewSession() | ||||
| defer sess.Close() | defer sess.Close() | ||||
| @@ -21,7 +21,9 @@ type CreateModelArtsTrainJobForm struct { | |||||
| BootFile string `form:"boot_file" binding:"Required"` | BootFile string `form:"boot_file" binding:"Required"` | ||||
| WorkServerNumber int `form:"work_server_number" binding:"Required"` | WorkServerNumber int `form:"work_server_number" binding:"Required"` | ||||
| EngineID int `form:"engine_id" binding:"Required"` | EngineID int `form:"engine_id" binding:"Required"` | ||||
| SpecID int `form:"spec_id" binding:"Required"` | |||||
| Flavor string `form:"flavor" binding:"Required"` | Flavor string `form:"flavor" binding:"Required"` | ||||
| PoolID string `form:"pool_id" binding:"Required"` | |||||
| Description string `form:"description"` | Description string `form:"description"` | ||||
| } | } | ||||
| @@ -23,17 +23,17 @@ const ( | |||||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | ||||
| //train-job | //train-job | ||||
| ResourcePools = "{\"resource_pool\":[{\"id\":1, \"value\":\"专属资源池\"}]}" | |||||
| ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" | |||||
| Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" | Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" | ||||
| EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + | EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + | ||||
| "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + | "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + | ||||
| "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + | "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + | ||||
| "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + | "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + | ||||
| "]}" | "]}" | ||||
| FlavorInfos = "{\"flavor\":[{\"id\":1,\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + | |||||
| "{\"id\":2,\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + | |||||
| "{\"id\":3,\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | |||||
| "{\"id\":4,\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | |||||
| FlavorInfos = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + | |||||
| "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + | |||||
| "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | |||||
| "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | |||||
| "]}" | "]}" | ||||
| CodePath = "/code/" | CodePath = "/code/" | ||||
| OutputPath = "/output/" | OutputPath = "/output/" | ||||
| @@ -48,6 +48,9 @@ type GenerateTrainJobReq struct { | |||||
| BootFile string | BootFile string | ||||
| DataUrl string | DataUrl string | ||||
| TrainUrl string | TrainUrl string | ||||
| FlavorCode string | |||||
| PoolID string | |||||
| SpecID int64 | |||||
| WorkServerNumber int | WorkServerNumber int | ||||
| EngineID int64 | EngineID int64 | ||||
| } | } | ||||
| @@ -61,8 +64,8 @@ type VersionInfo struct { | |||||
| type Flavor struct { | type Flavor struct { | ||||
| Info []struct { | Info []struct { | ||||
| ID int `json:"id"` | |||||
| Value string `json:"value"` | |||||
| Code string `json:"code"` | |||||
| Value string `json:"value"` | |||||
| } `json:"flavor"` | } `json:"flavor"` | ||||
| } | } | ||||
| @@ -75,8 +78,8 @@ type Engine struct { | |||||
| type ResourcePool struct { | type ResourcePool struct { | ||||
| Info []struct { | Info []struct { | ||||
| ID int `json:"id"` | |||||
| Value string `json:"value"` | |||||
| ID string `json:"id"` | |||||
| Value string `json:"value"` | |||||
| } `json:"resource_pool"` | } `json:"resource_pool"` | ||||
| } | } | ||||
| @@ -130,10 +133,15 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||||
| Config: models.Config{ | Config: models.Config{ | ||||
| WorkServerNum: req.WorkServerNumber, | WorkServerNum: req.WorkServerNumber, | ||||
| AppUrl: req.CodeObsPath, | AppUrl: req.CodeObsPath, | ||||
| BootFileUrl: req.CodeObsPath + req.BootFile, | |||||
| BootFileUrl: req.BootFile, | |||||
| DataUrl: req.DataUrl, | DataUrl: req.DataUrl, | ||||
| EngineID: req.EngineID, | EngineID: req.EngineID, | ||||
| TrainUrl: req.TrainUrl, | TrainUrl: req.TrainUrl, | ||||
| PoolID: req.PoolID, | |||||
| SpecID: req.SpecID, | |||||
| Flavor: models.Flavor{ | |||||
| Code: req.FlavorCode, | |||||
| }, | |||||
| }, | }, | ||||
| }) | }) | ||||
| @@ -24,6 +24,8 @@ const ( | |||||
| urlGetToken = "/v3/auth/tokens" | urlGetToken = "/v3/auth/tokens" | ||||
| urlNotebook = "/demanager/instances" | urlNotebook = "/demanager/instances" | ||||
| urlTrainJob = "/training-jobs" | urlTrainJob = "/training-jobs" | ||||
| urlResourceSpecs = "/job/resource-specs" | |||||
| errorCodeExceedLimit = "ModelArts.0118" | errorCodeExceedLimit = "ModelArts.0118" | ||||
| ) | ) | ||||
| func getRestyClient() *resty.Client { | func getRestyClient() *resty.Client { | ||||
| @@ -293,6 +295,8 @@ func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.Create | |||||
| client := getRestyClient() | client := getRestyClient() | ||||
| var result models.CreateTrainJobResult | var result models.CreateTrainJobResult | ||||
| log.Info("%+v",createJobParams) | |||||
| retry := 0 | retry := 0 | ||||
| sendjob: | sendjob: | ||||
| @@ -307,6 +311,8 @@ sendjob: | |||||
| return nil, fmt.Errorf("resty create train-job: %s", err) | return nil, fmt.Errorf("resty create train-job: %s", err) | ||||
| } | } | ||||
| log.Info("", res.StatusCode(), res.Request.Body) | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | ||||
| retry++ | retry++ | ||||
| _ = getToken() | _ = getToken() | ||||
| @@ -320,3 +326,40 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.GetResourceSpecsResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetResult(&result). | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty GetJob: %v", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| if res.StatusCode() != http.StatusOK { | |||||
| log.Error("GetResourceSpecs failed(%d)", res.StatusCode()) | |||||
| return &result, fmt.Errorf("GetResourceSpecs failed(%d)", res.StatusCode()) | |||||
| } | |||||
| if !result.IsSuccess { | |||||
| log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| @@ -336,6 +336,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| workServerNumber := form.WorkServerNumber | workServerNumber := form.WorkServerNumber | ||||
| engineID := form.EngineID | engineID := form.EngineID | ||||
| bootFile := form.BootFile | bootFile := form.BootFile | ||||
| flavorCode := form.Flavor | |||||
| poolID := form.PoolID | |||||
| specID := form.SpecID | |||||
| repo := ctx.Repo.Repository | repo := ctx.Repo.Repository | ||||
| codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | ||||
| codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | ||||
| @@ -349,6 +352,12 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| } | } | ||||
| //todo: upload code (send to file_server todo this work?) | //todo: upload code (send to file_server todo this work?) | ||||
| if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { | |||||
| log.Error("Failed to obsMkdir: %s (%v)", repo.FullName(), err) | |||||
| ctx.RenderWithErr("Failed to obsMkdir", tplModelArtsTrainJobNew, &form) | |||||
| return | |||||
| } | |||||
| if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | ||||
| log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | ||||
| ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | ||||
| @@ -360,14 +369,18 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| DataUrl: dataPath, | DataUrl: dataPath, | ||||
| Description: description, | Description: description, | ||||
| CodeObsPath: codeObsPath, | CodeObsPath: codeObsPath, | ||||
| BootFile: codeObsPath + "/" + bootFile, | |||||
| BootFile: codeObsPath + bootFile, | |||||
| TrainUrl: outputObsPath, | TrainUrl: outputObsPath, | ||||
| FlavorCode: flavorCode, | |||||
| PoolID: poolID, | |||||
| WorkServerNumber: workServerNumber, | WorkServerNumber: workServerNumber, | ||||
| EngineID: int64(engineID), | EngineID: int64(engineID), | ||||
| SpecID: int64(specID), | |||||
| } | } | ||||
| err := modelarts.GenerateTrainJob(ctx, req) | err := modelarts.GenerateTrainJob(ctx, req) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("GenerateTrainJob failed:%v", err.Error()) | |||||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | ||||
| return | return | ||||
| } | } | ||||
| @@ -408,7 +421,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||||
| if file.IsDir() { | if file.IsDir() { | ||||
| input := &obs.PutObjectInput{} | input := &obs.PutObjectInput{} | ||||
| input.Bucket = setting.Bucket | input.Bucket = setting.Bucket | ||||
| input.Key = codePath + file.Name() + "/" | |||||
| input.Key = parentDir + file.Name() + "/" | |||||
| _, err = storage.ObsCli.PutObject(input) | _, err = storage.ObsCli.PutObject(input) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | ||||
| @@ -422,7 +435,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||||
| } else { | } else { | ||||
| input := &obs.PutFileInput{} | input := &obs.PutFileInput{} | ||||
| input.Bucket = setting.Bucket | input.Bucket = setting.Bucket | ||||
| input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() | |||||
| input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name() | |||||
| input.SourceFile = codePath + file.Name() | input.SourceFile = codePath + file.Name() | ||||
| _, err = storage.ObsCli.PutFile(input) | _, err = storage.ObsCli.PutFile(input) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -434,3 +447,16 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||||
| return nil | return nil | ||||
| } | } | ||||
| func obsMkdir(dir string) error { | |||||
| input := &obs.PutObjectInput{} | |||||
| input.Bucket = setting.Bucket | |||||
| input.Key = dir | |||||
| _, err := storage.ObsCli.PutObject(input) | |||||
| if err != nil { | |||||
| log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | |||||
| return err | |||||
| } | |||||
| return nil | |||||
| } | |||||
| @@ -128,7 +128,7 @@ | |||||
| </div> | </div> | ||||
| <div class="inline required field"> | <div class="inline required field"> | ||||
| <label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label> | <label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label> | ||||
| <select class="ui search dropdown" id="trainjob_datasets" style='width:385px'> | |||||
| <select class="ui search dropdown" id="trainjob_datasets" style='width:385px' name="attachment"> | |||||
| {{range .attachments}} | {{range .attachments}} | ||||
| <option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option> | <option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option> | ||||
| {{end}} | {{end}} | ||||
| @@ -151,7 +151,7 @@ | |||||
| </select> | </select> | ||||
| </div> | </div> | ||||
| <div class="field"> | <div class="field"> | ||||
| <select class="ui search dropdown" id="trainjob_engine_versions" style='width:385px'> | |||||
| <select class="ui search dropdown" id="trainjob_engine_versions" style='width:385px' name="engine_id"> | |||||
| {{range .engine_versions}} | {{range .engine_versions}} | ||||
| <option name="engine_id" value="{{.ID}}">{{.Value}}</option> | <option name="engine_id" value="{{.ID}}">{{.Value}}</option> | ||||
| {{end}} | {{end}} | ||||
| @@ -161,13 +161,13 @@ | |||||
| </div> | </div> | ||||
| <div class="inline required field"> | <div class="inline required field"> | ||||
| <label>{{.i18n.Tr "repo.modelarts.train_job.start_file"}}</label> | <label>{{.i18n.Tr "repo.modelarts.train_job.start_file"}}</label> | ||||
| <input name="boot_file" id="trainjob_boot_file" value="{{.dataset_path}}" tabindex="3" autofocus required maxlength="255" readonly="readonly"> | |||||
| <input name="boot_file" id="trainjob_boot_file" value="{{.dataset_path}}" tabindex="3" autofocus required maxlength="255"> | |||||
| </div> | </div> | ||||
| </div> | </div> | ||||
| </div> | </div> | ||||
| <div class="required field"> | <div class="required field"> | ||||
| <label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label> | <label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label> | ||||
| <select class="ui search dropdown" id="trainjob_datasets" style='width:385px'> | |||||
| <select class="ui search dropdown" id="trainjob_datasets" style='width:385px' name="attachment"> | |||||
| {{range .attachments}} | {{range .attachments}} | ||||
| <option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option> | <option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option> | ||||
| {{end}} | {{end}} | ||||
| @@ -184,9 +184,9 @@ | |||||
| <h4 class="ui dividing header">{{.i18n.Tr "repo.modelarts.train_job.resource_setting"}}</h4> | <h4 class="ui dividing header">{{.i18n.Tr "repo.modelarts.train_job.resource_setting"}}</h4> | ||||
| <div class="required field"> | <div class="required field"> | ||||
| <label>{{.i18n.Tr "repo.modelarts.train_job.resource_pool"}}</label> | <label>{{.i18n.Tr "repo.modelarts.train_job.resource_pool"}}</label> | ||||
| <select class="ui search dropdown" id="trainjob_resource_pool" style='width:385px'> | |||||
| <select class="ui search dropdown" id="trainjob_resource_pool" style='width:385px' name="pool_id"> | |||||
| {{range .resource_pools}} | {{range .resource_pools}} | ||||
| <option value="{{.Value}}">{{.Value}}</option> | |||||
| <option value="{{.ID}}">{{.Value}}</option> | |||||
| {{end}} | {{end}} | ||||
| </select> | </select> | ||||
| </div> | </div> | ||||
| @@ -211,9 +211,9 @@ | |||||
| <div class="required field"> | <div class="required field"> | ||||
| <label>{{.i18n.Tr "repo.modelarts.train_job.standard"}}</label> | <label>{{.i18n.Tr "repo.modelarts.train_job.standard"}}</label> | ||||
| <select class="ui search dropdown" id="trainjob-flavor" style='width:385px'> | |||||
| <select class="ui search dropdown" id="trainjob-flavor" style='width:385px' name="flavor"> | |||||
| {{range .flavor_infos}} | {{range .flavor_infos}} | ||||
| <option name="flavor" value="{{.Value}}">{{.Value}}</option> | |||||
| <option name="flavor" value="{{.Code}}">{{.Value}}</option> | |||||
| {{end}} | {{end}} | ||||
| </select> | </select> | ||||
| </div> | </div> | ||||
| @@ -232,9 +232,9 @@ | |||||
| </div> | </div> | ||||
| </div> | </div> | ||||
| <div class="disabled field" id="save_para"> | <div class="disabled field" id="save_para"> | ||||
| <div class="required field"> | |||||
| <div class="field"> | |||||
| <label>{{.i18n.Tr "repo.modelarts.train_job.job_parameter_name"}}</label> | <label>{{.i18n.Tr "repo.modelarts.train_job.job_parameter_name"}}</label> | ||||
| <input name="job_type" id="cloudbrain_job_type" value="{{.notebook_type}}" tabindex="3" autofocus required maxlength="255"> | |||||
| <input name="job_type" id="cloudbrain_job_type" value="{{.notebook_type}}" tabindex="3" autofocus maxlength="255"> | |||||
| </div> | </div> | ||||
| <div class="field"> | <div class="field"> | ||||
| <label for="parameter_description">{{.i18n.Tr "repo.modelarts.train_job.parameter_description"}}</label> | <label for="parameter_description">{{.i18n.Tr "repo.modelarts.train_job.parameter_description"}}</label> | ||||