| @@ -21,14 +21,14 @@ const ( | |||||
| poolName = "train-private-1" | poolName = "train-private-1" | ||||
| poolType = "USER_DEFINED" | poolType = "USER_DEFINED" | ||||
| DataSetMountPath = "/home/ma-user/work" | |||||
| NotebookEnv = "Python3" | |||||
| NotebookType = "Ascend" | |||||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | |||||
| DataSetMountPath = "/home/ma-user/work" | |||||
| NotebookEnv = "Python3" | |||||
| NotebookType = "Ascend" | |||||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | |||||
| //train-job | //train-job | ||||
| ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" | |||||
| Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" | |||||
| ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" | |||||
| Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" | |||||
| EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + | EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + | ||||
| "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + | "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + | ||||
| "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + | "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + | ||||
| @@ -39,35 +39,35 @@ const ( | |||||
| "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | ||||
| "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | ||||
| "]}" | "]}" | ||||
| CodePath = "/code/" | |||||
| CodePath = "/code/" | |||||
| OutputPath = "/output/" | OutputPath = "/output/" | ||||
| LogPath = "/log/" | |||||
| JobPath = "/job/" | |||||
| OrderDesc = "desc" //向下查询 | |||||
| OrderAsc = "asc" //向上查询 | |||||
| Lines = 20 | |||||
| TrainUrl = "train_url" | |||||
| DataUrl = "data_url" | |||||
| PerPage = 10 | |||||
| LogPath = "/log/" | |||||
| JobPath = "/job/" | |||||
| OrderDesc = "desc" //向下查询 | |||||
| OrderAsc = "asc" //向上查询 | |||||
| Lines = 20 | |||||
| TrainUrl = "train_url" | |||||
| DataUrl = "data_url" | |||||
| PerPage = 10 | |||||
| SortByCreateTime = "create_time" | SortByCreateTime = "create_time" | ||||
| ConfigTypeCustom = "custom" | ConfigTypeCustom = "custom" | ||||
| ) | ) | ||||
| type GenerateTrainJobReq struct { | type GenerateTrainJobReq struct { | ||||
| JobName string | |||||
| Uuid string | |||||
| Description string | |||||
| CodeObsPath string | |||||
| BootFile string | |||||
| DataUrl string | |||||
| TrainUrl string | |||||
| FlavorCode string | |||||
| LogUrl string | |||||
| PoolID string | |||||
| WorkServerNumber int | |||||
| EngineID int64 | |||||
| Parameters []models.Parameter | |||||
| JobName string | |||||
| Uuid string | |||||
| Description string | |||||
| CodeObsPath string | |||||
| BootFile string | |||||
| DataUrl string | |||||
| TrainUrl string | |||||
| FlavorCode string | |||||
| LogUrl string | |||||
| PoolID string | |||||
| WorkServerNumber int | |||||
| EngineID int64 | |||||
| Parameters []models.Parameter | |||||
| } | } | ||||
| type VersionInfo struct { | type VersionInfo struct { | ||||
| @@ -79,8 +79,8 @@ type VersionInfo struct { | |||||
| type Flavor struct { | type Flavor struct { | ||||
| Info []struct { | Info []struct { | ||||
| Code string `json:"code"` | |||||
| Value string `json:"value"` | |||||
| Code string `json:"code"` | |||||
| Value string `json:"value"` | |||||
| } `json:"flavor"` | } `json:"flavor"` | ||||
| } | } | ||||
| @@ -93,8 +93,8 @@ type Engine struct { | |||||
| type ResourcePool struct { | type ResourcePool struct { | ||||
| Info []struct { | Info []struct { | ||||
| ID string `json:"id"` | |||||
| Value string `json:"value"` | |||||
| ID string `json:"id"` | |||||
| Value string `json:"value"` | |||||
| } `json:"resource_pool"` | } `json:"resource_pool"` | ||||
| } | } | ||||
| @@ -137,7 +137,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error | |||||
| JobName: jobName, | JobName: jobName, | ||||
| JobType: string(models.JobTypeDebug), | JobType: string(models.JobTypeDebug), | ||||
| Type: models.TypeCloudBrainNotebook, | Type: models.TypeCloudBrainNotebook, | ||||
| Uuid: uuid, | |||||
| Uuid: uuid, | |||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -149,24 +149,23 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error | |||||
| func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | ||||
| jobResult, err := createTrainJob(models.CreateTrainJobParams{ | jobResult, err := createTrainJob(models.CreateTrainJobParams{ | ||||
| JobName: req.JobName, | |||||
| Description: req.Description, | |||||
| JobName: req.JobName, | |||||
| Description: req.Description, | |||||
| Config: models.Config{ | Config: models.Config{ | ||||
| WorkServerNum: req.WorkServerNumber, | |||||
| AppUrl: req.CodeObsPath, | |||||
| BootFileUrl: req.BootFile, | |||||
| DataUrl: req.DataUrl, | |||||
| EngineID: req.EngineID, | |||||
| TrainUrl: req.TrainUrl, | |||||
| LogUrl: req.LogUrl, | |||||
| PoolID: req.PoolID, | |||||
| CreateVersion: true, | |||||
| Flavor: models.Flavor{ | |||||
| Code: req.FlavorCode, | |||||
| WorkServerNum: req.WorkServerNumber, | |||||
| AppUrl: req.CodeObsPath, | |||||
| BootFileUrl: req.BootFile, | |||||
| DataUrl: req.DataUrl, | |||||
| EngineID: req.EngineID, | |||||
| TrainUrl: req.TrainUrl, | |||||
| LogUrl: req.LogUrl, | |||||
| PoolID: req.PoolID, | |||||
| CreateVersion: true, | |||||
| Flavor: models.Flavor{ | |||||
| Code: req.FlavorCode, | |||||
| }, | }, | ||||
| Parameter: req.Parameters, | |||||
| Parameter: req.Parameters, | |||||
| }, | }, | ||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("CreateJob failed: %v", err.Error()) | log.Error("CreateJob failed: %v", err.Error()) | ||||
| @@ -180,10 +179,10 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||||
| JobID: strconv.FormatInt(jobResult.JobID, 10), | JobID: strconv.FormatInt(jobResult.JobID, 10), | ||||
| JobName: req.JobName, | JobName: req.JobName, | ||||
| JobType: string(models.JobTypeDebug), | JobType: string(models.JobTypeDebug), | ||||
| Type: models.TypeCloudBrainTrainJob, | |||||
| VersionID: jobResult.VersionID, | |||||
| Type: models.TypeCloudBrainTrainJob, | |||||
| VersionID: jobResult.VersionID, | |||||
| VersionName: jobResult.VersionName, | VersionName: jobResult.VersionName, | ||||
| Uuid: req.Uuid, | |||||
| Uuid: req.Uuid, | |||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -194,7 +193,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||||
| return nil | return nil | ||||
| } | } | ||||
| func TransTrainJobStatus(status int) string{ | |||||
| func TransTrainJobStatus(status int) string { | |||||
| switch status { | switch status { | ||||
| case 0: | case 0: | ||||
| return "UNKNOWN" | return "UNKNOWN" | ||||
| @@ -23,9 +23,9 @@ const ( | |||||
| urlGetToken = "/v3/auth/tokens" | urlGetToken = "/v3/auth/tokens" | ||||
| urlNotebook = "/demanager/instances" | urlNotebook = "/demanager/instances" | ||||
| urlTrainJob = "/training-jobs" | |||||
| urlResourceSpecs = "/job/resource-specs" | |||||
| urlTrainJobConfig = "/training-job-configs" | |||||
| urlTrainJob = "/training-jobs" | |||||
| urlResourceSpecs = "/job/resource-specs" | |||||
| urlTrainJobConfig = "/training-job-configs" | |||||
| errorCodeExceedLimit = "ModelArts.0118" | errorCodeExceedLimit = "ModelArts.0118" | ||||
| ) | ) | ||||
| @@ -435,12 +435,12 @@ func GetConfigList(perPage, page int, sortBy, order, searchContent, configType s | |||||
| sendjob: | sendjob: | ||||
| res, err := client.R(). | res, err := client.R(). | ||||
| SetQueryParams(map[string]string{ | SetQueryParams(map[string]string{ | ||||
| "per_page": strconv.Itoa(perPage), | |||||
| "page": strconv.Itoa(page), | |||||
| "sortBy": sortBy, | |||||
| "order": order, | |||||
| "search_content": searchContent, | |||||
| "config_type": configType, | |||||
| "per_page": strconv.Itoa(perPage), | |||||
| "page": strconv.Itoa(page), | |||||
| "sortBy": sortBy, | |||||
| "order": order, | |||||
| "search_content": searchContent, | |||||
| "config_type": configType, | |||||
| }). | }). | ||||
| SetAuthToken(TOKEN). | SetAuthToken(TOKEN). | ||||
| SetResult(&result). | SetResult(&result). | ||||
| @@ -484,7 +484,7 @@ func GetParaConfig(configName, configType string) (models.GetConfigResult, error | |||||
| sendjob: | sendjob: | ||||
| res, err := client.R(). | res, err := client.R(). | ||||
| SetQueryParams(map[string]string{ | SetQueryParams(map[string]string{ | ||||
| "config_type": configType, | |||||
| "config_type": configType, | |||||
| }). | }). | ||||
| SetAuthToken(TOKEN). | SetAuthToken(TOKEN). | ||||
| SetResult(&result). | SetResult(&result). | ||||
| @@ -569,10 +569,10 @@ func GetTrainJobLog(jobID, versionID, baseLine, logFile, order string, lines int | |||||
| sendjob: | sendjob: | ||||
| res, err := client.R(). | res, err := client.R(). | ||||
| SetQueryParams(map[string]string{ | SetQueryParams(map[string]string{ | ||||
| "base_line": baseLine, | |||||
| "lines": strconv.Itoa(lines), | |||||
| "log_file": logFile, | |||||
| "order": order, | |||||
| "base_line": baseLine, | |||||
| "lines": strconv.Itoa(lines), | |||||
| "log_file": logFile, | |||||
| "order": order, | |||||
| }). | }). | ||||
| SetAuthToken(TOKEN). | SetAuthToken(TOKEN). | ||||
| SetResult(&result). | SetResult(&result). | ||||
| @@ -278,7 +278,7 @@ func TrainJobIndex(ctx *context.Context) { | |||||
| PageSize: setting.UI.IssuePagingNum, | PageSize: setting.UI.IssuePagingNum, | ||||
| }, | }, | ||||
| RepoID: repo.ID, | RepoID: repo.ID, | ||||
| Type: models.TypeCloudBrainTrainJob, | |||||
| Type: models.TypeCloudBrainTrainJob, | |||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| ctx.ServerError("Cloudbrain", err) | ctx.ServerError("Cloudbrain", err) | ||||
| @@ -430,11 +430,11 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| var parameters models.Parameters | var parameters models.Parameters | ||||
| param := make([]models.Parameter, 0) | param := make([]models.Parameter, 0) | ||||
| param = append(param, models.Parameter{ | param = append(param, models.Parameter{ | ||||
| Label: modelarts.TrainUrl, | |||||
| Value: outputObsPath, | |||||
| Label: modelarts.TrainUrl, | |||||
| Value: outputObsPath, | |||||
| }, models.Parameter{ | }, models.Parameter{ | ||||
| Label: modelarts.DataUrl, | |||||
| Value: dataPath, | |||||
| Label: modelarts.DataUrl, | |||||
| Value: dataPath, | |||||
| }) | }) | ||||
| if len(params) != 0 { | if len(params) != 0 { | ||||
| err := json.Unmarshal([]byte(params), ¶meters) | err := json.Unmarshal([]byte(params), ¶meters) | ||||
| @@ -447,8 +447,8 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| for _, parameter := range parameters.Parameter { | for _, parameter := range parameters.Parameter { | ||||
| if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { | if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { | ||||
| param = append(param, models.Parameter{ | param = append(param, models.Parameter{ | ||||
| Label: parameter.Label, | |||||
| Value: parameter.Value, | |||||
| Label: parameter.Label, | |||||
| Value: parameter.Value, | |||||
| }) | }) | ||||
| } | } | ||||
| } | } | ||||
| @@ -463,43 +463,43 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| } | } | ||||
| _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ | _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ | ||||
| ConfigName: form.ParameterTemplateName, | |||||
| Description: form.PrameterDescription, | |||||
| DataUrl: dataPath, | |||||
| AppUrl: codeObsPath, | |||||
| BootFileUrl: codeObsPath + bootFile, | |||||
| TrainUrl: outputObsPath, | |||||
| Flavor: models.Flavor{ | |||||
| Code: flavorCode, | |||||
| ConfigName: form.ParameterTemplateName, | |||||
| Description: form.PrameterDescription, | |||||
| DataUrl: dataPath, | |||||
| AppUrl: codeObsPath, | |||||
| BootFileUrl: codeObsPath + bootFile, | |||||
| TrainUrl: outputObsPath, | |||||
| Flavor: models.Flavor{ | |||||
| Code: flavorCode, | |||||
| }, | }, | ||||
| WorkServerNum: workServerNumber, | |||||
| EngineID: int64(engineID), | |||||
| LogUrl: logObsPath, | |||||
| PoolID: poolID, | |||||
| Parameter: param, | |||||
| WorkServerNum: workServerNumber, | |||||
| EngineID: int64(engineID), | |||||
| LogUrl: logObsPath, | |||||
| PoolID: poolID, | |||||
| Parameter: param, | |||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("Failed to CreateTrainJobConfig: %v", err) | log.Error("Failed to CreateTrainJobConfig: %v", err) | ||||
| ctx.RenderWithErr("保存作业参数失败:" + err.Error(), tplModelArtsTrainJobNew, &form) | |||||
| ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| req := &modelarts.GenerateTrainJobReq{ | req := &modelarts.GenerateTrainJobReq{ | ||||
| JobName: jobName, | |||||
| DataUrl: dataPath, | |||||
| Description: description, | |||||
| CodeObsPath: codeObsPath, | |||||
| BootFile: codeObsPath + bootFile, | |||||
| TrainUrl: outputObsPath, | |||||
| FlavorCode: flavorCode, | |||||
| WorkServerNumber: workServerNumber, | |||||
| EngineID: int64(engineID), | |||||
| LogUrl: logObsPath, | |||||
| PoolID: poolID, | |||||
| Uuid: uuid, | |||||
| Parameters: param, | |||||
| JobName: jobName, | |||||
| DataUrl: dataPath, | |||||
| Description: description, | |||||
| CodeObsPath: codeObsPath, | |||||
| BootFile: codeObsPath + bootFile, | |||||
| TrainUrl: outputObsPath, | |||||
| FlavorCode: flavorCode, | |||||
| WorkServerNumber: workServerNumber, | |||||
| EngineID: int64(engineID), | |||||
| LogUrl: logObsPath, | |||||
| PoolID: poolID, | |||||
| Uuid: uuid, | |||||
| Parameters: param, | |||||
| } | } | ||||
| err = modelarts.GenerateTrainJob(ctx, req) | err = modelarts.GenerateTrainJob(ctx, req) | ||||
| @@ -552,7 +552,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||||
| return err | return err | ||||
| } | } | ||||
| if err = uploadCodeToObs(codePath + file.Name() + "/", jobName, parentDir + file.Name() + "/"); err != nil { | |||||
| if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil { | |||||
| log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error()) | log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error()) | ||||
| return err | return err | ||||
| } | } | ||||
| @@ -591,7 +591,7 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { | |||||
| return errors.New("启动文件必须是python文件") | return errors.New("启动文件必须是python文件") | ||||
| } | } | ||||
| if form.WorkServerNumber > 25 || form.WorkServerNumber < 1{ | |||||
| if form.WorkServerNumber > 25 || form.WorkServerNumber < 1 { | |||||
| log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber) | log.Error("the WorkServerNumber(%d) must be in (1,25)", form.WorkServerNumber) | ||||
| return errors.New("计算节点数必须在1-25之间") | return errors.New("计算节点数必须在1-25之间") | ||||
| } | } | ||||
| @@ -677,7 +677,7 @@ func TrainJobGetLog(ctx *context.Context) { | |||||
| //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) | //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) | ||||
| } | } | ||||
| func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error){ | |||||
| func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { | |||||
| task, err := models.GetCloudbrainByJobID(jobID) | task, err := models.GetCloudbrainByJobID(jobID) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | ||||