| @@ -482,20 +482,20 @@ type Config struct { | |||||
| BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | ||||
| Parameter []Parameter `json:"parameter"` | Parameter []Parameter `json:"parameter"` | ||||
| DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL | DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL | ||||
| DatasetID string `json:"dataset_id"` | |||||
| DataVersionID string `json:"dataset_version_id"` | |||||
| DataSource []DataSource `json:"data_source"` | |||||
| SpecID int64 `json:"spec_id"` | |||||
| //DatasetID string `json:"dataset_id"` | |||||
| //DataVersionID string `json:"dataset_version_id"` | |||||
| //DataSource []DataSource `json:"data_source"` | |||||
| //SpecID int64 `json:"spec_id"` | |||||
| EngineID int64 `json:"engine_id"` | EngineID int64 `json:"engine_id"` | ||||
| ModelID int64 `json:"model_id"` | |||||
| //ModelID int64 `json:"model_id"` | |||||
| TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL | TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL | ||||
| LogUrl string `json:"log_url"` | LogUrl string `json:"log_url"` | ||||
| UserImageUrl string `json:"user_image_url"` | |||||
| UserCommand string `json:"user_command"` | |||||
| //UserImageUrl string `json:"user_image_url"` | |||||
| //UserCommand string `json:"user_command"` | |||||
| CreateVersion bool `json:"create_version"` | CreateVersion bool `json:"create_version"` | ||||
| Volumes []Volumes `json:"volumes"` | Volumes []Volumes `json:"volumes"` | ||||
| Flavor Flavor `json:"flavor"` | Flavor Flavor `json:"flavor"` | ||||
| PoolID string `json:"pool_id"` | |||||
| PoolID string `json:"pool_id"` | |||||
| } | } | ||||
| type Parameter struct { | type Parameter struct { | ||||
| @@ -21,9 +21,8 @@ type CreateModelArtsTrainJobForm struct { | |||||
| BootFile string `form:"boot_file" binding:"Required"` | BootFile string `form:"boot_file" binding:"Required"` | ||||
| WorkServerNumber int `form:"work_server_number" binding:"Required"` | WorkServerNumber int `form:"work_server_number" binding:"Required"` | ||||
| EngineID int `form:"engine_id" binding:"Required"` | EngineID int `form:"engine_id" binding:"Required"` | ||||
| SpecID int `form:"spec_id" binding:"Required"` | |||||
| Flavor string `form:"flavor" binding:"Required"` | |||||
| PoolID string `form:"pool_id" binding:"Required"` | PoolID string `form:"pool_id" binding:"Required"` | ||||
| Flavor string `form:"flavor" binding:"Required"` | |||||
| Description string `form:"description"` | Description string `form:"description"` | ||||
| } | } | ||||
| @@ -37,6 +37,7 @@ const ( | |||||
| "]}" | "]}" | ||||
| CodePath = "/code/" | CodePath = "/code/" | ||||
| OutputPath = "/output/" | OutputPath = "/output/" | ||||
| LogPath = "/log/" | |||||
| JobPath = "/job/" | JobPath = "/job/" | ||||
| ) | ) | ||||
| @@ -49,8 +50,8 @@ type GenerateTrainJobReq struct { | |||||
| DataUrl string | DataUrl string | ||||
| TrainUrl string | TrainUrl string | ||||
| FlavorCode string | FlavorCode string | ||||
| LogUrl string | |||||
| PoolID string | PoolID string | ||||
| SpecID int64 | |||||
| WorkServerNumber int | WorkServerNumber int | ||||
| EngineID int64 | EngineID int64 | ||||
| } | } | ||||
| @@ -137,8 +138,8 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||||
| DataUrl: req.DataUrl, | DataUrl: req.DataUrl, | ||||
| EngineID: req.EngineID, | EngineID: req.EngineID, | ||||
| TrainUrl: req.TrainUrl, | TrainUrl: req.TrainUrl, | ||||
| LogUrl: req.LogUrl, | |||||
| PoolID: req.PoolID, | PoolID: req.PoolID, | ||||
| SpecID: req.SpecID, | |||||
| Flavor: models.Flavor{ | Flavor: models.Flavor{ | ||||
| Code: req.FlavorCode, | Code: req.FlavorCode, | ||||
| }, | }, | ||||
| @@ -86,6 +86,7 @@ func getToken() error { | |||||
| } | } | ||||
| TOKEN = res.Header().Get("X-Subject-Token") | TOKEN = res.Header().Get("X-Subject-Token") | ||||
| log.Info(TOKEN) | |||||
| return nil | return nil | ||||
| } | } | ||||
| @@ -311,7 +312,9 @@ sendjob: | |||||
| return nil, fmt.Errorf("resty create train-job: %s", err) | return nil, fmt.Errorf("resty create train-job: %s", err) | ||||
| } | } | ||||
| log.Info("", res.StatusCode(), res.Request.Body) | |||||
| //log.Info("%d", res.StatusCode()) | |||||
| //req, _ := json.Marshal(createJobParams) | |||||
| //log.Info("%s", req) | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | ||||
| retry++ | retry++ | ||||
| @@ -320,7 +323,7 @@ sendjob: | |||||
| } | } | ||||
| if res.StatusCode() != http.StatusOK { | if res.StatusCode() != http.StatusOK { | ||||
| log.Error("createTrainJob failed(%d)", res.StatusCode()) | |||||
| log.Error("createTrainJob failed", res.StatusCode(), res.RawResponse.Body, result.ErrorCode, result.ErrorMsg) | |||||
| return &result, fmt.Errorf("createTrainJob failed(%d)", res.StatusCode()) | return &result, fmt.Errorf("createTrainJob failed(%d)", res.StatusCode()) | ||||
| } | } | ||||
| @@ -347,7 +350,7 @@ sendjob: | |||||
| Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) | Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) | ||||
| if err != nil { | if err != nil { | ||||
| return nil, fmt.Errorf("resty GetJob: %v", err) | |||||
| return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) | |||||
| } | } | ||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | ||||
| @@ -356,8 +359,6 @@ sendjob: | |||||
| goto sendjob | goto sendjob | ||||
| } | } | ||||
| log.Info("", res.StatusCode(), res.RawResponse.Body) | |||||
| if res.StatusCode() != http.StatusOK { | if res.StatusCode() != http.StatusOK { | ||||
| log.Error("GetResourceSpecs failed(%d)", res.StatusCode()) | log.Error("GetResourceSpecs failed(%d)", res.StatusCode()) | ||||
| return &result, fmt.Errorf("GetResourceSpecs failed(%d)", res.StatusCode()) | return &result, fmt.Errorf("GetResourceSpecs failed(%d)", res.StatusCode()) | ||||
| @@ -326,15 +326,6 @@ func TrainJobNew(ctx *context.Context) { | |||||
| } | } | ||||
| ctx.Data["flavor_infos"] = flavorInfos.Info | ctx.Data["flavor_infos"] = flavorInfos.Info | ||||
| res, err := modelarts.GetResourceSpecs() | |||||
| if err != nil { | |||||
| log.Error("GetResourceSpecs failed: %v", err) | |||||
| ctx.ServerError("GetResourceSpecs failed:", err) | |||||
| return | |||||
| } | |||||
| log.Info("", res.SpecTotalCount) | |||||
| ctx.HTML(200, tplModelArtsTrainJobNew) | ctx.HTML(200, tplModelArtsTrainJobNew) | ||||
| } | } | ||||
| @@ -348,11 +339,11 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| bootFile := form.BootFile | bootFile := form.BootFile | ||||
| flavorCode := form.Flavor | flavorCode := form.Flavor | ||||
| poolID := form.PoolID | poolID := form.PoolID | ||||
| specID := form.SpecID | |||||
| repo := ctx.Repo.Repository | repo := ctx.Repo.Repository | ||||
| codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | ||||
| codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | ||||
| outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath | outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath | ||||
| logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath | |||||
| dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" | dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" | ||||
| if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { | if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { | ||||
| @@ -363,8 +354,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| //todo: upload code (send to file_server todo this work?) | //todo: upload code (send to file_server todo this work?) | ||||
| if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { | if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { | ||||
| log.Error("Failed to obsMkdir: %s (%v)", repo.FullName(), err) | |||||
| ctx.RenderWithErr("Failed to obsMkdir", tplModelArtsTrainJobNew, &form) | |||||
| log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) | |||||
| ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) | |||||
| return | |||||
| } | |||||
| if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil { | |||||
| log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) | |||||
| ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| @@ -382,10 +379,10 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||||
| BootFile: codeObsPath + bootFile, | BootFile: codeObsPath + bootFile, | ||||
| TrainUrl: outputObsPath, | TrainUrl: outputObsPath, | ||||
| FlavorCode: flavorCode, | FlavorCode: flavorCode, | ||||
| PoolID: poolID, | |||||
| WorkServerNumber: workServerNumber, | WorkServerNumber: workServerNumber, | ||||
| EngineID: int64(engineID), | EngineID: int64(engineID), | ||||
| SpecID: int64(specID), | |||||
| LogUrl: logObsPath, | |||||
| PoolID: poolID, | |||||
| } | } | ||||
| err := modelarts.GenerateTrainJob(ctx, req) | err := modelarts.GenerateTrainJob(ctx, req) | ||||