From 08583f1234b552b8e9246b17c50f7301c86ab686 Mon Sep 17 00:00:00 2001 From: liuzx Date: Mon, 8 Nov 2021 18:18:50 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AE=AD=E7=BB=83=E5=A4=9A?= =?UTF-8?q?=E7=89=88=E6=9C=AC=E8=B7=AF=E7=94=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 154 +++++++- modules/modelarts/modelarts.go | 94 +++++ modules/modelarts/resty.go | 46 +++ routers/repo/modelarts.go | 345 ++++++++++++++++-- routers/routes/routes.go | 3 + .../repo/modelarts/trainjob/version_new.tmpl | 0 6 files changed, 604 insertions(+), 38 deletions(-) create mode 100644 templates/repo/modelarts/trainjob/version_new.tmpl diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 0da069580..0df6c2145 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -30,6 +30,7 @@ const ( JobTypeSnn4imagenet JobType = "SNN4IMAGENET" JobTypeBrainScore JobType = "BRAINSCORE" JobTypeTrain JobType = "TRAIN" + JobVersionName JobType = "V0001" ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中 ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中 @@ -68,15 +69,33 @@ type Cloudbrain struct { CanDel bool `xorm:"-"` Type int `xorm:"INDEX DEFAULT 0"` - VersionID int64 `xorm:"INDEX DEFAULT 0"` - VersionName string - Uuid string - DatasetName string + VersionID int64 `xorm:"INDEX DEFAULT 0"` + VersionName string + Uuid string + DatasetName string + VersionCount int64 `xorm:"INDEX DEFAULT 1"` User *User `xorm:"-"` Repo *Repository `xorm:"-"` } +type TrainjobConfigDetail struct { + ID int64 `xorm:"pk autoincr"` + JobName string `xorm:"INDEX"` + ResourcePools string `xorm:"INDEX"` + EngineVersions int `xorm:"INDEX"` + FlavorInfos string `xorm:"INDEX"` + TrainUrl string `xorm:"INDEX"` + BootFile string `xorm:"INDEX"` + Uuid string `xorm:"INDEX"` + DatasetName string `xorm:"INDEX"` + Params string `xorm:"deleted"` + BranchName string `xorm:"INDEX"` + + // User *User `xorm:"-"` + // Repo *Repository `xorm:"-"` +} + type CloudbrainInfo struct { Cloudbrain `xorm:"extends"` User `xorm:"extends"` @@ -150,13 +169,15 @@ type CloudbrainsOptions struct { ListOptions RepoID int64 // include all repos if empty UserID int64 - JobID int64 + JobID string SortType string CloudbrainIDs []int64 // JobStatus CloudbrainStatus - Type int - JobType string + Type int + JobType string + VersionName string } + type TaskPod struct { TaskRoleStatus struct { Name string `json:"name"` @@ -594,6 +615,33 @@ type Config struct { PoolID string `json:"pool_id"` } +type CreateTrainJobVersionParams struct { + Description string `json:"job_desc"` + Config TrainJobVersionConfig `json:"config"` +} + +type TrainJobVersionConfig struct { + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + //DatasetID string `json:"dataset_id"` + //DataVersionID string `json:"dataset_version_id"` + //DataSource []DataSource `json:"data_source"` + //SpecID int64 `json:"spec_id"` + EngineID int64 `json:"engine_id"` + //ModelID int64 `json:"model_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + //UserImageUrl string `json:"user_image_url"` + //UserCommand string `json:"user_command"` + //Volumes []Volumes `json:"volumes"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` + PreVersionId int64 `json:"pre_version_id"` +} + type CreateConfigParams struct { ConfigName string `json:"config_name"` Description string `json:"config_desc"` @@ -837,7 +885,7 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } - if (opts.JobID) > 0 { + if (opts.JobID) != "" { cond = cond.And( builder.Eq{"cloudbrain.job_id": opts.JobID}, ) @@ -855,6 +903,12 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } + if (opts.VersionName) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.version_name": opts.VersionName}, + ) + } + // switch opts.JobStatus { // case JobWaiting: // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)}) @@ -897,6 +951,72 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { return cloudbrains, count, nil } +func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { + sess := x.NewSession() + defer sess.Close() + + var cond = builder.NewCond() + if opts.RepoID > 0 { + cond = cond.And( + builder.Eq{"cloudbrain.repo_id": opts.RepoID}, + ) + } + + if opts.UserID > 0 { + cond = cond.And( + builder.Eq{"cloudbrain.user_id": opts.UserID}, + ) + } + + if (opts.Type) >= 0 { + cond = cond.And( + builder.Eq{"cloudbrain.type": opts.Type}, + ) + } + + if (opts.JobID) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.job_id": opts.JobID}, + ) + } + + if (opts.JobType) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.job_type": opts.JobType}, + ) + } + + if len(opts.CloudbrainIDs) > 0 { + cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs)) + } + + count, err := sess.Where(cond).Count(new(Cloudbrain)) + if err != nil { + return nil, 0, fmt.Errorf("Count: %v", err) + } + + if opts.Page >= 0 && opts.PageSize > 0 { + var start int + if opts.Page == 0 { + start = 0 + } else { + start = (opts.Page - 1) * opts.PageSize + } + sess.Limit(opts.PageSize, start) + } + + sess.OrderBy("cloudbrain.created_unix DESC") + cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum) + if err := sess.Table(&Cloudbrain{}).Where(cond). + Join("left", "`user`", "cloudbrain.user_id = `user`.id"). + Find(&cloudbrains); err != nil { + return nil, 0, fmt.Errorf("Find: %v", err) + } + sess.Close() + + return cloudbrains, count, nil +} + func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { if _, err = x.Insert(cloudbrain); err != nil { return err @@ -904,6 +1024,13 @@ func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { return nil } +func CreateTrainjobConfigDetail(trainjobConfigDetail *TrainjobConfigDetail) (err error) { + if _, err = x.Insert(trainjobConfigDetail); err != nil { + return err + } + return nil +} + func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) { has, err := x.Get(cb) if err != nil { @@ -924,6 +1051,11 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) { return getRepoCloudBrain(cb) } +func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) { + cb := &Cloudbrain{JobID: jobID, VersionName: versionName} + return getRepoCloudBrain(cb) +} + func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { cloudBrains := make([]*Cloudbrain, 0) err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) @@ -948,6 +1080,12 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train return } +func SetVersionCountByJobID(jobID string, versionName string, versionCount int64) (err error) { + cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount} + _, err = x.Cols("version_Count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb) + return +} + func UpdateJob(job *Cloudbrain) error { return updateJob(x, job) } diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index e1dbe9f5a..4bff6a347 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -71,6 +71,23 @@ type GenerateTrainJobReq struct { Parameters []models.Parameter } +type GenerateTrainJobVersionReq struct { + JobName string + Uuid string + Description string + CodeObsPath string + BootFile string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + PreVersionId int64 +} + type VersionInfo struct { Version []struct { ID int `json:"id"` @@ -223,6 +240,83 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { return nil } +func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string) error { + jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{ + Description: req.Description, + Config: models.TrainJobVersionConfig{ + WorkServerNum: req.WorkServerNumber, + AppUrl: req.CodeObsPath, + BootFileUrl: req.BootFile, + DataUrl: req.DataUrl, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + LogUrl: req.LogUrl, + PoolID: req.PoolID, + Flavor: models.Flavor{ + Code: req.FlavorCode, + }, + Parameter: req.Parameters, + PreVersionId: req.PreVersionId, + }, + }, jobId) + if err != nil { + log.Error("CreateJob failed: %v", err.Error()) + return err + } + + attach, err := models.GetAttachmentByUUID(req.Uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) + return nil + } + + err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + }) + if err != nil { + log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) + return err + } + + repo := ctx.Repo.Repository + page := ctx.QueryInt("page") + if page <= 0 { + page = 1 + } + _, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ + ListOptions: models.ListOptions{ + Page: page, + PageSize: setting.UI.IssuePagingNum, + }, + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + JobID: strconv.FormatInt(jobResult.JobID, 10), + }) + if err != nil { + ctx.ServerError("Cloudbrain", err) + return nil + } + versionName := "V0001" + err = models.SetVersionCountByJobID(strconv.FormatInt(jobResult.JobID, 10), versionName, VersionListCount) + if err != nil { + ctx.ServerError("UpdateJobVersionCount failed", err) + return nil + } + + return nil +} + func TransTrainJobStatus(status int) string { switch status { case 0: diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index d17478c94..c967c0eda 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -377,6 +377,52 @@ sendjob: return &result, nil } +func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobVersionParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") + + if err != nil { + return nil, fmt.Errorf("resty create train-job version: %s", err) + } + + req, _ := json.Marshal(createJobVersionParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { checkSetting() client := getRestyClient() diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index cc2c75485..5f0b8c4f9 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -40,6 +40,7 @@ const ( tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new" tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show" tplModelArtsTrainJobShowModels base.TplName = "repo/modelarts/trainjob/models/index" + tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new" ) // MustEnableDataset check if repository enable internal cb @@ -286,8 +287,8 @@ func NotebookIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, JobType: string(models.JobTypeDebug), }) if err != nil { @@ -493,14 +494,6 @@ func NotebookDel(ctx *context.Context) { func TrainJobIndex(ctx *context.Context) { MustEnableModelArts(ctx) - //can, err := canUserCreateTrainJob(ctx.User.ID) - //if err != nil { - // ctx.ServerError("canUserCreateTrainJob", err) - // return - //} - // - //ctx.Data["CanCreate"] = can - repo := ctx.Repo.Repository page := ctx.QueryInt("page") if page <= 0 { @@ -512,9 +505,10 @@ func TrainJobIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, - JobType: string(models.JobTypeTrain), + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + VersionName: string(models.JobVersionName), }) if err != nil { ctx.ServerError("Cloudbrain", err) @@ -614,6 +608,82 @@ func trainJobNewDataPrepare(ctx *context.Context) error { return nil } +func TrainJobNewVersion(ctx *context.Context) { + err := trainJobNewVersionDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(200, tplModelArtsTrainJobVersionNew) +} + +func trainJobNewVersionDataPrepare(ctx *context.Context) error { + ctx.Data["PageIsCloudBrain"] = true + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("versionName") + + t := time.Now() + var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["job_name"] = jobName + + attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID) + if err != nil { + ctx.ServerError("GetAllUserAttachments failed:", err) + return err + } + ctx.Data["attachments"] = attachs + + var resourcePools modelarts.ResourcePool + if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["resource_pools"] = resourcePools.Info + + var engines modelarts.Engine + if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engines"] = engines.Info + + var versionInfos modelarts.VersionInfo + if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engine_versions"] = versionInfos.Version + + var flavorInfos modelarts.Flavor + if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["flavor_infos"] = flavorInfos.Info + + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + ctx.Data["train_url"] = outputObsPath + + Branches, err := ctx.Repo.GitRepo.GetBranches() + if err != nil { + ctx.ServerError("GetBranches error:", err) + return err + } + ctx.Data["Branches"] = Branches + ctx.Data["BranchesCount"] = len(Branches) + ctx.Data["jobID"] = jobID + ctx.Data["versionName"] = versionName + + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) + if err != nil { + ctx.ServerError("getConfigList failed:", err) + return err + } + ctx.Data["config_list"] = configList.ParaConfigs + + return nil +} + func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { ctx.Data["PageIsTrainJob"] = true jobName := form.JobName @@ -634,19 +704,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branch_name := form.BranchName - //can, err := canUserCreateTrainJob(ctx.User.ID) - //if err != nil { - // ctx.ServerError("canUserCreateTrainJob", err) - // return - //} - // - //if !can { - // log.Error("the user can not create train-job") - // ctx.RenderWithErr("the user can not create train-job", tplModelArtsTrainJobNew, &form) - // return - //} - - //param check if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) trainJobNewDataPrepare(ctx) @@ -665,9 +722,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) if err == nil { os.RemoveAll(codeLocalPath) } - // branch_name := "testbranch" - // gitRepo, _ := git.OpenRepository(repo.RepoPath()) - // commitID, _ := gitRepo.GetBranchCommitID(branch_name) if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ Branch: branch_name, @@ -786,7 +840,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) LogUrl: logObsPath, PoolID: poolID, Uuid: uuid, - Parameters: param, + Parameters: parameters.Parameter, } err = modelarts.GenerateTrainJob(ctx, req) @@ -797,12 +851,220 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Data["uuid"] = form.Attachment ctx.Data["datasetName"] = attach.Name ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { + ctx.Data["PageIsTrainJob"] = true + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("versionName") + jobName := form.JobName + uuid := form.Attachment + description := form.Description + workServerNumber := form.WorkServerNumber + engineID := form.EngineID + bootFile := form.BootFile + flavorCode := form.Flavor + params := form.Params + poolID := form.PoolID + isSaveParam := form.IsSaveParam + repo := ctx.Repo.Repository + codeLocalPath := setting.JobPath + jobName + modelarts.CodePath + codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" + branch_name := form.BranchName + + if err := paramCheckCreateTrainJob(form); err != nil { + log.Error("paramCheckCreateTrainJob failed:(%v)", err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + + attach, err := models.GetAttachmentByUUID(uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error()) + return + } + + //todo: del the codeLocalPath + _, err = ioutil.ReadDir(codeLocalPath) + if err == nil { + os.RemoveAll(codeLocalPath) + } + + if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ + Branch: branch_name, + }); err != nil { + log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + + ctx.Data["bootFile"] = form.BootFile + ctx.Data["uuid"] = form.Attachment + ctx.Data["datasetName"] = attach.Name + ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name + // ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form) + // ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form) + return + } + + //todo: upload code (send to file_server todo this work?) + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { + log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form) + return + } + + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil { + log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form) + return + } + + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { + log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form) + return + } + + //todo: del local code? + + var parameters models.Parameters + param := make([]models.Parameter, 0) + param = append(param, models.Parameter{ + Label: modelarts.TrainUrl, + Value: outputObsPath, + }, models.Parameter{ + Label: modelarts.DataUrl, + Value: dataPath, + }) + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form) + return + } + + for _, parameter := range parameters.Parameter { + if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) + } + } + } + + //save param config + if isSaveParam == "on" { + if form.ParameterTemplateName == "" { + log.Error("ParameterTemplateName is empty") + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form) + return + } + + _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ + ConfigName: form.ParameterTemplateName, + Description: form.PrameterDescription, + DataUrl: dataPath, + AppUrl: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + TrainUrl: outputObsPath, + Flavor: models.Flavor{ + Code: flavorCode, + }, + WorkServerNum: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Parameter: parameters.Parameter, + }) + + if err != nil { + log.Error("Failed to CreateTrainJobConfig: %v", err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + } + // JobVersionName := "V0001" + // PreVersionId := int64(67646) + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + req := &modelarts.GenerateTrainJobVersionReq{ + JobName: task.JobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFile: codeObsPath + bootFile, + TrainUrl: outputObsPath, + FlavorCode: flavorCode, + WorkServerNumber: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Parameters: parameters.Parameter, + PreVersionId: task.VersionID, + } + err = modelarts.GenerateTrainJobVersion(ctx, req, jobID) + if err != nil { + log.Error("GenerateTrainJob failed:%v", err.Error()) + trainJobNewVersionDataPrepare(ctx) + ctx.Data["bootFile"] = form.BootFile + ctx.Data["uuid"] = form.Attachment + ctx.Data["datasetName"] = attach.Name + ctx.Data["params"] = form.Params + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + //保存openi创建训练任务界面的参数 + // err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ + + // JobName: req.JobName, + // ResourcePools: form.PoolID, + // EngineVersions: form.EngineID, + // FlavorInfos: form.Flavor, + // TrainUrl: outputObsPath, + // BootFile: form.BootFile, + // Uuid: form.Attachment, + // DatasetName: attach.Name, + // Params: form.Params, + // BranchName: branch_name, + // }) + + // if err != nil { + // log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) + // trainJobNewVersionDataPrepare(ctx) + // ctx.Data["bootFile"] = form.BootFile + // ctx.Data["uuid"] = form.Attachment + // ctx.Data["datasetName"] = attach.Name + // ctx.Data["params"] = form.Params + // ctx.Data["branch_name"] = branch_name + // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + // return + // } + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") +} + // readDir reads the directory named by dirname and returns // a list of directory entries sorted by filename. func readDir(dirname string) ([]os.FileInfo, error) { @@ -895,6 +1157,27 @@ func TrainJobShow(ctx *context.Context) { var jobID = ctx.Params(":jobid") task, err := models.GetCloudbrainByJobID(jobID) + + repo := ctx.Repo.Repository + page := ctx.QueryInt("page") + if page <= 0 { + page = 1 + } + VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ + ListOptions: models.ListOptions{ + Page: page, + PageSize: setting.UI.IssuePagingNum, + }, + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + JobID: jobID, + }) + if err != nil { + ctx.ServerError("Cloudbrain", err) + return + } + if err != nil { log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) @@ -945,6 +1228,8 @@ func TrainJobShow(ctx *context.Context) { ctx.Data["task"] = task ctx.Data["jobID"] = jobID ctx.Data["result"] = result + ctx.Data["VersionListTasks"] = VersionListTasks + ctx.Data["VersionLisCount"] = VersionListCount ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 7e7d0642a..0a5464065 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -993,9 +993,12 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog) m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels) m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) + m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) + m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) }) m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew) m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) + m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) }) }, context.RepoRef()) diff --git a/templates/repo/modelarts/trainjob/version_new.tmpl b/templates/repo/modelarts/trainjob/version_new.tmpl new file mode 100644 index 000000000..e69de29bb