From 2436bb351f66cf45a09ba94582ee368142c30b84 Mon Sep 17 00:00:00 2001 From: liuzx Date: Fri, 5 Nov 2021 11:22:29 +0800 Subject: [PATCH 01/66] trainjob add branches --- models/cloudbrain.go | 14 +++++++------- modules/auth/modelarts.go | 1 + routers/repo/modelarts.go | 19 +++++++++++++++++-- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 563ab9d06..557a223a1 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -782,12 +782,13 @@ type GetTrainJobResult struct { //UserImageUrl string `json:"user_image_url"` //UserCommand string `json:"user_command"` //Volumes []Volumes `json:"volumes"` - Flavor Flavor `json:"flavor"` - PoolID string `json:"pool_id"` - PoolName string `json:"pool_name"` - NasMountPath string `json:"nas_mount_path"` - NasShareAddr string `json:"nas_share_addr"` - DatasetName string + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` + PoolName string `json:"pool_name"` + NasMountPath string `json:"nas_mount_path"` + NasShareAddr string `json:"nas_share_addr"` + DatasetName string + ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话 } type GetTrainJobLogResult struct { @@ -892,7 +893,6 @@ func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { if _, err = x.Insert(cloudbrain); err != nil { return err } - return nil } diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index f2e5aeed5..1eb214392 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -38,6 +38,7 @@ type CreateModelArtsTrainJobForm struct { IsSaveParam string `form:"is_save_para"` ParameterTemplateName string `form:"parameter_template_name"` PrameterDescription string `form:"parameter_description"` + BranchName string `form:"branch_name"` } func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index ea8ab0c06..7b34ad470 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -594,12 +594,19 @@ func trainJobNewDataPrepare(ctx *context.Context) error { outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath ctx.Data["train_url"] = outputObsPath + Branches, err := ctx.Repo.GitRepo.GetBranches() + if err != nil { + ctx.ServerError("GetBranches error:", err) + return err + } + ctx.Data["Branches"] = Branches + ctx.Data["BranchesCount"] = len(Branches) + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { ctx.ServerError("getConfigList failed:", err) return err } - ctx.Data["config_list"] = configList.ParaConfigs return nil @@ -623,6 +630,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" + branch_name := form.BranchName //can, err := canUserCreateTrainJob(ctx.User.ID) //if err != nil { @@ -655,7 +663,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) if err == nil { os.RemoveAll(codeLocalPath) } - if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { + // branch_name := "testbranch" + // gitRepo, _ := git.OpenRepository(repo.RepoPath()) + // commitID, _ := gitRepo.GetBranchCommitID(branch_name) + + if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ + Branch: branch_name, + }); err != nil { log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err) trainJobNewDataPrepare(ctx) @@ -663,6 +677,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Data["uuid"] = form.Attachment ctx.Data["datasetName"] = attach.Name ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name trainJobNewDataPrepare(ctx) // ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobNew, &form) From 08583f1234b552b8e9246b17c50f7301c86ab686 Mon Sep 17 00:00:00 2001 From: liuzx Date: Mon, 8 Nov 2021 18:18:50 +0800 Subject: [PATCH 02/66] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AE=AD=E7=BB=83?= =?UTF-8?q?=E5=A4=9A=E7=89=88=E6=9C=AC=E8=B7=AF=E7=94=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 154 +++++++- modules/modelarts/modelarts.go | 94 +++++ modules/modelarts/resty.go | 46 +++ routers/repo/modelarts.go | 345 ++++++++++++++++-- routers/routes/routes.go | 3 + .../repo/modelarts/trainjob/version_new.tmpl | 0 6 files changed, 604 insertions(+), 38 deletions(-) create mode 100644 templates/repo/modelarts/trainjob/version_new.tmpl diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 0da069580..0df6c2145 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -30,6 +30,7 @@ const ( JobTypeSnn4imagenet JobType = "SNN4IMAGENET" JobTypeBrainScore JobType = "BRAINSCORE" JobTypeTrain JobType = "TRAIN" + JobVersionName JobType = "V0001" ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中 ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中 @@ -68,15 +69,33 @@ type Cloudbrain struct { CanDel bool `xorm:"-"` Type int `xorm:"INDEX DEFAULT 0"` - VersionID int64 `xorm:"INDEX DEFAULT 0"` - VersionName string - Uuid string - DatasetName string + VersionID int64 `xorm:"INDEX DEFAULT 0"` + VersionName string + Uuid string + DatasetName string + VersionCount int64 `xorm:"INDEX DEFAULT 1"` User *User `xorm:"-"` Repo *Repository `xorm:"-"` } +type TrainjobConfigDetail struct { + ID int64 `xorm:"pk autoincr"` + JobName string `xorm:"INDEX"` + ResourcePools string `xorm:"INDEX"` + EngineVersions int `xorm:"INDEX"` + FlavorInfos string `xorm:"INDEX"` + TrainUrl string `xorm:"INDEX"` + BootFile string `xorm:"INDEX"` + Uuid string `xorm:"INDEX"` + DatasetName string `xorm:"INDEX"` + Params string `xorm:"deleted"` + BranchName string `xorm:"INDEX"` + + // User *User `xorm:"-"` + // Repo *Repository `xorm:"-"` +} + type CloudbrainInfo struct { Cloudbrain `xorm:"extends"` User `xorm:"extends"` @@ -150,13 +169,15 @@ type CloudbrainsOptions struct { ListOptions RepoID int64 // include all repos if empty UserID int64 - JobID int64 + JobID string SortType string CloudbrainIDs []int64 // JobStatus CloudbrainStatus - Type int - JobType string + Type int + JobType string + VersionName string } + type TaskPod struct { TaskRoleStatus struct { Name string `json:"name"` @@ -594,6 +615,33 @@ type Config struct { PoolID string `json:"pool_id"` } +type CreateTrainJobVersionParams struct { + Description string `json:"job_desc"` + Config TrainJobVersionConfig `json:"config"` +} + +type TrainJobVersionConfig struct { + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + //DatasetID string `json:"dataset_id"` + //DataVersionID string `json:"dataset_version_id"` + //DataSource []DataSource `json:"data_source"` + //SpecID int64 `json:"spec_id"` + EngineID int64 `json:"engine_id"` + //ModelID int64 `json:"model_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + //UserImageUrl string `json:"user_image_url"` + //UserCommand string `json:"user_command"` + //Volumes []Volumes `json:"volumes"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` + PreVersionId int64 `json:"pre_version_id"` +} + type CreateConfigParams struct { ConfigName string `json:"config_name"` Description string `json:"config_desc"` @@ -837,7 +885,7 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } - if (opts.JobID) > 0 { + if (opts.JobID) != "" { cond = cond.And( builder.Eq{"cloudbrain.job_id": opts.JobID}, ) @@ -855,6 +903,12 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } + if (opts.VersionName) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.version_name": opts.VersionName}, + ) + } + // switch opts.JobStatus { // case JobWaiting: // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)}) @@ -897,6 +951,72 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { return cloudbrains, count, nil } +func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { + sess := x.NewSession() + defer sess.Close() + + var cond = builder.NewCond() + if opts.RepoID > 0 { + cond = cond.And( + builder.Eq{"cloudbrain.repo_id": opts.RepoID}, + ) + } + + if opts.UserID > 0 { + cond = cond.And( + builder.Eq{"cloudbrain.user_id": opts.UserID}, + ) + } + + if (opts.Type) >= 0 { + cond = cond.And( + builder.Eq{"cloudbrain.type": opts.Type}, + ) + } + + if (opts.JobID) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.job_id": opts.JobID}, + ) + } + + if (opts.JobType) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.job_type": opts.JobType}, + ) + } + + if len(opts.CloudbrainIDs) > 0 { + cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs)) + } + + count, err := sess.Where(cond).Count(new(Cloudbrain)) + if err != nil { + return nil, 0, fmt.Errorf("Count: %v", err) + } + + if opts.Page >= 0 && opts.PageSize > 0 { + var start int + if opts.Page == 0 { + start = 0 + } else { + start = (opts.Page - 1) * opts.PageSize + } + sess.Limit(opts.PageSize, start) + } + + sess.OrderBy("cloudbrain.created_unix DESC") + cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum) + if err := sess.Table(&Cloudbrain{}).Where(cond). + Join("left", "`user`", "cloudbrain.user_id = `user`.id"). + Find(&cloudbrains); err != nil { + return nil, 0, fmt.Errorf("Find: %v", err) + } + sess.Close() + + return cloudbrains, count, nil +} + func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { if _, err = x.Insert(cloudbrain); err != nil { return err @@ -904,6 +1024,13 @@ func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { return nil } +func CreateTrainjobConfigDetail(trainjobConfigDetail *TrainjobConfigDetail) (err error) { + if _, err = x.Insert(trainjobConfigDetail); err != nil { + return err + } + return nil +} + func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) { has, err := x.Get(cb) if err != nil { @@ -924,6 +1051,11 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) { return getRepoCloudBrain(cb) } +func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) { + cb := &Cloudbrain{JobID: jobID, VersionName: versionName} + return getRepoCloudBrain(cb) +} + func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { cloudBrains := make([]*Cloudbrain, 0) err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) @@ -948,6 +1080,12 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train return } +func SetVersionCountByJobID(jobID string, versionName string, versionCount int64) (err error) { + cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount} + _, err = x.Cols("version_Count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb) + return +} + func UpdateJob(job *Cloudbrain) error { return updateJob(x, job) } diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index e1dbe9f5a..4bff6a347 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -71,6 +71,23 @@ type GenerateTrainJobReq struct { Parameters []models.Parameter } +type GenerateTrainJobVersionReq struct { + JobName string + Uuid string + Description string + CodeObsPath string + BootFile string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + PreVersionId int64 +} + type VersionInfo struct { Version []struct { ID int `json:"id"` @@ -223,6 +240,83 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { return nil } +func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string) error { + jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{ + Description: req.Description, + Config: models.TrainJobVersionConfig{ + WorkServerNum: req.WorkServerNumber, + AppUrl: req.CodeObsPath, + BootFileUrl: req.BootFile, + DataUrl: req.DataUrl, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + LogUrl: req.LogUrl, + PoolID: req.PoolID, + Flavor: models.Flavor{ + Code: req.FlavorCode, + }, + Parameter: req.Parameters, + PreVersionId: req.PreVersionId, + }, + }, jobId) + if err != nil { + log.Error("CreateJob failed: %v", err.Error()) + return err + } + + attach, err := models.GetAttachmentByUUID(req.Uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) + return nil + } + + err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + }) + if err != nil { + log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) + return err + } + + repo := ctx.Repo.Repository + page := ctx.QueryInt("page") + if page <= 0 { + page = 1 + } + _, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ + ListOptions: models.ListOptions{ + Page: page, + PageSize: setting.UI.IssuePagingNum, + }, + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + JobID: strconv.FormatInt(jobResult.JobID, 10), + }) + if err != nil { + ctx.ServerError("Cloudbrain", err) + return nil + } + versionName := "V0001" + err = models.SetVersionCountByJobID(strconv.FormatInt(jobResult.JobID, 10), versionName, VersionListCount) + if err != nil { + ctx.ServerError("UpdateJobVersionCount failed", err) + return nil + } + + return nil +} + func TransTrainJobStatus(status int) string { switch status { case 0: diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index d17478c94..c967c0eda 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -377,6 +377,52 @@ sendjob: return &result, nil } +func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobVersionParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") + + if err != nil { + return nil, fmt.Errorf("resty create train-job version: %s", err) + } + + req, _ := json.Marshal(createJobVersionParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { checkSetting() client := getRestyClient() diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index cc2c75485..5f0b8c4f9 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -40,6 +40,7 @@ const ( tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new" tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show" tplModelArtsTrainJobShowModels base.TplName = "repo/modelarts/trainjob/models/index" + tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new" ) // MustEnableDataset check if repository enable internal cb @@ -286,8 +287,8 @@ func NotebookIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, JobType: string(models.JobTypeDebug), }) if err != nil { @@ -493,14 +494,6 @@ func NotebookDel(ctx *context.Context) { func TrainJobIndex(ctx *context.Context) { MustEnableModelArts(ctx) - //can, err := canUserCreateTrainJob(ctx.User.ID) - //if err != nil { - // ctx.ServerError("canUserCreateTrainJob", err) - // return - //} - // - //ctx.Data["CanCreate"] = can - repo := ctx.Repo.Repository page := ctx.QueryInt("page") if page <= 0 { @@ -512,9 +505,10 @@ func TrainJobIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, - JobType: string(models.JobTypeTrain), + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + VersionName: string(models.JobVersionName), }) if err != nil { ctx.ServerError("Cloudbrain", err) @@ -614,6 +608,82 @@ func trainJobNewDataPrepare(ctx *context.Context) error { return nil } +func TrainJobNewVersion(ctx *context.Context) { + err := trainJobNewVersionDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(200, tplModelArtsTrainJobVersionNew) +} + +func trainJobNewVersionDataPrepare(ctx *context.Context) error { + ctx.Data["PageIsCloudBrain"] = true + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("versionName") + + t := time.Now() + var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["job_name"] = jobName + + attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID) + if err != nil { + ctx.ServerError("GetAllUserAttachments failed:", err) + return err + } + ctx.Data["attachments"] = attachs + + var resourcePools modelarts.ResourcePool + if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["resource_pools"] = resourcePools.Info + + var engines modelarts.Engine + if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engines"] = engines.Info + + var versionInfos modelarts.VersionInfo + if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engine_versions"] = versionInfos.Version + + var flavorInfos modelarts.Flavor + if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["flavor_infos"] = flavorInfos.Info + + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + ctx.Data["train_url"] = outputObsPath + + Branches, err := ctx.Repo.GitRepo.GetBranches() + if err != nil { + ctx.ServerError("GetBranches error:", err) + return err + } + ctx.Data["Branches"] = Branches + ctx.Data["BranchesCount"] = len(Branches) + ctx.Data["jobID"] = jobID + ctx.Data["versionName"] = versionName + + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) + if err != nil { + ctx.ServerError("getConfigList failed:", err) + return err + } + ctx.Data["config_list"] = configList.ParaConfigs + + return nil +} + func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { ctx.Data["PageIsTrainJob"] = true jobName := form.JobName @@ -634,19 +704,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branch_name := form.BranchName - //can, err := canUserCreateTrainJob(ctx.User.ID) - //if err != nil { - // ctx.ServerError("canUserCreateTrainJob", err) - // return - //} - // - //if !can { - // log.Error("the user can not create train-job") - // ctx.RenderWithErr("the user can not create train-job", tplModelArtsTrainJobNew, &form) - // return - //} - - //param check if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) trainJobNewDataPrepare(ctx) @@ -665,9 +722,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) if err == nil { os.RemoveAll(codeLocalPath) } - // branch_name := "testbranch" - // gitRepo, _ := git.OpenRepository(repo.RepoPath()) - // commitID, _ := gitRepo.GetBranchCommitID(branch_name) if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ Branch: branch_name, @@ -786,7 +840,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) LogUrl: logObsPath, PoolID: poolID, Uuid: uuid, - Parameters: param, + Parameters: parameters.Parameter, } err = modelarts.GenerateTrainJob(ctx, req) @@ -797,12 +851,220 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Data["uuid"] = form.Attachment ctx.Data["datasetName"] = attach.Name ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { + ctx.Data["PageIsTrainJob"] = true + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("versionName") + jobName := form.JobName + uuid := form.Attachment + description := form.Description + workServerNumber := form.WorkServerNumber + engineID := form.EngineID + bootFile := form.BootFile + flavorCode := form.Flavor + params := form.Params + poolID := form.PoolID + isSaveParam := form.IsSaveParam + repo := ctx.Repo.Repository + codeLocalPath := setting.JobPath + jobName + modelarts.CodePath + codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" + branch_name := form.BranchName + + if err := paramCheckCreateTrainJob(form); err != nil { + log.Error("paramCheckCreateTrainJob failed:(%v)", err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + + attach, err := models.GetAttachmentByUUID(uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error()) + return + } + + //todo: del the codeLocalPath + _, err = ioutil.ReadDir(codeLocalPath) + if err == nil { + os.RemoveAll(codeLocalPath) + } + + if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ + Branch: branch_name, + }); err != nil { + log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + + ctx.Data["bootFile"] = form.BootFile + ctx.Data["uuid"] = form.Attachment + ctx.Data["datasetName"] = attach.Name + ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name + // ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form) + // ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form) + return + } + + //todo: upload code (send to file_server todo this work?) + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { + log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form) + return + } + + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil { + log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form) + return + } + + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { + log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form) + return + } + + //todo: del local code? + + var parameters models.Parameters + param := make([]models.Parameter, 0) + param = append(param, models.Parameter{ + Label: modelarts.TrainUrl, + Value: outputObsPath, + }, models.Parameter{ + Label: modelarts.DataUrl, + Value: dataPath, + }) + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form) + return + } + + for _, parameter := range parameters.Parameter { + if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) + } + } + } + + //save param config + if isSaveParam == "on" { + if form.ParameterTemplateName == "" { + log.Error("ParameterTemplateName is empty") + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form) + return + } + + _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ + ConfigName: form.ParameterTemplateName, + Description: form.PrameterDescription, + DataUrl: dataPath, + AppUrl: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + TrainUrl: outputObsPath, + Flavor: models.Flavor{ + Code: flavorCode, + }, + WorkServerNum: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Parameter: parameters.Parameter, + }) + + if err != nil { + log.Error("Failed to CreateTrainJobConfig: %v", err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + } + // JobVersionName := "V0001" + // PreVersionId := int64(67646) + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + req := &modelarts.GenerateTrainJobVersionReq{ + JobName: task.JobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFile: codeObsPath + bootFile, + TrainUrl: outputObsPath, + FlavorCode: flavorCode, + WorkServerNumber: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Parameters: parameters.Parameter, + PreVersionId: task.VersionID, + } + err = modelarts.GenerateTrainJobVersion(ctx, req, jobID) + if err != nil { + log.Error("GenerateTrainJob failed:%v", err.Error()) + trainJobNewVersionDataPrepare(ctx) + ctx.Data["bootFile"] = form.BootFile + ctx.Data["uuid"] = form.Attachment + ctx.Data["datasetName"] = attach.Name + ctx.Data["params"] = form.Params + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + //保存openi创建训练任务界面的参数 + // err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ + + // JobName: req.JobName, + // ResourcePools: form.PoolID, + // EngineVersions: form.EngineID, + // FlavorInfos: form.Flavor, + // TrainUrl: outputObsPath, + // BootFile: form.BootFile, + // Uuid: form.Attachment, + // DatasetName: attach.Name, + // Params: form.Params, + // BranchName: branch_name, + // }) + + // if err != nil { + // log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) + // trainJobNewVersionDataPrepare(ctx) + // ctx.Data["bootFile"] = form.BootFile + // ctx.Data["uuid"] = form.Attachment + // ctx.Data["datasetName"] = attach.Name + // ctx.Data["params"] = form.Params + // ctx.Data["branch_name"] = branch_name + // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + // return + // } + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") +} + // readDir reads the directory named by dirname and returns // a list of directory entries sorted by filename. func readDir(dirname string) ([]os.FileInfo, error) { @@ -895,6 +1157,27 @@ func TrainJobShow(ctx *context.Context) { var jobID = ctx.Params(":jobid") task, err := models.GetCloudbrainByJobID(jobID) + + repo := ctx.Repo.Repository + page := ctx.QueryInt("page") + if page <= 0 { + page = 1 + } + VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ + ListOptions: models.ListOptions{ + Page: page, + PageSize: setting.UI.IssuePagingNum, + }, + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + JobID: jobID, + }) + if err != nil { + ctx.ServerError("Cloudbrain", err) + return + } + if err != nil { log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) @@ -945,6 +1228,8 @@ func TrainJobShow(ctx *context.Context) { ctx.Data["task"] = task ctx.Data["jobID"] = jobID ctx.Data["result"] = result + ctx.Data["VersionListTasks"] = VersionListTasks + ctx.Data["VersionLisCount"] = VersionListCount ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 7e7d0642a..0a5464065 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -993,9 +993,12 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog) m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels) m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) + m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) + m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) }) m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew) m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) + m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) }) }, context.RepoRef()) diff --git a/templates/repo/modelarts/trainjob/version_new.tmpl b/templates/repo/modelarts/trainjob/version_new.tmpl new file mode 100644 index 000000000..e69de29bb From 4e2ec3ebd9a25647f77553730fc251d918bc5e62 Mon Sep 17 00:00:00 2001 From: liuzx Date: Tue, 9 Nov 2021 19:07:53 +0800 Subject: [PATCH 03/66] =?UTF-8?q?=E8=AE=AD=E7=BB=83=E5=A4=9A=E7=89=88?= =?UTF-8?q?=E6=9C=AC=E4=BF=AE=E6=94=B9=E8=BF=94=E5=9B=9E=E7=9A=84=E8=AE=AD?= =?UTF-8?q?=E7=BB=83=E5=88=97=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 40 +++++++++------ models/models.go | 1 + modules/modelarts/modelarts.go | 75 +++++++++++++++++++--------- routers/repo/modelarts.go | 89 +++++++++++++++++++--------------- routers/routes/routes.go | 3 ++ 5 files changed, 131 insertions(+), 77 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 0df6c2145..81c3df2af 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -69,11 +69,13 @@ type Cloudbrain struct { CanDel bool `xorm:"-"` Type int `xorm:"INDEX DEFAULT 0"` - VersionID int64 `xorm:"INDEX DEFAULT 0"` - VersionName string - Uuid string - DatasetName string - VersionCount int64 `xorm:"INDEX DEFAULT 1"` + VersionID int64 `xorm:"INDEX DEFAULT 0"` + VersionName string + Uuid string + DatasetName string + VersionCount int64 `xorm:"INDEX DEFAULT 1"` + IsLatestVersion string + CommitID string User *User `xorm:"-"` Repo *Repository `xorm:"-"` @@ -89,11 +91,11 @@ type TrainjobConfigDetail struct { BootFile string `xorm:"INDEX"` Uuid string `xorm:"INDEX"` DatasetName string `xorm:"INDEX"` - Params string `xorm:"deleted"` + Params string `xorm:"INDEX"` BranchName string `xorm:"INDEX"` - // User *User `xorm:"-"` - // Repo *Repository `xorm:"-"` + User *User `xorm:"-"` + Repo *Repository `xorm:"-"` } type CloudbrainInfo struct { @@ -173,9 +175,10 @@ type CloudbrainsOptions struct { SortType string CloudbrainIDs []int64 // JobStatus CloudbrainStatus - Type int - JobType string - VersionName string + Type int + JobType string + VersionName string + IsLatestVersion string } type TaskPod struct { @@ -903,9 +906,9 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } - if (opts.VersionName) != "" { + if (opts.IsLatestVersion) != "" { cond = cond.And( - builder.Eq{"cloudbrain.version_name": opts.VersionName}, + builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion}, ) } @@ -1056,6 +1059,11 @@ func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Clou return getRepoCloudBrain(cb) } +func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) { + cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion} + return getRepoCloudBrain(cb) +} + func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { cloudBrains := make([]*Cloudbrain, 0) err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) @@ -1080,9 +1088,9 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train return } -func SetVersionCountByJobID(jobID string, versionName string, versionCount int64) (err error) { - cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount} - _, err = x.Cols("version_Count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb) +func SetVersionCountAndLatestVersionByJobIDAndVersionName(jobID string, versionName string, versionCount int64, isLatestVersion string) (err error) { + cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion} + _, err = x.Cols("version_Count", "is_latest_version").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb) return } diff --git a/models/models.go b/models/models.go index 696d0949b..7ec021223 100755 --- a/models/models.go +++ b/models/models.go @@ -133,6 +133,7 @@ func init() { new(FileChunk), new(BlockChain), new(RecommendOrg), + new(TrainjobConfigDetail), ) tablesStatistic = append(tablesStatistic, diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 4bff6a347..fcf1e8829 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -35,16 +35,18 @@ const ( // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + // "]}" - CodePath = "/code/" - OutputPath = "/output/" - LogPath = "/log/" - JobPath = "/job/" - OrderDesc = "desc" //向下查询 - OrderAsc = "asc" //向上查询 - Lines = 20 - TrainUrl = "train_url" - DataUrl = "data_url" - PerPage = 10 + CodePath = "/code/" + OutputPath = "/output/" + LogPath = "/log/" + JobPath = "/job/" + OrderDesc = "desc" //向下查询 + OrderAsc = "asc" //向上查询 + Lines = 20 + TrainUrl = "train_url" + DataUrl = "data_url" + PerPage = 10 + IsLatestVersion = "1" + NotLatestVersion = "0" SortByCreateTime = "create_time" ConfigTypeCustom = "custom" @@ -69,6 +71,8 @@ type GenerateTrainJobReq struct { WorkServerNumber int EngineID int64 Parameters []models.Parameter + CommitID string + IsLatestVersion string } type GenerateTrainJobVersionReq struct { @@ -86,6 +90,7 @@ type GenerateTrainJobVersionReq struct { EngineID int64 Parameters []models.Parameter PreVersionId int64 + CommitID string } type VersionInfo struct { @@ -219,17 +224,19 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { } err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: strconv.FormatInt(jobResult.JobID, 10), - JobName: req.JobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainTwo, - VersionID: jobResult.VersionID, - VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: attach.Name, + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, }) if err != nil { @@ -282,6 +289,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR VersionName: jobResult.VersionName, Uuid: req.Uuid, DatasetName: attach.Name, + CommitID: req.CommitID, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) @@ -307,8 +315,29 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR ctx.ServerError("Cloudbrain", err) return nil } - versionName := "V0001" - err = models.SetVersionCountByJobID(strconv.FormatInt(jobResult.JobID, 10), versionName, VersionListCount) + + //将训练任务的上一版本的isLatestVersion设置为"0" + latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(strconv.FormatInt(jobResult.JobID, 10), IsLatestVersion) + if err != nil { + ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err) + return nil + } + + // lastVersionNum := jobResult.VersionName[1:] + // lastVersionNumToInt64, err := strconv.ParseInt(lastVersionNum, 10, 64) + // if err != nil { + // ctx.ServerError("lastVersionNumToInt64 faild:", err) + // return nil + // } + // lastVersionName := "V" + strconv.FormatInt(lastVersionNumToInt64-1, 10) + err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion) + if err != nil { + ctx.ServerError("UpdateJobVersionCount failed", err) + return nil + } + + //将当前版本的isLatestVersion和任务数量更新 + err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), jobResult.VersionName, VersionListCount, IsLatestVersion) if err != nil { ctx.ServerError("UpdateJobVersionCount failed", err) return nil diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 5f0b8c4f9..71efd233a 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -505,10 +505,10 @@ func TrainJobIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, - JobType: string(models.JobTypeTrain), - VersionName: string(models.JobVersionName), + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + IsLatestVersion: modelarts.IsLatestVersion, }) if err != nil { ctx.ServerError("Cloudbrain", err) @@ -614,13 +614,14 @@ func TrainJobNewVersion(ctx *context.Context) { ctx.ServerError("get new train-job info failed", err) return } - ctx.HTML(200, tplModelArtsTrainJobVersionNew) + ctx.HTML(200, tplModelArtsTrainJobNew) } func trainJobNewVersionDataPrepare(ctx *context.Context) error { ctx.Data["PageIsCloudBrain"] = true var jobID = ctx.Params(":jobid") var versionName = ctx.Query("versionName") + jobID = "19373" t := time.Now() var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] @@ -703,6 +704,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branch_name := form.BranchName + isLatestVersion := modelarts.IsLatestVersion if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) @@ -723,6 +725,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) os.RemoveAll(codeLocalPath) } + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(branch_name) + if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ Branch: branch_name, }); err != nil { @@ -841,6 +846,8 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) PoolID: poolID, Uuid: uuid, Parameters: parameters.Parameter, + CommitID: commitID, + IsLatestVersion: isLatestVersion, } err = modelarts.GenerateTrainJob(ctx, req) @@ -862,6 +869,9 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") var versionName = ctx.Query("versionName") + jobID = "19373" + versionName = "V0009" + jobName := form.JobName uuid := form.Attachment description := form.Description @@ -883,7 +893,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } @@ -899,6 +909,8 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ os.RemoveAll(codeLocalPath) } + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(branch_name) if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ Branch: branch_name, }); err != nil { @@ -911,7 +923,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["params"] = form.Params ctx.Data["branch_name"] = branch_name // ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) - ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobNew, &form) // ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form) return } @@ -920,21 +932,21 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) return } if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil { log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) return } if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) return } @@ -954,7 +966,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if err != nil { log.Error("Failed to Unmarshal params: %s (%v)", params, err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) return } @@ -973,7 +985,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if form.ParameterTemplateName == "" { log.Error("ParameterTemplateName is empty") trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) return } @@ -997,7 +1009,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if err != nil { log.Error("Failed to CreateTrainJobConfig: %v", err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) return } } @@ -1006,7 +1018,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) if err != nil { log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } req := &modelarts.GenerateTrainJobVersionReq{ @@ -1024,6 +1036,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ Uuid: uuid, Parameters: parameters.Parameter, PreVersionId: task.VersionID, + CommitID: commitID, } err = modelarts.GenerateTrainJobVersion(ctx, req, jobID) if err != nil { @@ -1036,32 +1049,32 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) return } - //保存openi创建训练任务界面的参数 - // err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ + // 保存openi创建训练任务界面的参数 + err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ - // JobName: req.JobName, - // ResourcePools: form.PoolID, - // EngineVersions: form.EngineID, - // FlavorInfos: form.Flavor, - // TrainUrl: outputObsPath, - // BootFile: form.BootFile, - // Uuid: form.Attachment, - // DatasetName: attach.Name, - // Params: form.Params, - // BranchName: branch_name, - // }) + JobName: req.JobName, + ResourcePools: form.PoolID, + EngineVersions: form.EngineID, + FlavorInfos: form.Flavor, + TrainUrl: outputObsPath, + BootFile: form.BootFile, + Uuid: form.Attachment, + DatasetName: attach.Name, + Params: form.Params, + BranchName: branch_name, + }) - // if err != nil { - // log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) - // trainJobNewVersionDataPrepare(ctx) - // ctx.Data["bootFile"] = form.BootFile - // ctx.Data["uuid"] = form.Attachment - // ctx.Data["datasetName"] = attach.Name - // ctx.Data["params"] = form.Params - // ctx.Data["branch_name"] = branch_name - // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) - // return - // } + if err != nil { + log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) + trainJobNewVersionDataPrepare(ctx) + ctx.Data["bootFile"] = form.BootFile + ctx.Data["uuid"] = form.Attachment + ctx.Data["datasetName"] = attach.Name + ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + return + } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 0a5464065..37807cf31 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -999,6 +999,9 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew) m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) + // m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNewVersion) + // m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) }) }, context.RepoRef()) From 2ba6457e82e3930e7804da34ad6baeb339aa114e Mon Sep 17 00:00:00 2001 From: liuzx Date: Tue, 9 Nov 2021 19:20:22 +0800 Subject: [PATCH 04/66] =?UTF-8?q?=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- routers/repo/modelarts.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 71efd233a..be6b635bc 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -614,7 +614,7 @@ func TrainJobNewVersion(ctx *context.Context) { ctx.ServerError("get new train-job info failed", err) return } - ctx.HTML(200, tplModelArtsTrainJobNew) + ctx.HTML(200, tplModelArtsTrainJobVersionNew) } func trainJobNewVersionDataPrepare(ctx *context.Context) error { @@ -893,7 +893,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) return } @@ -923,7 +923,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["params"] = form.Params ctx.Data["branch_name"] = branch_name // ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) - ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form) // ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form) return } @@ -932,21 +932,21 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form) return } if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil { log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form) return } if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form) return } @@ -966,7 +966,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if err != nil { log.Error("Failed to Unmarshal params: %s (%v)", params, err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form) return } @@ -985,7 +985,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if form.ParameterTemplateName == "" { log.Error("ParameterTemplateName is empty") trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form) return } @@ -1009,7 +1009,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ if err != nil { log.Error("Failed to CreateTrainJobConfig: %v", err) trainJobNewVersionDataPrepare(ctx) - ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form) return } } @@ -1018,7 +1018,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) if err != nil { log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) return } req := &modelarts.GenerateTrainJobVersionReq{ @@ -1072,7 +1072,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["datasetName"] = attach.Name ctx.Data["params"] = form.Params ctx.Data["branch_name"] = branch_name - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") From 00cd20d53ca869d6436ab465d72246a5008bc01c Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 10 Nov 2021 09:57:33 +0800 Subject: [PATCH 05/66] =?UTF-8?q?=E8=AE=AD=E7=BB=83=E5=90=8E=E5=B0=86jobid?= =?UTF-8?q?=E5=92=8CversionName=E5=A2=9E=E5=8A=A0=E5=88=B0=E9=85=8D?= =?UTF-8?q?=E7=BD=AE=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 2 ++ modules/modelarts/modelarts.go | 28 ++++++++++++------------- routers/repo/modelarts.go | 38 ++++++++++++++++++++++++++++++---- routers/routes/routes.go | 10 --------- 4 files changed, 50 insertions(+), 28 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 81c3df2af..90b2433ad 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -83,6 +83,7 @@ type Cloudbrain struct { type TrainjobConfigDetail struct { ID int64 `xorm:"pk autoincr"` + JobID string `xorm:"INDEX"` JobName string `xorm:"INDEX"` ResourcePools string `xorm:"INDEX"` EngineVersions int `xorm:"INDEX"` @@ -93,6 +94,7 @@ type TrainjobConfigDetail struct { DatasetName string `xorm:"INDEX"` Params string `xorm:"INDEX"` BranchName string `xorm:"INDEX"` + VersionName string `xorm:"INDEX"` User *User `xorm:"-"` Repo *Repository `xorm:"-"` diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index fcf1e8829..f75bf571b 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -192,7 +192,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error return nil } -func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { +func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobresult *models.CreateTrainJobResult, err error) { jobResult, err := createTrainJob(models.CreateTrainJobParams{ JobName: req.JobName, Description: req.Description, @@ -214,13 +214,13 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { }) if err != nil { log.Error("CreateJob failed: %v", err.Error()) - return err + return nil, err } attach, err := models.GetAttachmentByUUID(req.Uuid) if err != nil { log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) - return nil + return nil, err } err = models.CreateCloudbrain(&models.Cloudbrain{ @@ -241,13 +241,13 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) - return err + return nil, err } - return nil + return jobResult, nil } -func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string) error { +func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string) (jobresult *models.CreateTrainJobResult, err error) { jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{ Description: req.Description, Config: models.TrainJobVersionConfig{ @@ -268,13 +268,13 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR }, jobId) if err != nil { log.Error("CreateJob failed: %v", err.Error()) - return err + return nil, err } attach, err := models.GetAttachmentByUUID(req.Uuid) if err != nil { log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) - return nil + return nil, err } err = models.CreateCloudbrain(&models.Cloudbrain{ @@ -293,7 +293,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) - return err + return nil, err } repo := ctx.Repo.Repository @@ -313,14 +313,14 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR }) if err != nil { ctx.ServerError("Cloudbrain", err) - return nil + return nil, err } //将训练任务的上一版本的isLatestVersion设置为"0" latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(strconv.FormatInt(jobResult.JobID, 10), IsLatestVersion) if err != nil { ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err) - return nil + return nil, err } // lastVersionNum := jobResult.VersionName[1:] @@ -333,17 +333,17 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion) if err != nil { ctx.ServerError("UpdateJobVersionCount failed", err) - return nil + return nil, err } //将当前版本的isLatestVersion和任务数量更新 err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), jobResult.VersionName, VersionListCount, IsLatestVersion) if err != nil { ctx.ServerError("UpdateJobVersionCount failed", err) - return nil + return nil, err } - return nil + return jobResult, err } func TransTrainJobStatus(status int) string { diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index be6b635bc..28b66e59e 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -850,7 +850,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) IsLatestVersion: isLatestVersion, } - err = modelarts.GenerateTrainJob(ctx, req) + jobResult, err := modelarts.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) trainJobNewDataPrepare(ctx) @@ -862,6 +862,34 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } + // 保存openi创建训练任务界面的参数 + err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ + + JobName: req.JobName, + JobID: strconv.FormatInt(jobResult.JobID, 10), + VersionName: jobResult.VersionName, + ResourcePools: form.PoolID, + EngineVersions: form.EngineID, + FlavorInfos: form.Flavor, + TrainUrl: outputObsPath, + BootFile: form.BootFile, + Uuid: form.Attachment, + DatasetName: attach.Name, + Params: form.Params, + BranchName: branch_name, + }) + + if err != nil { + log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) + trainJobNewVersionDataPrepare(ctx) + ctx.Data["bootFile"] = form.BootFile + ctx.Data["uuid"] = form.Attachment + ctx.Data["datasetName"] = attach.Name + ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } @@ -869,8 +897,8 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") var versionName = ctx.Query("versionName") - jobID = "19373" - versionName = "V0009" + // jobID = "19373" + // versionName = "V0009" jobName := form.JobName uuid := form.Attachment @@ -1038,7 +1066,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ PreVersionId: task.VersionID, CommitID: commitID, } - err = modelarts.GenerateTrainJobVersion(ctx, req, jobID) + jobResult, err := modelarts.GenerateTrainJobVersion(ctx, req, jobID) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) trainJobNewVersionDataPrepare(ctx) @@ -1053,6 +1081,8 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ JobName: req.JobName, + JobID: strconv.FormatInt(jobResult.JobID, 10), + VersionName: jobResult.VersionName, ResourcePools: form.PoolID, EngineVersions: form.EngineID, FlavorInfos: form.Flavor, diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 37807cf31..c3dde2274 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -962,16 +962,6 @@ func RegisterRoutes(m *macaron.Macaron) { }, context.RepoRef()) m.Group("/modelarts", func() { - // m.Get("", reqRepoCloudBrainReader, repo.ModelArtsIndex) - // m.Group("/:jobid", func() { - // m.Get("", reqRepoCloudBrainReader, repo.ModelArtsShow) - // m.Get("/debug", reqRepoCloudBrainReader, repo.ModelArtsDebug) - // m.Post("/stop", reqRepoCloudBrainWriter, repo.ModelArtsStop) - // m.Post("/del", reqRepoCloudBrainWriter, repo.ModelArtsDel) - // }) - // m.Get("/create", reqRepoCloudBrainWriter, repo.ModelArtsNew) - // m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsForm{}), repo.ModelArtsCreate) - m.Group("/notebook", func() { m.Get("", reqRepoCloudBrainReader, repo.NotebookIndex) m.Group("/:jobid", func() { From e9a5545fe86440d8a78b5ee1df239f668dd73155 Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 10 Nov 2021 15:09:37 +0800 Subject: [PATCH 06/66] update --- models/cloudbrain.go | 17 ++++++----- modules/auth/modelarts.go | 3 +- modules/modelarts/modelarts.go | 43 +++++++++++++++----------- routers/repo/modelarts.go | 56 +++++++++++++++++++++++++++++++--- routers/routes/routes.go | 2 ++ 5 files changed, 91 insertions(+), 30 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 90b2433ad..04c4dbac3 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -69,13 +69,16 @@ type Cloudbrain struct { CanDel bool `xorm:"-"` Type int `xorm:"INDEX DEFAULT 0"` - VersionID int64 `xorm:"INDEX DEFAULT 0"` - VersionName string - Uuid string - DatasetName string - VersionCount int64 `xorm:"INDEX DEFAULT 1"` - IsLatestVersion string - CommitID string + VersionID int64 `xorm:"INDEX DEFAULT 0"` + VersionName string + Uuid string + DatasetName string + VersionCount int64 `xorm:"INDEX DEFAULT 1"` + IsLatestVersion string + CommitID string + FatherVersionName string + ComputeResource string + EngineID int64 User *User `xorm:"-"` Repo *Repository `xorm:"-"` diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index 1eb214392..a53661b74 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -38,7 +38,8 @@ type CreateModelArtsTrainJobForm struct { IsSaveParam string `form:"is_save_para"` ParameterTemplateName string `form:"parameter_template_name"` PrameterDescription string `form:"parameter_description"` - BranchName string `form:"branch_name"` + BranchName string `form:"branch_name" binding:"Required"` + VersionName string `form:"version_name" binding:"Required"` } func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index f75bf571b..9f7b67c06 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -47,6 +47,7 @@ const ( PerPage = 10 IsLatestVersion = "1" NotLatestVersion = "0" + ComputeResource = "NPU" SortByCreateTime = "create_time" ConfigTypeCustom = "custom" @@ -237,6 +238,8 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobresult DatasetName: attach.Name, CommitID: req.CommitID, IsLatestVersion: req.IsLatestVersion, + ComputeResource: ComputeResource, + EngineID: req.EngineID, }) if err != nil { @@ -247,7 +250,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobresult return jobResult, nil } -func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string) (jobresult *models.CreateTrainJobResult, err error) { +func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string, fatherVersionName string) (jobresult *models.CreateTrainJobResult, err error) { jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{ Description: req.Description, Config: models.TrainJobVersionConfig{ @@ -278,18 +281,21 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR } err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: strconv.FormatInt(jobResult.JobID, 10), - JobName: req.JobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainTwo, - VersionID: jobResult.VersionID, - VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: attach.Name, - CommitID: req.CommitID, + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + CommitID: req.CommitID, + FatherVersionName: fatherVersionName, + ComputeResource: ComputeResource, + EngineID: req.EngineID, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) @@ -322,6 +328,11 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err) return nil, err } + err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion) + if err != nil { + ctx.ServerError("UpdateJobVersionCount failed", err) + return nil, err + } // lastVersionNum := jobResult.VersionName[1:] // lastVersionNumToInt64, err := strconv.ParseInt(lastVersionNum, 10, 64) @@ -330,11 +341,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR // return nil // } // lastVersionName := "V" + strconv.FormatInt(lastVersionNumToInt64-1, 10) - err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion) - if err != nil { - ctx.ServerError("UpdateJobVersionCount failed", err) - return nil, err - } + //将训练任务的本版本的isLatestVersion设置为"0" //将当前版本的isLatestVersion和任务数量更新 err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), jobResult.VersionName, VersionListCount, IsLatestVersion) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 28b66e59e..677c9530b 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -896,7 +896,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") - var versionName = ctx.Query("versionName") + // var fatherVersionName = ctx.Query("versionName") // jobID = "19373" // versionName = "V0009" @@ -917,6 +917,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branch_name := form.BranchName + fatherVersionName := form.VersionName if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) @@ -1043,7 +1044,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ } // JobVersionName := "V0001" // PreVersionId := int64(67646) - task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, fatherVersionName) if err != nil { log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) @@ -1066,7 +1067,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ PreVersionId: task.VersionID, CommitID: commitID, } - jobResult, err := modelarts.GenerateTrainJobVersion(ctx, req, jobID) + jobResult, err := modelarts.GenerateTrainJobVersion(ctx, req, jobID, fatherVersionName) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) trainJobNewVersionDataPrepare(ctx) @@ -1105,7 +1106,8 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) return } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + // ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } // readDir reads the directory named by dirname and returns @@ -1383,6 +1385,52 @@ func TrainJobStop(ctx *context.Context) { ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func TrainJobVersionDel(ctx *context.Context) { + var jobID = ctx.Params(":jobid") + var versionName = ctx.Params(":versionName") + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + _, err = modelarts.DelTrainJob(jobID) + if err != nil { + log.Error("DelTrainJob(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + err = models.DeleteJob(task) + if err != nil { + ctx.ServerError("DeleteJob failed", err) + return + } + + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") +} + +func TrainJobVersionStop(ctx *context.Context) { + var jobID = ctx.Params(":jobid") + var versionName = ctx.Params(":versionName") + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + return + } + + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") +} + func canUserCreateTrainJob(uid int64) (bool, error) { org, err := models.GetOrgByName(setting.AllowedOrg) if err != nil { diff --git a/routers/routes/routes.go b/routers/routes/routes.go index c3dde2274..c1702ebe1 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -985,6 +985,8 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + m.Post("/stop_version", reqRepoCloudBrainWriter, repo.TrainJobVersionStop) + m.Post("/del_version", reqRepoCloudBrainWriter, repo.TrainJobVersionDel) }) m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew) m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) From 18e894850c8ab8a9efc13933876bc3792169e7db Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 10 Nov 2021 17:45:58 +0800 Subject: [PATCH 07/66] =?UTF-8?q?=E6=95=B4=E5=90=88cloudbrain=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 15 ++- modules/modelarts/modelarts.go | 151 ++++++++++++++++-------------- routers/repo/modelarts.go | 164 +++++++++++++++++---------------- 3 files changed, 182 insertions(+), 148 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 04c4dbac3..e4d8461fb 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -69,8 +69,8 @@ type Cloudbrain struct { CanDel bool `xorm:"-"` Type int `xorm:"INDEX DEFAULT 0"` - VersionID int64 `xorm:"INDEX DEFAULT 0"` - VersionName string + VersionID int64 `xorm:"INDEX DEFAULT 0"` + VersionName string `xorm:"INDEX"` Uuid string DatasetName string VersionCount int64 `xorm:"INDEX DEFAULT 1"` @@ -80,6 +80,17 @@ type Cloudbrain struct { ComputeResource string EngineID int64 + TrainUrl string + BranchName string + Parameters string + BootFile string + DataUrl string + LogUrl string + PreVersionId int64 + FlavorCode string + Description string + WorkServerNumber int + User *User `xorm:"-"` Repo *Repository `xorm:"-"` } diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 9f7b67c06..7d87ca1b2 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -35,19 +35,20 @@ const ( // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + // "]}" - CodePath = "/code/" - OutputPath = "/output/" - LogPath = "/log/" - JobPath = "/job/" - OrderDesc = "desc" //向下查询 - OrderAsc = "asc" //向上查询 - Lines = 20 - TrainUrl = "train_url" - DataUrl = "data_url" - PerPage = 10 - IsLatestVersion = "1" - NotLatestVersion = "0" - ComputeResource = "NPU" + CodePath = "/code/" + OutputPath = "/output/" + LogPath = "/log/" + JobPath = "/job/" + OrderDesc = "desc" //向下查询 + OrderAsc = "asc" //向上查询 + Lines = 20 + TrainUrl = "train_url" + DataUrl = "data_url" + PerPage = 10 + IsLatestVersion = "1" + NotLatestVersion = "0" + ComputeResource = "NPU" + InitFatherVersionName = "V0001" SortByCreateTime = "create_time" ConfigTypeCustom = "custom" @@ -59,21 +60,24 @@ var ( ) type GenerateTrainJobReq struct { - JobName string - Uuid string - Description string - CodeObsPath string - BootFile string - DataUrl string - TrainUrl string - FlavorCode string - LogUrl string - PoolID string - WorkServerNumber int - EngineID int64 - Parameters []models.Parameter - CommitID string - IsLatestVersion string + JobName string + Uuid string + Description string + CodeObsPath string + BootFile string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + CommitID string + IsLatestVersion string + Params string + BranchName string + FatherVersionName string } type GenerateTrainJobVersionReq struct { @@ -90,8 +94,10 @@ type GenerateTrainJobVersionReq struct { WorkServerNumber int EngineID int64 Parameters []models.Parameter + Params string PreVersionId int64 CommitID string + BranchName string } type VersionInfo struct { @@ -193,7 +199,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error return nil } -func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobresult *models.CreateTrainJobResult, err error) { +func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { jobResult, err := createTrainJob(models.CreateTrainJobParams{ JobName: req.JobName, Description: req.Description, @@ -215,42 +221,52 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobresult }) if err != nil { log.Error("CreateJob failed: %v", err.Error()) - return nil, err + return err } attach, err := models.GetAttachmentByUUID(req.Uuid) if err != nil { log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) - return nil, err + return err } err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: strconv.FormatInt(jobResult.JobID, 10), - JobName: req.JobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainTwo, - VersionID: jobResult.VersionID, - VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: attach.Name, - CommitID: req.CommitID, - IsLatestVersion: req.IsLatestVersion, - ComputeResource: ComputeResource, - EngineID: req.EngineID, + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: ComputeResource, + EngineID: req.EngineID, + FatherVersionName: req.FatherVersionName, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + LogUrl: req.LogUrl, + FlavorCode: req.FlavorCode, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) - return nil, err + return err } - return jobResult, nil + return nil } -func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string, fatherVersionName string) (jobresult *models.CreateTrainJobResult, err error) { +func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string, fatherVersionName string) (err error) { jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{ Description: req.Description, Config: models.TrainJobVersionConfig{ @@ -271,13 +287,13 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR }, jobId) if err != nil { log.Error("CreateJob failed: %v", err.Error()) - return nil, err + return err } attach, err := models.GetAttachmentByUUID(req.Uuid) if err != nil { log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) - return nil, err + return err } err = models.CreateCloudbrain(&models.Cloudbrain{ @@ -296,10 +312,20 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR FatherVersionName: fatherVersionName, ComputeResource: ComputeResource, EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + LogUrl: req.LogUrl, + PreVersionId: req.PreVersionId, + FlavorCode: req.FlavorCode, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) - return nil, err + return err } repo := ctx.Repo.Repository @@ -319,38 +345,29 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR }) if err != nil { ctx.ServerError("Cloudbrain", err) - return nil, err + return err } //将训练任务的上一版本的isLatestVersion设置为"0" latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(strconv.FormatInt(jobResult.JobID, 10), IsLatestVersion) if err != nil { ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err) - return nil, err + return err } err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion) if err != nil { ctx.ServerError("UpdateJobVersionCount failed", err) - return nil, err + return err } - // lastVersionNum := jobResult.VersionName[1:] - // lastVersionNumToInt64, err := strconv.ParseInt(lastVersionNum, 10, 64) - // if err != nil { - // ctx.ServerError("lastVersionNumToInt64 faild:", err) - // return nil - // } - // lastVersionName := "V" + strconv.FormatInt(lastVersionNumToInt64-1, 10) - //将训练任务的本版本的isLatestVersion设置为"0" - - //将当前版本的isLatestVersion和任务数量更新 + //将当前版本的isLatestVersion设置为"1"和任务数量更新 err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), jobResult.VersionName, VersionListCount, IsLatestVersion) if err != nil { ctx.ServerError("UpdateJobVersionCount failed", err) - return nil, err + return err } - return jobResult, err + return err } func TransTrainJobStatus(status int) string { diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 677c9530b..8426f99d8 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -833,24 +833,27 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) } req := &modelarts.GenerateTrainJobReq{ - JobName: jobName, - DataUrl: dataPath, - Description: description, - CodeObsPath: codeObsPath, - BootFile: codeObsPath + bootFile, - TrainUrl: outputObsPath, - FlavorCode: flavorCode, - WorkServerNumber: workServerNumber, - EngineID: int64(engineID), - LogUrl: logObsPath, - PoolID: poolID, - Uuid: uuid, - Parameters: parameters.Parameter, - CommitID: commitID, - IsLatestVersion: isLatestVersion, - } - - jobResult, err := modelarts.GenerateTrainJob(ctx, req) + JobName: jobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFile: codeObsPath + bootFile, + TrainUrl: outputObsPath, + FlavorCode: flavorCode, + WorkServerNumber: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Parameters: parameters.Parameter, + CommitID: commitID, + IsLatestVersion: isLatestVersion, + BranchName: branch_name, + Params: form.Params, + FatherVersionName: modelarts.InitFatherVersionName, + } + + err = modelarts.GenerateTrainJob(ctx, req) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) trainJobNewDataPrepare(ctx) @@ -862,34 +865,34 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } - // 保存openi创建训练任务界面的参数 - err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ - - JobName: req.JobName, - JobID: strconv.FormatInt(jobResult.JobID, 10), - VersionName: jobResult.VersionName, - ResourcePools: form.PoolID, - EngineVersions: form.EngineID, - FlavorInfos: form.Flavor, - TrainUrl: outputObsPath, - BootFile: form.BootFile, - Uuid: form.Attachment, - DatasetName: attach.Name, - Params: form.Params, - BranchName: branch_name, - }) + // // 保存openi创建训练任务界面的参数 + // err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ - if err != nil { - log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) - trainJobNewVersionDataPrepare(ctx) - ctx.Data["bootFile"] = form.BootFile - ctx.Data["uuid"] = form.Attachment - ctx.Data["datasetName"] = attach.Name - ctx.Data["params"] = form.Params - ctx.Data["branch_name"] = branch_name - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) - return - } + // JobName: req.JobName, + // JobID: strconv.FormatInt(jobResult.JobID, 10), + // VersionName: jobResult.VersionName, + // ResourcePools: form.PoolID, + // EngineVersions: form.EngineID, + // FlavorInfos: form.Flavor, + // TrainUrl: outputObsPath, + // BootFile: form.BootFile, + // Uuid: form.Attachment, + // DatasetName: attach.Name, + // Params: form.Params, + // BranchName: branch_name, + // }) + + // if err != nil { + // log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) + // trainJobNewVersionDataPrepare(ctx) + // ctx.Data["bootFile"] = form.BootFile + // ctx.Data["uuid"] = form.Attachment + // ctx.Data["datasetName"] = attach.Name + // ctx.Data["params"] = form.Params + // ctx.Data["branch_name"] = branch_name + // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + // return + // } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } @@ -1063,11 +1066,12 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ LogUrl: logObsPath, PoolID: poolID, Uuid: uuid, - Parameters: parameters.Parameter, + Params: form.Params, PreVersionId: task.VersionID, CommitID: commitID, + BranchName: branch_name, } - jobResult, err := modelarts.GenerateTrainJobVersion(ctx, req, jobID, fatherVersionName) + err = modelarts.GenerateTrainJobVersion(ctx, req, jobID, fatherVersionName) if err != nil { log.Error("GenerateTrainJob failed:%v", err.Error()) trainJobNewVersionDataPrepare(ctx) @@ -1079,33 +1083,33 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ return } // 保存openi创建训练任务界面的参数 - err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ - - JobName: req.JobName, - JobID: strconv.FormatInt(jobResult.JobID, 10), - VersionName: jobResult.VersionName, - ResourcePools: form.PoolID, - EngineVersions: form.EngineID, - FlavorInfos: form.Flavor, - TrainUrl: outputObsPath, - BootFile: form.BootFile, - Uuid: form.Attachment, - DatasetName: attach.Name, - Params: form.Params, - BranchName: branch_name, - }) + // err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{ + + // JobName: req.JobName, + // JobID: strconv.FormatInt(jobResult.JobID, 10), + // VersionName: jobResult.VersionName, + // ResourcePools: form.PoolID, + // EngineVersions: form.EngineID, + // FlavorInfos: form.Flavor, + // TrainUrl: outputObsPath, + // BootFile: form.BootFile, + // Uuid: form.Attachment, + // DatasetName: attach.Name, + // Params: form.Params, + // BranchName: branch_name, + // }) - if err != nil { - log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) - trainJobNewVersionDataPrepare(ctx) - ctx.Data["bootFile"] = form.BootFile - ctx.Data["uuid"] = form.Attachment - ctx.Data["datasetName"] = attach.Name - ctx.Data["params"] = form.Params - ctx.Data["branch_name"] = branch_name - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) - return - } + // if err != nil { + // log.Error("CreateTrainjobConfigDetail failed:%v", err.Error()) + // trainJobNewVersionDataPrepare(ctx) + // ctx.Data["bootFile"] = form.BootFile + // ctx.Data["uuid"] = form.Attachment + // ctx.Data["datasetName"] = attach.Name + // ctx.Data["params"] = form.Params + // ctx.Data["branch_name"] = branch_name + // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + // return + // } // ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } @@ -1387,18 +1391,18 @@ func TrainJobStop(ctx *context.Context) { func TrainJobVersionDel(ctx *context.Context) { var jobID = ctx.Params(":jobid") - var versionName = ctx.Params(":versionName") + var versionName = ctx.Query(":versionName") task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) if err != nil { log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) return } _, err = modelarts.DelTrainJob(jobID) if err != nil { log.Error("DelTrainJob(%s) failed:%v", task.JobName, err.Error()) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) return } @@ -1408,12 +1412,13 @@ func TrainJobVersionDel(ctx *context.Context) { return } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + // ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } func TrainJobVersionStop(ctx *context.Context) { var jobID = ctx.Params(":jobid") - var versionName = ctx.Params(":versionName") + var versionName = ctx.Query(":versionName") task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) if err != nil { log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) @@ -1428,7 +1433,8 @@ func TrainJobVersionStop(ctx *context.Context) { return } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + // ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } func canUserCreateTrainJob(uid int64) (bool, error) { From 2823b028a82d77a91f658a1bc89b7e830df3d532 Mon Sep 17 00:00:00 2001 From: liuzx Date: Wed, 10 Nov 2021 18:58:25 +0800 Subject: [PATCH 08/66] =?UTF-8?q?=E5=BC=95=E6=93=8E=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- models/cloudbrain.go | 1 + modules/auth/modelarts.go | 1 + modules/modelarts/modelarts.go | 4 ++++ routers/repo/modelarts.go | 27 ++++++++++++++++++++------- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/models/cloudbrain.go b/models/cloudbrain.go index e4d8461fb..a061857c4 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -90,6 +90,7 @@ type Cloudbrain struct { FlavorCode string Description string WorkServerNumber int + FlavorName string User *User `xorm:"-"` Repo *Repository `xorm:"-"` diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index a53661b74..3cd8ac637 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -40,6 +40,7 @@ type CreateModelArtsTrainJobForm struct { PrameterDescription string `form:"parameter_description"` BranchName string `form:"branch_name" binding:"Required"` VersionName string `form:"version_name" binding:"Required"` + FlavorName string `form:"flavor_name" binding:"Required"` } func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 7d87ca1b2..88378ab10 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -78,6 +78,7 @@ type GenerateTrainJobReq struct { Params string BranchName string FatherVersionName string + FlavorName string } type GenerateTrainJobVersionReq struct { @@ -98,6 +99,7 @@ type GenerateTrainJobVersionReq struct { PreVersionId int64 CommitID string BranchName string + FlavorName string } type VersionInfo struct { @@ -256,6 +258,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error FlavorCode: req.FlavorCode, Description: req.Description, WorkServerNumber: req.WorkServerNumber, + FlavorName: req.FlavorName, }) if err != nil { @@ -322,6 +325,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR FlavorCode: req.FlavorCode, Description: req.Description, WorkServerNumber: req.WorkServerNumber, + FlavorName: req.FlavorName, }) if err != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 8426f99d8..b5b5b07a9 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -620,12 +620,17 @@ func TrainJobNewVersion(ctx *context.Context) { func trainJobNewVersionDataPrepare(ctx *context.Context) error { ctx.Data["PageIsCloudBrain"] = true var jobID = ctx.Params(":jobid") - var versionName = ctx.Query("versionName") - jobID = "19373" + var versionName = ctx.Query("version_name") + + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) + return err + } t := time.Now() var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] - ctx.Data["job_name"] = jobName + ctx.Data["job_name"] = task.JobName attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID) if err != nil { @@ -670,10 +675,14 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error { ctx.ServerError("GetBranches error:", err) return err } - ctx.Data["Branches"] = Branches - ctx.Data["BranchesCount"] = len(Branches) - ctx.Data["jobID"] = jobID - ctx.Data["versionName"] = versionName + ctx.Data["branches"] = Branches + ctx.Data["branch_name"] = task.BranchName + ctx.Data["description"] = task.Description + ctx.Data["boot_file"] = task.BootFile + ctx.Data["dataset_name"] = task.DatasetName + ctx.Data["params"] = task.Parameters + ctx.Data["work_server_number"] = task.WorkServerNumber + ctx.Data["flavor_name"] = task.FlavorName configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { @@ -705,6 +714,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branch_name := form.BranchName isLatestVersion := modelarts.IsLatestVersion + FlavorName := form.FlavorName if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) @@ -851,6 +861,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) BranchName: branch_name, Params: form.Params, FatherVersionName: modelarts.InitFatherVersionName, + FlavorName: FlavorName, } err = modelarts.GenerateTrainJob(ctx, req) @@ -921,6 +932,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branch_name := form.BranchName fatherVersionName := form.VersionName + FlavorName := form.FlavorName if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) @@ -1070,6 +1082,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ PreVersionId: task.VersionID, CommitID: commitID, BranchName: branch_name, + FlavorName: FlavorName, } err = modelarts.GenerateTrainJobVersion(ctx, req, jobID, fatherVersionName) if err != nil { From 800bd819bbcfd280f9da343a8bd9c151f77ff9e7 Mon Sep 17 00:00:00 2001 From: liuzx Date: Thu, 11 Nov 2021 10:26:22 +0800 Subject: [PATCH 09/66] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=AE=AD=E7=BB=83?= =?UTF-8?q?=E7=89=88=E6=9C=AC=E7=9A=84=E6=A8=A1=E5=9E=8B=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- modules/storage/obs.go | 72 ++++++++++++++++++++++++++++++++++----- routers/repo/modelarts.go | 36 ++++++++++++++++++-- routers/routes/routes.go | 1 + 3 files changed, 99 insertions(+), 10 deletions(-) diff --git a/modules/storage/obs.go b/modules/storage/obs.go index 8e72c03c4..e8552e2b4 100755 --- a/modules/storage/obs.go +++ b/modules/storage/obs.go @@ -185,10 +185,10 @@ func GetObsListObject(jobName, parentDir string) ([]FileInfo, error) { for _, val := range output.Contents { str1 := strings.Split(val.Key, "/") var isDir bool - var fileName,nextParentDir string + var fileName, nextParentDir string if strings.HasSuffix(val.Key, "/") { //dirs in next level dir - if len(str1) - len(strPrefix) > 2 { + if len(str1)-len(strPrefix) > 2 { continue } fileName = str1[len(str1)-2] @@ -199,12 +199,12 @@ func GetObsListObject(jobName, parentDir string) ([]FileInfo, error) { nextParentDir = parentDir + "/" + fileName } - if fileName == strPrefix[len(strPrefix)-1] || (fileName + "/") == setting.OutPutPath { + if fileName == strPrefix[len(strPrefix)-1] || (fileName+"/") == setting.OutPutPath { continue } } else { //files in next level dir - if len(str1) - len(strPrefix) > 1 { + if len(str1)-len(strPrefix) > 1 { continue } fileName = str1[len(str1)-1] @@ -213,10 +213,66 @@ func GetObsListObject(jobName, parentDir string) ([]FileInfo, error) { } fileInfo := FileInfo{ - ModTime: val.LastModified.Format("2006-01-02 15:04:05"), + ModTime: val.LastModified.Format("2006-01-02 15:04:05"), FileName: fileName, - Size: val.Size, - IsDir:isDir, + Size: val.Size, + IsDir: isDir, + ParenDir: nextParentDir, + } + fileInfos = append(fileInfos, fileInfo) + } + return fileInfos, err + } else { + if obsError, ok := err.(obs.ObsError); ok { + log.Error("Code:%s, Message:%s", obsError.Code, obsError.Message) + } + return nil, err + } +} + +func GetVersionObsListObject(jobName, parentDir string) ([]FileInfo, error) { + input := &obs.ListObjectsInput{} + input.Bucket = setting.Bucket + input.Prefix = strings.TrimPrefix(path.Join(setting.TrainJobModelPath, jobName, setting.OutPutPath, parentDir), "/") + strPrefix := strings.Split(input.Prefix, "/") + output, err := ObsCli.ListObjects(input) + fileInfos := make([]FileInfo, 0) + if err == nil { + for _, val := range output.Contents { + str1 := strings.Split(val.Key, "/") + var isDir bool + var fileName, nextParentDir string + if strings.HasSuffix(val.Key, "/") { + //dirs in next level dir + if len(str1)-len(strPrefix) > 2 { + continue + } + fileName = str1[len(str1)-2] + isDir = true + if parentDir == "" { + nextParentDir = fileName + } else { + nextParentDir = parentDir + "/" + fileName + } + + if fileName == strPrefix[len(strPrefix)-1] || (fileName+"/") == setting.OutPutPath { + continue + } + } else { + //files in next level dir + if len(str1)-len(strPrefix) > 1 { + continue + } + fileName = str1[len(str1)-1] + isDir = false + nextParentDir = parentDir + } + + fileInfo := FileInfo{ + ModTime: val.LastModified.Format("2006-01-02 15:04:05"), + FileName: fileName, + Size: val.Size, + IsDir: isDir, ParenDir: nextParentDir, } fileInfos = append(fileInfos, fileInfo) @@ -257,7 +313,7 @@ func GetObsCreateSignedUrl(jobName, parentDir, fileName string) (string, error) input := &obs.CreateSignedUrlInput{} input.Bucket = setting.Bucket input.Key = strings.TrimPrefix(path.Join(setting.TrainJobModelPath, jobName, setting.OutPutPath, parentDir, fileName), "/") - + input.Expires = 60 * 60 input.Method = obs.HttpMethodGet diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index b5b5b07a9..8e75c58a9 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -1219,6 +1219,10 @@ func TrainJobShow(ctx *context.Context) { var jobID = ctx.Params(":jobid") task, err := models.GetCloudbrainByJobID(jobID) + if err != nil { + ctx.ServerError("GetCloudbrainByJobID faild", err) + return + } repo := ctx.Repo.Repository page := ctx.QueryInt("page") @@ -1290,8 +1294,8 @@ func TrainJobShow(ctx *context.Context) { ctx.Data["task"] = task ctx.Data["jobID"] = jobID ctx.Data["result"] = result - ctx.Data["VersionListTasks"] = VersionListTasks - ctx.Data["VersionLisCount"] = VersionListCount + ctx.Data["version_list_task"] = VersionListTasks + ctx.Data["version_list_count"] = VersionListCount ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } @@ -1541,6 +1545,34 @@ func TrainJobShowModels(ctx *context.Context) { ctx.HTML(200, tplModelArtsTrainJobShowModels) } +func TrainJobVersionShowModels(ctx *context.Context) { + ctx.Data["PageIsCloudBrain"] = true + + jobID := ctx.Params(":jobid") + parentDir := ctx.Query("parentDir") + versionName := ctx.Query("version_name") + dirArray := strings.Split(parentDir, "/") + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("no such job!", ctx.Data["msgID"]) + ctx.ServerError("no such job:", err) + return + } + parentDir = versionName + models, err := storage.GetVersionObsListObject(task.JobName, parentDir) + if err != nil { + log.Info("get TrainJobListModel failed:", err) + ctx.ServerError("GetVersionObsListObject:", err) + return + } + + ctx.Data["Path"] = dirArray + ctx.Data["Dirs"] = models + ctx.Data["task"] = task + ctx.Data["JobID"] = jobID + ctx.HTML(200, tplModelArtsTrainJobShowModels) +} + func TrainJobDownloadModel(ctx *context.Context) { parentDir := ctx.Query("parentDir") fileName := ctx.Query("fileName") diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 153a77e93..579137d1e 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -988,6 +988,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog) m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels) m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) + m.Get("/version_models", reqRepoCloudBrainReader, repo.TrainJobVersionShowModels) m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) m.Post("/stop_version", reqRepoCloudBrainWriter, repo.TrainJobVersionStop) From 23d4f5acd76653418151a5bf69f9a1974e66e19a Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Thu, 11 Nov 2021 10:39:04 +0800 Subject: [PATCH 10/66] fix issue --- options/locale/locale_en-US.ini | 5 + options/locale/locale_zh-CN.ini | 6 + templates/repo/cloudbrain/index.tmpl | 4 +- templates/repo/modelarts/trainjob/index.tmpl | 116 ++++++++++--------- templates/repo/modelarts/trainjob/new.tmpl | 93 +++------------ 5 files changed, 92 insertions(+), 132 deletions(-) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index a722ea671..e2a3008c9 100644 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -792,6 +792,10 @@ total_count_get_error=Can not get the total page. last_update_time_error=Can not get the last updated time. get_repo_stat_error=Can not get the statistics of the repository. get_repo_info_error=Can not get the information of the repository. +modelarts.status=Status +modelarts.createtime=CreateTime +modelarts.version_nums = Version Nums +modelarts.computing_resources=compute Resources modelarts.notebook=Debug Task modelarts.train_job=Train Task modelarts.train_job.new_debug= New Debug Task @@ -820,6 +824,7 @@ modelarts.train_job.AI_driver=AI Engine modelarts.train_job.start_file=Start File modelarts.train_job.boot_file_helper=The startup file is the entry file that your program executes, and it must be a file ending in .py modelarts.train_job.dataset=Dataset +modelarts.code_version = Code Version modelarts.train_job.run_parameter=Run Parameter modelarts.train_job.add_run_parameter=Add Run Parameter modelarts.train_job.parameter_name=Parameter Name diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index e89c3dcde..bf4fd4bd5 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -794,6 +794,11 @@ total_count_get_error=查询总页数失败。 last_update_time_error=查询最新更新时间失败。 get_repo_stat_error=查询当前仓库的统计信息失败。 get_repo_info_error=查询当前仓库信息失败。 + +modelarts.status=状态 +modelarts.createtime=创建时间 +modelarts.version_nums=版本数 +modelarts.computing_resources=计算资源 modelarts.notebook=调试任务 modelarts.train_job=训练任务 modelarts.train_job.new_debug=新建调试任务 @@ -823,6 +828,7 @@ modelarts.train_job.start_file=启动文件 modelarts.train_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。 modelarts.train_job.boot_file_place=填写启动文件路径,默认为train.py modelarts.train_job.dataset=数据集 +modelarts.code_version=代码版本 modelarts.train_job.run_parameter=运行参数 modelarts.train_job.add_run_parameter=增加运行参数 modelarts.train_job.parameter_name=参数名 diff --git a/templates/repo/cloudbrain/index.tmpl b/templates/repo/cloudbrain/index.tmpl index 9099cb17a..d72ffb0c4 100755 --- a/templates/repo/cloudbrain/index.tmpl +++ b/templates/repo/cloudbrain/index.tmpl @@ -239,8 +239,8 @@
diff --git a/templates/repo/modelarts/trainjob/index.tmpl b/templates/repo/modelarts/trainjob/index.tmpl index 1453da9e5..4abf934ea 100755 --- a/templates/repo/modelarts/trainjob/index.tmpl +++ b/templates/repo/modelarts/trainjob/index.tmpl @@ -180,6 +180,12 @@ cursor: pointer; pointer-events: none; } + .fontsize14{ + font-size: 14px; + } + .padding0{ + padding: 0 !important; + } @@ -232,13 +238,13 @@
- @@ -278,20 +284,29 @@
-
+
{{$.i18n.Tr "repo.cloudbrain_task"}}
-
- {{$.i18n.Tr "repo.cloudbrain_status_createtime"}} +
+ {{$.i18n.Tr "repo.modelarts.version_nums"}} +
+
+ {{$.i18n.Tr "repo.modelarts.status"}} +
+
+ {{$.i18n.Tr "repo.modelarts.createtime"}}
-
+
{{$.i18n.Tr "repo.cloudbrain_status_runtime"}}
-
+
+ {{$.i18n.Tr "repo.modelarts.computing_resources"}} +
+
{{$.i18n.Tr "repo.cloudbrain_creator"}}
-
- {{$.i18n.Tr "repo.cloudbrain_operate"}} +
+ {{$.i18n.Tr "repo.cloudbrain_operate"}}
@@ -305,38 +320,44 @@
-
- + - -
- - + +
+ {{.VersionCount}} +
+ +
+ + {{.Status}} + +
+ +
+ {{TimeSinceUnix .Cloudbrain.CreatedUnix $.Lang}} +
+ + {{TimeSinceUnix .Cloudbrain.CreatedUnix $.Lang}} +
--> + +
+
- -
- - - - - - - + +
+ {{.ComputeResource}}
- -
+ +
{{if .User.Name}} {{else}} @@ -344,56 +365,41 @@ {{end}}
-
+
- -
+ {{$.CsrfTokenHtml}} {{if $.Permission.CanWrite $.UnitTypeCloudBrain}} - + {{$.i18n.Tr "repo.stop"}} {{else}} - + {{$.i18n.Tr "repo.stop"}} {{end}}
-
+
{{$.CsrfTokenHtml}} {{if $.Permission.CanWrite $.UnitTypeCloudBrain}} - + {{$.i18n.Tr "repo.delete"}} {{else}} - + {{$.i18n.Tr "repo.delete"}} {{end}}
- - - -
{{end}} {{template "base/paginate" .}} diff --git a/templates/repo/modelarts/trainjob/new.tmpl b/templates/repo/modelarts/trainjob/new.tmpl index 97b7b50e0..78d23e482 100755 --- a/templates/repo/modelarts/trainjob/new.tmpl +++ b/templates/repo/modelarts/trainjob/new.tmpl @@ -163,18 +163,22 @@
-

{{.i18n.Tr "repo.modelarts.train_job.parameter_setting"}}:

+ +
+ + +
+ + +
@@ -183,52 +187,19 @@ {{end}} -
+
- +
- +
{{if .bootFile}} @@ -255,16 +226,12 @@
- - {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}}
-
- + - -
- -
+ + + +
+
+ 2021/11/08 19:35:19 + 当前版本:V0073 + 父版本:V0070 + 状态 + 运行成功 + + 运行时间: + 01:09:50 + + +
+ +
- +
+
+ - +
+
+
+
+
+ + + + -
- -
-
- - {{$.i18n.Tr "repo.modelarts.train_job.version"}} -
-
+ 作业名称 +
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}
{{.i18n.Tr "repo.modelarts.train_job.job_name"}} {{.result.JobName}}
{{.i18n.Tr "repo.modelarts.train_job.job_status"}} {{.result.Status}}
{{.i18n.Tr "repo.modelarts.train_job.version"}} {{.result.VersionName}}
{{.i18n.Tr "repo.modelarts.train_job.start_time"}} {{.result.CreateTime}}
{{.i18n.Tr "repo.modelarts.train_job.dura_time"}} {{.result.TrainJobDuration}}
{{.i18n.Tr "repo.modelarts.train_job.description"}} {{.result.Description}}
-
-
- - - - - - - - - - - - - - - - - - - - - - -
{{.i18n.Tr "repo.modelarts.train_job.parameter_setting_info"}}
{{.i18n.Tr "repo.modelarts.train_job.AI_driver"}} {{.result.EngineName}} | {{.result.EngineVersion}}
{{.i18n.Tr "repo.modelarts.train_job.start_file"}}{{.result.BootFileUrl}}
{{.i18n.Tr "repo.modelarts.train_job.dataset"}} {{.result.DatasetName}}
{{.i18n.Tr "repo.modelarts.train_job.run_parameter"}} {{.result.Parameter}}
-
-
- - - - - - - - + + + + + + + + + + + + + +
{{.i18n.Tr "repo.modelarts.train_job.resource_setting_info"}}
{{.i18n.Tr "repo.modelarts.train_job.resource_pool"}} {{.result.PoolName}} +
+ trainjob-d672 | job15b681bc +
+
+ 作业名称 + +
+ trainjob-d672 | job15b681bc +
+
+ 作业名称 + +
+ trainjob-d672 | job15b681bc +
+
+
+
+ + + + + + - - - + + + + - - - + + + +
+ 作业名称 + +
+ trainjob-d672 | job15b681bc +
+
{{.i18n.Tr "repo.modelarts.train_job.amount_of_compute_node"}}{{.result.WorkServerNum}}
+ 作业名称 + +
+ trainjob-d672 | job15b681bc +
+
{{.i18n.Tr "repo.modelarts.train_job.NAS_mount_path"}} {{.result.NasMountPath}}
+ 作业名称 + +
+ trainjob-d672 | job15b681bc +
+
-
-
- -
-
- - {{.log_file_name}} - - - -
-
-
-
{{.log.Content}}
-
- -
+
+ +
+
+
+
+ +
+
+ + {{.log_file_name}} + + +
+
+
+
{{.log.Content}}
+
+
+
+
+
+ asdasd +
+
+
@@ -143,8 +273,15 @@ \ No newline at end of file From 51b18b1be5ef2775098d93f2d40e65f2af64b4b9 Mon Sep 17 00:00:00 2001 From: liuzx Date: Fri, 12 Nov 2021 11:20:59 +0800 Subject: [PATCH 19/66] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=B7=AF=E7=94=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- routers/repo/modelarts.go | 6 ++++-- routers/routes/routes.go | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 4da5d1d1c..83f070d92 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -620,7 +620,8 @@ func TrainJobNewVersion(ctx *context.Context) { func trainJobNewVersionDataPrepare(ctx *context.Context) error { ctx.Data["PageIsCloudBrain"] = true var jobID = ctx.Params(":jobid") - var versionName = ctx.Query("version_name") + var versionName = ctx.Params(":version-name") + // var versionName = ctx.Query("version_name") task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) if err != nil { @@ -923,6 +924,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") + var versionName = ctx.Params(":version-name") jobName := form.JobName uuid := form.Attachment @@ -941,7 +943,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" branch_name := form.BranchName - fatherVersionName := form.VersionName + fatherVersionName := versionName FlavorName := form.FlavorName if err := paramCheckCreateTrainJob(form); err != nil { diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 579137d1e..1b94c6dd6 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -989,8 +989,10 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels) m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) m.Get("/version_models", reqRepoCloudBrainReader, repo.TrainJobVersionShowModels) - m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) - m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + m.Group("/:version-name", func() { + m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) + m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + }) m.Post("/stop_version", reqRepoCloudBrainWriter, repo.TrainJobVersionStop) m.Post("/del_version", reqRepoCloudBrainWriter, repo.TrainJobVersionDel) }) From ba817105d8f619215c10add2c38020193d6896f2 Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Fri, 12 Nov 2021 11:30:12 +0800 Subject: [PATCH 20/66] update --- templates/repo/modelarts/trainjob/show.tmpl | 2 +- .../repo/modelarts/trainjob/version_new.tmpl | 17 ++++++++++++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/templates/repo/modelarts/trainjob/show.tmpl b/templates/repo/modelarts/trainjob/show.tmpl index dc290dec6..97788b9ff 100755 --- a/templates/repo/modelarts/trainjob/show.tmpl +++ b/templates/repo/modelarts/trainjob/show.tmpl @@ -123,7 +123,7 @@ td, th {
- 修改 + 修改
diff --git a/templates/repo/modelarts/trainjob/version_new.tmpl b/templates/repo/modelarts/trainjob/version_new.tmpl index 2e9b01f21..f5e748410 100644 --- a/templates/repo/modelarts/trainjob/version_new.tmpl +++ b/templates/repo/modelarts/trainjob/version_new.tmpl @@ -103,6 +103,9 @@ -webkit-animation-delay: -0.8s; animation-delay: -0.8s; } + .left2{ + margin-left: -2px; + } @-webkit-keyframes sk-stretchdelay { 0%, @@ -159,7 +162,7 @@
- +
@@ -171,12 +174,14 @@
- {{if .branch_name}} {{end}} {{range $k, $v :=.branches}} + {{if ne $.branch_name $v}} + {{end}} {{end}} @@ -197,10 +202,12 @@
@@ -227,7 +234,9 @@ {{end}} {{range .attachments}} + {{if ne $.uuid .UUID}} + {{end}} {{end}}
@@ -272,7 +281,9 @@ {{end}} {{range .flavor_infos}} + {{if ne $.flavor_code .Code}} + {{end}} {{end}}
From ed0c53e1945de7ced9fa8ba7bba606b940df6621 Mon Sep 17 00:00:00 2001 From: liuzx Date: Fri, 12 Nov 2021 11:51:11 +0800 Subject: [PATCH 21/66] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=B7=AF=E7=94=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- routers/repo/modelarts.go | 7 ++++--- routers/routes/routes.go | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 83f070d92..c0d36bba9 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -620,8 +620,8 @@ func TrainJobNewVersion(ctx *context.Context) { func trainJobNewVersionDataPrepare(ctx *context.Context) error { ctx.Data["PageIsCloudBrain"] = true var jobID = ctx.Params(":jobid") - var versionName = ctx.Params(":version-name") - // var versionName = ctx.Query("version_name") + // var versionName = ctx.Params(":version-name") + var versionName = ctx.Query("version_name") task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) if err != nil { @@ -924,7 +924,8 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") - var versionName = ctx.Params(":version-name") + // var versionName = ctx.Params(":version-name") + var versionName = ctx.Query("version_name") jobName := form.JobName uuid := form.Attachment diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 1b94c6dd6..c000a6a69 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -989,10 +989,10 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels) m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) m.Get("/version_models", reqRepoCloudBrainReader, repo.TrainJobVersionShowModels) - m.Group("/:version-name", func() { - m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) - m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) - }) + // m.Group("/:version-name", func() { + m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) + m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + // }) m.Post("/stop_version", reqRepoCloudBrainWriter, repo.TrainJobVersionStop) m.Post("/del_version", reqRepoCloudBrainWriter, repo.TrainJobVersionDel) }) From 7d5dfd1e0e8bc7b0de976351deecbf3764c3ba6e Mon Sep 17 00:00:00 2001 From: zhoupzh Date: Fri, 12 Nov 2021 12:05:10 +0800 Subject: [PATCH 22/66] fix issue --- templates/repo/modelarts/trainjob/show.tmpl | 2 +- templates/repo/modelarts/trainjob/version_new.tmpl | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/templates/repo/modelarts/trainjob/show.tmpl b/templates/repo/modelarts/trainjob/show.tmpl index 97788b9ff..8c63ba8c1 100755 --- a/templates/repo/modelarts/trainjob/show.tmpl +++ b/templates/repo/modelarts/trainjob/show.tmpl @@ -123,7 +123,7 @@ td, th {
- 修改 + 修改
diff --git a/templates/repo/modelarts/trainjob/version_new.tmpl b/templates/repo/modelarts/trainjob/version_new.tmpl index f5e748410..cdddd380f 100644 --- a/templates/repo/modelarts/trainjob/version_new.tmpl +++ b/templates/repo/modelarts/trainjob/version_new.tmpl @@ -316,8 +316,12 @@ @@ -920,4 +1102,8 @@ height: 350px; width: 100%; } + .item_content{ + color: #409eff; + margin-top: 10px; + } \ No newline at end of file diff --git a/web_src/js/components/UserAnalysis.vue b/web_src/js/components/UserAnalysis.vue index 547a077f4..3c07a8807 100755 --- a/web_src/js/components/UserAnalysis.vue +++ b/web_src/js/components/UserAnalysis.vue @@ -1,7 +1,7 @@