diff --git a/models/cloudbrain.go b/models/cloudbrain.go index a106d7433..44b669181 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -52,26 +52,46 @@ type Cloudbrain struct { ID int64 `xorm:"pk autoincr"` JobID string `xorm:"INDEX NOT NULL"` JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"` - JobName string `xorm:"INDEX"` - Status string `xorm:"INDEX"` - UserID int64 `xorm:"INDEX"` - RepoID int64 `xorm:"INDEX"` - SubTaskName string `xorm:"INDEX"` + JobName string + Status string + UserID int64 + RepoID int64 + SubTaskName string ContainerID string ContainerIp string CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"` UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` - Duration int64 `xorm:"INDEX duration"` + Duration int64 TrainJobDuration string DeletedAt time.Time `xorm:"deleted"` CanDebug bool `xorm:"-"` CanDel bool `xorm:"-"` - Type int `xorm:"INDEX DEFAULT 0"` - - VersionID int64 `xorm:"INDEX DEFAULT 0"` - VersionName string - Uuid string - DatasetName string + Type int + + VersionID int64 //版本id + VersionName string `xorm:"INDEX"` //当前版本 + Uuid string //数据集id + DatasetName string + VersionCount int //任务的当前版本数量,不包括删除的 + IsLatestVersion string //是否是最新版本,1是,0否 + CommitID string //提交的仓库代码id + PreVersionName string //父版本名称 + ComputeResource string //计算资源,例如npu + EngineID int64 //引擎id + + TrainUrl string //输出的obs路径 + BranchName string //分支名称 + Parameters string //传给modelarts的param参数 + BootFile string //启动文件 + DataUrl string //数据集的obs路径 + LogUrl string //日志输出的obs路径 + PreVersionId int64 //父版本的版本id + FlavorCode string //modelarts上的规格id + Description string //描述 + WorkServerNumber int //节点数 + FlavorName string //规格名称 + EngineName string //引擎名称 + TotalVersionCount int //任务的所有版本数量,包括删除的 User *User `xorm:"-"` Repo *Repository `xorm:"-"` @@ -150,13 +170,16 @@ type CloudbrainsOptions struct { ListOptions RepoID int64 // include all repos if empty UserID int64 - JobID int64 + JobID string SortType string CloudbrainIDs []int64 // JobStatus CloudbrainStatus - Type int - JobType string + Type int + JobType string + VersionName string + IsLatestVersion string } + type TaskPod struct { TaskRoleStatus struct { Name string `json:"name"` @@ -579,20 +602,33 @@ type Config struct { BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL - //DatasetID string `json:"dataset_id"` - //DataVersionID string `json:"dataset_version_id"` - //DataSource []DataSource `json:"data_source"` - //SpecID int64 `json:"spec_id"` - EngineID int64 `json:"engine_id"` - //ModelID int64 `json:"model_id"` - TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL - LogUrl string `json:"log_url"` + EngineID int64 `json:"engine_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` //UserImageUrl string `json:"user_image_url"` //UserCommand string `json:"user_command"` - CreateVersion bool `json:"create_version"` - //Volumes []Volumes `json:"volumes"` - Flavor Flavor `json:"flavor"` - PoolID string `json:"pool_id"` + CreateVersion bool `json:"create_version"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` +} + +type CreateTrainJobVersionParams struct { + Description string `json:"job_desc"` + Config TrainJobVersionConfig `json:"config"` +} + +type TrainJobVersionConfig struct { + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + EngineID int64 `json:"engine_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` + PreVersionId int64 `json:"pre_version_id"` } type CreateConfigParams struct { @@ -603,20 +639,11 @@ type CreateConfigParams struct { BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL - //DatasetID string `json:"dataset_id"` - //DataVersionID string `json:"dataset_version_id"` - //DataSource []DataSource `json:"data_source"` - //SpecID int64 `json:"spec_id"` - EngineID int64 `json:"engine_id"` - //ModelID int64 `json:"model_id"` - TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL - LogUrl string `json:"log_url"` - //UserImageUrl string `json:"user_image_url"` - //UserCommand string `json:"user_command"` - //CreateVersion bool `json:"create_version"` - //Volumes []Volumes `json:"volumes"` - Flavor Flavor `json:"flavor"` - PoolID string `json:"pool_id"` + EngineID int64 `json:"engine_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` } type Parameter struct { @@ -730,18 +757,10 @@ type GetConfigResult struct { BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL - //DatasetID string `json:"dataset_id"` - //DataVersionID string `json:"dataset_version_id"` - //DataSource []DataSource `json:"data_source"` - //SpecID int64 `json:"spec_id"` - EngineID int64 `json:"engine_id"` - //ModelID int64 `json:"model_id"` - TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL - LogUrl string `json:"log_url"` - //UserImageUrl string `json:"user_image_url"` - //UserCommand string `json:"user_command"` - //CreateVersion bool `json:"create_version"` - //Volumes []Volumes `json:"volumes"` + EngineID int64 `json:"engine_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + Flavor Flavor `json:"flavor"` PoolID string `json:"pool_id"` } @@ -772,25 +791,18 @@ type GetTrainJobResult struct { BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 Parameter []Parameter `json:"parameter"` DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL - //DatasetID string `json:"dataset_id"` - //DataVersionID string `json:"dataset_version_id"` - //DataSource []DataSource `json:"data_source"` - //SpecID int64 `json:"spec_id"` - EngineID int64 `json:"engine_id"` - EngineName string `json:"engine_name"` - EngineVersion string `json:"engine_version"` - //ModelID int64 `json:"model_id"` - TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL - LogUrl string `json:"log_url"` - //UserImageUrl string `json:"user_image_url"` - //UserCommand string `json:"user_command"` - //Volumes []Volumes `json:"volumes"` - Flavor Flavor `json:"flavor"` - PoolID string `json:"pool_id"` - PoolName string `json:"pool_name"` - NasMountPath string `json:"nas_mount_path"` - NasShareAddr string `json:"nas_share_addr"` - DatasetName string + EngineID int64 `json:"engine_id"` + EngineName string `json:"engine_name"` + EngineVersion string `json:"engine_version"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + Flavor Flavor `json:"flavor"` + PoolID string `json:"pool_id"` + PoolName string `json:"pool_name"` + NasMountPath string `json:"nas_mount_path"` + NasShareAddr string `json:"nas_share_addr"` + DatasetName string + ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话 } type GetTrainJobLogResult struct { @@ -837,7 +849,7 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } - if (opts.JobID) > 0 { + if (opts.JobID) != "" { cond = cond.And( builder.Eq{"cloudbrain.job_id": opts.JobID}, ) @@ -855,16 +867,11 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { ) } - // switch opts.JobStatus { - // case JobWaiting: - // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)}) - // case JobFailed: - // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)}) - // case JobStopped: - // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)}) - // case JobSucceeded: - // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)}) - // } + if (opts.IsLatestVersion) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion}, + ) + } if len(opts.CloudbrainIDs) > 0 { cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs)) @@ -892,16 +899,79 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) { Find(&cloudbrains); err != nil { return nil, 0, fmt.Errorf("Find: %v", err) } - sess.Close() return cloudbrains, count, nil } +func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, error) { + sess := x.NewSession() + defer sess.Close() + + var cond = builder.NewCond() + if opts.RepoID > 0 { + cond = cond.And( + builder.Eq{"cloudbrain.repo_id": opts.RepoID}, + ) + } + + if opts.UserID > 0 { + cond = cond.And( + builder.Eq{"cloudbrain.user_id": opts.UserID}, + ) + } + + if (opts.Type) >= 0 { + cond = cond.And( + builder.Eq{"cloudbrain.type": opts.Type}, + ) + } + + if (opts.JobID) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.job_id": opts.JobID}, + ) + } + + if (opts.JobType) != "" { + cond = cond.And( + builder.Eq{"cloudbrain.job_type": opts.JobType}, + ) + } + + if len(opts.CloudbrainIDs) > 0 { + cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs)) + } + + count, err := sess.Where(cond).Count(new(Cloudbrain)) + if err != nil { + return nil, 0, fmt.Errorf("Count: %v", err) + } + + if opts.Page >= 0 && opts.PageSize > 0 { + var start int + if opts.Page == 0 { + start = 0 + } else { + start = (opts.Page - 1) * opts.PageSize + } + sess.Limit(opts.PageSize, start) + } + + sess.OrderBy("cloudbrain.created_unix DESC") + cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum) + if err := sess.Table(&Cloudbrain{}).Where(cond). + Join("left", "`user`", "cloudbrain.user_id = `user`.id"). + Find(&cloudbrains); err != nil { + return nil, 0, fmt.Errorf("Find: %v", err) + } + + return cloudbrains, int(count), nil +} + func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { if _, err = x.Insert(cloudbrain); err != nil { return err } - return nil } @@ -925,6 +995,16 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) { return getRepoCloudBrain(cb) } +func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) { + cb := &Cloudbrain{JobID: jobID, VersionName: versionName} + return getRepoCloudBrain(cb) +} + +func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) { + cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion} + return getRepoCloudBrain(cb) +} + func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { cloudBrains := make([]*Cloudbrain, 0) err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) @@ -949,6 +1029,12 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train return } +func SetVersionCountAndLatestVersion(jobID string, versionName string, versionCount int, isLatestVersion string, totalVersionCount int) (err error) { + cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion, TotalVersionCount: totalVersionCount} + _, err = x.Cols("version_Count", "is_latest_version", "total_version_count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb) + return +} + func UpdateJob(job *Cloudbrain) error { return updateJob(x, job) } @@ -960,16 +1046,16 @@ func updateJob(e Engine, job *Cloudbrain) error { return err } -// func UpdateTrainJob(job *CloudbrainInfo) error { -// return updateTrainJob(x, job) -// } +func UpdateTrainJobVersion(job *Cloudbrain) error { + return updateJobTrainVersion(x, job) +} -// func updateTrainJob(e Engine, job *CloudbrainInfo) error { -// var sess *xorm.Session -// sess = e.Where("job_id = ?", job.Cloudbrain.JobID) -// _, err := sess.Cols("status", "container_id", "container_ip").Update(job) -// return err -// } +func updateJobTrainVersion(e Engine, job *Cloudbrain) error { + var sess *xorm.Session + sess = e.Where("job_id = ? AND version_name=?", job.JobID, job.VersionName) + _, err := sess.Cols("status", "train_job_duration").Update(job) + return err +} func DeleteJob(job *Cloudbrain) error { return deleteJob(x, job) diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index e9e101523..59f72696e 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -19,7 +19,7 @@ type CreateModelArtsNotebookForm struct { JobName string `form:"job_name" binding:"Required"` Attachment string `form:"attachment"` Description string `form:"description"` - Flavor string `form:"flavor"` + Flavor string `form:"flavor"` } func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { @@ -39,6 +39,10 @@ type CreateModelArtsTrainJobForm struct { IsSaveParam string `form:"is_save_para"` ParameterTemplateName string `form:"parameter_template_name"` PrameterDescription string `form:"parameter_description"` + BranchName string `form:"branch_name" binding:"Required"` + VersionName string `form:"version_name" binding:"Required"` + FlavorName string `form:"flaver_names" binding:"Required"` + EngineName string `form:"engine_names" binding:"Required"` } func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 3153fbc52..a64c317a8 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -2,6 +2,7 @@ package modelarts import ( "encoding/json" + "fmt" "path" "strconv" @@ -35,19 +36,24 @@ const ( // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + // "]}" - CodePath = "/code/" - OutputPath = "/output/" - LogPath = "/log/" - JobPath = "/job/" - OrderDesc = "desc" //向下查询 - OrderAsc = "asc" //向上查询 - Lines = 20 - TrainUrl = "train_url" - DataUrl = "data_url" - PerPage = 10 - - SortByCreateTime = "create_time" - ConfigTypeCustom = "custom" + CodePath = "/code/" + OutputPath = "/output/" + LogPath = "/log/" + JobPath = "/job/" + OrderDesc = "desc" //向下查询 + OrderAsc = "asc" //向上查询 + Lines = 500 + TrainUrl = "train_url" + DataUrl = "data_url" + PerPage = 10 + IsLatestVersion = "1" + NotLatestVersion = "0" + ComputeResource = "NPU" + VersionCount = 1 + + SortByCreateTime = "create_time" + ConfigTypeCustom = "custom" + TotalVersionCount = 1 ) var ( @@ -56,19 +62,55 @@ var ( ) type GenerateTrainJobReq struct { - JobName string - Uuid string - Description string - CodeObsPath string - BootFile string - DataUrl string - TrainUrl string - FlavorCode string - LogUrl string - PoolID string - WorkServerNumber int - EngineID int64 - Parameters []models.Parameter + JobName string + Uuid string + Description string + CodeObsPath string + BootFile string + BootFileUrl string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + CommitID string + IsLatestVersion string + Params string + BranchName string + PreVersionId int64 + PreVersionName string + FlavorName string + VersionCount int + EngineName string + TotalVersionCount int +} + +type GenerateTrainJobVersionReq struct { + JobName string + Uuid string + Description string + CodeObsPath string + BootFile string + BootFileUrl string + DataUrl string + TrainUrl string + FlavorCode string + LogUrl string + PoolID string + WorkServerNumber int + EngineID int64 + Parameters []models.Parameter + Params string + PreVersionId int64 + CommitID string + BranchName string + FlavorName string + EngineName string + PreVersionName string + TotalVersionCount int } type VersionInfo struct { @@ -99,6 +141,22 @@ type ResourcePool struct { } `json:"resource_pool"` } +// type Parameter struct { +// Label string `json:"label"` +// Value string `json:"value"` +// } + +// type Parameters struct { +// Parameter []Parameter `json:"parameter"` +// } + +type Parameters struct { + Parameter []struct { + Label string `json:"label"` + Value string `json:"value"` + } `json:"parameter"` +} + func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error { var dataActualPath string if uuid != "" { @@ -170,14 +228,14 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor strin return nil } -func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { +func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { jobResult, err := createTrainJob(models.CreateTrainJobParams{ JobName: req.JobName, Description: req.Description, Config: models.Config{ WorkServerNum: req.WorkServerNumber, AppUrl: req.CodeObsPath, - BootFileUrl: req.BootFile, + BootFileUrl: req.BootFileUrl, DataUrl: req.DataUrl, EngineID: req.EngineID, TrainUrl: req.TrainUrl, @@ -198,21 +256,38 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { attach, err := models.GetAttachmentByUUID(req.Uuid) if err != nil { log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) - return nil + return err } err = models.CreateCloudbrain(&models.Cloudbrain{ - Status: TransTrainJobStatus(jobResult.Status), - UserID: ctx.User.ID, - RepoID: ctx.Repo.Repository.ID, - JobID: strconv.FormatInt(jobResult.JobID, 10), - JobName: req.JobName, - JobType: string(models.JobTypeTrain), - Type: models.TypeCloudBrainTwo, - VersionID: jobResult.VersionID, - VersionName: jobResult.VersionName, - Uuid: req.Uuid, - DatasetName: attach.Name, + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + ComputeResource: ComputeResource, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + LogUrl: req.LogUrl, + FlavorCode: req.FlavorCode, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, + FlavorName: req.FlavorName, + EngineName: req.EngineName, + VersionCount: req.VersionCount, + TotalVersionCount: req.TotalVersionCount, }) if err != nil { @@ -223,6 +298,96 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { return nil } +func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) { + jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{ + Description: req.Description, + Config: models.TrainJobVersionConfig{ + WorkServerNum: req.WorkServerNumber, + AppUrl: req.CodeObsPath, + BootFileUrl: req.BootFileUrl, + DataUrl: req.DataUrl, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + LogUrl: req.LogUrl, + PoolID: req.PoolID, + Flavor: models.Flavor{ + Code: req.FlavorCode, + }, + Parameter: req.Parameters, + PreVersionId: req.PreVersionId, + }, + }, jobId) + if err != nil { + log.Error("CreateJob failed: %v", err.Error()) + return err + } + + attach, err := models.GetAttachmentByUUID(req.Uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) + return err + } + + repo := ctx.Repo.Repository + VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + JobID: strconv.FormatInt(jobResult.JobID, 10), + }) + if err != nil { + ctx.ServerError("Cloudbrain", err) + return err + } + //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount + + err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: TransTrainJobStatus(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeTrain), + Type: models.TypeCloudBrainTwo, + VersionID: jobResult.VersionID, + VersionName: jobResult.VersionName, + Uuid: req.Uuid, + DatasetName: attach.Name, + CommitID: req.CommitID, + IsLatestVersion: req.IsLatestVersion, + PreVersionName: req.PreVersionName, + ComputeResource: ComputeResource, + EngineID: req.EngineID, + TrainUrl: req.TrainUrl, + BranchName: req.BranchName, + Parameters: req.Params, + BootFile: req.BootFile, + DataUrl: req.DataUrl, + LogUrl: req.LogUrl, + PreVersionId: req.PreVersionId, + FlavorCode: req.FlavorCode, + Description: req.Description, + WorkServerNumber: req.WorkServerNumber, + FlavorName: req.FlavorName, + EngineName: req.EngineName, + TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1, + VersionCount: VersionListCount + 1, + }) + if err != nil { + log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) + return err + } + + //将训练任务的上一版本的isLatestVersion设置为"0" + err = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount) + if err != nil { + ctx.ServerError("Update IsLatestVersion failed", err) + return err + } + + return err +} + func TransTrainJobStatus(status int) string { switch status { case 0: @@ -273,6 +438,10 @@ func TransTrainJobStatus(status int) string { default: return strconv.Itoa(status) } +} - return "" +func GetVersionOutputPathByTotalVersionCount(TotalVersionCount int) (VersionOutputPath string) { + talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount) + VersionOutputPath = "V" + talVersionCountToString + return VersionOutputPath } diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index d17478c94..2cc9e34be 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -366,6 +366,16 @@ sendjob: return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) } log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + BootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'." + DataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'." + if temp.ErrorMsg == BootFileErrorMsg { + log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("启动文件错误!") + } + if temp.ErrorMsg == DataSetErrorMsg { + log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("数据集错误!") + } return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) } @@ -377,6 +387,61 @@ sendjob: return &result, nil } +func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobVersionParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") + + if err != nil { + return nil, fmt.Errorf("resty create train-job version: %s", err) + } + + req, _ := json.Marshal(createJobVersionParams) + log.Info("%s", req) + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'." + DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'." + if temp.ErrorMsg == BootFileErrorMsg { + log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("启动文件错误!") + } + if temp.ErrorMsg == DataSetErrorMsg { + log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("数据集错误!") + } + return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} + func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { checkSetting() client := getRestyClient() @@ -768,3 +833,44 @@ sendjob: return &result, nil } + +func DelTrainJobVersion(jobID string, versionID string) (*models.TrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.TrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetAuthToken(TOKEN). + SetResult(&result). + Delete(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions/" + versionID) + + if err != nil { + return &result, fmt.Errorf("resty DelTrainJobVersion: %v", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if res.StatusCode() != http.StatusOK { + var temp models.ErrorResult + if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { + log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) + } + log.Error("DelTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + return &result, fmt.Errorf("删除训练作业版本失败(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) + } + + if !result.IsSuccess { + log.Error("DelTrainJob(%s) failed", jobID) + return &result, fmt.Errorf("删除训练作业版本失败:%s", result.ErrorMsg) + } + + return &result, nil +} diff --git a/modules/templates/helper.go b/modules/templates/helper.go index c399cc289..da278ba32 100755 --- a/modules/templates/helper.go +++ b/modules/templates/helper.go @@ -92,6 +92,7 @@ func NewFuncMap() []template.FuncMap { "Str2html": Str2html, "TimeSince": timeutil.TimeSince, "TimeSinceUnix": timeutil.TimeSinceUnix, + "TimeSinceUnix1": timeutil.TimeSinceUnix1, "RawTimeSince": timeutil.RawTimeSince, "FileSize": base.FileSize, "PrettyNumber": base.PrettyNumber, @@ -340,6 +341,7 @@ func NewTextFuncMap() []texttmpl.FuncMap { }, "TimeSince": timeutil.TimeSince, "TimeSinceUnix": timeutil.TimeSinceUnix, + "TimeSinceUnix1": timeutil.TimeSinceUnix1, "RawTimeSince": timeutil.RawTimeSince, "DateFmtLong": func(t time.Time) string { return t.Format(time.RFC1123Z) diff --git a/modules/timeutil/since.go b/modules/timeutil/since.go index e6c29c19f..a7854ed91 100755 --- a/modules/timeutil/since.go +++ b/modules/timeutil/since.go @@ -162,3 +162,8 @@ func htmlTimeSinceUnix(then, now TimeStamp, lang string) template.HTML { then.FormatInLocation(GetTimeFormat(lang), setting.DefaultUILocation), timeSinceUnix(int64(then), int64(now), lang))) } +func TimeSinceUnix1(then TimeStamp) string { + format := time.Unix(int64(then), 0).Format("2006-01-02 15:04:05") + return format + +} diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index b54ca18f2..440147d58 100644 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -823,7 +823,11 @@ modelarts.train_job.new_train=New Train Task modelarts.train_job.config=Configuration information modelarts.train_job.new=New train Task modelarts.train_job.new_place=The description should not exceed 256 characters - +modelarts.modify=Modify +modelarts.current_version=Current version +modelarts.parent_version=Parent Version +modelarts.run_version=Run Version +modelarts.train_job.compute_node=Compute Node modelarts.train_job.basic_info=Basic Info @@ -844,6 +848,8 @@ modelarts.train_job.AI_driver=AI Engine modelarts.train_job.start_file=Start File modelarts.train_job.boot_file_helper=The startup file is the entry file that your program executes, and it must be a file ending in .py modelarts.train_job.dataset=Dataset +modelarts.code_version = Code Version +modelarts.parents_version = Parents Version modelarts.train_job.run_parameter=Run Parameter modelarts.train_job.add_run_parameter=Add Run Parameter modelarts.train_job.parameter_name=Parameter Name diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 7da672c0d..c2e0aa891 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -816,9 +816,11 @@ total_count_get_error=查询总页数失败。 last_update_time_error=查询最新更新时间失败。 get_repo_stat_error=查询当前仓库的统计信息失败。 get_repo_info_error=查询当前仓库信息失败。 -generate_statistic_file_error=生成文件失败。 -repo_stat_inspect=项目分析 -all=所有 + +modelarts.status=状态 +modelarts.createtime=创建时间 +modelarts.version_nums=版本数 +modelarts.computing_resources=计算资源 modelarts.notebook=调试任务 modelarts.train_job=训练任务 modelarts.train_job.new_debug=新建调试任务 @@ -826,6 +828,10 @@ modelarts.train_job.new_train=新建训练任务 modelarts.train_job.config=配置信息 modelarts.train_job.new=新建训练任务 modelarts.train_job.new_place=描述字数不超过256个字符 +modelarts.modify=修改 +modelarts.current_version=当前版本 +modelarts.parent_version=父版本 +modelarts.run_version=运行版本 @@ -845,9 +851,14 @@ modelarts.train_job.frames=常用框架 modelarts.train_job.algorithm_origin=算法来源 modelarts.train_job.AI_driver=AI引擎 modelarts.train_job.start_file=启动文件 -modelarts.train_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。 +modelarts.train_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。比如train.py、main.py、example/train.py、case/main.py。 modelarts.train_job.boot_file_place=填写启动文件路径,默认为train.py modelarts.train_job.dataset=数据集 +modelarts.code_version=代码分支 +modelarts.parents_version=基于版本 +modelarts.train_job.compute_node=计算节点 +modelarts.train_job.train_dataset=训练数据集 + modelarts.train_job.run_parameter=运行参数 modelarts.train_job.add_run_parameter=增加运行参数 modelarts.train_job.parameter_name=参数名 diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 02bfc6731..b20713bca 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -874,8 +874,12 @@ func RegisterRoutes(m *macaron.Macaron) { }) m.Group("/train-job", func() { m.Group("/:jobid", func() { - m.Get("", repo.GetModelArtsTrainJob) + m.Get("", repo.GetModelArtsTrainJobVersion) m.Get("/log", repo.TrainJobGetLog) + m.Post("/del_version", repo.DelTrainJobVersion) + m.Post("/stop_version", repo.StopTrainJobVersion) + m.Get("/model_list", repo.ModelList) + m.Get("/model_download", repo.ModelDownload) }) }) }, reqRepoReader(models.UnitTypeCloudBrain)) diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index 2e825d8cc..c53e62efa 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -6,12 +6,15 @@ package repo import ( + "net/http" + "strconv" + "strings" + "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/modelarts" - "net/http" - "strconv" + "code.gitea.io/gitea/modules/storage" ) func GetModelArtsNotebook(ctx *context.APIContext) { @@ -72,56 +75,274 @@ func GetModelArtsTrainJob(ctx *context.APIContext) { } ctx.JSON(http.StatusOK, map[string]interface{}{ - "JobID": jobID, - "JobStatus": job.Status, - "JobDuration": job.Duration, + "JobID": jobID, + "JobStatus": job.Status, + "JobDuration": job.Duration, }) } -func TrainJobGetLog(ctx *context.APIContext) { +func GetModelArtsTrainJobVersion(ctx *context.APIContext) { var ( err error ) - log.Info("test") + jobID := ctx.Params(":jobid") + versionName := ctx.Query("version_name") + job, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + ctx.NotFound(err) + return + } + result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) + if err != nil { + ctx.NotFound(err) + return + } + + job.Status = modelarts.TransTrainJobStatus(result.IntStatus) + job.Duration = result.Duration + job.TrainJobDuration = result.TrainJobDuration + + if result.Duration != 0 { + job.TrainJobDuration = addZero(result.Duration/3600000) + ":" + addZero(result.Duration%3600000/60000) + ":" + addZero(result.Duration%60000/1000) + + } else { + job.TrainJobDuration = "00:00:00" + } + + err = models.UpdateTrainJobVersion(job) + if err != nil { + log.Error("UpdateJob failed:", err) + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "JobStatus": job.Status, + "JobDuration": job.TrainJobDuration, + }) + +} + +func addZero(t int64) (m string) { + if t < 10 { + m = "0" + strconv.FormatInt(t, 10) + return m + } else { + return strconv.FormatInt(t, 10) + } +} + +func TrainJobGetLog(ctx *context.APIContext) { + var ( + err error + ) var jobID = ctx.Params(":jobid") - var logFileName = ctx.Query("file_name") + var versionName = ctx.Query("version_name") + // var logFileName = ctx.Query("file_name") var baseLine = ctx.Query("base_line") var order = ctx.Query("order") + var lines = ctx.Query("lines") + lines_int, err := strconv.Atoi(lines) + if err != nil { + log.Error("change lines(%d) string to int failed", lines_int) + } if order != modelarts.OrderDesc && order != modelarts.OrderAsc { log.Error("order(%s) check failed", order) ctx.JSON(http.StatusBadRequest, map[string]interface{}{ - "err_msg": "order check failed", + "err_msg": "order check failed", }) return } - task, err := models.GetCloudbrainByJobID(jobID) + resultLogFile, result, err := trainJobGetLogContent(jobID, versionName, baseLine, order, lines_int) if err != nil { - log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) - ctx.JSON(http.StatusInternalServerError, map[string]interface{}{ - "err_msg": "GetCloudbrainByJobID failed", - }) + log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error()) + // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) return } - result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines) + ctx.Data["log_file_name"] = resultLogFile.LogFileList[0] + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "LogFileName": resultLogFile.LogFileList[0], + "StartLine": result.StartLine, + "EndLine": result.EndLine, + "Content": result.Content, + "Lines": result.Lines, + }) +} + +func trainJobGetLogContent(jobID string, versionName string, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) + return nil, nil, err + } + + resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) + return nil, nil, err + } + + result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines) if err != nil { log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) - ctx.JSON(http.StatusInternalServerError, map[string]interface{}{ - "err_msg": "GetTrainJobLog failed", - }) + return nil, nil, err + } + + return resultLogFile, result, err +} + +func DelTrainJobVersion(ctx *context.APIContext) { + var ( + err error + ) + + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("version_name") + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + ctx.NotFound(err) return } + //删除modelarts上的记录 + _, err = modelarts.DelTrainJobVersion(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("DelTrainJobVersion(%s) failed:%v", task.JobName, err.Error()) + ctx.NotFound(err) + return + } + + //删除数据库记录 + err = models.DeleteJob(task) + if err != nil { + ctx.ServerError("DeleteJob failed", err) + ctx.NotFound(err) + return + } + + //获取删除后的版本数量 + repo := ctx.Repo.Repository + VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + JobID: jobID, + }) + if err != nil { + ctx.ServerError("get VersionListCount faild", err) + return + } + + // 判断当前删掉的任务是否是最新版本,若是,将排序后的TotalVersionCount置为删掉的最新版本的TotalVersionCount,若不是,按时间排序后的版本列表的第一个版本设置为最新版本,TotalVersionCount不变 + if task.IsLatestVersion == modelarts.IsLatestVersion { + err = models.SetVersionCountAndLatestVersion(jobID, VersionTaskList[0].Cloudbrain.VersionName, VersionListCount, modelarts.IsLatestVersion, task.TotalVersionCount) + if err != nil { + ctx.ServerError("UpdateJobVersionCount failed", err) + return + } + } else { + err = models.SetVersionCountAndLatestVersion(jobID, VersionTaskList[0].VersionName, VersionListCount, modelarts.IsLatestVersion, VersionTaskList[0].Cloudbrain.TotalVersionCount) + if err != nil { + ctx.ServerError("UpdateJobVersionCount failed", err) + return + } + } + ctx.JSON(http.StatusOK, map[string]interface{}{ - "JobID": jobID, - "StartLine": result.StartLine, - "EndLine": result.EndLine, - "Content": result.Content, - "Lines": result.Lines, + "JobID": jobID, + "VersionName": versionName, + "StatusOK": 0, }) } + +func StopTrainJobVersion(ctx *context.APIContext) { + var ( + err error + ) + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("version_name") + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + return + } + + _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error()) + return + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "VersionName": versionName, + "StatusOK": 0, + }) +} + +func ModelList(ctx *context.APIContext) { + var ( + err error + ) + + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("version_name") + parentDir := ctx.Query("parentDir") + dirArray := strings.Split(parentDir, "/") + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + return + } + VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(task.TotalVersionCount) + parentDir = VersionOutputPath + "/" + parentDir + models, err := storage.GetObsListObject(task.JobName, parentDir) + if err != nil { + log.Info("get TrainJobListModel failed:", err) + ctx.ServerError("GetObsListObject:", err) + return + } + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "VersionName": versionName, + "StatusOK": 0, + "Path": dirArray, + "Dirs": models, + "task": task, + "PageIsCloudBrain": true, + }) +} + +func ModelDownload(ctx *context.APIContext) { + var ( + err error + ) + + var jobID = ctx.Params(":jobid") + versionName := ctx.Query("version_name") + parentDir := ctx.Query("parent_dir") + fileName := ctx.Query("file_name") + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + return + } + VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(task.TotalVersionCount) + parentDir = VersionOutputPath + "/" + parentDir + url, err := storage.GetObsCreateSignedUrl(task.JobName, parentDir, fileName) + if err != nil { + log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"]) + ctx.ServerError("GetObsCreateSignedUrl", err) + return + } + http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently) +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index aec6a024f..a996524d2 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -34,7 +34,7 @@ const ( tplModelArtsTrainJobIndex base.TplName = "repo/modelarts/trainjob/index" tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new" tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show" - tplModelArtsTrainJobShowModels base.TplName = "repo/modelarts/trainjob/models/index" + tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new" ) // MustEnableDataset check if repository enable internal cb @@ -58,8 +58,8 @@ func NotebookIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, JobType: string(models.JobTypeDebug), }) if err != nil { @@ -266,14 +266,6 @@ func NotebookDel(ctx *context.Context) { func TrainJobIndex(ctx *context.Context) { MustEnableModelArts(ctx) - //can, err := canUserCreateTrainJob(ctx.User.ID) - //if err != nil { - // ctx.ServerError("canUserCreateTrainJob", err) - // return - //} - // - //ctx.Data["CanCreate"] = can - repo := ctx.Repo.Repository page := ctx.QueryInt("page") if page <= 0 { @@ -285,9 +277,10 @@ func TrainJobIndex(ctx *context.Context) { Page: page, PageSize: setting.UI.IssuePagingNum, }, - RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, - JobType: string(models.JobTypeTrain), + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + IsLatestVersion: modelarts.IsLatestVersion, }) if err != nil { ctx.ServerError("Cloudbrain", err) @@ -369,12 +362,116 @@ func trainJobNewDataPrepare(ctx *context.Context) error { outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath ctx.Data["train_url"] = outputObsPath + Branches, err := ctx.Repo.GitRepo.GetBranches() + if err != nil { + ctx.ServerError("GetBranches error:", err) + return err + } + ctx.Data["Branches"] = Branches + ctx.Data["BranchesCount"] = len(Branches) + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { ctx.ServerError("getConfigList failed:", err) return err } + ctx.Data["config_list"] = configList.ParaConfigs + + return nil +} + +func TrainJobNewVersion(ctx *context.Context) { + err := trainJobNewVersionDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(200, tplModelArtsTrainJobVersionNew) +} + +func trainJobNewVersionDataPrepare(ctx *context.Context) error { + ctx.Data["PageIsCloudBrain"] = true + var jobID = ctx.Params(":jobid") + // var versionName = ctx.Params(":version-name") + var versionName = ctx.Query("version_name") + + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) + return err + } + + t := time.Now() + var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:] + ctx.Data["job_name"] = task.JobName + + attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID) + if err != nil { + ctx.ServerError("GetAllUserAttachments failed:", err) + return err + } + ctx.Data["attachments"] = attachs + + var resourcePools modelarts.ResourcePool + if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["resource_pools"] = resourcePools.Info + + var engines modelarts.Engine + if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engines"] = engines.Info + + var versionInfos modelarts.VersionInfo + if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["engine_versions"] = versionInfos.Version + + var flavorInfos modelarts.Flavor + if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["flavor_infos"] = flavorInfos.Info + + var Parameters modelarts.Parameters + if err = json.Unmarshal([]byte(task.Parameters), &Parameters); err != nil { + ctx.ServerError("json.Unmarshal failed:", err) + return err + } + ctx.Data["params"] = Parameters.Parameter + + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + ctx.Data["train_url"] = outputObsPath + + Branches, err := ctx.Repo.GitRepo.GetBranches() + if err != nil { + ctx.ServerError("GetBranches error:", err) + return err + } + ctx.Data["branches"] = Branches + ctx.Data["branch_name"] = task.BranchName + ctx.Data["description"] = task.Description + ctx.Data["boot_file"] = task.BootFile + ctx.Data["dataset_name"] = task.DatasetName + ctx.Data["work_server_number"] = task.WorkServerNumber + ctx.Data["flavor_name"] = task.FlavorName + ctx.Data["engine_name"] = task.EngineName + ctx.Data["uuid"] = task.Uuid + ctx.Data["flavor_code"] = task.FlavorCode + ctx.Data["engine_id"] = task.EngineID + configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) + if err != nil { + ctx.ServerError("getConfigList failed:", err) + return err + } ctx.Data["config_list"] = configList.ParaConfigs return nil @@ -382,6 +479,7 @@ func trainJobNewDataPrepare(ctx *context.Context) error { func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { ctx.Data["PageIsTrainJob"] = true + VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(modelarts.TotalVersionCount) jobName := form.JobName uuid := form.Attachment description := form.Description @@ -395,23 +493,15 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) repo := ctx.Repo.Repository codeLocalPath := setting.JobPath + jobName + modelarts.CodePath codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath - outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath - logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/" + logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" + branch_name := form.BranchName + isLatestVersion := modelarts.IsLatestVersion + FlavorName := form.FlavorName + VersionCount := modelarts.VersionCount + EngineName := form.EngineName - //can, err := canUserCreateTrainJob(ctx.User.ID) - //if err != nil { - // ctx.ServerError("canUserCreateTrainJob", err) - // return - //} - // - //if !can { - // log.Error("the user can not create train-job") - // ctx.RenderWithErr("the user can not create train-job", tplModelArtsTrainJobNew, &form) - // return - //} - - //param check if err := paramCheckCreateTrainJob(form); err != nil { log.Error("paramCheckCreateTrainJob failed:(%v)", err) trainJobNewDataPrepare(ctx) @@ -430,30 +520,35 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) if err == nil { os.RemoveAll(codeLocalPath) } - if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { - log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err) + + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(branch_name) + + if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ + Branch: branch_name, + }); err != nil { + log.Error("创建任务失败,服务器超时!: %s (%v)", repo.FullName(), err) trainJobNewDataPrepare(ctx) ctx.Data["bootFile"] = form.BootFile ctx.Data["uuid"] = form.Attachment ctx.Data["datasetName"] = attach.Name ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name trainJobNewDataPrepare(ctx) - // ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) - ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobNew, &form) - // ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("创建任务失败,服务器超时!", tplModelArtsTrainJobNew, &form) return } //todo: upload code (send to file_server todo this work?) - if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) trainJobNewDataPrepare(ctx) ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form) return } - if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil { + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) trainJobNewDataPrepare(ctx) ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form) @@ -532,19 +627,28 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) } req := &modelarts.GenerateTrainJobReq{ - JobName: jobName, - DataUrl: dataPath, - Description: description, - CodeObsPath: codeObsPath, - BootFile: codeObsPath + bootFile, - TrainUrl: outputObsPath, - FlavorCode: flavorCode, - WorkServerNumber: workServerNumber, - EngineID: int64(engineID), - LogUrl: logObsPath, - PoolID: poolID, - Uuid: uuid, - Parameters: param, + JobName: jobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + BootFile: bootFile, + TrainUrl: outputObsPath, + FlavorCode: flavorCode, + WorkServerNumber: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Parameters: parameters.Parameter, + CommitID: commitID, + IsLatestVersion: isLatestVersion, + BranchName: branch_name, + Params: form.Params, + FlavorName: FlavorName, + EngineName: EngineName, + VersionCount: VersionCount, + TotalVersionCount: modelarts.TotalVersionCount, } err = modelarts.GenerateTrainJob(ctx, req) @@ -555,12 +659,221 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Data["uuid"] = form.Attachment ctx.Data["datasetName"] = attach.Name ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { + ctx.Data["PageIsTrainJob"] = true + var jobID = ctx.Params(":jobid") + + latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(jobID, modelarts.IsLatestVersion) + if err != nil { + ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err) + return + } + VersionOutputPath := modelarts.GetVersionOutputPathByTotalVersionCount(latestTask.TotalVersionCount + 1) + + jobName := form.JobName + uuid := form.Attachment + description := form.Description + workServerNumber := form.WorkServerNumber + engineID := form.EngineID + bootFile := form.BootFile + flavorCode := form.Flavor + params := form.Params + poolID := form.PoolID + isSaveParam := form.IsSaveParam + repo := ctx.Repo.Repository + codeLocalPath := setting.JobPath + jobName + modelarts.CodePath + codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + VersionOutputPath + "/" + logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/" + dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" + branch_name := form.BranchName + PreVersionName := form.VersionName + FlavorName := form.FlavorName + EngineName := form.EngineName + isLatestVersion := modelarts.IsLatestVersion + + if err := paramCheckCreateTrainJob(form); err != nil { + log.Error("paramCheckCreateTrainJob failed:(%v)", err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + + attach, err := models.GetAttachmentByUUID(uuid) + if err != nil { + log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error()) + return + } + + //todo: del the codeLocalPath + _, err = ioutil.ReadDir(codeLocalPath) + if err == nil { + os.RemoveAll(codeLocalPath) + } + + gitRepo, _ := git.OpenRepository(repo.RepoPath()) + commitID, _ := gitRepo.GetBranchCommitID(branch_name) + if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ + Branch: branch_name, + }); err != nil { + log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + + ctx.Data["bootFile"] = form.BootFile + ctx.Data["uuid"] = form.Attachment + ctx.Data["datasetName"] = attach.Name + ctx.Data["params"] = form.Params + ctx.Data["branch_name"] = branch_name + ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form) + return + } + + //todo: upload code (send to file_server todo this work?) + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { + log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form) + return + } + + if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil { + log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form) + return + } + + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { + log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form) + return + } + + //todo: del local code? + + var parameters models.Parameters + param := make([]models.Parameter, 0) + param = append(param, models.Parameter{ + Label: modelarts.TrainUrl, + Value: outputObsPath, + }, models.Parameter{ + Label: modelarts.DataUrl, + Value: dataPath, + }) + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form) + return + } + + for _, parameter := range parameters.Parameter { + if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl { + param = append(param, models.Parameter{ + Label: parameter.Label, + Value: parameter.Value, + }) + } + } + } + + //save param config + if isSaveParam == "on" { + if form.ParameterTemplateName == "" { + log.Error("ParameterTemplateName is empty") + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form) + return + } + + _, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{ + ConfigName: form.ParameterTemplateName, + Description: form.PrameterDescription, + DataUrl: dataPath, + AppUrl: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + TrainUrl: outputObsPath, + Flavor: models.Flavor{ + Code: flavorCode, + }, + WorkServerNum: workServerNumber, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Parameter: parameters.Parameter, + }) + + if err != nil { + log.Error("Failed to CreateTrainJobConfig: %v", err) + trainJobNewVersionDataPrepare(ctx) + ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + } + + if err != nil { + log.Error("getFlavorNameByEngineID(%s) failed:%v", engineID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, PreVersionName) + if err != nil { + log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error()) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + req := &modelarts.GenerateTrainJobReq{ + JobName: task.JobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFileUrl: codeObsPath + bootFile, + BootFile: bootFile, + TrainUrl: outputObsPath, + FlavorCode: flavorCode, + WorkServerNumber: workServerNumber, + IsLatestVersion: isLatestVersion, + EngineID: int64(engineID), + LogUrl: logObsPath, + PoolID: poolID, + Uuid: uuid, + Params: form.Params, + Parameters: parameters.Parameter, + PreVersionId: task.VersionID, + CommitID: commitID, + BranchName: branch_name, + FlavorName: FlavorName, + EngineName: EngineName, + PreVersionName: PreVersionName, + TotalVersionCount: latestTask.TotalVersionCount + 1, + } + + err = modelarts.GenerateTrainJobVersion(ctx, req, jobID) + if err != nil { + log.Error("GenerateTrainJob failed:%v", err.Error()) + trainJobNewVersionDataPrepare(ctx) + ctx.Data["bootFile"] = form.BootFile + ctx.Data["uuid"] = form.Attachment + ctx.Data["datasetName"] = attach.Name + ctx.Data["params"] = form.Params + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form) + return + } + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + // ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) +} + // readDir reads the directory named by dirname and returns // a list of directory entries sorted by filename. func readDir(dirname string) ([]os.FileInfo, error) { @@ -652,69 +965,59 @@ func TrainJobShow(ctx *context.Context) { ctx.Data["PageIsCloudBrain"] = true var jobID = ctx.Params(":jobid") - task, err := models.GetCloudbrainByJobID(jobID) - if err != nil { - log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) - return - } - // attach, err := models.GetAttachmentByUUID(task.Uuid) - // if err != nil { - // log.Error("GetAttachmentByUUID(%s) failed:%v", jobID, err.Error()) - // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) - // return - // } + repo := ctx.Repo.Repository + page := ctx.QueryInt("page") + if page <= 0 { + page = 1 + } + VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ + ListOptions: models.ListOptions{ + Page: page, + PageSize: setting.UI.IssuePagingNum, + }, + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + JobID: jobID, + }) - result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) if err != nil { - log.Error("GetJob(%s) failed:%v", jobID, err.Error()) + log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error()) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) return } + //将运行参数转化为epoch_size = 3, device_target = Ascend的格式 + for i, _ := range VersionListTasks { - if result != nil { - result.CreateTime = time.Unix(int64(result.LongCreateTime/1000), 0).Format("2006-01-02 15:04:05") - if result.Duration != 0 { - result.TrainJobDuration = addZero(result.Duration/3600000) + ":" + addZero(result.Duration%3600000/60000) + ":" + addZero(result.Duration%60000/1000) + var parameters models.Parameters - } else { - result.TrainJobDuration = "00:00:00" - } - result.Status = modelarts.TransTrainJobStatus(result.IntStatus) - err = models.SetTrainJobStatusByJobID(jobID, result.Status, result.Duration, string(result.TrainJobDuration)) + err := json.Unmarshal([]byte(VersionListTasks[i].Parameters), ¶meters) if err != nil { - ctx.ServerError("UpdateJob failed", err) + log.Error("Failed to Unmarshal Parameters: %s (%v)", VersionListTasks[i].Parameters, err) + trainJobNewDataPrepare(ctx) return } - result.DatasetName = task.DatasetName - } - - resultLogFile, resultLog, err := trainJobGetLog(jobID) - if err != nil { - log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error()) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) - return + if len(parameters.Parameter) > 0 { + paramTemp := "" + for _, Parameter := range parameters.Parameter { + param := Parameter.Label + " = " + Parameter.Value + ", " + paramTemp = paramTemp + param + } + VersionListTasks[i].Parameters = paramTemp[:len(paramTemp)-2] + } else { + VersionListTasks[i].Parameters = "" + } } - ctx.Data["log_file_name"] = resultLogFile.LogFileList[0] - ctx.Data["log"] = resultLog - ctx.Data["task"] = task ctx.Data["jobID"] = jobID - ctx.Data["result"] = result + ctx.Data["jobName"] = VersionListTasks[0].JobName + ctx.Data["version_list_task"] = VersionListTasks + ctx.Data["version_list_count"] = VersionListCount ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) } -func addZero(t int64) (m string) { - if t < 10 { - m = "0" + strconv.FormatInt(t, 10) - return m - } else { - return strconv.FormatInt(t, 10) - } -} - func TrainJobGetLog(ctx *context.Context) { ctx.Data["PageIsTrainJob"] = true @@ -771,24 +1074,34 @@ func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *model func TrainJobDel(ctx *context.Context) { var jobID = ctx.Params(":jobid") - task, err := models.GetCloudbrainByJobID(jobID) + repo := ctx.Repo.Repository + + VersionListTasks, _, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ + RepoID: repo.ID, + Type: models.TypeCloudBrainTwo, + JobType: string(models.JobTypeTrain), + JobID: jobID, + }) if err != nil { - log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) - ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) + ctx.ServerError("get VersionListTasks failed", err) return } + //删除modelarts上的任务记录 _, err = modelarts.DelTrainJob(jobID) if err != nil { - log.Error("DelTrainJob(%s) failed:%v", task.JobName, err.Error()) + log.Error("DelTrainJob(%s) failed:%v", jobID, err.Error()) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) return } - err = models.DeleteJob(task) - if err != nil { - ctx.ServerError("DeleteJob failed", err) - return + //删除数据库Cloudbrain表的记录 + for _, task := range VersionListTasks { + err = models.DeleteJob(&task.Cloudbrain) + if err != nil { + ctx.ServerError("DeleteJob failed", err) + return + } } ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") @@ -876,43 +1189,3 @@ func getConfigList(perPage, page int, sortBy, order, searchContent, configType s return list, nil } - -func TrainJobShowModels(ctx *context.Context) { - ctx.Data["PageIsCloudBrain"] = true - - jobID := ctx.Params(":jobid") - parentDir := ctx.Query("parentDir") - dirArray := strings.Split(parentDir, "/") - task, err := models.GetCloudbrainByJobID(jobID) - if err != nil { - log.Error("no such job!", ctx.Data["msgID"]) - ctx.ServerError("no such job:", err) - return - } - - models, err := storage.GetObsListObject(task.JobName, parentDir) - if err != nil { - log.Info("get TrainJobListModel failed:", err) - ctx.ServerError("GetObsListObject:", err) - return - } - - ctx.Data["Path"] = dirArray - ctx.Data["Dirs"] = models - ctx.Data["task"] = task - ctx.Data["JobID"] = jobID - ctx.HTML(200, tplModelArtsTrainJobShowModels) -} - -func TrainJobDownloadModel(ctx *context.Context) { - parentDir := ctx.Query("parentDir") - fileName := ctx.Query("fileName") - jobName := ctx.Query("jobName") - url, err := storage.GetObsCreateSignedUrl(jobName, parentDir, fileName) - if err != nil { - log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"]) - ctx.ServerError("GetObsCreateSignedUrl", err) - return - } - http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusMovedPermanently) -} diff --git a/routers/routes/routes.go b/routers/routes/routes.go index c28e76a47..fe2588b25 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -971,16 +971,6 @@ func RegisterRoutes(m *macaron.Macaron) { }, context.RepoRef()) m.Group("/modelarts", func() { - // m.Get("", reqRepoCloudBrainReader, repo.ModelArtsIndex) - // m.Group("/:jobid", func() { - // m.Get("", reqRepoCloudBrainReader, repo.ModelArtsShow) - // m.Get("/debug", reqRepoCloudBrainReader, repo.ModelArtsDebug) - // m.Post("/stop", reqRepoCloudBrainWriter, repo.ModelArtsStop) - // m.Post("/del", reqRepoCloudBrainWriter, repo.ModelArtsDel) - // }) - // m.Get("/create", reqRepoCloudBrainWriter, repo.ModelArtsNew) - // m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsForm{}), repo.ModelArtsCreate) - m.Group("/notebook", func() { m.Get("", reqRepoCloudBrainReader, repo.NotebookIndex) m.Group("/:jobid", func() { @@ -999,12 +989,12 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) m.Post("/stop", reqRepoCloudBrainWriter, repo.TrainJobStop) m.Post("/del", reqRepoCloudBrainWriter, repo.TrainJobDel) - m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog) - m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels) - m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) + m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion) + m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) }) m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew) m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) + m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) }) }, context.RepoRef()) diff --git a/templates/repo/cloudbrain/index.tmpl b/templates/repo/cloudbrain/index.tmpl index 9099cb17a..d72ffb0c4 100755 --- a/templates/repo/cloudbrain/index.tmpl +++ b/templates/repo/cloudbrain/index.tmpl @@ -239,8 +239,8 @@
diff --git a/templates/repo/cloudbrain/show.tmpl b/templates/repo/cloudbrain/show.tmpl index 842f629c9..8cec8f5d2 100755 --- a/templates/repo/cloudbrain/show.tmpl +++ b/templates/repo/cloudbrain/show.tmpl @@ -6,7 +6,19 @@ {{template "base/alert" .}}

- 返回 +

diff --git a/templates/repo/modelarts/notebook/show.tmpl b/templates/repo/modelarts/notebook/show.tmpl index 3f914b56d..cac87df79 100755 --- a/templates/repo/modelarts/notebook/show.tmpl +++ b/templates/repo/modelarts/notebook/show.tmpl @@ -6,7 +6,19 @@ {{template "base/alert" .}}

- 返回 +

diff --git a/templates/repo/modelarts/trainjob/index.tmpl b/templates/repo/modelarts/trainjob/index.tmpl index 1453da9e5..e93a8000b 100755 --- a/templates/repo/modelarts/trainjob/index.tmpl +++ b/templates/repo/modelarts/trainjob/index.tmpl @@ -180,6 +180,12 @@ cursor: pointer; pointer-events: none; } + .fontsize14{ + font-size: 14px; + } + .padding0{ + padding: 0 !important; + } @@ -232,13 +238,13 @@
- @@ -278,20 +284,29 @@
-
+
{{$.i18n.Tr "repo.cloudbrain_task"}}
-
- {{$.i18n.Tr "repo.cloudbrain_status_createtime"}} +
+ {{$.i18n.Tr "repo.modelarts.version_nums"}}
-
+
+ {{$.i18n.Tr "repo.modelarts.status"}} +
+
+ {{$.i18n.Tr "repo.modelarts.createtime"}} +
+
{{$.i18n.Tr "repo.cloudbrain_status_runtime"}}
-
+
+ {{$.i18n.Tr "repo.modelarts.computing_resources"}} +
+
{{$.i18n.Tr "repo.cloudbrain_creator"}}
-
- {{$.i18n.Tr "repo.cloudbrain_operate"}} +
+ {{$.i18n.Tr "repo.cloudbrain_operate"}}
@@ -305,38 +320,44 @@
-
- + - -
- - + +
+ {{.VersionCount}} +
+ +
+ + {{.Status}} + +
+ +
+ {{TimeSinceUnix .Cloudbrain.CreatedUnix $.Lang}} +
+ + {{TimeSinceUnix .Cloudbrain.CreatedUnix $.Lang}} +
--> + +
+
- -
- - - - - - - + +
+ {{.ComputeResource}}
- -
+ +
{{if .User.Name}} {{else}} @@ -344,56 +365,41 @@ {{end}}
- - - - -
{{end}} {{template "base/paginate" .}} @@ -435,6 +441,8 @@ {{template "base/footer" .}} \ No newline at end of file diff --git a/templates/repo/modelarts/trainjob/version_new.tmpl b/templates/repo/modelarts/trainjob/version_new.tmpl new file mode 100644 index 000000000..442f10a1a --- /dev/null +++ b/templates/repo/modelarts/trainjob/version_new.tmpl @@ -0,0 +1,628 @@ +{{template "base/head" .}} + + +
+
+
+
+
+
+
+
+
+
+ {{template "repo/header" .}} +
+ {{template "base/alert" .}} +

+ {{.i18n.Tr "repo.modelarts.train_job.new"}} +

+
+ +
+ {{.CsrfTokenHtml}} + + + + +

{{.i18n.Tr "repo.modelarts.train_job.basic_info"}}:

+
+ + + +
+
+ + + +
+ +
+ + +
+
+ +

{{.i18n.Tr "repo.modelarts.train_job.parameter_setting"}}:

+ + +
+ + + +
+ + + +
+ +
+ +
+ +
+ + +
+ +
+ +
+ + {{if .boot_file}} + + {{else}} + + {{end}} + + + +
+
+ + +
+ +
+ + {{.i18n.Tr "repo.modelarts.train_job.add_run_parameter"}} + +
+ {{if ne 0 (len .params)}} + {{range $k ,$v := .params}} +
+
+ +
+
+ +
+ + + + +
+ {{end}} + {{end}} +
+
+ + + + + + +
+ + +
+
+ + +
+ + + + +
+
+ +
+ + {{.i18n.Tr "repo.cloudbrain.cancel"}} +
+ + + +
+
+
+
+{{template "base/footer" .}} + + \ No newline at end of file diff --git a/web_src/js/index.js b/web_src/js/index.js index a6d07d0bc..1b718ed53 100755 --- a/web_src/js/index.js +++ b/web_src/js/index.js @@ -2785,67 +2785,6 @@ $(document).ready(async () => { } }); } - - // dataset Dropzone - // const $dataset = $('#dataset'); - // if ($dataset.length > 0) { - // const filenameDict = {}; - // let previewTemplate = ''; - // previewTemplate += '
\n '; - // previewTemplate += '
\n '; - // previewTemplate += '
'; - // previewTemplate += ' '; - // previewTemplate += '
\n '; - // previewTemplate += '
\n '; - // previewTemplate += '
\n '; - // previewTemplate += '
'; - // previewTemplate += '
\n '; - // previewTemplate += '
\n '; - // previewTemplate += '
'; - // previewTemplate += ' 上传成功'; - // previewTemplate += '
\n '; - // previewTemplate += '
'; - // previewTemplate += ' 上传失败'; - // previewTemplate += '
\n '; - // previewTemplate += '
'; - // previewTemplate += ' '; - // previewTemplate += '
\n'; - // previewTemplate += '
'; - - // await createDropzone('#dataset', { - // url: $dataset.data('upload-url'), - // headers: {'X-Csrf-Token': csrf}, - // maxFiles: $dataset.data('max-file'), - // maxFilesize: $dataset.data('max-size'), - // acceptedFiles: ($dataset.data('accepts') === '*/*') ? null : $dataset.data('accepts'), - // addRemoveLinks: true, - // timeout: 0, - // dictDefaultMessage: $dataset.data('default-message'), - // dictInvalidFileType: $dataset.data('invalid-input-type'), - // dictFileTooBig: $dataset.data('file-too-big'), - // dictRemoveFile: $dataset.data('remove-file'), - // previewTemplate, - // init() { - // this.on('success', (file, data) => { - // filenameDict[file.name] = data.uuid; - // const input = $(``).val(data.uuid); - // $('.files').append(input); - // }); - // this.on('removedfile', (file) => { - // if (file.name in filenameDict) { - // $(`#${filenameDict[file.name]}`).remove(); - // } - // if ($dataset.data('remove-url') && $dataset.data('csrf')) { - // $.post($dataset.data('remove-url'), { - // file: filenameDict[file.name], - // _csrf: $dataset.data('csrf') - // }); - // } - // }); - // }, - // }); - // } - // Helpers. $('.delete-button').on('click', showDeletePopup); $('.add-all-button').on('click', showAddAllPopup); @@ -3984,243 +3923,6 @@ function initNavbarContentToggle() { }); } -// function initTopicbar() { -// const mgrBtn = $('#manage_topic'); -// const editDiv = $('#topic_edit'); -// const viewDiv = $('#repo-topics'); -// const saveBtn = $('#save_topic'); -// const topicDropdown = $('#topic_edit .dropdown'); -// const topicForm = $('#topic_edit.ui.form'); -// const topicInput = $("#topics_input") -// const topicPrompts = getPrompts(); - - -// mgrBtn.on('click', (e) => { -// // viewDiv.hide(); -// editDiv.css('display', ''); // show Semantic UI Grid -// topicInput.val('') -// console.log("-----------------asdasd",$("#topics_input"),$("#topics_input").val()) -// stopPropagation(e); - -// }); -// $(document).bind('click',function(){ -// editDiv.css('display','none'); - -// }) -// editDiv.click(function(e){ -// stopPropagation(e); -// }) - -// function getPrompts() { -// const hidePrompt = $('div.hide#validate_prompt'); -// const prompts = { -// countPrompt: hidePrompt.children('#count_prompt').text(), -// formatPrompt: hidePrompt.children('#format_prompt').text() -// }; -// hidePrompt.remove(); -// return prompts; -// } - -// function stopPropagation(e) { -// var ev = e || window.event; -// if (ev.stopPropagation) { -// ev.stopPropagation(); -// } -// else if (window.event) { -// window.event.cancelBubble = true;//兼容IE -// } -// } - - -// saveBtn.on('click', () => { -// const topics = $('input[name=topics]').val(); - -// $.post( -// saveBtn.data('link'), -// { -// _csrf: csrf, -// topics -// }, -// (_data, _textStatus, xhr) => { -// if (xhr.responseJSON.status === 'ok') { -// console.log("--------saveBtn------------") -// viewDiv.children('.topic').remove(); -// if (topics.length) { -// const topicArray = topics.split(','); - -// const last = viewDiv.children('a').last(); -// for (let i = 0; i < topicArray.length; i++) { -// const link = $(''); -// link.attr( -// 'href', -// `${AppSubUrl}/explore/repos?q=${encodeURIComponent( -// topicArray[i] -// )}&topic=1` -// ); -// link.text(topicArray[i]); -// link.insertBefore(last); -// } -// } -// editDiv.css('display', 'none'); -// viewDiv.show(); -// } -// } -// ) -// .fail((xhr) => { -// if (xhr.status === 422) { -// if (xhr.responseJSON.invalidTopics.length > 0) { -// topicPrompts.formatPrompt = xhr.responseJSON.message; - -// const {invalidTopics} = xhr.responseJSON; -// const topicLables = topicDropdown.children('a.ui.label'); - -// topics.split(',').forEach((value, index) => { -// for (let i = 0; i < invalidTopics.length; i++) { -// if (invalidTopics[i] === value) { -// topicLables -// .eq(index) -// .removeClass('green') -// .addClass('red'); -// } -// } -// }); -// } else { -// topicPrompts.countPrompt = xhr.responseJSON.message; -// } -// } -// }) -// .always(() => { -// topicForm.form('validate form'); -// }); -// }); - -// topicDropdown.dropdown({ -// allowAdditions: true, -// forceSelection: false, -// fields: {name: 'description', value: 'data-value'}, -// saveRemoteData: false, -// label: { -// transition: 'horizontal flip', -// duration: 200, -// variation: false, -// blue: true, -// basic: true -// }, -// className: { -// label: 'ui small label' -// }, -// apiSettings: { -// url: `${AppSubUrl}/api/v1/topics/search?q={query}`, -// throttle: 500, -// cache: false, -// onResponse(res) { -// const formattedResponse = { -// success: false, -// results: [] -// }; -// const stripTags = function (text) { -// return text.replace(/<[^>]*>?/gm, ''); -// }; - -// const query = stripTags(this.urlData.query.trim()); -// let found_query = false; -// const current_topics = []; -// topicDropdown -// .find('div.label.visible.topic,a.label.visible') -// .each((_, e) => { -// current_topics.push(e.dataset.value); -// }); - -// if (res.topics) { -// let found = false; -// for (let i = 0; i < res.topics.length; i++) { -// // skip currently added tags -// if (current_topics.includes(res.topics[i].topic_name)) { -// continue; -// } - -// if ( -// res.topics[i].topic_name.toLowerCase() === query.toLowerCase() -// ) { -// found_query = true; -// } -// formattedResponse.results.push({ -// description: res.topics[i].topic_name, -// 'data-value': res.topics[i].topic_name -// }); -// found = true; -// } -// formattedResponse.success = found; -// } - -// if (query.length > 0 && !found_query) { -// formattedResponse.success = true; -// formattedResponse.results.unshift({ -// description: query, -// 'data-value': query -// }); -// } else if (query.length > 0 && found_query) { -// formattedResponse.results.sort((a, b) => { -// if (a.description.toLowerCase() === query.toLowerCase()) return -1; -// if (b.description.toLowerCase() === query.toLowerCase()) return 1; -// if (a.description > b.description) return -1; -// if (a.description < b.description) return 1; -// return 0; -// }); -// } - -// return formattedResponse; -// } -// }, -// onLabelCreate(value) { -// value = value.toLowerCase().trim(); -// this.attr('data-value', value) -// .contents() -// .first() -// .replaceWith(value); -// return $(this); -// }, -// onAdd(addedValue, _addedText, $addedChoice) { -// addedValue = addedValue.toLowerCase().trim(); -// $($addedChoice).attr('data-value', addedValue); -// $($addedChoice).attr('data-text', addedValue); -// } -// }); - -// $.fn.form.settings.rules.validateTopic = function (_values, regExp) { -// const topics = topicDropdown.children('a.ui.label'); -// const status = -// topics.length === 0 || (topics.last().attr('data-value').match(regExp) !== null && topics.last().attr('data-value').length <= 35); -// if (!status) { -// topics -// .last() -// .removeClass('green') -// .addClass('red'); -// } -// return status && topicDropdown.children('a.ui.label.red').length === 0; -// }; - -// topicForm.form({ -// on: 'change', -// inline: true, -// fields: { -// topics: { -// identifier: 'topics', -// rules: [ -// { -// type: 'validateTopic', -// value: /^[\u4e00-\u9fa5a-z0-9][\u4e00-\u9fa5a-z0-9-]{0,105}$/, -// prompt: topicPrompts.formatPrompt -// }, -// { -// type: 'maxCount[25]', -// prompt: topicPrompts.countPrompt -// } -// ] -// } -// } -// }); -// } window.toggleDeadlineForm = function () { $('#deadlineForm').fadeToggle(150);