Browse Source

增加训练多版本路由

tags/v1.21.12.1
liuzx 4 years ago
parent
commit
08583f1234
6 changed files with 604 additions and 38 deletions
  1. +146
    -8
      models/cloudbrain.go
  2. +94
    -0
      modules/modelarts/modelarts.go
  3. +46
    -0
      modules/modelarts/resty.go
  4. +315
    -30
      routers/repo/modelarts.go
  5. +3
    -0
      routers/routes/routes.go
  6. +0
    -0
      templates/repo/modelarts/trainjob/version_new.tmpl

+ 146
- 8
models/cloudbrain.go View File

@@ -30,6 +30,7 @@ const (
JobTypeSnn4imagenet JobType = "SNN4IMAGENET" JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
JobTypeBrainScore JobType = "BRAINSCORE" JobTypeBrainScore JobType = "BRAINSCORE"
JobTypeTrain JobType = "TRAIN" JobTypeTrain JobType = "TRAIN"
JobVersionName JobType = "V0001"


ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中 ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中 ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
@@ -68,15 +69,33 @@ type Cloudbrain struct {
CanDel bool `xorm:"-"` CanDel bool `xorm:"-"`
Type int `xorm:"INDEX DEFAULT 0"` Type int `xorm:"INDEX DEFAULT 0"`


VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string
Uuid string
DatasetName string
VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string
Uuid string
DatasetName string
VersionCount int64 `xorm:"INDEX DEFAULT 1"`


User *User `xorm:"-"` User *User `xorm:"-"`
Repo *Repository `xorm:"-"` Repo *Repository `xorm:"-"`
} }


type TrainjobConfigDetail struct {
ID int64 `xorm:"pk autoincr"`
JobName string `xorm:"INDEX"`
ResourcePools string `xorm:"INDEX"`
EngineVersions int `xorm:"INDEX"`
FlavorInfos string `xorm:"INDEX"`
TrainUrl string `xorm:"INDEX"`
BootFile string `xorm:"INDEX"`
Uuid string `xorm:"INDEX"`
DatasetName string `xorm:"INDEX"`
Params string `xorm:"deleted"`
BranchName string `xorm:"INDEX"`

// User *User `xorm:"-"`
// Repo *Repository `xorm:"-"`
}

type CloudbrainInfo struct { type CloudbrainInfo struct {
Cloudbrain `xorm:"extends"` Cloudbrain `xorm:"extends"`
User `xorm:"extends"` User `xorm:"extends"`
@@ -150,13 +169,15 @@ type CloudbrainsOptions struct {
ListOptions ListOptions
RepoID int64 // include all repos if empty RepoID int64 // include all repos if empty
UserID int64 UserID int64
JobID int64
JobID string
SortType string SortType string
CloudbrainIDs []int64 CloudbrainIDs []int64
// JobStatus CloudbrainStatus // JobStatus CloudbrainStatus
Type int
JobType string
Type int
JobType string
VersionName string
} }

type TaskPod struct { type TaskPod struct {
TaskRoleStatus struct { TaskRoleStatus struct {
Name string `json:"name"` Name string `json:"name"`
@@ -594,6 +615,33 @@ type Config struct {
PoolID string `json:"pool_id"` PoolID string `json:"pool_id"`
} }


type CreateTrainJobVersionParams struct {
Description string `json:"job_desc"`
Config TrainJobVersionConfig `json:"config"`
}

type TrainJobVersionConfig struct {
WorkServerNum int `json:"worker_server_num"`
AppUrl string `json:"app_url"` //训练作业的代码目录
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
Parameter []Parameter `json:"parameter"`
DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
//DatasetID string `json:"dataset_id"`
//DataVersionID string `json:"dataset_version_id"`
//DataSource []DataSource `json:"data_source"`
//SpecID int64 `json:"spec_id"`
EngineID int64 `json:"engine_id"`
//ModelID int64 `json:"model_id"`
TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
LogUrl string `json:"log_url"`
//UserImageUrl string `json:"user_image_url"`
//UserCommand string `json:"user_command"`
//Volumes []Volumes `json:"volumes"`
Flavor Flavor `json:"flavor"`
PoolID string `json:"pool_id"`
PreVersionId int64 `json:"pre_version_id"`
}

type CreateConfigParams struct { type CreateConfigParams struct {
ConfigName string `json:"config_name"` ConfigName string `json:"config_name"`
Description string `json:"config_desc"` Description string `json:"config_desc"`
@@ -837,7 +885,7 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
) )
} }


if (opts.JobID) > 0 {
if (opts.JobID) != "" {
cond = cond.And( cond = cond.And(
builder.Eq{"cloudbrain.job_id": opts.JobID}, builder.Eq{"cloudbrain.job_id": opts.JobID},
) )
@@ -855,6 +903,12 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
) )
} }


if (opts.VersionName) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.version_name": opts.VersionName},
)
}

// switch opts.JobStatus { // switch opts.JobStatus {
// case JobWaiting: // case JobWaiting:
// cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)}) // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
@@ -897,6 +951,72 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
return cloudbrains, count, nil return cloudbrains, count, nil
} }


func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
sess := x.NewSession()
defer sess.Close()

var cond = builder.NewCond()
if opts.RepoID > 0 {
cond = cond.And(
builder.Eq{"cloudbrain.repo_id": opts.RepoID},
)
}

if opts.UserID > 0 {
cond = cond.And(
builder.Eq{"cloudbrain.user_id": opts.UserID},
)
}

if (opts.Type) >= 0 {
cond = cond.And(
builder.Eq{"cloudbrain.type": opts.Type},
)
}

if (opts.JobID) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.job_id": opts.JobID},
)
}

if (opts.JobType) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.job_type": opts.JobType},
)
}

if len(opts.CloudbrainIDs) > 0 {
cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
}

count, err := sess.Where(cond).Count(new(Cloudbrain))
if err != nil {
return nil, 0, fmt.Errorf("Count: %v", err)
}

if opts.Page >= 0 && opts.PageSize > 0 {
var start int
if opts.Page == 0 {
start = 0
} else {
start = (opts.Page - 1) * opts.PageSize
}
sess.Limit(opts.PageSize, start)
}

sess.OrderBy("cloudbrain.created_unix DESC")
cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
if err := sess.Table(&Cloudbrain{}).Where(cond).
Join("left", "`user`", "cloudbrain.user_id = `user`.id").
Find(&cloudbrains); err != nil {
return nil, 0, fmt.Errorf("Find: %v", err)
}
sess.Close()

return cloudbrains, count, nil
}

func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
if _, err = x.Insert(cloudbrain); err != nil { if _, err = x.Insert(cloudbrain); err != nil {
return err return err
@@ -904,6 +1024,13 @@ func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
return nil return nil
} }


func CreateTrainjobConfigDetail(trainjobConfigDetail *TrainjobConfigDetail) (err error) {
if _, err = x.Insert(trainjobConfigDetail); err != nil {
return err
}
return nil
}

func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) { func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
has, err := x.Get(cb) has, err := x.Get(cb)
if err != nil { if err != nil {
@@ -924,6 +1051,11 @@ func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
return getRepoCloudBrain(cb) return getRepoCloudBrain(cb)
} }


func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) {
cb := &Cloudbrain{JobID: jobID, VersionName: versionName}
return getRepoCloudBrain(cb)
}

func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) { func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
cloudBrains := make([]*Cloudbrain, 0) cloudBrains := make([]*Cloudbrain, 0)
err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains) err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
@@ -948,6 +1080,12 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train
return return
} }


func SetVersionCountByJobID(jobID string, versionName string, versionCount int64) (err error) {
cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount}
_, err = x.Cols("version_Count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
return
}

func UpdateJob(job *Cloudbrain) error { func UpdateJob(job *Cloudbrain) error {
return updateJob(x, job) return updateJob(x, job)
} }


+ 94
- 0
modules/modelarts/modelarts.go View File

@@ -71,6 +71,23 @@ type GenerateTrainJobReq struct {
Parameters []models.Parameter Parameters []models.Parameter
} }


type GenerateTrainJobVersionReq struct {
JobName string
Uuid string
Description string
CodeObsPath string
BootFile string
DataUrl string
TrainUrl string
FlavorCode string
LogUrl string
PoolID string
WorkServerNumber int
EngineID int64
Parameters []models.Parameter
PreVersionId int64
}

type VersionInfo struct { type VersionInfo struct {
Version []struct { Version []struct {
ID int `json:"id"` ID int `json:"id"`
@@ -223,6 +240,83 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
return nil return nil
} }


func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionReq, jobId string) error {
jobResult, err := createTrainJobVersion(models.CreateTrainJobVersionParams{
Description: req.Description,
Config: models.TrainJobVersionConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFile,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
},
}, jobId)
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
}

attach, err := models.GetAttachmentByUUID(req.Uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
return nil
}

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
})
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return err
}

repo := ctx.Repo.Repository
page := ctx.QueryInt("page")
if page <= 0 {
page = 1
}
_, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
ListOptions: models.ListOptions{
Page: page,
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
JobID: strconv.FormatInt(jobResult.JobID, 10),
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
return nil
}
versionName := "V0001"
err = models.SetVersionCountByJobID(strconv.FormatInt(jobResult.JobID, 10), versionName, VersionListCount)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return nil
}

return nil
}

func TransTrainJobStatus(status int) string { func TransTrainJobStatus(status int) string {
switch status { switch status {
case 0: case 0:


+ 46
- 0
modules/modelarts/resty.go View File

@@ -377,6 +377,52 @@ sendjob:
return &result, nil return &result, nil
} }


func createTrainJobVersion(createJobVersionParams models.CreateTrainJobVersionParams, jobID string) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobVersionParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty create train-job version: %s", err)
}

req, _ := json.Marshal(createJobVersionParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJobVersion failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
checkSetting() checkSetting()
client := getRestyClient() client := getRestyClient()


+ 315
- 30
routers/repo/modelarts.go View File

@@ -40,6 +40,7 @@ const (
tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new" tplModelArtsTrainJobNew base.TplName = "repo/modelarts/trainjob/new"
tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show" tplModelArtsTrainJobShow base.TplName = "repo/modelarts/trainjob/show"
tplModelArtsTrainJobShowModels base.TplName = "repo/modelarts/trainjob/models/index" tplModelArtsTrainJobShowModels base.TplName = "repo/modelarts/trainjob/models/index"
tplModelArtsTrainJobVersionNew base.TplName = "repo/modelarts/trainjob/version_new"
) )


// MustEnableDataset check if repository enable internal cb // MustEnableDataset check if repository enable internal cb
@@ -286,8 +287,8 @@ func NotebookIndex(ctx *context.Context) {
Page: page, Page: page,
PageSize: setting.UI.IssuePagingNum, PageSize: setting.UI.IssuePagingNum,
}, },
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeDebug), JobType: string(models.JobTypeDebug),
}) })
if err != nil { if err != nil {
@@ -493,14 +494,6 @@ func NotebookDel(ctx *context.Context) {
func TrainJobIndex(ctx *context.Context) { func TrainJobIndex(ctx *context.Context) {
MustEnableModelArts(ctx) MustEnableModelArts(ctx)


//can, err := canUserCreateTrainJob(ctx.User.ID)
//if err != nil {
// ctx.ServerError("canUserCreateTrainJob", err)
// return
//}
//
//ctx.Data["CanCreate"] = can

repo := ctx.Repo.Repository repo := ctx.Repo.Repository
page := ctx.QueryInt("page") page := ctx.QueryInt("page")
if page <= 0 { if page <= 0 {
@@ -512,9 +505,10 @@ func TrainJobIndex(ctx *context.Context) {
Page: page, Page: page,
PageSize: setting.UI.IssuePagingNum, PageSize: setting.UI.IssuePagingNum,
}, },
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
VersionName: string(models.JobVersionName),
}) })
if err != nil { if err != nil {
ctx.ServerError("Cloudbrain", err) ctx.ServerError("Cloudbrain", err)
@@ -614,6 +608,82 @@ func trainJobNewDataPrepare(ctx *context.Context) error {
return nil return nil
} }


func TrainJobNewVersion(ctx *context.Context) {
err := trainJobNewVersionDataPrepare(ctx)
if err != nil {
ctx.ServerError("get new train-job info failed", err)
return
}
ctx.HTML(200, tplModelArtsTrainJobVersionNew)
}

func trainJobNewVersionDataPrepare(ctx *context.Context) error {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("versionName")

t := time.Now()
var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
ctx.Data["job_name"] = jobName

attachs, err := models.GetModelArtsUserAttachments(ctx.User.ID)
if err != nil {
ctx.ServerError("GetAllUserAttachments failed:", err)
return err
}
ctx.Data["attachments"] = attachs

var resourcePools modelarts.ResourcePool
if err = json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["resource_pools"] = resourcePools.Info

var engines modelarts.Engine
if err = json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["engines"] = engines.Info

var versionInfos modelarts.VersionInfo
if err = json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["engine_versions"] = versionInfos.Version

var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info

outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
ctx.Data["train_url"] = outputObsPath

Branches, err := ctx.Repo.GitRepo.GetBranches()
if err != nil {
ctx.ServerError("GetBranches error:", err)
return err
}
ctx.Data["Branches"] = Branches
ctx.Data["BranchesCount"] = len(Branches)
ctx.Data["jobID"] = jobID
ctx.Data["versionName"] = versionName

configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
if err != nil {
ctx.ServerError("getConfigList failed:", err)
return err
}
ctx.Data["config_list"] = configList.ParaConfigs

return nil
}

func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
ctx.Data["PageIsTrainJob"] = true ctx.Data["PageIsTrainJob"] = true
jobName := form.JobName jobName := form.JobName
@@ -634,19 +704,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/" dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName branch_name := form.BranchName


//can, err := canUserCreateTrainJob(ctx.User.ID)
//if err != nil {
// ctx.ServerError("canUserCreateTrainJob", err)
// return
//}
//
//if !can {
// log.Error("the user can not create train-job")
// ctx.RenderWithErr("the user can not create train-job", tplModelArtsTrainJobNew, &form)
// return
//}

//param check
if err := paramCheckCreateTrainJob(form); err != nil { if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err) log.Error("paramCheckCreateTrainJob failed:(%v)", err)
trainJobNewDataPrepare(ctx) trainJobNewDataPrepare(ctx)
@@ -665,9 +722,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
if err == nil { if err == nil {
os.RemoveAll(codeLocalPath) os.RemoveAll(codeLocalPath)
} }
// branch_name := "testbranch"
// gitRepo, _ := git.OpenRepository(repo.RepoPath())
// commitID, _ := gitRepo.GetBranchCommitID(branch_name)


if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{ if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
Branch: branch_name, Branch: branch_name,
@@ -786,7 +840,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
LogUrl: logObsPath, LogUrl: logObsPath,
PoolID: poolID, PoolID: poolID,
Uuid: uuid, Uuid: uuid,
Parameters: param,
Parameters: parameters.Parameter,
} }


err = modelarts.GenerateTrainJob(ctx, req) err = modelarts.GenerateTrainJob(ctx, req)
@@ -797,12 +851,220 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
ctx.Data["uuid"] = form.Attachment ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return return
} }
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
} }


func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
ctx.Data["PageIsTrainJob"] = true
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("versionName")
jobName := form.JobName
uuid := form.Attachment
description := form.Description
workServerNumber := form.WorkServerNumber
engineID := form.EngineID
bootFile := form.BootFile
flavorCode := form.Flavor
params := form.Params
poolID := form.PoolID
isSaveParam := form.IsSaveParam
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
outputObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName

if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}

attach, err := models.GetAttachmentByUUID(uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", uuid, err.Error())
return
}

//todo: del the codeLocalPath
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
os.RemoveAll(codeLocalPath)
}

if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
Branch: branch_name,
}); err != nil {
log.Error("创建任务失败,任务名称已存在!: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)

ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
// ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form)
ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form)
// ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form)
return
}

//todo: upload code (send to file_server todo this work?)
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
return
}

if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil {
log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
return
}

//todo: del local code?

var parameters models.Parameters
param := make([]models.Parameter, 0)
param = append(param, models.Parameter{
Label: modelarts.TrainUrl,
Value: outputObsPath,
}, models.Parameter{
Label: modelarts.DataUrl,
Value: dataPath,
})
if len(params) != 0 {
err := json.Unmarshal([]byte(params), &parameters)
if err != nil {
log.Error("Failed to Unmarshal params: %s (%v)", params, err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
return
}

for _, parameter := range parameters.Parameter {
if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
param = append(param, models.Parameter{
Label: parameter.Label,
Value: parameter.Value,
})
}
}
}

//save param config
if isSaveParam == "on" {
if form.ParameterTemplateName == "" {
log.Error("ParameterTemplateName is empty")
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
return
}

_, err := modelarts.CreateTrainJobConfig(models.CreateConfigParams{
ConfigName: form.ParameterTemplateName,
Description: form.PrameterDescription,
DataUrl: dataPath,
AppUrl: codeObsPath,
BootFileUrl: codeObsPath + bootFile,
TrainUrl: outputObsPath,
Flavor: models.Flavor{
Code: flavorCode,
},
WorkServerNum: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
PoolID: poolID,
Parameter: parameters.Parameter,
})

if err != nil {
log.Error("Failed to CreateTrainJobConfig: %v", err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
}
// JobVersionName := "V0001"
// PreVersionId := int64(67646)
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
req := &modelarts.GenerateTrainJobVersionReq{
JobName: task.JobName,
DataUrl: dataPath,
Description: description,
CodeObsPath: codeObsPath,
BootFile: codeObsPath + bootFile,
TrainUrl: outputObsPath,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
PoolID: poolID,
Uuid: uuid,
Parameters: parameters.Parameter,
PreVersionId: task.VersionID,
}
err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
//保存openi创建训练任务界面的参数
// err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{

// JobName: req.JobName,
// ResourcePools: form.PoolID,
// EngineVersions: form.EngineID,
// FlavorInfos: form.Flavor,
// TrainUrl: outputObsPath,
// BootFile: form.BootFile,
// Uuid: form.Attachment,
// DatasetName: attach.Name,
// Params: form.Params,
// BranchName: branch_name,
// })

// if err != nil {
// log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
// trainJobNewVersionDataPrepare(ctx)
// ctx.Data["bootFile"] = form.BootFile
// ctx.Data["uuid"] = form.Attachment
// ctx.Data["datasetName"] = attach.Name
// ctx.Data["params"] = form.Params
// ctx.Data["branch_name"] = branch_name
// ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
// return
// }
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}

// readDir reads the directory named by dirname and returns // readDir reads the directory named by dirname and returns
// a list of directory entries sorted by filename. // a list of directory entries sorted by filename.
func readDir(dirname string) ([]os.FileInfo, error) { func readDir(dirname string) ([]os.FileInfo, error) {
@@ -895,6 +1157,27 @@ func TrainJobShow(ctx *context.Context) {


var jobID = ctx.Params(":jobid") var jobID = ctx.Params(":jobid")
task, err := models.GetCloudbrainByJobID(jobID) task, err := models.GetCloudbrainByJobID(jobID)

repo := ctx.Repo.Repository
page := ctx.QueryInt("page")
if page <= 0 {
page = 1
}
VersionListTasks, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
ListOptions: models.ListOptions{
Page: page,
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
JobID: jobID,
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
return
}

if err != nil { if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
@@ -945,6 +1228,8 @@ func TrainJobShow(ctx *context.Context) {
ctx.Data["task"] = task ctx.Data["task"] = task
ctx.Data["jobID"] = jobID ctx.Data["jobID"] = jobID
ctx.Data["result"] = result ctx.Data["result"] = result
ctx.Data["VersionListTasks"] = VersionListTasks
ctx.Data["VersionLisCount"] = VersionListCount
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
} }




+ 3
- 0
routers/routes/routes.go View File

@@ -993,9 +993,12 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog) m.Get("/log", reqRepoCloudBrainReader, repo.TrainJobGetLog)
m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels) m.Get("/models", reqRepoCloudBrainReader, repo.TrainJobShowModels)
m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel) m.Get("/download_model", reqRepoCloudBrainReader, repo.TrainJobDownloadModel)
m.Get("/create_version", reqRepoCloudBrainReader, repo.TrainJobNewVersion)
m.Post("/create_version", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion)
}) })
m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew) m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew)
m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate) m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate)

m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList) m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList)
}) })
}, context.RepoRef()) }, context.RepoRef())


+ 0
- 0
templates/repo/modelarts/trainjob/version_new.tmpl View File


Loading…
Cancel
Save