Browse Source

训练多版本修改返回的训练列表

tags/v1.21.12.1
liuzx 4 years ago
parent
commit
4e2ec3ebd9
5 changed files with 131 additions and 77 deletions
  1. +24
    -16
      models/cloudbrain.go
  2. +1
    -0
      models/models.go
  3. +52
    -23
      modules/modelarts/modelarts.go
  4. +51
    -38
      routers/repo/modelarts.go
  5. +3
    -0
      routers/routes/routes.go

+ 24
- 16
models/cloudbrain.go View File

@@ -69,11 +69,13 @@ type Cloudbrain struct {
CanDel bool `xorm:"-"`
Type int `xorm:"INDEX DEFAULT 0"`

VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string
Uuid string
DatasetName string
VersionCount int64 `xorm:"INDEX DEFAULT 1"`
VersionID int64 `xorm:"INDEX DEFAULT 0"`
VersionName string
Uuid string
DatasetName string
VersionCount int64 `xorm:"INDEX DEFAULT 1"`
IsLatestVersion string
CommitID string

User *User `xorm:"-"`
Repo *Repository `xorm:"-"`
@@ -89,11 +91,11 @@ type TrainjobConfigDetail struct {
BootFile string `xorm:"INDEX"`
Uuid string `xorm:"INDEX"`
DatasetName string `xorm:"INDEX"`
Params string `xorm:"deleted"`
Params string `xorm:"INDEX"`
BranchName string `xorm:"INDEX"`

// User *User `xorm:"-"`
// Repo *Repository `xorm:"-"`
User *User `xorm:"-"`
Repo *Repository `xorm:"-"`
}

type CloudbrainInfo struct {
@@ -173,9 +175,10 @@ type CloudbrainsOptions struct {
SortType string
CloudbrainIDs []int64
// JobStatus CloudbrainStatus
Type int
JobType string
VersionName string
Type int
JobType string
VersionName string
IsLatestVersion string
}

type TaskPod struct {
@@ -903,9 +906,9 @@ func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
)
}

if (opts.VersionName) != "" {
if (opts.IsLatestVersion) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.version_name": opts.VersionName},
builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion},
)
}

@@ -1056,6 +1059,11 @@ func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Clou
return getRepoCloudBrain(cb)
}

func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) {
cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion}
return getRepoCloudBrain(cb)
}

func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
cloudBrains := make([]*Cloudbrain, 0)
err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
@@ -1080,9 +1088,9 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train
return
}

func SetVersionCountByJobID(jobID string, versionName string, versionCount int64) (err error) {
cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount}
_, err = x.Cols("version_Count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
func SetVersionCountAndLatestVersionByJobIDAndVersionName(jobID string, versionName string, versionCount int64, isLatestVersion string) (err error) {
cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion}
_, err = x.Cols("version_Count", "is_latest_version").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
return
}



+ 1
- 0
models/models.go View File

@@ -133,6 +133,7 @@ func init() {
new(FileChunk),
new(BlockChain),
new(RecommendOrg),
new(TrainjobConfigDetail),
)

tablesStatistic = append(tablesStatistic,


+ 52
- 23
modules/modelarts/modelarts.go View File

@@ -35,16 +35,18 @@ const (
// "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
// "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
// "]}"
CodePath = "/code/"
OutputPath = "/output/"
LogPath = "/log/"
JobPath = "/job/"
OrderDesc = "desc" //向下查询
OrderAsc = "asc" //向上查询
Lines = 20
TrainUrl = "train_url"
DataUrl = "data_url"
PerPage = 10
CodePath = "/code/"
OutputPath = "/output/"
LogPath = "/log/"
JobPath = "/job/"
OrderDesc = "desc" //向下查询
OrderAsc = "asc" //向上查询
Lines = 20
TrainUrl = "train_url"
DataUrl = "data_url"
PerPage = 10
IsLatestVersion = "1"
NotLatestVersion = "0"

SortByCreateTime = "create_time"
ConfigTypeCustom = "custom"
@@ -69,6 +71,8 @@ type GenerateTrainJobReq struct {
WorkServerNumber int
EngineID int64
Parameters []models.Parameter
CommitID string
IsLatestVersion string
}

type GenerateTrainJobVersionReq struct {
@@ -86,6 +90,7 @@ type GenerateTrainJobVersionReq struct {
EngineID int64
Parameters []models.Parameter
PreVersionId int64
CommitID string
}

type VersionInfo struct {
@@ -219,17 +224,19 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
}

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
})

if err != nil {
@@ -282,6 +289,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
CommitID: req.CommitID,
})
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
@@ -307,8 +315,29 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobVersionR
ctx.ServerError("Cloudbrain", err)
return nil
}
versionName := "V0001"
err = models.SetVersionCountByJobID(strconv.FormatInt(jobResult.JobID, 10), versionName, VersionListCount)

//将训练任务的上一版本的isLatestVersion设置为"0"
latestTask, err := models.GetCloudbrainByJobIDAndIsLatestVersion(strconv.FormatInt(jobResult.JobID, 10), IsLatestVersion)
if err != nil {
ctx.ServerError("GetCloudbrainByJobIDAndIsLatestVersion faild:", err)
return nil
}

// lastVersionNum := jobResult.VersionName[1:]
// lastVersionNumToInt64, err := strconv.ParseInt(lastVersionNum, 10, 64)
// if err != nil {
// ctx.ServerError("lastVersionNumToInt64 faild:", err)
// return nil
// }
// lastVersionName := "V" + strconv.FormatInt(lastVersionNumToInt64-1, 10)
err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), latestTask.VersionName, VersionListCount, NotLatestVersion)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return nil
}

//将当前版本的isLatestVersion和任务数量更新
err = models.SetVersionCountAndLatestVersionByJobIDAndVersionName(strconv.FormatInt(jobResult.JobID, 10), jobResult.VersionName, VersionListCount, IsLatestVersion)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return nil


+ 51
- 38
routers/repo/modelarts.go View File

@@ -505,10 +505,10 @@ func TrainJobIndex(ctx *context.Context) {
Page: page,
PageSize: setting.UI.IssuePagingNum,
},
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
VersionName: string(models.JobVersionName),
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobType: string(models.JobTypeTrain),
IsLatestVersion: modelarts.IsLatestVersion,
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
@@ -614,13 +614,14 @@ func TrainJobNewVersion(ctx *context.Context) {
ctx.ServerError("get new train-job info failed", err)
return
}
ctx.HTML(200, tplModelArtsTrainJobVersionNew)
ctx.HTML(200, tplModelArtsTrainJobNew)
}

func trainJobNewVersionDataPrepare(ctx *context.Context) error {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("versionName")
jobID = "19373"

t := time.Now()
var jobName = cutString(ctx.User.Name, 5) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
@@ -703,6 +704,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath
dataPath := "/" + setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + uuid + "/"
branch_name := form.BranchName
isLatestVersion := modelarts.IsLatestVersion

if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
@@ -723,6 +725,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
os.RemoveAll(codeLocalPath)
}

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branch_name)

if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
Branch: branch_name,
}); err != nil {
@@ -841,6 +846,8 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
PoolID: poolID,
Uuid: uuid,
Parameters: parameters.Parameter,
CommitID: commitID,
IsLatestVersion: isLatestVersion,
}

err = modelarts.GenerateTrainJob(ctx, req)
@@ -862,6 +869,9 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
ctx.Data["PageIsTrainJob"] = true
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("versionName")
jobID = "19373"
versionName = "V0009"

jobName := form.JobName
uuid := form.Attachment
description := form.Description
@@ -883,7 +893,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
if err := paramCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return
}

@@ -899,6 +909,8 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
os.RemoveAll(codeLocalPath)
}

gitRepo, _ := git.OpenRepository(repo.RepoPath())
commitID, _ := gitRepo.GetBranchCommitID(branch_name)
if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{
Branch: branch_name,
}); err != nil {
@@ -911,7 +923,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
// ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form)
ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("创建任务失败,任务名称已存在!", tplModelArtsTrainJobNew, &form)
// ctx.RenderWithErr(err, tplModelArtsTrainJobNew, &form)
return
}
@@ -920,21 +932,21 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("Failed to obsMkdir_output", tplModelArtsTrainJobNew, &form)
return
}

if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath); err != nil {
log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("Failed to obsMkdir_log", tplModelArtsTrainJobNew, &form)
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
return
}

@@ -954,7 +966,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
if err != nil {
log.Error("Failed to Unmarshal params: %s (%v)", params, err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("运行参数错误", tplModelArtsTrainJobNew, &form)
return
}

@@ -973,7 +985,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
if form.ParameterTemplateName == "" {
log.Error("ParameterTemplateName is empty")
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("保存作业参数时,作业参数名称不能为空", tplModelArtsTrainJobNew, &form)
return
}

@@ -997,7 +1009,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
if err != nil {
log.Error("Failed to CreateTrainJobConfig: %v", err)
trainJobNewVersionDataPrepare(ctx)
ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("保存作业参数失败:"+err.Error(), tplModelArtsTrainJobNew, &form)
return
}
}
@@ -1006,7 +1018,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return
}
req := &modelarts.GenerateTrainJobVersionReq{
@@ -1024,6 +1036,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
Uuid: uuid,
Parameters: parameters.Parameter,
PreVersionId: task.VersionID,
CommitID: commitID,
}
err = modelarts.GenerateTrainJobVersion(ctx, req, jobID)
if err != nil {
@@ -1036,32 +1049,32 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobVersionNew, &form)
return
}
//保存openi创建训练任务界面的参数
// err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{
// 保存openi创建训练任务界面的参数
err = models.CreateTrainjobConfigDetail(&models.TrainjobConfigDetail{

// JobName: req.JobName,
// ResourcePools: form.PoolID,
// EngineVersions: form.EngineID,
// FlavorInfos: form.Flavor,
// TrainUrl: outputObsPath,
// BootFile: form.BootFile,
// Uuid: form.Attachment,
// DatasetName: attach.Name,
// Params: form.Params,
// BranchName: branch_name,
// })
JobName: req.JobName,
ResourcePools: form.PoolID,
EngineVersions: form.EngineID,
FlavorInfos: form.Flavor,
TrainUrl: outputObsPath,
BootFile: form.BootFile,
Uuid: form.Attachment,
DatasetName: attach.Name,
Params: form.Params,
BranchName: branch_name,
})

// if err != nil {
// log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
// trainJobNewVersionDataPrepare(ctx)
// ctx.Data["bootFile"] = form.BootFile
// ctx.Data["uuid"] = form.Attachment
// ctx.Data["datasetName"] = attach.Name
// ctx.Data["params"] = form.Params
// ctx.Data["branch_name"] = branch_name
// ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
// return
// }
if err != nil {
log.Error("CreateTrainjobConfigDetail failed:%v", err.Error())
trainJobNewVersionDataPrepare(ctx)
ctx.Data["bootFile"] = form.BootFile
ctx.Data["uuid"] = form.Attachment
ctx.Data["datasetName"] = attach.Name
ctx.Data["params"] = form.Params
ctx.Data["branch_name"] = branch_name
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
}



+ 3
- 0
routers/routes/routes.go View File

@@ -999,6 +999,9 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNew)
m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate)

// m.Get("/create", reqRepoCloudBrainReader, repo.TrainJobNewVersion)
// m.Post("/create", reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion)

m.Get("/para-config-list", reqRepoCloudBrainReader, repo.TrainJobGetConfigList)
})
}, context.RepoRef())


Loading…
Cancel
Save