Browse Source

notebook api

tags/v1.21.12.2^2
lewis 3 years ago
parent
commit
74ad6132b3
4 changed files with 79 additions and 60 deletions
  1. +1
    -1
      models/cloudbrain.go
  2. +1
    -1
      modules/modelarts/modelarts.go
  3. +6
    -2
      routers/repo/cloudbrain.go
  4. +71
    -56
      routers/repo/modelarts.go

+ 1
- 1
models/cloudbrain.go View File

@@ -105,7 +105,7 @@ type Cloudbrain struct {
IsLatestVersion string //是否是最新版本,1是,0否
CommitID string //提交的仓库代码id
PreVersionName string //父版本名称
ComputeResource string //计算资源,例如npu
ComputeResource string `xorm:"-"` //计算资源,例如npu
EngineID int64 //引擎id

TrainUrl string //输出的obs路径


+ 1
- 1
modules/modelarts/modelarts.go View File

@@ -277,7 +277,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
DatasetName: attach.Name,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
ComputeResource: NPUResource,
//ComputeResource: NPUResource,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,


+ 6
- 2
routers/repo/cloudbrain.go View File

@@ -256,13 +256,17 @@ func CloudBrainRestart(ctx *context.Context) {
3、更新此任务的状态
*/

//todo: 是否启用事务?处理时间太长,容易卡住
/*
todo: 是否启用事务?
启用:处理时间可能比较长,容易出现大事务
不启用:容易出现脏数据
*/

var jobID = ctx.Params(":jobid")
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error(), ctx.Data["MsgID"])
ctx.RenderWithErr(err.Error(), tplCloudBrainIndex, nil)
ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
return
}



+ 71
- 56
routers/repo/modelarts.go View File

@@ -73,17 +73,14 @@ func DebugJobIndex(ctx *context.Context) {
}

for i, task := range ciTasks {
ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)

if task.Cloudbrain.Type == models.TypeCloudBrainOne {
ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
ciTasks[i].Cloudbrain.ComputeResource = modelarts.GPUResource
}
if task.Cloudbrain.Type == models.TypeCloudBrainTwo {
ciTasks[i].CanDebug = cloudbrain.CanCreateOrDebugJob(ctx)
ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
} else if task.Cloudbrain.Type == models.TypeCloudBrainTwo {
ciTasks[i].Cloudbrain.ComputeResource = modelarts.NPUResource
}

}

pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)
@@ -235,65 +232,82 @@ func NotebookDebug(ctx *context.Context) {
func NotebookManage(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var action = ctx.Params(":action")
var resultCode = "0"
var errorMsg = ""

task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID failed:%v", err, ctx.Data["MsgID"])
ctx.RenderWithErr("system error", tplDebugJobIndex, nil)
return
}

if action == models.ActionStop {
if task.Status != string(models.ModelArtsRunning) {
log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
ctx.RenderWithErr("the job is not running", tplDebugJobIndex, nil)
return
for {
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID failed:%v", err, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}
} else if action == models.ActionRestart {
if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
ctx.RenderWithErr("the job is not stopped", tplDebugJobIndex, nil)
return

if action == models.ActionStop {
if task.Status != string(models.ModelArtsRunning) {
log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not running"
break
}
} else if action == models.ActionRestart {
if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not stopped"
break
}

count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
if err != nil {
log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "you have already a running or waiting task, can not create more"
break
}
}

action = models.ActionStart
} else {
log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "非法操作"
break
}

count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
param := models.NotebookAction{
Action: action,
}
res, err := modelarts.ManageNotebook(jobID, param)
if err != nil {
log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
ctx.RenderWithErr("system error", tplDebugJobIndex, nil)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplDebugJobIndex, nil)
return
}
log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "启动失败"
break
}

action = models.ActionStart
} else {
log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"])
ctx.RenderWithErr("非法操作", tplDebugJobIndex, nil)
return
task.Status = res.CurrentStatus
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}
}

param := models.NotebookAction{
Action: action,
}
res, err := modelarts.ManageNotebook(jobID, param)
if err != nil {
log.Error("ManageNotebook(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
ctx.RenderWithErr("启动失败", tplDebugJobIndex, nil)
return
}

task.Status = res.CurrentStatus
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
ctx.RenderWithErr("system error", tplDebugJobIndex, nil)
return
}
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
})
}

func NotebookDel(ctx *context.Context) {
@@ -353,6 +367,7 @@ func TrainJobIndex(ctx *context.Context) {
for i, task := range tasks {
tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain)
tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain)
tasks[i].ComputeResource = modelarts.NPUResource
}

pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5)


Loading…
Cancel
Save