Browse Source

debug

fix-2419
lewis 3 years ago
parent
commit
7fd4bef198
6 changed files with 121 additions and 87 deletions
  1. +6
    -1
      models/cloudbrain.go
  2. +95
    -23
      modules/modelarts/modelarts.go
  3. +2
    -1
      modules/modelarts/resty.go
  4. +2
    -15
      routers/api/v1/repo/modelarts.go
  5. +8
    -26
      routers/repo/cloudbrain.go
  6. +8
    -21
      routers/repo/modelarts.go

+ 6
- 1
models/cloudbrain.go View File

@@ -1275,7 +1275,7 @@ type JobVersionList struct {


type GetTrainJobVersionListResult struct { type GetTrainJobVersionListResult struct {
ErrorResult ErrorResult
JobID string `json:"job_id"`
JobID int64 `json:"job_id"`
JobName string `json:"job_name"` JobName string `json:"job_name"`
JobDesc string `json:"job_desc"` JobDesc string `json:"job_desc"`
VersionCount int64 `json:"version_count"` VersionCount int64 `json:"version_count"`
@@ -2169,3 +2169,8 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) {
In("id", ids). In("id", ids).
Find(&cloudbrains) Find(&cloudbrains)
} }

func GetCloudbrainCountByJobName(jobName, jobType string) (int, error) {
count, err := x.Where("job_name = ? and job_type= ?", jobName, jobType).Count(new(Cloudbrain))
return int(count), err
}

+ 95
- 23
modules/modelarts/modelarts.go View File

@@ -509,7 +509,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
RepoID: repo.ID, RepoID: repo.ID,
Type: models.TypeCloudBrainTwo, Type: models.TypeCloudBrainTwo,
JobTypes: jobTypes, JobTypes: jobTypes,
JobName: req.JobName,
JobID: jobId,
}) })
if err != nil { if err != nil {
ctx.ServerError("Cloudbrain", err) ctx.ServerError("Cloudbrain", err)
@@ -519,10 +519,10 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job


createTime := timeutil.TimeStampNow() createTime := timeutil.TimeStampNow()
task := &models.Cloudbrain{ task := &models.Cloudbrain{
Status: string(models.ModelArtsTrainJobWaiting),
Status: models.JobStatusTemp,
UserID: ctx.User.ID, UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID, RepoID: ctx.Repo.Repository.ID,
JobID: models.TempJobIdPrefix + jobId,
JobID: jobId,
JobName: req.JobName, JobName: req.JobName,
DisplayJobName: req.DisplayJobName, DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain), JobType: string(models.JobTypeTrain),
@@ -846,17 +846,17 @@ func GetNotebookImageName(imageId string) (string, error) {
} }


func HandleTrainJobInfo(task *models.Cloudbrain) error { func HandleTrainJobInfo(task *models.Cloudbrain) error {
if strings.HasPrefix(task.JobID, models.TempJobIdPrefix) {
if isTempJob(task.JobID, task.Status) {
if task.VersionCount > VersionCountOne { if task.VersionCount > VersionCountOne {
//multi version //multi version
result, err := GetTrainJobVersionList(1000, 1, strings.TrimPrefix(task.JobID, models.TempJobIdPrefix)) result, err := GetTrainJobVersionList(1000, 1, strings.TrimPrefix(task.JobID, models.TempJobIdPrefix))
if err != nil { if err != nil {
log.Error("GetTrainJobVersionList(%s) failed:%v", task.JobName, err)
log.Error("GetTrainJobVersionList failed:%v", err)
return err return err
} }


if result != nil { if result != nil {
if result.JobID == task.JobID && result.JobName == task.JobName {
if strconv.FormatInt(result.JobID, 10) == task.JobID && result.JobName == task.JobName {
if result.VersionCount == int64(task.VersionCount) { if result.VersionCount == int64(task.VersionCount) {
log.Info("find the record(%s)", task.DisplayJobName) log.Info("find the record(%s)", task.DisplayJobName)
task.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus) task.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
@@ -871,13 +871,13 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error {
temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID) temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
if err != nil { if err != nil {
log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error()) log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
return err
}
err = models.DeleteCloudbrainTemp(temp)
if err != nil {
log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
return err
} else {
err = models.DeleteCloudbrainTemp(temp)
if err != nil {
log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
}
} }

return nil return nil
} else { } else {
log.Error("can not find the record(%s) until now", task.DisplayJobName) log.Error("can not find the record(%s) until now", task.DisplayJobName)
@@ -890,16 +890,14 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error {
//inference or one version //inference or one version
result, err := GetTrainJobList(1000, 1, "create_time", "desc", task.JobName) result, err := GetTrainJobList(1000, 1, "create_time", "desc", task.JobName)
if err != nil { if err != nil {
log.Error("GetTrainJobList(%s) failed:%v", task.DisplayJobName, err)
log.Error("GetTrainJobList failed:%v", err)
return err return err
} }


if result != nil { if result != nil {
isExist := false
for _, job := range result.JobList { for _, job := range result.JobList {
if task.JobName == job.JobName { if task.JobName == job.JobName {
log.Info("find the record(%s)", task.DisplayJobName) log.Info("find the record(%s)", task.DisplayJobName)
isExist = true
task.Status = TransTrainJobStatus(job.IntStatus) task.Status = TransTrainJobStatus(job.IntStatus)
task.JobID = strconv.FormatInt(job.JobID, 10) task.JobID = strconv.FormatInt(job.JobID, 10)


@@ -921,14 +919,6 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error {
return nil return nil
} }
} }

//todo: move
if !isExist {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
//temp.QueryTimes = temp.QueryTimes + 1
} else {
log.Info("find the record(%s)", task.DisplayJobName)
}
} }


} }
@@ -963,3 +953,85 @@ func HandleTrainJobInfo(task *models.Cloudbrain) error {


return nil return nil
} }

func HandleNotebookInfo(task *models.Cloudbrain) error {
if isTempJob(task.JobID, task.Status) {
result, err := GetNotebookList(1000, 0, "createTime", "DESC", task.JobName)
if err != nil {
log.Error("GetNotebookList failed:%v", err)
return err
}

if result != nil {
count, err := models.GetCloudbrainCountByJobName(task.JobName, task.JobType)
if err != nil {
log.Error("GetCloudbrainCountByJobName failed:%v", err)
return err
}

if len(result.NotebookList) == count {
if result.NotebookList[0].JobName == task.JobName {
log.Info("find the record(%s)", task.DisplayJobName)
task.Status = result.NotebookList[0].Status
task.JobID = result.NotebookList[0].JobID

err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
return err
}
temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
if err != nil {
log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
return err
}
err = models.DeleteCloudbrainTemp(temp)
if err != nil {
log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
return err
}
return nil
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}
} else {
//normal
result, err := GetNotebook2(task.JobID)
if err != nil {
log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
return err
}

if result != nil {
task.Status = result.Status
if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
}
if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.CorrectCreateUnix()
task.ComputeAndSetDuration()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
return err
}
}
}

return nil
}

func isTempJob(jobID, status string) bool {
if (strings.HasPrefix(jobID, models.TempJobIdPrefix) && status == string(models.ModelArtsTrainJobWaiting)) || status == models.JobStatusTemp {
return true
}
return false
}

+ 2
- 1
modules/modelarts/resty.go View File

@@ -1379,7 +1379,7 @@ sendjob:
return &result, nil return &result, nil
} }


func GetNotebookList(limit, page int, sortBy, order, searchContent, status string) (*models.GetNotebookListResult, error) {
func GetNotebookList(limit, offset int, sortBy, order, searchContent string) (*models.GetNotebookListResult, error) {
checkSetting() checkSetting()
client := getRestyClient() client := getRestyClient()
var result models.GetNotebookListResult var result models.GetNotebookListResult
@@ -1390,6 +1390,7 @@ sendjob:
res, err := client.R(). res, err := client.R().
SetQueryParams(map[string]string{ SetQueryParams(map[string]string{
"limit": strconv.Itoa(limit), "limit": strconv.Itoa(limit),
"offset": strconv.Itoa(offset),
"name": searchContent, "name": searchContent,
"sort_key": sortBy, "sort_key": sortBy,
"sort_dir": order, "sort_dir": order,


+ 2
- 15
routers/api/v1/repo/modelarts.go View File

@@ -36,29 +36,16 @@ func GetModelArtsNotebook2(ctx *context.APIContext) {
ctx.NotFound(err) ctx.NotFound(err)
return return
} }
result, err := modelarts.GetNotebook2(job.JobID)
err = modelarts.HandleNotebookInfo(job)
if err != nil { if err != nil {
ctx.NotFound(err) ctx.NotFound(err)
return return
} }
if job.StartTime == 0 && result.Lease.UpdateTime > 0 {
job.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
}
job.Status = result.Status
if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) {
job.EndTime = timeutil.TimeStampNow()
}
job.CorrectCreateUnix()
job.ComputeAndSetDuration()
err = models.UpdateJob(job)
if err != nil {
log.Error("UpdateJob failed:", err)
}


ctx.JSON(http.StatusOK, map[string]interface{}{ ctx.JSON(http.StatusOK, map[string]interface{}{
"ID": ID, "ID": ID,
"JobName": job.JobName, "JobName": job.JobName,
"JobStatus": result.Status,
"JobStatus": job.Status,
}) })


} }


+ 8
- 26
routers/repo/cloudbrain.go View File

@@ -368,7 +368,6 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
} }
} }



func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBrainInferencForm) { func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBrainInferencForm) {
ctx.Data["PageIsCloudBrain"] = true ctx.Data["PageIsCloudBrain"] = true
displayJobName := form.DisplayJobName displayJobName := form.DisplayJobName
@@ -489,6 +488,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job") ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")


} }

/** /**
检查用户传输的参数是否符合专属资源池 检查用户传输的参数是否符合专属资源池
*/ */
@@ -1706,42 +1706,24 @@ func SyncCloudbrainStatus() {
} }
} else if task.Type == models.TypeCloudBrainTwo { } else if task.Type == models.TypeCloudBrainTwo {
if task.JobType == string(models.JobTypeDebug) { if task.JobType == string(models.JobTypeDebug) {
//result, err := modelarts.GetJob(task.JobID)
result, err := modelarts.GetNotebook2(task.JobID)
err := modelarts.HandleNotebookInfo(task)
if err != nil { if err != nil {
log.Error("GetJob(%s) failed:%v", task.JobName, err)
log.Error("HandleNotebookInfo(%s) failed:%v", task.DisplayJobName, err)
continue continue
} }

if result != nil {
task.Status = result.Status
if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
}
if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.CorrectCreateUnix()
task.ComputeAndSetDuration()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
continue
}
}
} else if task.JobType == string(models.JobTypeTrain) || task.JobType == string(models.JobTypeInference) { } else if task.JobType == string(models.JobTypeTrain) || task.JobType == string(models.JobTypeInference) {
err := modelarts.HandleTrainJobInfo(task) err := modelarts.HandleTrainJobInfo(task)
if err != nil { if err != nil {
log.Error("HandleTrainJobInfo(%s) failed:%v", task.JobName, err)
log.Error("HandleTrainJobInfo(%s) failed:%v", task.DisplayJobName, err)
continue continue
} }
} else { } else {
log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType)
log.Error("task.JobType(%s) is error:%s", task.DisplayJobName, task.JobType)
} }
} else if task.Type == models.TypeC2Net { } else if task.Type == models.TypeC2Net {
result, err := grampus.GetJob(task.JobID) result, err := grampus.GetJob(task.JobID)
if err != nil { if err != nil {
log.Error("GetTrainJob(%s) failed:%v", task.JobName, err)
log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
continue continue
} }


@@ -1762,12 +1744,12 @@ func SyncCloudbrainStatus() {
task.CorrectCreateUnix() task.CorrectCreateUnix()
err = models.UpdateJob(task) err = models.UpdateJob(task)
if err != nil { if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
continue continue
} }
} }
} else { } else {
log.Error("task.Type(%s) is error:%d", task.JobName, task.Type)
log.Error("task.Type(%s) is error:%d", task.DisplayJobName, task.Type)
} }
} }




+ 8
- 21
routers/repo/modelarts.go View File

@@ -263,28 +263,15 @@ func NotebookShow(ctx *context.Context) {
return return
} }


result, err := modelarts.GetNotebook2(task.JobID)
if err != nil {
ctx.Data["error"] = err.Error()
ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
return
}

if result != nil {
if task.DeletedAt.IsZero() { //normal record
if task.Status != result.Status {
task.Status = result.Status
models.ParseAndSetDurationFromModelArtsNotebook(result, task)
err = models.UpdateJob(task)
if err != nil {
ctx.Data["error"] = err.Error()
ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
return
}
}
} else { //deleted record

if task.DeletedAt.IsZero() { //normal record
err := modelarts.HandleNotebookInfo(task)
if err != nil {
ctx.Data["error"] = err.Error()
ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
return
} }
} else { //deleted record

} }


datasetDownload := make([]models.DatasetDownload, 0) datasetDownload := make([]models.DatasetDownload, 0)


Loading…
Cancel
Save