Browse Source

debug

fix-2419
lewis 3 years ago
parent
commit
aabb0c3852
4 changed files with 125 additions and 27 deletions
  1. +0
    -2
      models/cloudbrain.go
  2. +25
    -0
      models/cloudbrain_temp.go
  3. +99
    -24
      modules/modelarts/modelarts.go
  4. +1
    -1
      modules/modelarts/resty.go

+ 0
- 2
models/cloudbrain.go View File

@@ -1200,7 +1200,6 @@ type JobList struct {
VersionCount int64 `json:"version_count"`
Description string `json:"job_desc"`
IntStatus int `json:"status"`
Status string
}

type GetTrainJobListResult struct {
@@ -1215,7 +1214,6 @@ type JobVersionList struct {
VersionName string `json:"version_name"`
VersionID int64 `json:"version_id"`
IntStatus int `json:"status"`
Status string
}

type GetTrainJobVersionListResult struct {


+ 25
- 0
models/cloudbrain_temp.go View File

@@ -17,6 +17,7 @@ type CloudbrainTemp struct {
Type int
JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
Status string `xorm:"INDEX NOT NULL DEFAULT 'TEMP'"`
VersionCount int `xorm:"NOT NULL DEFAULT 0"`
QueryTimes int `xorm:"INDEX NOT NULL DEFAULT 0"`
CreatedUnix timeutil.TimeStamp `xorm:"INDEX"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
@@ -30,3 +31,27 @@ func InsertCloudbrainTemp(temp *CloudbrainTemp) (err error) {

return nil
}

func getCloudBrainTemp(temp *CloudbrainTemp) (*CloudbrainTemp, error) {
has, err := x.Get(temp)
if err != nil {
return nil, err
} else if !has {
return nil, ErrJobNotExist{}
}
return temp, nil
}

func GetCloudbrainTempByCloudbrainID(id int64) (*CloudbrainTemp, error) {
temp := &CloudbrainTemp{CloudbrainID: id}
return getCloudBrainTemp(temp)
}

func DeleteCloudbrainTemp(temp *CloudbrainTemp) error {
return deleteCloudbrainTemp(x, temp)
}

func deleteCloudbrainTemp(e Engine, temp *CloudbrainTemp) error {
_, err := e.Where("cloudbrain_id = ?", temp.CloudbrainID).Delete(temp)
return err
}

+ 99
- 24
modules/modelarts/modelarts.go View File

@@ -768,43 +768,118 @@ func GetNotebookImageName(imageId string) (string, error) {
return imageName, nil
}

func ProcessTrainJobInfo(task *models.Cloudbrain) error {
func HandleTrainJobInfo(task *models.Cloudbrain) error {
if strings.HasPrefix(task.JobID, models.TempJobIdPrefix) {

if task.VersionCount > VersionCountOne {
//multi version
result, err := GetTrainJobVersionList(1000, 1, strings.TrimPrefix(task.JobID, models.TempJobIdPrefix))
if err != nil {
log.Error("GetTrainJobVersionList(%s) failed:%v", task.JobName, err)
return err
}

if result != nil {
//todo: get the cb record count and check the count ==
if len(result.JobVersionList) == task.VersionCount {
log.Info("find the record(%s)", task.DisplayJobName)
task.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
task.VersionName = result.JobVersionList[0].VersionName
task.VersionID = result.JobVersionList[0].VersionID
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
return err
}
temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
if err != nil {
log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
return err
}
err = models.DeleteCloudbrainTemp(temp)
if err != nil {
log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
return err
}
return nil
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}

}
} else {
//inference or one version
result, err := GetTrainJobList(1000, 1, "create_time", "desc", task.JobName)
if err != nil {
log.Error("GetTrainJobList(%s) failed:%v", task.DisplayJobName, err)
return err
}

if result != nil {
isExist := false
for _, job := range result.JobList {
if task.JobName == job.JobName {
log.Info("find the record(%s)", task.DisplayJobName)
isExist = true
task.Status = TransTrainJobStatus(job.IntStatus)
task.JobID = strconv.FormatInt(job.JobID, 10)
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
return err
}
temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
if err != nil {
log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
return err
}
err = models.DeleteCloudbrainTemp(temp)
if err != nil {
log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
return err
}
return nil
}
}

//todo: move
if !isExist {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
//temp.QueryTimes = temp.QueryTimes + 1
} else {
log.Info("find the record(%s)", task.DisplayJobName)
}
}

}
} else {
//normal
}
result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJob(%s) failed:%v", task.JobName, err)
return err
}

if result != nil {
task.Status = TransTrainJobStatus(result.IntStatus)
task.Duration = result.Duration / 1000
task.TrainJobDuration = result.TrainJobDuration

if task.StartTime == 0 && result.StartTime > 0 {
task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
err = models.UpdateJob(task)
result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
return err
}

if result != nil {
task.Status = TransTrainJobStatus(result.IntStatus)
task.Duration = result.Duration / 1000
task.TrainJobDuration = result.TrainJobDuration

if task.StartTime == 0 && result.StartTime > 0 {
task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
return err
}
}
}

//temp
return nil
}

+ 1
- 1
modules/modelarts/resty.go View File

@@ -1175,7 +1175,7 @@ sendjob:
return &result, nil
}

func GetTrainJobList(perPage, page int, sortBy, order, searchContent, status string) (*models.GetTrainJobListResult, error) {
func GetTrainJobList(perPage, page int, sortBy, order, searchContent string) (*models.GetTrainJobListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobListResult


Loading…
Cancel
Save