| @@ -31,9 +31,11 @@ const ( | |||
| ) | |||
| const ( | |||
| NPUResource = "NPU" | |||
| GPUResource = "CPU/GPU" | |||
| AllResource = "all" | |||
| TempJobIdPrefix = "TEMP" | |||
| JobStatusTemp = "TEMP" | |||
| NPUResource = "NPU" | |||
| GPUResource = "CPU/GPU" | |||
| AllResource = "all" | |||
| //notebook storage category | |||
| EVSCategory = "EVS" | |||
| @@ -353,6 +355,7 @@ type CloudbrainsOptions struct { | |||
| RepoID int64 // include all repos if empty | |||
| UserID int64 | |||
| JobID string | |||
| JobName string | |||
| SortType string | |||
| CloudbrainIDs []int64 | |||
| JobStatus []string | |||
| @@ -1256,6 +1259,52 @@ type LogFile struct { | |||
| Name string | |||
| } | |||
| type JobList struct { | |||
| JobName string `json:"job_name"` | |||
| JobID int64 `json:"job_id"` | |||
| VersionID int64 `json:"version_id"` | |||
| VersionCount int64 `json:"version_count"` | |||
| Description string `json:"job_desc"` | |||
| IntStatus int `json:"status"` | |||
| } | |||
| type GetTrainJobListResult struct { | |||
| ErrorResult | |||
| JobTotalCount int `json:"job_total_count"` //查询到的用户创建作业总数 | |||
| JobCountLimit int `json:"job_count_limit"` //用户还可以创建训练作业的数量 | |||
| Quotas int `json:"quotas"` //训练作业的运行数量上限 | |||
| JobList []JobList `json:"jobs"` | |||
| } | |||
| type JobVersionList struct { | |||
| VersionName string `json:"version_name"` | |||
| VersionID int64 `json:"version_id"` | |||
| IntStatus int `json:"status"` | |||
| } | |||
| type GetTrainJobVersionListResult struct { | |||
| ErrorResult | |||
| JobID int64 `json:"job_id"` | |||
| JobName string `json:"job_name"` | |||
| JobDesc string `json:"job_desc"` | |||
| VersionCount int64 `json:"version_count"` | |||
| JobVersionList []JobVersionList `json:"versions"` | |||
| } | |||
| type NotebookList struct { | |||
| JobName string `json:"name"` | |||
| JobID string `json:"id"` | |||
| Status string `json:"status"` | |||
| } | |||
| type GetNotebookListResult struct { | |||
| TotalCount int64 `json:"total"` //总的记录数量 | |||
| CurrentPage int `json:"current"` //当前页数 | |||
| TotalPages int `json:"pages"` //总的页数 | |||
| Size int `json:"size"` //每一页的数量 | |||
| NotebookList []NotebookList `json:"data"` | |||
| } | |||
| //Grampus | |||
| type GrampusResult struct { | |||
| ErrorCode int `json:"errorCode"` | |||
| @@ -1568,6 +1617,12 @@ func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, e | |||
| ) | |||
| } | |||
| if (opts.JobName) != "" { | |||
| cond = cond.And( | |||
| builder.Eq{"cloudbrain.job_name": opts.JobName}, | |||
| ) | |||
| } | |||
| if len(opts.JobTypes) > 0 { | |||
| cond = cond.And( | |||
| builder.In("cloudbrain.job_type", opts.JobTypes), | |||
| @@ -1701,9 +1756,9 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train | |||
| return | |||
| } | |||
| func SetVersionCountAndLatestVersion(jobID string, versionName string, versionCount int, isLatestVersion string, totalVersionCount int) (err error) { | |||
| cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion, TotalVersionCount: totalVersionCount} | |||
| _, err = x.Cols("version_Count", "is_latest_version", "total_version_count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb) | |||
| func SetVersionCountAndLatestVersion(jobName string, versionName string, versionCount int, isLatestVersion string, totalVersionCount int) (err error) { | |||
| cb := &Cloudbrain{JobName: jobName, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion, TotalVersionCount: totalVersionCount} | |||
| _, err = x.Cols("version_Count", "is_latest_version", "total_version_count").Where("cloudbrain.job_name=? AND cloudbrain.version_name=?", jobName, versionName).Update(cb) | |||
| return | |||
| } | |||
| @@ -2123,3 +2178,8 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) { | |||
| In("id", ids). | |||
| Find(&cloudbrains) | |||
| } | |||
| func GetCloudbrainCountByJobName(jobName, jobType string) (int, error) { | |||
| count, err := x.Where("job_name = ? and job_type= ?", jobName, jobType).Count(new(Cloudbrain)) | |||
| return int(count), err | |||
| } | |||
| @@ -0,0 +1,57 @@ | |||
| package models | |||
| import ( | |||
| "time" | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| ) | |||
| const ( | |||
| //TempJobIdPrefix = "TEMP" | |||
| ) | |||
| type CloudbrainTemp struct { | |||
| CloudbrainID int64 `xorm:"pk"` | |||
| JobName string | |||
| Type int | |||
| JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"` | |||
| Status string `xorm:"INDEX NOT NULL DEFAULT 'TEMP'"` | |||
| VersionCount int `xorm:"NOT NULL DEFAULT 0"` | |||
| QueryTimes int `xorm:"INDEX NOT NULL DEFAULT 0"` | |||
| CreatedUnix timeutil.TimeStamp `xorm:"INDEX"` | |||
| UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` | |||
| DeletedAt time.Time `xorm:"deleted"` | |||
| } | |||
| func InsertCloudbrainTemp(temp *CloudbrainTemp) (err error) { | |||
| if _, err = x.Insert(temp); err != nil { | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| func getCloudBrainTemp(temp *CloudbrainTemp) (*CloudbrainTemp, error) { | |||
| has, err := x.Get(temp) | |||
| if err != nil { | |||
| return nil, err | |||
| } else if !has { | |||
| return nil, ErrJobNotExist{} | |||
| } | |||
| return temp, nil | |||
| } | |||
| func GetCloudbrainTempByCloudbrainID(id int64) (*CloudbrainTemp, error) { | |||
| temp := &CloudbrainTemp{CloudbrainID: id} | |||
| return getCloudBrainTemp(temp) | |||
| } | |||
| func DeleteCloudbrainTemp(temp *CloudbrainTemp) error { | |||
| return deleteCloudbrainTemp(x, temp) | |||
| } | |||
| func deleteCloudbrainTemp(e Engine, temp *CloudbrainTemp) error { | |||
| _, err := e.Where("cloudbrain_id = ?", temp.CloudbrainID).Delete(temp) | |||
| return err | |||
| } | |||
| @@ -142,8 +142,8 @@ func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error) | |||
| func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) { | |||
| var ID = ctx.Params(":id") | |||
| job, err := models.GetCloudbrainByID(ID) | |||
| var id = ctx.Params(":id") | |||
| job, err := models.GetCloudbrainByID(id) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByID failed:%v", err.Error()) | |||
| ctx.NotFound(ctx.Req.URL.RequestURI(), nil) | |||
| @@ -158,8 +158,8 @@ func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) { | |||
| func AdminOrJobCreaterRight(ctx *context.Context) { | |||
| var ID = ctx.Params(":id") | |||
| job, err := models.GetCloudbrainByID(ID) | |||
| var id = ctx.Params(":id") | |||
| job, err := models.GetCloudbrainByID(id) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByID failed:%v", err.Error()) | |||
| ctx.NotFound(ctx.Req.URL.RequestURI(), nil) | |||
| @@ -4,8 +4,11 @@ import ( | |||
| "encoding/json" | |||
| "errors" | |||
| "fmt" | |||
| "math/rand" | |||
| "path" | |||
| "strconv" | |||
| "strings" | |||
| "time" | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| @@ -59,7 +62,7 @@ const ( | |||
| PerPage = 10 | |||
| IsLatestVersion = "1" | |||
| NotLatestVersion = "0" | |||
| VersionCount = 1 | |||
| VersionCountOne = 1 | |||
| SortByCreateTime = "create_time" | |||
| ConfigTypeCustom = "custom" | |||
| @@ -264,31 +267,13 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc | |||
| log.Error("GetNotebookImageName failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| createTime := timeutil.TimeStampNow() | |||
| jobResult, err := createNotebook2(models.CreateNotebook2Params{ | |||
| JobName: jobName, | |||
| Description: description, | |||
| Flavor: flavor, | |||
| Duration: autoStopDurationMs, | |||
| ImageID: imageId, | |||
| PoolID: poolInfos.PoolInfo[0].PoolId, | |||
| Feature: models.NotebookFeature, | |||
| Volume: models.VolumeReq{ | |||
| Capacity: setting.Capacity, | |||
| Category: models.EVSCategory, | |||
| Ownership: models.ManagedOwnership, | |||
| }, | |||
| WorkspaceID: "0", | |||
| }) | |||
| if err != nil { | |||
| log.Error("createNotebook2 failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: jobResult.Status, | |||
| task := &models.Cloudbrain{ | |||
| Status: string(models.ModelArtsTrainJobWaiting), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobResult.ID, | |||
| JobID: models.TempJobIdPrefix + jobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))), | |||
| JobName: jobName, | |||
| FlavorCode: flavor, | |||
| DisplayJobName: displayJobName, | |||
| @@ -300,16 +285,66 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc | |||
| Description: description, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| }) | |||
| } | |||
| err = models.CreateCloudbrain(task) | |||
| if err != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", displayJobName, err.Error()) | |||
| return err | |||
| } | |||
| task, err := models.GetCloudbrainByName(jobName) | |||
| jobResult, err := createNotebook2(models.CreateNotebook2Params{ | |||
| JobName: jobName, | |||
| Description: description, | |||
| Flavor: flavor, | |||
| Duration: autoStopDurationMs, | |||
| ImageID: imageId, | |||
| PoolID: poolInfos.PoolInfo[0].PoolId, | |||
| Feature: models.NotebookFeature, | |||
| Volume: models.VolumeReq{ | |||
| Capacity: setting.Capacity, | |||
| Category: models.EVSCategory, | |||
| Ownership: models.ManagedOwnership, | |||
| }, | |||
| WorkspaceID: "0", | |||
| }) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByName failed: %v", err.Error()) | |||
| return err | |||
| log.Error("createNotebook2 failed: %v", err.Error()) | |||
| if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { | |||
| log.Info("(%s)unknown error, set temp status", displayJobName) | |||
| errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ | |||
| CloudbrainID: task.ID, | |||
| Status: models.JobStatusTemp, | |||
| Type: task.Type, | |||
| JobName: task.JobName, | |||
| JobType: task.JobType, | |||
| }) | |||
| if errTemp != nil { | |||
| log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) | |||
| return errTemp | |||
| } | |||
| } else { | |||
| task.Status = string(models.ModelArtsCreateFailed) | |||
| errTemp := models.UpdateJob(task) | |||
| if errTemp != nil { | |||
| log.Error("UpdateJob failed: %v", errTemp.Error()) | |||
| } | |||
| errTemp = models.DeleteJob(task) | |||
| if errTemp != nil { | |||
| log.Error("DeleteJob failed: %v", errTemp.Error()) | |||
| } | |||
| return err | |||
| } | |||
| } else { | |||
| task.Status = jobResult.Status | |||
| task.JobID = jobResult.ID | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| } | |||
| stringId := strconv.FormatInt(task.ID, 10) | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask) | |||
| return nil | |||
| @@ -317,66 +352,15 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc | |||
| func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) { | |||
| createTime := timeutil.TimeStampNow() | |||
| var jobResult *models.CreateTrainJobResult | |||
| var createErr error | |||
| if req.EngineID < 0 { | |||
| jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| Config: models.UserImageConfig{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| DataUrl: req.DataUrl, | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| UserImageUrl: req.UserImageUrl, | |||
| UserCommand: req.UserCommand, | |||
| }, | |||
| }) | |||
| } else { | |||
| jobResult, createErr = createTrainJob(models.CreateTrainJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| Config: models.Config{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| DataUrl: req.DataUrl, | |||
| EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| }, | |||
| }) | |||
| } | |||
| if createErr != nil { | |||
| log.Error("CreateJob failed: %v", createErr.Error()) | |||
| return createErr | |||
| } | |||
| jobId := strconv.FormatInt(jobResult.JobID, 10) | |||
| createErr = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: TransTrainJobStatus(jobResult.Status), | |||
| task := &models.Cloudbrain{ | |||
| Status: string(models.ModelArtsTrainJobWaiting), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobId, | |||
| JobID: models.TempJobIdPrefix + req.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))), | |||
| JobName: req.JobName, | |||
| DisplayJobName: req.DisplayJobName, | |||
| JobType: string(models.JobTypeTrain), | |||
| Type: models.TypeCloudBrainTwo, | |||
| VersionID: jobResult.VersionID, | |||
| VersionName: jobResult.VersionName, | |||
| Uuid: req.Uuid, | |||
| DatasetName: req.DatasetName, | |||
| CommitID: req.CommitID, | |||
| @@ -398,49 +382,21 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error | |||
| TotalVersionCount: req.TotalVersionCount, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| }) | |||
| if createErr != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error()) | |||
| return createErr | |||
| } | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask) | |||
| return nil | |||
| } | |||
| func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) { | |||
| return createTrainJobUserImage(models.CreateUserImageTrainJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| Config: models.UserImageConfig{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| DataUrl: req.DataUrl, | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| UserImageUrl: req.UserImageUrl, | |||
| UserCommand: req.UserCommand, | |||
| }, | |||
| }) | |||
| } | |||
| err = models.CreateCloudbrain(task) | |||
| if err != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error()) | |||
| return err | |||
| } | |||
| func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) { | |||
| createTime := timeutil.TimeStampNow() | |||
| var jobResult *models.CreateTrainJobResult | |||
| var createErr error | |||
| log.Info(" req.EngineID =" + fmt.Sprint(req.EngineID)) | |||
| if req.EngineID < 0 { | |||
| jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{ | |||
| jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| Config: models.TrainJobVersionUserImageConfig{ | |||
| Config: models.UserImageConfig{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| @@ -448,19 +404,20 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| PreVersionId: req.PreVersionId, | |||
| UserImageUrl: req.UserImageUrl, | |||
| UserCommand: req.UserCommand, | |||
| }, | |||
| }, jobId) | |||
| }) | |||
| } else { | |||
| jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{ | |||
| jobResult, createErr = createTrainJob(models.CreateTrainJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| Config: models.TrainJobVersionConfig{ | |||
| Config: models.Config{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| @@ -469,87 +426,60 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| PreVersionId: req.PreVersionId, | |||
| Parameter: req.Parameters, | |||
| }, | |||
| }, jobId) | |||
| } | |||
| if createErr != nil { | |||
| log.Error("CreateJob failed: %v", createErr.Error()) | |||
| return createErr | |||
| } | |||
| var jobTypes []string | |||
| jobTypes = append(jobTypes, string(models.JobTypeTrain)) | |||
| repo := ctx.Repo.Repository | |||
| VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{ | |||
| RepoID: repo.ID, | |||
| Type: models.TypeCloudBrainTwo, | |||
| JobTypes: jobTypes, | |||
| JobID: strconv.FormatInt(jobResult.JobID, 10), | |||
| }) | |||
| if createErr != nil { | |||
| ctx.ServerError("Cloudbrain", createErr) | |||
| return createErr | |||
| } | |||
| //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount | |||
| createErr = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: TransTrainJobStatus(jobResult.Status), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: strconv.FormatInt(jobResult.JobID, 10), | |||
| JobName: req.JobName, | |||
| DisplayJobName: req.DisplayJobName, | |||
| JobType: string(models.JobTypeTrain), | |||
| Type: models.TypeCloudBrainTwo, | |||
| VersionID: jobResult.VersionID, | |||
| VersionName: jobResult.VersionName, | |||
| Uuid: req.Uuid, | |||
| DatasetName: req.DatasetName, | |||
| CommitID: req.CommitID, | |||
| IsLatestVersion: req.IsLatestVersion, | |||
| PreVersionName: req.PreVersionName, | |||
| ComputeResource: models.NPUResource, | |||
| EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| BranchName: req.BranchName, | |||
| Parameters: req.Params, | |||
| BootFile: req.BootFile, | |||
| DataUrl: req.DataUrl, | |||
| LogUrl: req.LogUrl, | |||
| PreVersionId: req.PreVersionId, | |||
| FlavorCode: req.FlavorCode, | |||
| Description: req.Description, | |||
| WorkServerNumber: req.WorkServerNumber, | |||
| FlavorName: req.FlavorName, | |||
| EngineName: req.EngineName, | |||
| TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1, | |||
| VersionCount: VersionListCount + 1, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| }) | |||
| if createErr != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) | |||
| return createErr | |||
| }) | |||
| } | |||
| //将训练任务的上一版本的isLatestVersion设置为"0" | |||
| createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount) | |||
| if createErr != nil { | |||
| ctx.ServerError("Update IsLatestVersion failed", createErr) | |||
| return createErr | |||
| log.Error("createTrainJob failed: %v", createErr.Error()) | |||
| if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) { | |||
| log.Info("(%s)unknown error, set temp status", req.DisplayJobName) | |||
| errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ | |||
| CloudbrainID: task.ID, | |||
| Status: models.JobStatusTemp, | |||
| Type: task.Type, | |||
| JobName: task.JobName, | |||
| JobType: task.JobType, | |||
| }) | |||
| if errTemp != nil { | |||
| log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) | |||
| return errTemp | |||
| } | |||
| } else { | |||
| task.Status = string(models.ModelArtsTrainJobFailed) | |||
| errTemp := models.UpdateJob(task) | |||
| if errTemp != nil { | |||
| log.Error("UpdateJob failed: %v", errTemp.Error()) | |||
| } | |||
| errTemp = models.DeleteJob(task) | |||
| if errTemp != nil { | |||
| log.Error("DeleteJob failed: %v", errTemp.Error()) | |||
| } | |||
| return createErr | |||
| } | |||
| } else { | |||
| task.Status = TransTrainJobStatus(jobResult.Status) | |||
| task.JobID = strconv.FormatInt(jobResult.JobID, 10) | |||
| task.VersionID = jobResult.VersionID | |||
| task.VersionName = jobResult.VersionName | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| } | |||
| return createErr | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, task.JobID, req.DisplayJobName, models.ActionCreateTrainTask) | |||
| return nil | |||
| } | |||
| func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) { | |||
| createTime := timeutil.TimeStampNow() | |||
| jobResult, err := createTrainJobUserImage(models.CreateUserImageTrainJobParams{ | |||
| func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) { | |||
| return createTrainJobUserImage(models.CreateUserImageTrainJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| Config: models.UserImageConfig{ | |||
| @@ -569,11 +499,9 @@ func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrain | |||
| UserCommand: req.UserCommand, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| } | |||
| func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) { | |||
| var jobTypes []string | |||
| jobTypes = append(jobTypes, string(models.JobTypeTrain)) | |||
| repo := ctx.Repo.Repository | |||
| @@ -581,7 +509,7 @@ func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrain | |||
| RepoID: repo.ID, | |||
| Type: models.TypeCloudBrainTwo, | |||
| JobTypes: jobTypes, | |||
| JobID: strconv.FormatInt(jobResult.JobID, 10), | |||
| JobID: jobId, | |||
| }) | |||
| if err != nil { | |||
| ctx.ServerError("Cloudbrain", err) | |||
| @@ -589,25 +517,23 @@ func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrain | |||
| } | |||
| //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: TransTrainJobStatus(jobResult.Status), | |||
| createTime := timeutil.TimeStampNow() | |||
| task := &models.Cloudbrain{ | |||
| Status: models.JobStatusTemp, | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: strconv.FormatInt(jobResult.JobID, 10), | |||
| JobID: jobId, | |||
| JobName: req.JobName, | |||
| DisplayJobName: req.DisplayJobName, | |||
| JobType: string(models.JobTypeTrain), | |||
| Type: models.TypeCloudBrainTwo, | |||
| VersionID: jobResult.VersionID, | |||
| VersionName: jobResult.VersionName, | |||
| Uuid: req.Uuid, | |||
| DatasetName: req.DatasetName, | |||
| CommitID: req.CommitID, | |||
| IsLatestVersion: req.IsLatestVersion, | |||
| PreVersionName: req.PreVersionName, | |||
| ComputeResource: models.NPUResource, | |||
| EngineID: MORDELART_USER_IMAGE_ENGINE_ID, | |||
| Image: req.UserImageUrl, | |||
| EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| BranchName: req.BranchName, | |||
| Parameters: req.Params, | |||
| @@ -624,20 +550,103 @@ func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrain | |||
| VersionCount: VersionListCount + 1, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| }) | |||
| } | |||
| err = models.CreateCloudbrain(task) | |||
| if err != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) | |||
| return err | |||
| } | |||
| //将训练任务的上一版本的isLatestVersion设置为"0" | |||
| err = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount) | |||
| err = models.SetVersionCountAndLatestVersion(req.JobName, VersionTaskList[0].VersionName, VersionListCount, NotLatestVersion, VersionTaskList[0].TotalVersionCount) | |||
| if err != nil { | |||
| ctx.ServerError("Update IsLatestVersion failed", err) | |||
| return err | |||
| } | |||
| return err | |||
| var jobResult *models.CreateTrainJobResult | |||
| var createErr error | |||
| if req.EngineID < 0 { | |||
| jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{ | |||
| Description: req.Description, | |||
| Config: models.TrainJobVersionUserImageConfig{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| DataUrl: req.DataUrl, | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| PreVersionId: req.PreVersionId, | |||
| UserImageUrl: req.UserImageUrl, | |||
| UserCommand: req.UserCommand, | |||
| }, | |||
| }, jobId) | |||
| } else { | |||
| jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{ | |||
| Description: req.Description, | |||
| Config: models.TrainJobVersionConfig{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| DataUrl: req.DataUrl, | |||
| EngineID: req.EngineID, | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| PreVersionId: req.PreVersionId, | |||
| }, | |||
| }, jobId) | |||
| } | |||
| if createErr != nil { | |||
| log.Error("createTrainJobVersion failed: %v", err.Error()) | |||
| if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { | |||
| log.Info("(%s)unknown error, set temp status", req.DisplayJobName) | |||
| errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ | |||
| CloudbrainID: task.ID, | |||
| Status: models.JobStatusTemp, | |||
| Type: task.Type, | |||
| JobName: task.JobName, | |||
| JobType: task.JobType, | |||
| }) | |||
| if errTemp != nil { | |||
| log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) | |||
| return errTemp | |||
| } | |||
| } else { | |||
| task.Status = string(models.ModelArtsTrainJobFailed) | |||
| errTemp := models.UpdateJob(task) | |||
| if errTemp != nil { | |||
| log.Error("UpdateJob failed: %v", errTemp.Error()) | |||
| } | |||
| errTemp = models.DeleteJob(task) | |||
| if errTemp != nil { | |||
| log.Error("DeleteJob failed: %v", errTemp.Error()) | |||
| } | |||
| return createErr | |||
| } | |||
| } else { | |||
| task.Status = TransTrainJobStatus(jobResult.Status) | |||
| task.JobID = strconv.FormatInt(jobResult.JobID, 10) | |||
| task.VersionID = jobResult.VersionID | |||
| task.VersionName = jobResult.VersionName | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| func TransTrainJobStatus(status int) string { | |||
| @@ -700,47 +709,22 @@ func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) { | |||
| func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) { | |||
| createTime := timeutil.TimeStampNow() | |||
| jobResult, err := createInferenceJob(models.CreateInferenceJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| InfConfig: models.InfConfig{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| DataUrl: req.DataUrl, | |||
| EngineID: req.EngineID, | |||
| // TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| attach, err := models.GetAttachmentByUUID(req.Uuid) | |||
| if err != nil { | |||
| log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error()) | |||
| log.Error("GetAttachmentByUUID(%s) failed:%v", req.DisplayJobName, err.Error()) | |||
| return err | |||
| } | |||
| jobID := strconv.FormatInt(jobResult.JobID, 10) | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: TransTrainJobStatus(jobResult.Status), | |||
| task := &models.Cloudbrain{ | |||
| Status: string(models.ModelArtsTrainJobWaiting), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobID, | |||
| JobID: models.TempJobIdPrefix + req.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))), | |||
| JobName: req.JobName, | |||
| DisplayJobName: req.DisplayJobName, | |||
| JobType: string(models.JobTypeInference), | |||
| Type: models.TypeCloudBrainTwo, | |||
| VersionID: jobResult.VersionID, | |||
| VersionName: jobResult.VersionName, | |||
| Uuid: req.Uuid, | |||
| DatasetName: attach.Name, | |||
| CommitID: req.CommitID, | |||
| @@ -767,13 +751,74 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e | |||
| ResultUrl: req.ResultUrl, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| }) | |||
| } | |||
| err = models.CreateCloudbrain(task) | |||
| if err != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) | |||
| return err | |||
| } | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask) | |||
| jobResult, err := createInferenceJob(models.CreateInferenceJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| InfConfig: models.InfConfig{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.BootFileUrl, | |||
| DataUrl: req.DataUrl, | |||
| EngineID: req.EngineID, | |||
| // TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("createTrainJob failed: %v", err.Error()) | |||
| if strings.HasPrefix(err.Error(), UnknownErrorPrefix) { | |||
| log.Info("(%s)unknown error, set temp status", req.DisplayJobName) | |||
| err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{ | |||
| CloudbrainID: task.ID, | |||
| Status: models.JobStatusTemp, | |||
| Type: task.Type, | |||
| JobName: task.JobName, | |||
| JobType: task.JobType, | |||
| }) | |||
| if err != nil { | |||
| log.Error("InsertCloudbrainTemp failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| } else { | |||
| task.Status = string(models.ModelArtsTrainJobFailed) | |||
| errTemp := models.UpdateJob(task) | |||
| if errTemp != nil { | |||
| log.Error("UpdateJob failed: %v", errTemp.Error()) | |||
| } | |||
| errTemp = models.DeleteJob(task) | |||
| if errTemp != nil { | |||
| log.Error("DeleteJob failed: %v", errTemp.Error()) | |||
| } | |||
| return err | |||
| } | |||
| } else { | |||
| task.Status = TransTrainJobStatus(jobResult.Status) | |||
| task.JobID = strconv.FormatInt(jobResult.JobID, 10) | |||
| task.VersionID = jobResult.VersionID | |||
| task.VersionName = jobResult.VersionName | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| } | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, task.JobID, req.DisplayJobName, models.ActionCreateInferenceTask) | |||
| return nil | |||
| } | |||
| @@ -799,3 +844,194 @@ func GetNotebookImageName(imageId string) (string, error) { | |||
| return imageName, nil | |||
| } | |||
| func HandleTrainJobInfo(task *models.Cloudbrain) error { | |||
| if isTempJob(task.JobID, task.Status) { | |||
| if task.VersionCount > VersionCountOne { | |||
| //multi version | |||
| result, err := GetTrainJobVersionList(1000, 1, strings.TrimPrefix(task.JobID, models.TempJobIdPrefix)) | |||
| if err != nil { | |||
| log.Error("GetTrainJobVersionList failed:%v", err) | |||
| return err | |||
| } | |||
| if result != nil { | |||
| if strconv.FormatInt(result.JobID, 10) == task.JobID && result.JobName == task.JobName { | |||
| if result.VersionCount == int64(task.VersionCount) { | |||
| log.Info("find the record(%s)", task.DisplayJobName) | |||
| task.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus) | |||
| task.VersionName = result.JobVersionList[0].VersionName | |||
| task.VersionID = result.JobVersionList[0].VersionID | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||
| return err | |||
| } | |||
| temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID) | |||
| if err != nil { | |||
| log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error()) | |||
| } else { | |||
| err = models.DeleteCloudbrainTemp(temp) | |||
| if err != nil { | |||
| log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err) | |||
| } | |||
| } | |||
| return nil | |||
| } else { | |||
| log.Error("can not find the record(%s) until now", task.DisplayJobName) | |||
| } | |||
| } else { | |||
| log.Error("can not find the record(%s) until now", task.DisplayJobName) | |||
| } | |||
| } | |||
| } else { | |||
| //inference or one version | |||
| result, err := GetTrainJobList(1000, 1, "create_time", "desc", task.JobName) | |||
| if err != nil { | |||
| log.Error("GetTrainJobList failed:%v", err) | |||
| return err | |||
| } | |||
| if result != nil { | |||
| for _, job := range result.JobList { | |||
| if task.JobName == job.JobName { | |||
| log.Info("find the record(%s)", task.DisplayJobName) | |||
| task.Status = TransTrainJobStatus(job.IntStatus) | |||
| task.JobID = strconv.FormatInt(job.JobID, 10) | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) | |||
| return err | |||
| } | |||
| temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID) | |||
| if err != nil { | |||
| log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error()) | |||
| return err | |||
| } | |||
| err = models.DeleteCloudbrainTemp(temp) | |||
| if err != nil { | |||
| log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err) | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } else { | |||
| //normal | |||
| result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) | |||
| if err != nil { | |||
| log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err) | |||
| return err | |||
| } | |||
| if result != nil { | |||
| task.Status = TransTrainJobStatus(result.IntStatus) | |||
| task.Duration = result.Duration / 1000 | |||
| task.TrainJobDuration = result.TrainJobDuration | |||
| if task.StartTime == 0 && result.StartTime > 0 { | |||
| task.StartTime = timeutil.TimeStamp(result.StartTime / 1000) | |||
| } | |||
| task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) | |||
| if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { | |||
| task.EndTime = task.StartTime.Add(task.Duration) | |||
| } | |||
| task.CorrectCreateUnix() | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||
| return err | |||
| } | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| func HandleNotebookInfo(task *models.Cloudbrain) error { | |||
| if isTempJob(task.JobID, task.Status) { | |||
| result, err := GetNotebookList(1000, 0, "createTime", "DESC", task.JobName) | |||
| if err != nil { | |||
| log.Error("GetNotebookList failed:%v", err) | |||
| return err | |||
| } | |||
| if result != nil { | |||
| count, err := models.GetCloudbrainCountByJobName(task.JobName, task.JobType) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainCountByJobName failed:%v", err) | |||
| return err | |||
| } | |||
| if len(result.NotebookList) == count { | |||
| if result.NotebookList[0].JobName == task.JobName { | |||
| log.Info("find the record(%s)", task.DisplayJobName) | |||
| task.Status = result.NotebookList[0].Status | |||
| task.JobID = result.NotebookList[0].JobID | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||
| return err | |||
| } | |||
| temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID) | |||
| if err != nil { | |||
| log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error()) | |||
| return err | |||
| } | |||
| err = models.DeleteCloudbrainTemp(temp) | |||
| if err != nil { | |||
| log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err) | |||
| return err | |||
| } | |||
| return nil | |||
| } else { | |||
| log.Error("can not find the record(%s) until now", task.DisplayJobName) | |||
| } | |||
| } else { | |||
| log.Error("can not find the record(%s) until now", task.DisplayJobName) | |||
| } | |||
| } else { | |||
| log.Error("can not find the record(%s) until now", task.DisplayJobName) | |||
| } | |||
| } else { | |||
| //normal | |||
| result, err := GetNotebook2(task.JobID) | |||
| if err != nil { | |||
| log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err) | |||
| return err | |||
| } | |||
| if result != nil { | |||
| task.Status = result.Status | |||
| if task.StartTime == 0 && result.Lease.UpdateTime > 0 { | |||
| task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) | |||
| } | |||
| if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.CorrectCreateUnix() | |||
| task.ComputeAndSetDuration() | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) | |||
| return err | |||
| } | |||
| } | |||
| } | |||
| return nil | |||
| } | |||
| func isTempJob(jobID, status string) bool { | |||
| if (strings.HasPrefix(jobID, models.TempJobIdPrefix) && status == string(models.ModelArtsTrainJobWaiting)) || status == models.JobStatusTemp { | |||
| return true | |||
| } | |||
| return false | |||
| } | |||
| @@ -37,6 +37,7 @@ const ( | |||
| NotebookNotFound = "ModelArts.6404" | |||
| NotebookNoPermission = "ModelArts.6407" | |||
| NotebookInvalid = "ModelArts.6400" | |||
| UnknownErrorPrefix = "UNKNOWN:" | |||
| ) | |||
| func getRestyClient() *resty.Client { | |||
| @@ -298,6 +299,10 @@ sendjob: | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if res.StatusCode() == http.StatusBadGateway { | |||
| return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| if response.ErrorCode == modelartsIllegalToken && retry < 1 { | |||
| @@ -547,9 +552,6 @@ sendjob: | |||
| return nil, fmt.Errorf("resty create train-job: %s", err) | |||
| } | |||
| req, _ := json.Marshal(createJobParams) | |||
| log.Info("%s", req) | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| @@ -563,17 +565,21 @@ sendjob: | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| BootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'." | |||
| DataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'." | |||
| if temp.ErrorMsg == BootFileErrorMsg { | |||
| bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'." | |||
| dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'." | |||
| if temp.ErrorMsg == bootFileErrorMsg { | |||
| log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("启动文件错误!") | |||
| } | |||
| if temp.ErrorMsg == DataSetErrorMsg { | |||
| if temp.ErrorMsg == dataSetErrorMsg { | |||
| log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("数据集错误!") | |||
| } | |||
| return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| if res.StatusCode() == http.StatusBadGateway { | |||
| return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } else { | |||
| return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| } | |||
| if !result.IsSuccess { | |||
| @@ -603,9 +609,6 @@ sendjob: | |||
| return nil, fmt.Errorf("resty create train-job version: %s", err) | |||
| } | |||
| req, _ := json.Marshal(createJobVersionParams) | |||
| log.Info("%s", req) | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| @@ -618,17 +621,23 @@ sendjob: | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'." | |||
| DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'." | |||
| if temp.ErrorMsg == BootFileErrorMsg { | |||
| log.Error("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| bootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'." | |||
| dataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'." | |||
| if temp.ErrorMsg == bootFileErrorMsg { | |||
| log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("启动文件错误!") | |||
| } | |||
| if temp.ErrorMsg == DataSetErrorMsg { | |||
| if temp.ErrorMsg == dataSetErrorMsg { | |||
| log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("数据集错误!") | |||
| } | |||
| return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| if res.StatusCode() == http.StatusBadGateway { | |||
| return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } else { | |||
| return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| } | |||
| if !result.IsSuccess { | |||
| @@ -761,9 +770,6 @@ sendjob: | |||
| goto sendjob | |||
| } | |||
| //temp, _ := json.Marshal(req) | |||
| //log.Info("%s", temp) | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| @@ -1172,7 +1178,11 @@ sendjob: | |||
| log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("数据集错误!") | |||
| } | |||
| return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| if res.StatusCode() == http.StatusBadGateway { | |||
| return &result, fmt.Errorf(UnknownErrorPrefix+"createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } else { | |||
| return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| } | |||
| if !result.IsSuccess { | |||
| @@ -1212,7 +1222,11 @@ sendjob: | |||
| err = json.Unmarshal(res.Body(), &response) | |||
| if err != nil { | |||
| log.Error("json.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error()) | |||
| } | |||
| if res.StatusCode() == http.StatusBadGateway { | |||
| return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| @@ -1271,3 +1285,139 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func GetTrainJobList(perPage, page int, sortBy, order, searchContent string) (*models.GetTrainJobListResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetTrainJobListResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetQueryParams(map[string]string{ | |||
| "per_page": strconv.Itoa(perPage), | |||
| "page": strconv.Itoa(page), | |||
| "sortBy": sortBy, | |||
| "order": order, | |||
| "search_content": searchContent, | |||
| }). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetTrainJobList: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("GetTrainJobList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf(temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetTrainJobList failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf(result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func GetTrainJobVersionList(perPage, page int, jobID string) (*models.GetTrainJobVersionListResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetTrainJobVersionListResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetQueryParams(map[string]string{ | |||
| "per_page": strconv.Itoa(perPage), | |||
| "page": strconv.Itoa(page), | |||
| }). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions") | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetTrainJobVersionList: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("GetTrainJobVersionList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf(temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetTrainJobVersionList failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf(result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func GetNotebookList(limit, offset int, sortBy, order, searchContent string) (*models.GetNotebookListResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetNotebookListResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetQueryParams(map[string]string{ | |||
| "limit": strconv.Itoa(limit), | |||
| "offset": strconv.Itoa(offset), | |||
| "name": searchContent, | |||
| "sort_key": sortBy, | |||
| "sort_dir": order, | |||
| }). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetNotebookList: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("GetNotebookList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf(temp.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| @@ -26,40 +26,6 @@ import ( | |||
| routerRepo "code.gitea.io/gitea/routers/repo" | |||
| ) | |||
| func GetModelArtsNotebook(ctx *context.APIContext) { | |||
| var ( | |||
| err error | |||
| ) | |||
| jobID := ctx.Params(":jobid") | |||
| repoID := ctx.Repo.Repository.ID | |||
| job, err := models.GetRepoCloudBrainByJobID(repoID, jobID) | |||
| if err != nil { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| result, err := modelarts.GetJob(jobID) | |||
| if err != nil { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| oldStatus := job.Status | |||
| job.Status = result.Status | |||
| if oldStatus != result.Status { | |||
| notification.NotifyChangeCloudbrainStatus(job, oldStatus) | |||
| } | |||
| err = models.UpdateJob(job) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed:", err) | |||
| } | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| "JobID": jobID, | |||
| "JobStatus": result.Status, | |||
| }) | |||
| } | |||
| func GetModelArtsNotebook2(ctx *context.APIContext) { | |||
| var ( | |||
| err error | |||
| @@ -71,70 +37,16 @@ func GetModelArtsNotebook2(ctx *context.APIContext) { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| result, err := modelarts.GetNotebook2(job.JobID) | |||
| err = modelarts.HandleNotebookInfo(job) | |||
| if err != nil { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| if job.StartTime == 0 && result.Lease.UpdateTime > 0 { | |||
| job.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) | |||
| } | |||
| oldStatus := job.Status | |||
| job.Status = result.Status | |||
| if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) { | |||
| job.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| job.CorrectCreateUnix() | |||
| job.ComputeAndSetDuration() | |||
| if oldStatus != result.Status { | |||
| notification.NotifyChangeCloudbrainStatus(job, oldStatus) | |||
| } | |||
| err = models.UpdateJob(job) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed:", err) | |||
| } | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| "ID": ID, | |||
| "JobName": job.JobName, | |||
| "JobStatus": result.Status, | |||
| }) | |||
| } | |||
| func GetModelArtsTrainJob(ctx *context.APIContext) { | |||
| var ( | |||
| err error | |||
| ) | |||
| jobID := ctx.Params(":jobid") | |||
| repoID := ctx.Repo.Repository.ID | |||
| job, err := models.GetRepoCloudBrainByJobID(repoID, jobID) | |||
| if err != nil { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) | |||
| if err != nil { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| oldStatus := job.Status | |||
| job.Status = modelarts.TransTrainJobStatus(result.IntStatus) | |||
| job.Duration = result.Duration | |||
| job.TrainJobDuration = result.TrainJobDuration | |||
| if oldStatus != job.Status { | |||
| notification.NotifyChangeCloudbrainStatus(job, oldStatus) | |||
| } | |||
| err = models.UpdateJob(job) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed:", err) | |||
| } | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| "JobID": jobID, | |||
| "JobStatus": job.Status, | |||
| "JobDuration": job.Duration, | |||
| "JobStatus": job.Status, | |||
| }) | |||
| } | |||
| @@ -188,27 +100,11 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { | |||
| } | |||
| } | |||
| } else if job.Type == models.TypeCloudBrainTwo { | |||
| result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) | |||
| err := modelarts.HandleTrainJobInfo(job) | |||
| if err != nil { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| if job.StartTime == 0 && result.StartTime > 0 { | |||
| job.StartTime = timeutil.TimeStamp(result.StartTime / 1000) | |||
| } | |||
| job.Status = modelarts.TransTrainJobStatus(result.IntStatus) | |||
| job.Duration = result.Duration / 1000 | |||
| job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) | |||
| if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { | |||
| job.EndTime = job.StartTime.Add(job.Duration) | |||
| } | |||
| job.CorrectCreateUnix() | |||
| err = models.UpdateTrainJobVersion(job) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed:", err) | |||
| } | |||
| } else if job.Type == models.TypeC2Net { | |||
| result, err := grampus.GetJob(jobID) | |||
| if err != nil { | |||
| @@ -557,26 +453,11 @@ func GetModelArtsInferenceJob(ctx *context.APIContext) { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) | |||
| err = modelarts.HandleTrainJobInfo(job) | |||
| if err != nil { | |||
| ctx.NotFound(err) | |||
| return | |||
| } | |||
| if job.StartTime == 0 && result.StartTime > 0 { | |||
| job.StartTime = timeutil.TimeStamp(result.StartTime / 1000) | |||
| } | |||
| job.Status = modelarts.TransTrainJobStatus(result.IntStatus) | |||
| job.Duration = result.Duration / 1000 | |||
| job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) | |||
| if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { | |||
| job.EndTime = job.StartTime.Add(job.Duration) | |||
| } | |||
| job.CorrectCreateUnix() | |||
| err = models.UpdateInferenceJob(job) | |||
| if err != nil { | |||
| log.Error("UpdateJob failed:", err) | |||
| } | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| "JobID": jobID, | |||
| @@ -1784,70 +1784,24 @@ func SyncCloudbrainStatus() { | |||
| } | |||
| } else if task.Type == models.TypeCloudBrainTwo { | |||
| if task.JobType == string(models.JobTypeDebug) { | |||
| //result, err := modelarts.GetJob(task.JobID) | |||
| result, err := modelarts.GetNotebook2(task.JobID) | |||
| err := modelarts.HandleNotebookInfo(task) | |||
| if err != nil { | |||
| log.Error("GetJob(%s) failed:%v", task.JobName, err) | |||
| log.Error("HandleNotebookInfo(%s) failed:%v", task.DisplayJobName, err) | |||
| continue | |||
| } | |||
| if result != nil { | |||
| oldStatus := task.Status | |||
| task.Status = result.Status | |||
| if task.StartTime == 0 && result.Lease.UpdateTime > 0 { | |||
| task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) | |||
| } | |||
| if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.CorrectCreateUnix() | |||
| task.ComputeAndSetDuration() | |||
| if oldStatus != task.Status { | |||
| notification.NotifyChangeCloudbrainStatus(task, oldStatus) | |||
| } | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||
| continue | |||
| } | |||
| } | |||
| } else if task.JobType == string(models.JobTypeTrain) || task.JobType == string(models.JobTypeInference) { | |||
| result, err := modelarts.GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) | |||
| err := modelarts.HandleTrainJobInfo(task) | |||
| if err != nil { | |||
| log.Error("GetTrainJob(%s) failed:%v", task.JobName, err) | |||
| log.Error("HandleTrainJobInfo(%s) failed:%v", task.DisplayJobName, err) | |||
| continue | |||
| } | |||
| if result != nil { | |||
| oldStatus := task.Status | |||
| task.Status = modelarts.TransTrainJobStatus(result.IntStatus) | |||
| task.Duration = result.Duration / 1000 | |||
| task.TrainJobDuration = result.TrainJobDuration | |||
| if task.StartTime == 0 && result.StartTime > 0 { | |||
| task.StartTime = timeutil.TimeStamp(result.StartTime / 1000) | |||
| } | |||
| task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) | |||
| if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 { | |||
| task.EndTime = task.StartTime.Add(task.Duration) | |||
| } | |||
| task.CorrectCreateUnix() | |||
| if oldStatus != task.Status { | |||
| notification.NotifyChangeCloudbrainStatus(task, oldStatus) | |||
| } | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||
| continue | |||
| } | |||
| } | |||
| } else { | |||
| log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType) | |||
| log.Error("task.JobType(%s) is error:%s", task.DisplayJobName, task.JobType) | |||
| } | |||
| } else if task.Type == models.TypeC2Net { | |||
| result, err := grampus.GetJob(task.JobID) | |||
| if err != nil { | |||
| log.Error("GetTrainJob(%s) failed:%v", task.JobName, err) | |||
| log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err) | |||
| continue | |||
| } | |||
| @@ -337,7 +337,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain | |||
| EngineName: image, | |||
| DatasetName: attachment.Name, | |||
| IsLatestVersion: modelarts.IsLatestVersion, | |||
| VersionCount: modelarts.VersionCount, | |||
| VersionCount: modelarts.VersionCountOne, | |||
| WorkServerNumber: 1, | |||
| } | |||
| @@ -387,7 +387,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain | |||
| branchName := form.BranchName | |||
| isLatestVersion := modelarts.IsLatestVersion | |||
| flavorName := form.FlavorName | |||
| versionCount := modelarts.VersionCount | |||
| versionCount := modelarts.VersionCountOne | |||
| engineName := form.EngineName | |||
| if !jobNamePattern.MatchString(displayJobName) { | |||
| @@ -7,6 +7,7 @@ import ( | |||
| "fmt" | |||
| "io" | |||
| "io/ioutil" | |||
| "math/rand" | |||
| "net/http" | |||
| "os" | |||
| "path" | |||
| @@ -262,30 +263,15 @@ func NotebookShow(ctx *context.Context) { | |||
| return | |||
| } | |||
| result, err := modelarts.GetNotebook2(task.JobID) | |||
| if err != nil { | |||
| log.Error("GET job error", err.Error()) | |||
| ctx.NotFound(ctx.Req.URL.RequestURI(), nil) | |||
| return | |||
| } | |||
| if result != nil { | |||
| if task.DeletedAt.IsZero() { //normal record | |||
| if task.Status != result.Status { | |||
| oldStatus := task.Status | |||
| task.Status = result.Status | |||
| models.ParseAndSetDurationFromModelArtsNotebook(result, task) | |||
| notification.NotifyChangeCloudbrainStatus(task, oldStatus) | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("GET job error", err.Error()) | |||
| ctx.NotFound(ctx.Req.URL.RequestURI(), nil) | |||
| return | |||
| } | |||
| } | |||
| } else { //deleted record | |||
| if task.DeletedAt.IsZero() { //normal record | |||
| err := modelarts.HandleNotebookInfo(task) | |||
| if err != nil { | |||
| ctx.Data["error"] = err.Error() | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil) | |||
| return | |||
| } | |||
| } else { //deleted record | |||
| } | |||
| datasetDownload := make([]models.DatasetDownload, 0) | |||
| @@ -396,82 +382,141 @@ func NotebookDebug2(ctx *context.Context) { | |||
| ctx.Redirect(result.Url + "?token=" + result.Token) | |||
| } | |||
| func NotebookManage(ctx *context.Context) { | |||
| func NotebookRestart(ctx *context.Context) { | |||
| var ID = ctx.Params(":id") | |||
| var action = ctx.Params(":action") | |||
| var resultCode = "0" | |||
| var resultCode = "-1" | |||
| var errorMsg = "" | |||
| var status = "" | |||
| task := ctx.Cloudbrain | |||
| for { | |||
| task, err := models.GetCloudbrainByID(ID) | |||
| if err != nil { | |||
| log.Error("get task(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| ctx.CheckWechatBind() | |||
| if ctx.Written() { | |||
| return | |||
| } | |||
| if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) { | |||
| log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) | |||
| errorMsg = "the job is not stopped" | |||
| break | |||
| } | |||
| if action == models.ActionStop { | |||
| if task.Status != string(models.ModelArtsRunning) { | |||
| log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "the job is not running" | |||
| count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||
| errorMsg = "system error" | |||
| break | |||
| } else { | |||
| if count >= 1 { | |||
| log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) | |||
| errorMsg = "you have already a running or waiting task, can not create more" | |||
| break | |||
| } | |||
| } | |||
| if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin() && !ctx.IsUserRepoOwner()) { | |||
| log.Error("the user has no right ro stop the job", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "you have no right to stop the job" | |||
| break | |||
| } | |||
| } else if action == models.ActionRestart { | |||
| ctx.CheckWechatBind() | |||
| if ctx.Written() { | |||
| return | |||
| } | |||
| if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) { | |||
| log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "the job is not stopped" | |||
| break | |||
| } | |||
| createTime := timeutil.TimeStampNow() | |||
| newTask := &models.Cloudbrain{ | |||
| Status: string(models.ModelArtsTrainJobWaiting), | |||
| UserID: task.UserID, | |||
| RepoID: task.RepoID, | |||
| JobID: models.TempJobIdPrefix + task.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))), | |||
| JobName: task.JobName, | |||
| DisplayJobName: task.DisplayJobName, | |||
| JobType: task.JobType, | |||
| Type: task.Type, | |||
| Uuid: task.Uuid, | |||
| Image: task.Image, | |||
| ComputeResource: task.ComputeResource, | |||
| Description: task.Description, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| } | |||
| if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin()) { | |||
| log.Error("the user has no right ro restart the job", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "you have no right to restart the job" | |||
| err = models.RestartCloudbrain(task, newTask) | |||
| if err != nil { | |||
| log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| param := models.NotebookAction{ | |||
| Action: models.ActionStart, | |||
| } | |||
| res, err := modelarts.ManageNotebook2(task.JobID, param) | |||
| if err != nil { | |||
| log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"]) | |||
| if strings.HasPrefix(err.Error(), modelarts.UnknownErrorPrefix) { | |||
| log.Info("(%s)unknown error, set temp status", newTask.DisplayJobName) | |||
| errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{ | |||
| CloudbrainID: newTask.ID, | |||
| Status: models.JobStatusTemp, | |||
| Type: newTask.Type, | |||
| JobName: newTask.JobName, | |||
| JobType: newTask.JobType, | |||
| }) | |||
| if errTemp != nil { | |||
| log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error()) | |||
| } | |||
| } else { | |||
| newTask.Status = string(models.ModelArtsTrainJobFailed) | |||
| errTemp := models.UpdateJob(newTask) | |||
| if errTemp != nil { | |||
| log.Error("UpdateJob failed: %v", errTemp.Error()) | |||
| } | |||
| errTemp = models.DeleteJob(newTask) | |||
| if errTemp != nil { | |||
| log.Error("DeleteJob failed: %v", errTemp.Error()) | |||
| } | |||
| errorMsg = err.Error() | |||
| break | |||
| } | |||
| count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID) | |||
| } else { | |||
| newTask.Status = res.Status | |||
| newTask.JobID = task.JobID | |||
| err = models.UpdateJob(newTask) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| log.Error("UpdateJob failed: %v", err.Error()) | |||
| errorMsg = err.Error() | |||
| break | |||
| } else { | |||
| if count >= 1 { | |||
| log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "you have already a running or waiting task, can not create more" | |||
| break | |||
| } | |||
| } | |||
| } | |||
| action = models.ActionStart | |||
| } else { | |||
| log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"]) | |||
| status = res.Status | |||
| resultCode = "0" | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, strconv.FormatInt(newTask.ID, 10), newTask.DisplayJobName, models.ActionCreateDebugNPUTask) | |||
| break | |||
| } | |||
| ctx.JSON(200, map[string]string{ | |||
| "result_code": resultCode, | |||
| "error_msg": errorMsg, | |||
| "status": status, | |||
| "id": ID, | |||
| }) | |||
| } | |||
| func NotebookStop(ctx *context.Context) { | |||
| var ID = ctx.Params(":id") | |||
| var resultCode = "0" | |||
| var errorMsg = "" | |||
| var status = "" | |||
| task := ctx.Cloudbrain | |||
| for { | |||
| if task.Status != string(models.ModelArtsRunning) { | |||
| log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "非法操作" | |||
| errorMsg = "the job is not running" | |||
| break | |||
| } | |||
| param := models.NotebookAction{ | |||
| Action: action, | |||
| Action: models.ActionStop, | |||
| } | |||
| createTime := timeutil.TimeStampNow() | |||
| res, err := modelarts.ManageNotebook2(task.JobID, param) | |||
| if err != nil { | |||
| log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||
| @@ -484,50 +529,17 @@ func NotebookManage(ctx *context.Context) { | |||
| } | |||
| status = res.Status | |||
| if action == models.ActionStart { | |||
| newTask := &models.Cloudbrain{ | |||
| Status: status, | |||
| UserID: task.UserID, | |||
| RepoID: task.RepoID, | |||
| JobID: task.JobID, | |||
| JobName: task.JobName, | |||
| DisplayJobName: task.DisplayJobName, | |||
| JobType: task.JobType, | |||
| Type: task.Type, | |||
| Uuid: task.Uuid, | |||
| Image: task.Image, | |||
| ComputeResource: task.ComputeResource, | |||
| Description: task.Description, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| } | |||
| err = models.RestartCloudbrain(task, newTask) | |||
| if err != nil { | |||
| log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| ID = strconv.FormatInt(newTask.ID, 10) | |||
| notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, ID, task.DisplayJobName, models.ActionCreateDebugNPUTask) | |||
| } else { | |||
| oldStatus := task.Status | |||
| task.Status = res.Status | |||
| if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.ComputeAndSetDuration() | |||
| if oldStatus != task.Status { | |||
| notification.NotifyChangeCloudbrainStatus(task, oldStatus) | |||
| } | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| task.Status = res.Status | |||
| if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { | |||
| task.EndTime = timeutil.TimeStampNow() | |||
| } | |||
| task.ComputeAndSetDuration() | |||
| err = models.UpdateJob(task) | |||
| if err != nil { | |||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) | |||
| resultCode = "-1" | |||
| errorMsg = "system error" | |||
| break | |||
| } | |||
| break | |||
| @@ -1000,7 +1012,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| branch_name := form.BranchName | |||
| isLatestVersion := modelarts.IsLatestVersion | |||
| FlavorName := form.FlavorName | |||
| VersionCount := modelarts.VersionCount | |||
| VersionCount := modelarts.VersionCountOne | |||
| EngineName := form.EngineName | |||
| count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) | |||
| @@ -1702,60 +1714,6 @@ func TrainJobShow(ctx *context.Context) { | |||
| ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) | |||
| } | |||
| func TrainJobGetLog(ctx *context.Context) { | |||
| ctx.Data["PageIsTrainJob"] = true | |||
| var jobID = ctx.Params(":jobid") | |||
| var logFileName = ctx.Query("file_name") | |||
| var baseLine = ctx.Query("base_line") | |||
| var order = ctx.Query("order") | |||
| if order != modelarts.OrderDesc && order != modelarts.OrderAsc { | |||
| log.Error("order(%s) check failed", order) | |||
| ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow) | |||
| return | |||
| } | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) | |||
| return | |||
| } | |||
| result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines) | |||
| if err != nil { | |||
| log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) | |||
| return | |||
| } | |||
| ctx.Data["log"] = result | |||
| //ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) | |||
| } | |||
| func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | |||
| return nil, nil, err | |||
| } | |||
| resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) | |||
| if err != nil { | |||
| log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) | |||
| return nil, nil, err | |||
| } | |||
| result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines) | |||
| if err != nil { | |||
| log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) | |||
| return nil, nil, err | |||
| } | |||
| return resultLogFile, result, err | |||
| } | |||
| func TrainJobDel(ctx *context.Context) { | |||
| var jobID = ctx.Params(":jobid") | |||
| var listType = ctx.Query("listType") | |||
| @@ -1822,15 +1780,6 @@ func TrainJobStop(ctx *context.Context) { | |||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) | |||
| } | |||
| func canUserCreateTrainJob(uid int64) (bool, error) { | |||
| org, err := models.GetOrgByName(setting.AllowedOrg) | |||
| if err != nil { | |||
| log.Error("get allowed org failed: ", setting.AllowedOrg) | |||
| return false, err | |||
| } | |||
| return org.IsOrgMember(uid) | |||
| } | |||
| func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) { | |||
| if ctx == nil || ctx.User == nil { | |||
| log.Error("user unlogin!") | |||
| @@ -1922,7 +1871,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference | |||
| EngineName := form.EngineName | |||
| LabelName := form.LabelName | |||
| isLatestVersion := modelarts.IsLatestVersion | |||
| VersionCount := modelarts.VersionCount | |||
| VersionCount := modelarts.VersionCountOne | |||
| trainUrl := form.TrainUrl | |||
| modelName := form.ModelName | |||
| modelVersion := form.ModelVersion | |||
| @@ -1183,7 +1183,8 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Group("/:id", func() { | |||
| m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) | |||
| m.Get("/debug", cloudbrain.AdminOrJobCreaterRight, repo.NotebookDebug2) | |||
| m.Post("/:action", reqRepoCloudBrainWriter, repo.NotebookManage) | |||
| m.Post("/restart", cloudbrain.AdminOrJobCreaterRight, repo.NotebookRestart) | |||
| m.Post("/stop", cloudbrain.AdminOrJobCreaterRight, repo.NotebookStop) | |||
| m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookDel) | |||
| }) | |||
| m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.NotebookNew) | |||