| @@ -60,6 +60,8 @@ type Cloudbrain struct { | |||
| DeletedAt time.Time `xorm:"deleted"` | |||
| CanDebug bool `xorm:"-"` | |||
| Type int `xorm:"INDEX DEFAULT 0"` | |||
| VersionID int64 `xorm:"INDEX DEFAULT 0"` | |||
| VersionName string | |||
| User *User `xorm:"-"` | |||
| Repo *Repository `xorm:"-"` | |||
| @@ -499,7 +501,7 @@ type Config struct { | |||
| LogUrl string `json:"log_url"` | |||
| //UserImageUrl string `json:"user_image_url"` | |||
| //UserCommand string `json:"user_command"` | |||
| //CreateVersion bool `json:"create_version"` | |||
| CreateVersion bool `json:"create_version"` | |||
| //Volumes []Volumes `json:"volumes"` | |||
| Flavor Flavor `json:"flavor"` | |||
| PoolID string `json:"pool_id"` | |||
| @@ -507,7 +509,7 @@ type Config struct { | |||
| type CreateConfigParams struct { | |||
| ConfigName string `json:"config_name"` | |||
| Description string `json:"config_desc"` | |||
| Description string `json:"config_desc"` | |||
| WorkServerNum int `json:"worker_server_num"` | |||
| AppUrl string `json:"app_url"` //训练作业的代码目录 | |||
| BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | |||
| @@ -570,7 +572,7 @@ type CreateTrainJobResult struct { | |||
| JobName string `json:"job_name"` | |||
| JobID int64 `json:"job_id"` | |||
| Status int `json:"status"` | |||
| CreationTime int64 `json:"create_time"` | |||
| CreateTime int64 `json:"create_time"` | |||
| VersionID int64 `json:"version_id"` | |||
| ResourceID string `json:"resource_id"` | |||
| VersionName string `json:"version_name"` | |||
| @@ -610,6 +612,43 @@ type ErrorResult struct { | |||
| IsSuccess bool `json:"is_success"` | |||
| } | |||
| type GetTrainJobResult struct { | |||
| IsSuccess bool `json:"is_success"` | |||
| JobName string `json:"job_name"` | |||
| JobID int64 `json:"job_id"` | |||
| Description string `json:"job_desc"` | |||
| Status int `json:"status"` | |||
| LongCreateTime int64 `json:"create_time"` | |||
| CreateTime string | |||
| Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒 | |||
| VersionID int64 `json:"version_id"` | |||
| ResourceID string `json:"resource_id"` | |||
| VersionName string `json:"version_name"` | |||
| PreVersionID int64 `json:"pre_version_id"` | |||
| WorkServerNum int `json:"worker_server_num"` | |||
| AppUrl string `json:"app_url"` //训练作业的代码目录 | |||
| BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | |||
| Parameter []Parameter `json:"parameter"` | |||
| DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL | |||
| //DatasetID string `json:"dataset_id"` | |||
| //DataVersionID string `json:"dataset_version_id"` | |||
| //DataSource []DataSource `json:"data_source"` | |||
| //SpecID int64 `json:"spec_id"` | |||
| EngineID int64 `json:"engine_id"` | |||
| //ModelID int64 `json:"model_id"` | |||
| TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL | |||
| LogUrl string `json:"log_url"` | |||
| //UserImageUrl string `json:"user_image_url"` | |||
| //UserCommand string `json:"user_command"` | |||
| CreateVersion bool `json:"create_version"` | |||
| //Volumes []Volumes `json:"volumes"` | |||
| Flavor Flavor `json:"flavor"` | |||
| PoolID string `json:"pool_id"` | |||
| PoolName string `json:"pool_name"` | |||
| NasMountPath string `json:"nas_mount_path"` | |||
| NasShareAddr string `json:"nas_share_addr"` | |||
| } | |||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | |||
| sess := x.NewSession() | |||
| defer sess.Close() | |||
| @@ -149,6 +149,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||
| TrainUrl: req.TrainUrl, | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| }, | |||
| @@ -161,18 +162,75 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||
| } | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: strconv.Itoa(jobResult.Status), | |||
| Status: transTrainJobStatus(jobResult.Status), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: strconv.FormatInt(jobResult.JobID, 10), | |||
| JobName: req.JobName, | |||
| JobType: string(models.JobTypeDebug), | |||
| Type: models.TypeCloudBrainTrainJob, | |||
| VersionID: jobResult.VersionID, | |||
| VersionName: jobResult.VersionName, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error()) | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| func transTrainJobStatus(status int) string{ | |||
| switch status { | |||
| case 0: | |||
| return "UNKNOWN" | |||
| case 1: | |||
| return "INIT" | |||
| case 2: | |||
| return "IMAGE_CREATING" | |||
| case 3: | |||
| return "IMAGE_FAILED" | |||
| case 4: | |||
| return "SUBMIT_TRYING" | |||
| case 5: | |||
| return "SUBMIT_FAILED" | |||
| case 6: | |||
| return "DELETE_FAILED" | |||
| case 7: | |||
| return "WAITING" | |||
| case 8: | |||
| return "RUNNING" | |||
| case 9: | |||
| return "KILLING" | |||
| case 10: | |||
| return "COMPLETED" | |||
| case 11: | |||
| return "FAILED" | |||
| case 12: | |||
| return "KILLED" | |||
| case 13: | |||
| return "CANCELED" | |||
| case 14: | |||
| return "LOST" | |||
| case 15: | |||
| return "SCALING" | |||
| case 16: | |||
| return "SUBMIT_MODEL_FAILED" | |||
| case 17: | |||
| return "DEPLOY_SERVICE_FAILED" | |||
| case 18: | |||
| return "CHECK_INIT" | |||
| case 19: | |||
| return "CHECK_RUNNING" | |||
| case 20: | |||
| return "CHECK_RUNNING_COMPLETED" | |||
| case 21: | |||
| return "CHECK_FAILED" | |||
| default: | |||
| return strconv.Itoa(status) | |||
| } | |||
| return "" | |||
| } | |||
| @@ -425,3 +425,87 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func GetConfigList() (*models.GetResourceSpecsResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetResourceSpecsResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJobConfig) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetResourceSpecs: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("GetResourceSpecs failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| func GetTrainJob(jobID, versionID string) (*models.GetTrainJobResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.GetTrainJobResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetResult(&result). | |||
| Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/" + versionID) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty GetTrainJob: %v", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if res.StatusCode() != http.StatusOK { | |||
| var temp models.ErrorResult | |||
| if err = json.Unmarshal([]byte(res.String()), &temp); err != nil { | |||
| log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error()) | |||
| } | |||
| log.Error("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| return &result, fmt.Errorf("GetTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg) | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("GetTrainJob(%s) failed", jobID) | |||
| return &result, fmt.Errorf("获取作业详情失败") | |||
| } | |||
| return &result, nil | |||
| } | |||
| @@ -9,6 +9,7 @@ import ( | |||
| "errors" | |||
| "github.com/unknwon/com" | |||
| "io" | |||
| "net/http" | |||
| "os" | |||
| "path" | |||
| "strconv" | |||
| @@ -379,6 +380,8 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| return | |||
| } | |||
| //todo: del local code? | |||
| if isSaveParam == "on" { | |||
| if form.ParameterTemplateName == "" { | |||
| log.Error("ParameterTemplateName is empty") | |||
| @@ -522,3 +525,32 @@ func paramCheckCreateTrainJob(form auth.CreateModelArtsTrainJobForm) error { | |||
| return nil | |||
| } | |||
| func TrainJobShow(ctx *context.Context) { | |||
| ctx.Data["PageIsCloudBrain"] = true | |||
| var jobID = ctx.Params(":jobid") | |||
| task, err := models.GetCloudbrainByJobID(jobID) | |||
| if err != nil { | |||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) | |||
| return | |||
| } | |||
| result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) | |||
| if err != nil { | |||
| log.Error("GetJob(%s) failed:%v", jobID, err.Error()) | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) | |||
| return | |||
| } | |||
| if result != nil { | |||
| createTime, _ := com.StrTo(result.LongCreateTime).Int64() | |||
| result.CreateTime = time.Unix(int64(createTime/1000), 0).Format("2006-01-02 15:04:05") | |||
| } | |||
| ctx.Data["task"] = task | |||
| ctx.Data["jobID"] = jobID | |||
| ctx.Data["result"] = result | |||
| ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow) | |||
| } | |||
| @@ -932,7 +932,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Group("/train-job", func() { | |||
| m.Get("", reqRepoCloudBrainReader, repo.TrainJobIndex) | |||
| m.Group("/:jobid", func() { | |||
| m.Get("", reqRepoCloudBrainReader, repo.NotebookShow) | |||
| m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow) | |||
| m.Get("/debug", reqRepoCloudBrainReader, repo.NotebookDebug) | |||
| m.Post("/stop", reqRepoCloudBrainWriter, repo.NotebookStop) | |||
| m.Post("/del", reqRepoCloudBrainWriter, repo.NotebookDel) | |||