diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 2e036b60c..17762e72b 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -469,6 +469,76 @@ type NotebookDelResult struct { InstanceID string `json:"instance_id"` } +type CreateTrainJobParams struct { + JobName string `json:"job_name"` + Description string `json:"job_desc"` + Config Config `json:"config"` + WorkspaceID string `json:"workspace_id"` +} + +type Config struct { + WorkServerNum int `json:"worker_server_num"` + AppUrl string `json:"app_url"` //训练作业的代码目录 + BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 + Parameter []Parameter `json:"parameter"` + DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL + DatasetID string `json:"dataset_id"` + DataVersionID string `json:"dataset_version_id"` + DataSource []DataSource `json:"data_source"` + SpecID int64 `json:"spec_id"` + EngineID int64 `json:"engine_id"` + ModelID int64 `json:"model_id"` + TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL + LogUrl string `json:"log_url"` + UserImageUrl string `json:"user_image_url"` + UserCommand string `json:"user_command"` + CreateVersion bool `json:"create_version"` + Volumes []Volumes `json:"volumes"` +} + +type Parameter struct { + Label string `json:"label"` + Value string `json:"value"` +} + +type DataSource struct { + DatasetID string `json:"dataset_id"` + DatasetVersion string `json:"dataset_version"` + Type string `json:"type"` + DataUrl string `json:"data_url"` +} + +type Volumes struct { + Nfs Nfs `json:"nfs"` + HostPath HostPath `json:"host_path"` +} + +type Nfs struct { + ID string `json:"id"` + SourcePath string `json:"src_path"` + DestPath string `json:"dest_path"` + ReadOnly bool `json:"read_only"` +} + +type HostPath struct { + SourcePath string `json:"src_path"` + DestPath string `json:"dest_path"` + ReadOnly bool `json:"read_only"` +} + +type CreateTrainJobResult struct { + ErrorCode string `json:"error_code"` + ErrorMsg string `json:"error_msg"` + IsSuccess bool `json:"is_success"` + JobName string `json:"job_name"` + JobID int64 `json:"job_id"` + Status int `json:"status"` + CreationTime int64 `json:"create_time"` + VersionID int64 `json:"version_id"` + ResourceID string `json:"resource_id"` + VersionName string `json:"version_name"` +} + func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { sess := x.NewSession() defer sess.Close() diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index fbcb5f72f..5c26c19fa 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -18,6 +18,8 @@ func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs bindin type CreateModelArtsTrainJobForm struct { JobName string `form:"job_name" binding:"Required"` Attachment string `form:"attachment" binding:"Required"` + BootFile string `form:"boot_file" binding:"Required"` + WorkServerNumber int `form:"work_server_number" binding:"Required"` Description string `form:"description"` } diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index d272d036e..48884d66b 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -1,12 +1,13 @@ package modelarts import ( - "code.gitea.io/gitea/modules/setting" "path" + "strconv" "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" ) const ( @@ -19,12 +20,27 @@ const ( NotebookEnv = "Python3" NotebookType = "Ascend" FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" - CodeLocalPath = "/code/" + + engineID = 118 + CodePath = "/code/" + OutputPath = "/output/" + JobPath = "/job/" ) +type GenerateTrainJobReq struct { + JobName string + Uuid string + Description string + CodeObsPath string + BootFile string + DataUrl string + TrainUrl string + WorkServerNumber int +} + func GenerateTask(ctx *context.Context, jobName, uuid, description string) error { dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" - jobResult, err := CreateJob(models.CreateNotebookParams{ + jobResult, err := createNotebook(models.CreateNotebookParams{ JobName: jobName, Description:description, ProfileID: profileID, @@ -64,3 +80,39 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error return nil } + +func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { + jobResult, err := createTrainJob(models.CreateTrainJobParams{ + JobName: req.JobName, + Description: req.Description, + Config: models.Config{ + WorkServerNum: req.WorkServerNumber, + AppUrl: req.CodeObsPath, + BootFileUrl: req.CodeObsPath + req.BootFile, + DataUrl: req.DataUrl, + EngineID: engineID, + TrainUrl: req.TrainUrl, + }, + + }) + if err != nil { + log.Error("CreateJob failed: %v", err.Error()) + return err + } + + err = models.CreateCloudbrain(&models.Cloudbrain{ + Status: strconv.Itoa(jobResult.Status), + UserID: ctx.User.ID, + RepoID: ctx.Repo.Repository.ID, + JobID: strconv.FormatInt(jobResult.JobID, 10), + JobName: req.JobName, + JobType: string(models.JobTypeDebug), + Type: models.TypeCloudBrainTrainJob, + }) + + if err != nil { + return err + } + + return nil +} diff --git a/modules/modelarts/resty.go b/modules/modelarts/resty.go index df020decb..b698c1d65 100755 --- a/modules/modelarts/resty.go +++ b/modules/modelarts/resty.go @@ -23,6 +23,7 @@ const ( urlGetToken = "/v3/auth/tokens" urlNotebook = "/demanager/instances" + urlTrainJob = "/training-jobs" errorCodeExceedLimit = "ModelArts.0118" ) func getRestyClient() *resty.Client { @@ -87,7 +88,7 @@ func getToken() error { return nil } -func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { +func createNotebook(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { checkSetting() client := getRestyClient() var result models.CreateNotebookResult @@ -103,7 +104,7 @@ sendjob: Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) if err != nil { - return nil, fmt.Errorf("resty create job: %s", err) + return nil, fmt.Errorf("resty create notebook: %s", err) } if res.StatusCode() == http.StatusUnauthorized && retry < 1 { @@ -120,11 +121,11 @@ sendjob: } if len(response.ErrorCode) != 0 { - log.Error("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) if response.ErrorCode == errorCodeExceedLimit { response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" } - return &result, fmt.Errorf("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) + return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) } return &result, nil @@ -286,3 +287,36 @@ sendjob: return &result, nil } + +func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { + checkSetting() + client := getRestyClient() + var result models.CreateTrainJobResult + + retry := 0 + +sendjob: + res, err := client.R(). + SetHeader("Content-Type", "application/json"). + SetAuthToken(TOKEN). + SetBody(createJobParams). + SetResult(&result). + Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) + + if err != nil { + return nil, fmt.Errorf("resty create train-job: %s", err) + } + + if res.StatusCode() == http.StatusUnauthorized && retry < 1 { + retry++ + _ = getToken() + goto sendjob + } + + if !result.IsSuccess { + log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) + } + + return &result, nil +} diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 29e4773cc..fb00f1ee6 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -9,6 +9,7 @@ import ( "github.com/unknwon/com" "io" "os" + "path" "strconv" "strings" "time" @@ -306,36 +307,45 @@ func TrainJobNew(ctx *context.Context) { func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { ctx.Data["PageIsCloudBrain"] = true jobName := form.JobName - /* uuid := form.Attachment description := form.Description - - */ + workServerNumber := form.WorkServerNumber + bootFile := form.BootFile repo := ctx.Repo.Repository - codePath := setting.JobPath + jobName + modelarts.CodeLocalPath + codeLocalPath := setting.JobPath + jobName + modelarts.CodePath + codeObsPath := setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath + outputObsPath := setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath + dataPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" - if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{}); err != nil { + if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) return } //todo: upload code (send to file_server todo this work?) - if err := uploadCodeToObs(codePath, jobName, ""); err != nil { + if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) return } - /* - err := modelarts.GenerateTask(ctx, jobName, uuid, description) + req := &modelarts.GenerateTrainJobReq{ + JobName: jobName, + DataUrl: dataPath, + Description: description, + CodeObsPath: codeObsPath, + BootFile: bootFile, + TrainUrl: outputObsPath, + WorkServerNumber: workServerNumber, + } + + err := modelarts.GenerateTrainJob(ctx, req) if err != nil { - ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form) + ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) return } - */ - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } @@ -350,6 +360,7 @@ func readDir(dirname string) ([]os.FileInfo, error) { list, err := f.Readdir(100) f.Close() if err != nil { + //todo: can not upload empty folder if err == io.EOF { return nil, nil } @@ -361,7 +372,6 @@ func readDir(dirname string) ([]os.FileInfo, error) { } func uploadCodeToObs(codePath, jobName, parentDir string) error { - log.Info(codePath) files, err := readDir(codePath) if err != nil { log.Error("readDir(%s) failed: %s", codePath, err.Error()) @@ -373,7 +383,6 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { input := &obs.PutObjectInput{} input.Bucket = setting.Bucket input.Key = codePath + file.Name() + "/" - log.Info(input.Key) _, err = storage.ObsCli.PutObject(input) if err != nil { log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) @@ -388,9 +397,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { input := &obs.PutFileInput{} input.Bucket = setting.Bucket input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() - log.Info(input.Key) input.SourceFile = codePath + file.Name() - log.Info(input.SourceFile) _, err = storage.ObsCli.PutFile(input) if err != nil { log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())