| @@ -469,6 +469,76 @@ type NotebookDelResult struct { | |||||
| InstanceID string `json:"instance_id"` | InstanceID string `json:"instance_id"` | ||||
| } | } | ||||
| type CreateTrainJobParams struct { | |||||
| JobName string `json:"job_name"` | |||||
| Description string `json:"job_desc"` | |||||
| Config Config `json:"config"` | |||||
| WorkspaceID string `json:"workspace_id"` | |||||
| } | |||||
| type Config struct { | |||||
| WorkServerNum int `json:"worker_server_num"` | |||||
| AppUrl string `json:"app_url"` //训练作业的代码目录 | |||||
| BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | |||||
| Parameter []Parameter `json:"parameter"` | |||||
| DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL | |||||
| DatasetID string `json:"dataset_id"` | |||||
| DataVersionID string `json:"dataset_version_id"` | |||||
| DataSource []DataSource `json:"data_source"` | |||||
| SpecID int64 `json:"spec_id"` | |||||
| EngineID int64 `json:"engine_id"` | |||||
| ModelID int64 `json:"model_id"` | |||||
| TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL | |||||
| LogUrl string `json:"log_url"` | |||||
| UserImageUrl string `json:"user_image_url"` | |||||
| UserCommand string `json:"user_command"` | |||||
| CreateVersion bool `json:"create_version"` | |||||
| Volumes []Volumes `json:"volumes"` | |||||
| } | |||||
| type Parameter struct { | |||||
| Label string `json:"label"` | |||||
| Value string `json:"value"` | |||||
| } | |||||
| type DataSource struct { | |||||
| DatasetID string `json:"dataset_id"` | |||||
| DatasetVersion string `json:"dataset_version"` | |||||
| Type string `json:"type"` | |||||
| DataUrl string `json:"data_url"` | |||||
| } | |||||
| type Volumes struct { | |||||
| Nfs Nfs `json:"nfs"` | |||||
| HostPath HostPath `json:"host_path"` | |||||
| } | |||||
| type Nfs struct { | |||||
| ID string `json:"id"` | |||||
| SourcePath string `json:"src_path"` | |||||
| DestPath string `json:"dest_path"` | |||||
| ReadOnly bool `json:"read_only"` | |||||
| } | |||||
| type HostPath struct { | |||||
| SourcePath string `json:"src_path"` | |||||
| DestPath string `json:"dest_path"` | |||||
| ReadOnly bool `json:"read_only"` | |||||
| } | |||||
| type CreateTrainJobResult struct { | |||||
| ErrorCode string `json:"error_code"` | |||||
| ErrorMsg string `json:"error_msg"` | |||||
| IsSuccess bool `json:"is_success"` | |||||
| JobName string `json:"job_name"` | |||||
| JobID int64 `json:"job_id"` | |||||
| Status int `json:"status"` | |||||
| CreationTime int64 `json:"create_time"` | |||||
| VersionID int64 `json:"version_id"` | |||||
| ResourceID string `json:"resource_id"` | |||||
| VersionName string `json:"version_name"` | |||||
| } | |||||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | ||||
| sess := x.NewSession() | sess := x.NewSession() | ||||
| defer sess.Close() | defer sess.Close() | ||||
| @@ -18,6 +18,8 @@ func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs bindin | |||||
| type CreateModelArtsTrainJobForm struct { | type CreateModelArtsTrainJobForm struct { | ||||
| JobName string `form:"job_name" binding:"Required"` | JobName string `form:"job_name" binding:"Required"` | ||||
| Attachment string `form:"attachment" binding:"Required"` | Attachment string `form:"attachment" binding:"Required"` | ||||
| BootFile string `form:"boot_file" binding:"Required"` | |||||
| WorkServerNumber int `form:"work_server_number" binding:"Required"` | |||||
| Description string `form:"description"` | Description string `form:"description"` | ||||
| } | } | ||||
| @@ -1,12 +1,13 @@ | |||||
| package modelarts | package modelarts | ||||
| import ( | import ( | ||||
| "code.gitea.io/gitea/modules/setting" | |||||
| "path" | "path" | ||||
| "strconv" | |||||
| "code.gitea.io/gitea/models" | "code.gitea.io/gitea/models" | ||||
| "code.gitea.io/gitea/modules/context" | "code.gitea.io/gitea/modules/context" | ||||
| "code.gitea.io/gitea/modules/log" | "code.gitea.io/gitea/modules/log" | ||||
| "code.gitea.io/gitea/modules/setting" | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| @@ -19,12 +20,27 @@ const ( | |||||
| NotebookEnv = "Python3" | NotebookEnv = "Python3" | ||||
| NotebookType = "Ascend" | NotebookType = "Ascend" | ||||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | ||||
| CodeLocalPath = "/code/" | |||||
| engineID = 118 | |||||
| CodePath = "/code/" | |||||
| OutputPath = "/output/" | |||||
| JobPath = "/job/" | |||||
| ) | ) | ||||
| type GenerateTrainJobReq struct { | |||||
| JobName string | |||||
| Uuid string | |||||
| Description string | |||||
| CodeObsPath string | |||||
| BootFile string | |||||
| DataUrl string | |||||
| TrainUrl string | |||||
| WorkServerNumber int | |||||
| } | |||||
| func GenerateTask(ctx *context.Context, jobName, uuid, description string) error { | func GenerateTask(ctx *context.Context, jobName, uuid, description string) error { | ||||
| dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" | dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" | ||||
| jobResult, err := CreateJob(models.CreateNotebookParams{ | |||||
| jobResult, err := createNotebook(models.CreateNotebookParams{ | |||||
| JobName: jobName, | JobName: jobName, | ||||
| Description:description, | Description:description, | ||||
| ProfileID: profileID, | ProfileID: profileID, | ||||
| @@ -64,3 +80,39 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error | |||||
| return nil | return nil | ||||
| } | } | ||||
| func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||||
| jobResult, err := createTrainJob(models.CreateTrainJobParams{ | |||||
| JobName: req.JobName, | |||||
| Description: req.Description, | |||||
| Config: models.Config{ | |||||
| WorkServerNum: req.WorkServerNumber, | |||||
| AppUrl: req.CodeObsPath, | |||||
| BootFileUrl: req.CodeObsPath + req.BootFile, | |||||
| DataUrl: req.DataUrl, | |||||
| EngineID: engineID, | |||||
| TrainUrl: req.TrainUrl, | |||||
| }, | |||||
| }) | |||||
| if err != nil { | |||||
| log.Error("CreateJob failed: %v", err.Error()) | |||||
| return err | |||||
| } | |||||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||||
| Status: strconv.Itoa(jobResult.Status), | |||||
| UserID: ctx.User.ID, | |||||
| RepoID: ctx.Repo.Repository.ID, | |||||
| JobID: strconv.FormatInt(jobResult.JobID, 10), | |||||
| JobName: req.JobName, | |||||
| JobType: string(models.JobTypeDebug), | |||||
| Type: models.TypeCloudBrainTrainJob, | |||||
| }) | |||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| return nil | |||||
| } | |||||
| @@ -23,6 +23,7 @@ const ( | |||||
| urlGetToken = "/v3/auth/tokens" | urlGetToken = "/v3/auth/tokens" | ||||
| urlNotebook = "/demanager/instances" | urlNotebook = "/demanager/instances" | ||||
| urlTrainJob = "/training-jobs" | |||||
| errorCodeExceedLimit = "ModelArts.0118" | errorCodeExceedLimit = "ModelArts.0118" | ||||
| ) | ) | ||||
| func getRestyClient() *resty.Client { | func getRestyClient() *resty.Client { | ||||
| @@ -87,7 +88,7 @@ func getToken() error { | |||||
| return nil | return nil | ||||
| } | } | ||||
| func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { | |||||
| func createNotebook(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { | |||||
| checkSetting() | checkSetting() | ||||
| client := getRestyClient() | client := getRestyClient() | ||||
| var result models.CreateNotebookResult | var result models.CreateNotebookResult | ||||
| @@ -103,7 +104,7 @@ sendjob: | |||||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) | Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) | ||||
| if err != nil { | if err != nil { | ||||
| return nil, fmt.Errorf("resty create job: %s", err) | |||||
| return nil, fmt.Errorf("resty create notebook: %s", err) | |||||
| } | } | ||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | ||||
| @@ -120,11 +121,11 @@ sendjob: | |||||
| } | } | ||||
| if len(response.ErrorCode) != 0 { | if len(response.ErrorCode) != 0 { | ||||
| log.Error("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| if response.ErrorCode == errorCodeExceedLimit { | if response.ErrorCode == errorCodeExceedLimit { | ||||
| response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" | response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" | ||||
| } | } | ||||
| return &result, fmt.Errorf("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||||
| } | } | ||||
| return &result, nil | return &result, nil | ||||
| @@ -286,3 +287,36 @@ sendjob: | |||||
| return &result, nil | return &result, nil | ||||
| } | } | ||||
| func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { | |||||
| checkSetting() | |||||
| client := getRestyClient() | |||||
| var result models.CreateTrainJobResult | |||||
| retry := 0 | |||||
| sendjob: | |||||
| res, err := client.R(). | |||||
| SetHeader("Content-Type", "application/json"). | |||||
| SetAuthToken(TOKEN). | |||||
| SetBody(createJobParams). | |||||
| SetResult(&result). | |||||
| Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) | |||||
| if err != nil { | |||||
| return nil, fmt.Errorf("resty create train-job: %s", err) | |||||
| } | |||||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||||
| retry++ | |||||
| _ = getToken() | |||||
| goto sendjob | |||||
| } | |||||
| if !result.IsSuccess { | |||||
| log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||||
| } | |||||
| return &result, nil | |||||
| } | |||||
| @@ -9,6 +9,7 @@ import ( | |||||
| "github.com/unknwon/com" | "github.com/unknwon/com" | ||||
| "io" | "io" | ||||
| "os" | "os" | ||||
| "path" | |||||
| "strconv" | "strconv" | ||||
| "strings" | "strings" | ||||
| "time" | "time" | ||||
| @@ -306,36 +307,45 @@ func TrainJobNew(ctx *context.Context) { | |||||
| func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { | func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { | ||||
| ctx.Data["PageIsCloudBrain"] = true | ctx.Data["PageIsCloudBrain"] = true | ||||
| jobName := form.JobName | jobName := form.JobName | ||||
| /* | |||||
| uuid := form.Attachment | uuid := form.Attachment | ||||
| description := form.Description | description := form.Description | ||||
| */ | |||||
| workServerNumber := form.WorkServerNumber | |||||
| bootFile := form.BootFile | |||||
| repo := ctx.Repo.Repository | repo := ctx.Repo.Repository | ||||
| codePath := setting.JobPath + jobName + modelarts.CodeLocalPath | |||||
| codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | |||||
| codeObsPath := setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | |||||
| outputObsPath := setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath | |||||
| dataPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" | |||||
| if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{}); err != nil { | |||||
| if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { | |||||
| log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) | log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) | ||||
| ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) | ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) | ||||
| return | return | ||||
| } | } | ||||
| //todo: upload code (send to file_server todo this work?) | //todo: upload code (send to file_server todo this work?) | ||||
| if err := uploadCodeToObs(codePath, jobName, ""); err != nil { | |||||
| if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | |||||
| log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | ||||
| ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | ||||
| return | return | ||||
| } | } | ||||
| /* | |||||
| err := modelarts.GenerateTask(ctx, jobName, uuid, description) | |||||
| req := &modelarts.GenerateTrainJobReq{ | |||||
| JobName: jobName, | |||||
| DataUrl: dataPath, | |||||
| Description: description, | |||||
| CodeObsPath: codeObsPath, | |||||
| BootFile: bootFile, | |||||
| TrainUrl: outputObsPath, | |||||
| WorkServerNumber: workServerNumber, | |||||
| } | |||||
| err := modelarts.GenerateTrainJob(ctx, req) | |||||
| if err != nil { | if err != nil { | ||||
| ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form) | |||||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | |||||
| return | return | ||||
| } | } | ||||
| */ | |||||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") | ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") | ||||
| } | } | ||||
| @@ -350,6 +360,7 @@ func readDir(dirname string) ([]os.FileInfo, error) { | |||||
| list, err := f.Readdir(100) | list, err := f.Readdir(100) | ||||
| f.Close() | f.Close() | ||||
| if err != nil { | if err != nil { | ||||
| //todo: can not upload empty folder | |||||
| if err == io.EOF { | if err == io.EOF { | ||||
| return nil, nil | return nil, nil | ||||
| } | } | ||||
| @@ -361,7 +372,6 @@ func readDir(dirname string) ([]os.FileInfo, error) { | |||||
| } | } | ||||
| func uploadCodeToObs(codePath, jobName, parentDir string) error { | func uploadCodeToObs(codePath, jobName, parentDir string) error { | ||||
| log.Info(codePath) | |||||
| files, err := readDir(codePath) | files, err := readDir(codePath) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("readDir(%s) failed: %s", codePath, err.Error()) | log.Error("readDir(%s) failed: %s", codePath, err.Error()) | ||||
| @@ -373,7 +383,6 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||||
| input := &obs.PutObjectInput{} | input := &obs.PutObjectInput{} | ||||
| input.Bucket = setting.Bucket | input.Bucket = setting.Bucket | ||||
| input.Key = codePath + file.Name() + "/" | input.Key = codePath + file.Name() + "/" | ||||
| log.Info(input.Key) | |||||
| _, err = storage.ObsCli.PutObject(input) | _, err = storage.ObsCli.PutObject(input) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | ||||
| @@ -388,9 +397,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||||
| input := &obs.PutFileInput{} | input := &obs.PutFileInput{} | ||||
| input.Bucket = setting.Bucket | input.Bucket = setting.Bucket | ||||
| input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() | input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() | ||||
| log.Info(input.Key) | |||||
| input.SourceFile = codePath + file.Name() | input.SourceFile = codePath + file.Name() | ||||
| log.Info(input.SourceFile) | |||||
| _, err = storage.ObsCli.PutFile(input) | _, err = storage.ObsCli.PutFile(input) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error()) | log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error()) | ||||