| @@ -469,6 +469,76 @@ type NotebookDelResult struct { | |||
| InstanceID string `json:"instance_id"` | |||
| } | |||
| type CreateTrainJobParams struct { | |||
| JobName string `json:"job_name"` | |||
| Description string `json:"job_desc"` | |||
| Config Config `json:"config"` | |||
| WorkspaceID string `json:"workspace_id"` | |||
| } | |||
| type Config struct { | |||
| WorkServerNum int `json:"worker_server_num"` | |||
| AppUrl string `json:"app_url"` //训练作业的代码目录 | |||
| BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下 | |||
| Parameter []Parameter `json:"parameter"` | |||
| DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL | |||
| DatasetID string `json:"dataset_id"` | |||
| DataVersionID string `json:"dataset_version_id"` | |||
| DataSource []DataSource `json:"data_source"` | |||
| SpecID int64 `json:"spec_id"` | |||
| EngineID int64 `json:"engine_id"` | |||
| ModelID int64 `json:"model_id"` | |||
| TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL | |||
| LogUrl string `json:"log_url"` | |||
| UserImageUrl string `json:"user_image_url"` | |||
| UserCommand string `json:"user_command"` | |||
| CreateVersion bool `json:"create_version"` | |||
| Volumes []Volumes `json:"volumes"` | |||
| } | |||
| type Parameter struct { | |||
| Label string `json:"label"` | |||
| Value string `json:"value"` | |||
| } | |||
| type DataSource struct { | |||
| DatasetID string `json:"dataset_id"` | |||
| DatasetVersion string `json:"dataset_version"` | |||
| Type string `json:"type"` | |||
| DataUrl string `json:"data_url"` | |||
| } | |||
| type Volumes struct { | |||
| Nfs Nfs `json:"nfs"` | |||
| HostPath HostPath `json:"host_path"` | |||
| } | |||
| type Nfs struct { | |||
| ID string `json:"id"` | |||
| SourcePath string `json:"src_path"` | |||
| DestPath string `json:"dest_path"` | |||
| ReadOnly bool `json:"read_only"` | |||
| } | |||
| type HostPath struct { | |||
| SourcePath string `json:"src_path"` | |||
| DestPath string `json:"dest_path"` | |||
| ReadOnly bool `json:"read_only"` | |||
| } | |||
| type CreateTrainJobResult struct { | |||
| ErrorCode string `json:"error_code"` | |||
| ErrorMsg string `json:"error_msg"` | |||
| IsSuccess bool `json:"is_success"` | |||
| JobName string `json:"job_name"` | |||
| JobID int64 `json:"job_id"` | |||
| Status int `json:"status"` | |||
| CreationTime int64 `json:"create_time"` | |||
| VersionID int64 `json:"version_id"` | |||
| ResourceID string `json:"resource_id"` | |||
| VersionName string `json:"version_name"` | |||
| } | |||
| func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | |||
| sess := x.NewSession() | |||
| defer sess.Close() | |||
| @@ -18,6 +18,8 @@ func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs bindin | |||
| type CreateModelArtsTrainJobForm struct { | |||
| JobName string `form:"job_name" binding:"Required"` | |||
| Attachment string `form:"attachment" binding:"Required"` | |||
| BootFile string `form:"boot_file" binding:"Required"` | |||
| WorkServerNumber int `form:"work_server_number" binding:"Required"` | |||
| Description string `form:"description"` | |||
| } | |||
| @@ -1,12 +1,13 @@ | |||
| package modelarts | |||
| import ( | |||
| "code.gitea.io/gitea/modules/setting" | |||
| "path" | |||
| "strconv" | |||
| "code.gitea.io/gitea/models" | |||
| "code.gitea.io/gitea/modules/context" | |||
| "code.gitea.io/gitea/modules/log" | |||
| "code.gitea.io/gitea/modules/setting" | |||
| ) | |||
| const ( | |||
| @@ -19,12 +20,27 @@ const ( | |||
| NotebookEnv = "Python3" | |||
| NotebookType = "Ascend" | |||
| FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | |||
| CodeLocalPath = "/code/" | |||
| engineID = 118 | |||
| CodePath = "/code/" | |||
| OutputPath = "/output/" | |||
| JobPath = "/job/" | |||
| ) | |||
| type GenerateTrainJobReq struct { | |||
| JobName string | |||
| Uuid string | |||
| Description string | |||
| CodeObsPath string | |||
| BootFile string | |||
| DataUrl string | |||
| TrainUrl string | |||
| WorkServerNumber int | |||
| } | |||
| func GenerateTask(ctx *context.Context, jobName, uuid, description string) error { | |||
| dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" | |||
| jobResult, err := CreateJob(models.CreateNotebookParams{ | |||
| jobResult, err := createNotebook(models.CreateNotebookParams{ | |||
| JobName: jobName, | |||
| Description:description, | |||
| ProfileID: profileID, | |||
| @@ -64,3 +80,39 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error | |||
| return nil | |||
| } | |||
| func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||
| jobResult, err := createTrainJob(models.CreateTrainJobParams{ | |||
| JobName: req.JobName, | |||
| Description: req.Description, | |||
| Config: models.Config{ | |||
| WorkServerNum: req.WorkServerNumber, | |||
| AppUrl: req.CodeObsPath, | |||
| BootFileUrl: req.CodeObsPath + req.BootFile, | |||
| DataUrl: req.DataUrl, | |||
| EngineID: engineID, | |||
| TrainUrl: req.TrainUrl, | |||
| }, | |||
| }) | |||
| if err != nil { | |||
| log.Error("CreateJob failed: %v", err.Error()) | |||
| return err | |||
| } | |||
| err = models.CreateCloudbrain(&models.Cloudbrain{ | |||
| Status: strconv.Itoa(jobResult.Status), | |||
| UserID: ctx.User.ID, | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: strconv.FormatInt(jobResult.JobID, 10), | |||
| JobName: req.JobName, | |||
| JobType: string(models.JobTypeDebug), | |||
| Type: models.TypeCloudBrainTrainJob, | |||
| }) | |||
| if err != nil { | |||
| return err | |||
| } | |||
| return nil | |||
| } | |||
| @@ -23,6 +23,7 @@ const ( | |||
| urlGetToken = "/v3/auth/tokens" | |||
| urlNotebook = "/demanager/instances" | |||
| urlTrainJob = "/training-jobs" | |||
| errorCodeExceedLimit = "ModelArts.0118" | |||
| ) | |||
| func getRestyClient() *resty.Client { | |||
| @@ -87,7 +88,7 @@ func getToken() error { | |||
| return nil | |||
| } | |||
| func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { | |||
| func createNotebook(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.CreateNotebookResult | |||
| @@ -103,7 +104,7 @@ sendjob: | |||
| Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty create job: %s", err) | |||
| return nil, fmt.Errorf("resty create notebook: %s", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| @@ -120,11 +121,11 @@ sendjob: | |||
| } | |||
| if len(response.ErrorCode) != 0 { | |||
| log.Error("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| if response.ErrorCode == errorCodeExceedLimit { | |||
| response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" | |||
| } | |||
| return &result, fmt.Errorf("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| @@ -286,3 +287,36 @@ sendjob: | |||
| return &result, nil | |||
| } | |||
| func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) { | |||
| checkSetting() | |||
| client := getRestyClient() | |||
| var result models.CreateTrainJobResult | |||
| retry := 0 | |||
| sendjob: | |||
| res, err := client.R(). | |||
| SetHeader("Content-Type", "application/json"). | |||
| SetAuthToken(TOKEN). | |||
| SetBody(createJobParams). | |||
| SetResult(&result). | |||
| Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob) | |||
| if err != nil { | |||
| return nil, fmt.Errorf("resty create train-job: %s", err) | |||
| } | |||
| if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
| retry++ | |||
| _ = getToken() | |||
| goto sendjob | |||
| } | |||
| if !result.IsSuccess { | |||
| log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
| } | |||
| return &result, nil | |||
| } | |||
| @@ -9,6 +9,7 @@ import ( | |||
| "github.com/unknwon/com" | |||
| "io" | |||
| "os" | |||
| "path" | |||
| "strconv" | |||
| "strings" | |||
| "time" | |||
| @@ -306,36 +307,45 @@ func TrainJobNew(ctx *context.Context) { | |||
| func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { | |||
| ctx.Data["PageIsCloudBrain"] = true | |||
| jobName := form.JobName | |||
| /* | |||
| uuid := form.Attachment | |||
| description := form.Description | |||
| */ | |||
| workServerNumber := form.WorkServerNumber | |||
| bootFile := form.BootFile | |||
| repo := ctx.Repo.Repository | |||
| codePath := setting.JobPath + jobName + modelarts.CodeLocalPath | |||
| codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | |||
| codeObsPath := setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | |||
| outputObsPath := setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath | |||
| dataPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" | |||
| if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{}); err != nil { | |||
| if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil { | |||
| log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) | |||
| ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) | |||
| return | |||
| } | |||
| //todo: upload code (send to file_server todo this work?) | |||
| if err := uploadCodeToObs(codePath, jobName, ""); err != nil { | |||
| if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | |||
| log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | |||
| ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | |||
| return | |||
| } | |||
| /* | |||
| err := modelarts.GenerateTask(ctx, jobName, uuid, description) | |||
| req := &modelarts.GenerateTrainJobReq{ | |||
| JobName: jobName, | |||
| DataUrl: dataPath, | |||
| Description: description, | |||
| CodeObsPath: codeObsPath, | |||
| BootFile: bootFile, | |||
| TrainUrl: outputObsPath, | |||
| WorkServerNumber: workServerNumber, | |||
| } | |||
| err := modelarts.GenerateTrainJob(ctx, req) | |||
| if err != nil { | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form) | |||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | |||
| return | |||
| } | |||
| */ | |||
| ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") | |||
| } | |||
| @@ -350,6 +360,7 @@ func readDir(dirname string) ([]os.FileInfo, error) { | |||
| list, err := f.Readdir(100) | |||
| f.Close() | |||
| if err != nil { | |||
| //todo: can not upload empty folder | |||
| if err == io.EOF { | |||
| return nil, nil | |||
| } | |||
| @@ -361,7 +372,6 @@ func readDir(dirname string) ([]os.FileInfo, error) { | |||
| } | |||
| func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
| log.Info(codePath) | |||
| files, err := readDir(codePath) | |||
| if err != nil { | |||
| log.Error("readDir(%s) failed: %s", codePath, err.Error()) | |||
| @@ -373,7 +383,6 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
| input := &obs.PutObjectInput{} | |||
| input.Bucket = setting.Bucket | |||
| input.Key = codePath + file.Name() + "/" | |||
| log.Info(input.Key) | |||
| _, err = storage.ObsCli.PutObject(input) | |||
| if err != nil { | |||
| log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | |||
| @@ -388,9 +397,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
| input := &obs.PutFileInput{} | |||
| input.Bucket = setting.Bucket | |||
| input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() | |||
| log.Info(input.Key) | |||
| input.SourceFile = codePath + file.Name() | |||
| log.Info(input.SourceFile) | |||
| _, err = storage.ObsCli.PutFile(input) | |||
| if err != nil { | |||
| log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error()) | |||