Browse Source

create train job

tags/v1.21.12.1
lewis 4 years ago
parent
commit
05d7620f51
5 changed files with 187 additions and 22 deletions
  1. +70
    -0
      models/cloudbrain.go
  2. +2
    -0
      modules/auth/modelarts.go
  3. +55
    -3
      modules/modelarts/modelarts.go
  4. +38
    -4
      modules/modelarts/resty.go
  5. +22
    -15
      routers/repo/modelarts.go

+ 70
- 0
models/cloudbrain.go View File

@@ -469,6 +469,76 @@ type NotebookDelResult struct {
InstanceID string `json:"instance_id"` InstanceID string `json:"instance_id"`
} }


type CreateTrainJobParams struct {
JobName string `json:"job_name"`
Description string `json:"job_desc"`
Config Config `json:"config"`
WorkspaceID string `json:"workspace_id"`
}

type Config struct {
WorkServerNum int `json:"worker_server_num"`
AppUrl string `json:"app_url"` //训练作业的代码目录
BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
Parameter []Parameter `json:"parameter"`
DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
DatasetID string `json:"dataset_id"`
DataVersionID string `json:"dataset_version_id"`
DataSource []DataSource `json:"data_source"`
SpecID int64 `json:"spec_id"`
EngineID int64 `json:"engine_id"`
ModelID int64 `json:"model_id"`
TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
LogUrl string `json:"log_url"`
UserImageUrl string `json:"user_image_url"`
UserCommand string `json:"user_command"`
CreateVersion bool `json:"create_version"`
Volumes []Volumes `json:"volumes"`
}

type Parameter struct {
Label string `json:"label"`
Value string `json:"value"`
}

type DataSource struct {
DatasetID string `json:"dataset_id"`
DatasetVersion string `json:"dataset_version"`
Type string `json:"type"`
DataUrl string `json:"data_url"`
}

type Volumes struct {
Nfs Nfs `json:"nfs"`
HostPath HostPath `json:"host_path"`
}

type Nfs struct {
ID string `json:"id"`
SourcePath string `json:"src_path"`
DestPath string `json:"dest_path"`
ReadOnly bool `json:"read_only"`
}

type HostPath struct {
SourcePath string `json:"src_path"`
DestPath string `json:"dest_path"`
ReadOnly bool `json:"read_only"`
}

type CreateTrainJobResult struct {
ErrorCode string `json:"error_code"`
ErrorMsg string `json:"error_msg"`
IsSuccess bool `json:"is_success"`
JobName string `json:"job_name"`
JobID int64 `json:"job_id"`
Status int `json:"status"`
CreationTime int64 `json:"create_time"`
VersionID int64 `json:"version_id"`
ResourceID string `json:"resource_id"`
VersionName string `json:"version_name"`
}

func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
sess := x.NewSession() sess := x.NewSession()
defer sess.Close() defer sess.Close()


+ 2
- 0
modules/auth/modelarts.go View File

@@ -18,6 +18,8 @@ func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs bindin
type CreateModelArtsTrainJobForm struct { type CreateModelArtsTrainJobForm struct {
JobName string `form:"job_name" binding:"Required"` JobName string `form:"job_name" binding:"Required"`
Attachment string `form:"attachment" binding:"Required"` Attachment string `form:"attachment" binding:"Required"`
BootFile string `form:"boot_file" binding:"Required"`
WorkServerNumber int `form:"work_server_number" binding:"Required"`
Description string `form:"description"` Description string `form:"description"`
} }




+ 55
- 3
modules/modelarts/modelarts.go View File

@@ -1,12 +1,13 @@
package modelarts package modelarts


import ( import (
"code.gitea.io/gitea/modules/setting"
"path" "path"
"strconv"


"code.gitea.io/gitea/models" "code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/context"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
) )


const ( const (
@@ -19,12 +20,27 @@ const (
NotebookEnv = "Python3" NotebookEnv = "Python3"
NotebookType = "Ascend" NotebookType = "Ascend"
FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
CodeLocalPath = "/code/"

engineID = 118
CodePath = "/code/"
OutputPath = "/output/"
JobPath = "/job/"
) )


type GenerateTrainJobReq struct {
JobName string
Uuid string
Description string
CodeObsPath string
BootFile string
DataUrl string
TrainUrl string
WorkServerNumber int
}

func GenerateTask(ctx *context.Context, jobName, uuid, description string) error { func GenerateTask(ctx *context.Context, jobName, uuid, description string) error {
dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
jobResult, err := CreateJob(models.CreateNotebookParams{
jobResult, err := createNotebook(models.CreateNotebookParams{
JobName: jobName, JobName: jobName,
Description:description, Description:description,
ProfileID: profileID, ProfileID: profileID,
@@ -64,3 +80,39 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description string) error


return nil return nil
} }

func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
jobResult, err := createTrainJob(models.CreateTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.Config{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.CodeObsPath + req.BootFile,
DataUrl: req.DataUrl,
EngineID: engineID,
TrainUrl: req.TrainUrl,
},

})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
}

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: strconv.Itoa(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
JobType: string(models.JobTypeDebug),
Type: models.TypeCloudBrainTrainJob,
})

if err != nil {
return err
}

return nil
}

+ 38
- 4
modules/modelarts/resty.go View File

@@ -23,6 +23,7 @@ const (


urlGetToken = "/v3/auth/tokens" urlGetToken = "/v3/auth/tokens"
urlNotebook = "/demanager/instances" urlNotebook = "/demanager/instances"
urlTrainJob = "/training-jobs"
errorCodeExceedLimit = "ModelArts.0118" errorCodeExceedLimit = "ModelArts.0118"
) )
func getRestyClient() *resty.Client { func getRestyClient() *resty.Client {
@@ -87,7 +88,7 @@ func getToken() error {
return nil return nil
} }


func CreateJob(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) {
func createNotebook(createJobParams models.CreateNotebookParams) (*models.CreateNotebookResult, error) {
checkSetting() checkSetting()
client := getRestyClient() client := getRestyClient()
var result models.CreateNotebookResult var result models.CreateNotebookResult
@@ -103,7 +104,7 @@ sendjob:
Post(HOST + "/v1/" + setting.ProjectID + urlNotebook) Post(HOST + "/v1/" + setting.ProjectID + urlNotebook)


if err != nil { if err != nil {
return nil, fmt.Errorf("resty create job: %s", err)
return nil, fmt.Errorf("resty create notebook: %s", err)
} }


if res.StatusCode() == http.StatusUnauthorized && retry < 1 { if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
@@ -120,11 +121,11 @@ sendjob:
} }


if len(response.ErrorCode) != 0 { if len(response.ErrorCode) != 0 {
log.Error("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
log.Error("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == errorCodeExceedLimit { if response.ErrorCode == errorCodeExceedLimit {
response.ErrorMsg = "所选规格使用数量已超过最大配额限制。" response.ErrorMsg = "所选规格使用数量已超过最大配额限制。"
} }
return &result, fmt.Errorf("CreateJob failed(%s): %s", response.ErrorCode, response.ErrorMsg)
return &result, fmt.Errorf("createNotebook failed(%s): %s", response.ErrorCode, response.ErrorMsg)
} }


return &result, nil return &result, nil
@@ -286,3 +287,36 @@ sendjob:


return &result, nil return &result, nil
} }

func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.CreateTrainJobResult, error) {
checkSetting()
client := getRestyClient()
var result models.CreateTrainJobResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetBody(createJobParams).
SetResult(&result).
Post(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty create train-job: %s", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if !result.IsSuccess {
log.Error("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("createTrainJob failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

+ 22
- 15
routers/repo/modelarts.go View File

@@ -9,6 +9,7 @@ import (
"github.com/unknwon/com" "github.com/unknwon/com"
"io" "io"
"os" "os"
"path"
"strconv" "strconv"
"strings" "strings"
"time" "time"
@@ -306,36 +307,45 @@ func TrainJobNew(ctx *context.Context) {
func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) { func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
ctx.Data["PageIsCloudBrain"] = true ctx.Data["PageIsCloudBrain"] = true
jobName := form.JobName jobName := form.JobName
/*
uuid := form.Attachment uuid := form.Attachment
description := form.Description description := form.Description
*/
workServerNumber := form.WorkServerNumber
bootFile := form.BootFile
repo := ctx.Repo.Repository repo := ctx.Repo.Repository
codePath := setting.JobPath + jobName + modelarts.CodeLocalPath
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
outputObsPath := setting.Bucket + modelarts.JobPath + jobName + modelarts.OutputPath
dataPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"


if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{}); err != nil {
if err := git.Clone(repo.RepoPath(), codeLocalPath, git.CloneRepoOptions{}); err != nil {
log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err)
ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form) ctx.RenderWithErr("Failed to clone repository", tplModelArtsTrainJobNew, &form)
return return
} }


//todo: upload code (send to file_server todo this work?) //todo: upload code (send to file_server todo this work?)
if err := uploadCodeToObs(codePath, jobName, ""); err != nil {
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
return return
} }


/*
err := modelarts.GenerateTask(ctx, jobName, uuid, description)
req := &modelarts.GenerateTrainJobReq{
JobName: jobName,
DataUrl: dataPath,
Description: description,
CodeObsPath: codeObsPath,
BootFile: bootFile,
TrainUrl: outputObsPath,
WorkServerNumber: workServerNumber,
}

err := modelarts.GenerateTrainJob(ctx, req)
if err != nil { if err != nil {
ctx.RenderWithErr(err.Error(), tplModelArtsNotebookNew, &form)
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return return
} }


*/

ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
} }


@@ -350,6 +360,7 @@ func readDir(dirname string) ([]os.FileInfo, error) {
list, err := f.Readdir(100) list, err := f.Readdir(100)
f.Close() f.Close()
if err != nil { if err != nil {
//todo: can not upload empty folder
if err == io.EOF { if err == io.EOF {
return nil, nil return nil, nil
} }
@@ -361,7 +372,6 @@ func readDir(dirname string) ([]os.FileInfo, error) {
} }


func uploadCodeToObs(codePath, jobName, parentDir string) error { func uploadCodeToObs(codePath, jobName, parentDir string) error {
log.Info(codePath)
files, err := readDir(codePath) files, err := readDir(codePath)
if err != nil { if err != nil {
log.Error("readDir(%s) failed: %s", codePath, err.Error()) log.Error("readDir(%s) failed: %s", codePath, err.Error())
@@ -373,7 +383,6 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error {
input := &obs.PutObjectInput{} input := &obs.PutObjectInput{}
input.Bucket = setting.Bucket input.Bucket = setting.Bucket
input.Key = codePath + file.Name() + "/" input.Key = codePath + file.Name() + "/"
log.Info(input.Key)
_, err = storage.ObsCli.PutObject(input) _, err = storage.ObsCli.PutObject(input)
if err != nil { if err != nil {
log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
@@ -388,9 +397,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error {
input := &obs.PutFileInput{} input := &obs.PutFileInput{}
input.Bucket = setting.Bucket input.Bucket = setting.Bucket
input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name()
log.Info(input.Key)
input.SourceFile = codePath + file.Name() input.SourceFile = codePath + file.Name()
log.Info(input.SourceFile)
_, err = storage.ObsCli.PutFile(input) _, err = storage.ObsCli.PutFile(input)
if err != nil { if err != nil {
log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error()) log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())


Loading…
Cancel
Save