Browse Source

create train job

tags/v1.21.12.1
lewis 4 years ago
parent
commit
085e0c6a4a
6 changed files with 131 additions and 24 deletions
  1. +29
    -1
      models/cloudbrain.go
  2. +2
    -0
      modules/auth/modelarts.go
  3. +18
    -10
      modules/modelarts/modelarts.go
  4. +43
    -0
      modules/modelarts/resty.go
  5. +29
    -3
      routers/repo/modelarts.go
  6. +10
    -10
      templates/repo/modelarts/trainjob/new.tmpl

+ 29
- 1
models/cloudbrain.go View File

@@ -494,6 +494,8 @@ type Config struct {
UserCommand string `json:"user_command"`
CreateVersion bool `json:"create_version"`
Volumes []Volumes `json:"volumes"`
Flavor Flavor `json:"flavor"`
PoolID string `json:"pool_id"`
}

type Parameter struct {
@@ -509,7 +511,7 @@ type DataSource struct {
}

type Volumes struct {
Nfs Nfs `json:"nfs"`
Nfs Nfs `json:"nfs"`
HostPath HostPath `json:"host_path"`
}

@@ -526,6 +528,10 @@ type HostPath struct {
ReadOnly bool `json:"read_only"`
}

type Flavor struct {
Code string `json:"code"`
}

type CreateTrainJobResult struct {
ErrorCode string `json:"error_code"`
ErrorMsg string `json:"error_msg"`
@@ -539,6 +545,28 @@ type CreateTrainJobResult struct {
VersionName string `json:"version_name"`
}

type GetResourceSpecsResult struct {
ErrorCode string `json:"error_code"`
ErrorMsg string `json:"error_msg"`
IsSuccess bool `json:"is_success"`
SpecTotalCount int `json:"spec_total_count"`
Specs []Specs `json:"specs"`
}

type Specs struct {
ErrorCode string `json:"core"`
ErrorMsg string `json:"cpu"`
IsSuccess bool `json:"no_resource"`
JobName string `json:"gpu_type"`
JobID int64 `json:"spec_id"`
Status int `json:"gpu_num"`
ResourceID string `json:"spec_code"`
VersionName string `json:"storage"`
MaxNum int `json:"max_num"`
UnitNum int `json:"unit_num"`
InterfaceType int `json:"interface_type"`
}

func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) {
sess := x.NewSession()
defer sess.Close()


+ 2
- 0
modules/auth/modelarts.go View File

@@ -21,7 +21,9 @@ type CreateModelArtsTrainJobForm struct {
BootFile string `form:"boot_file" binding:"Required"`
WorkServerNumber int `form:"work_server_number" binding:"Required"`
EngineID int `form:"engine_id" binding:"Required"`
SpecID int `form:"spec_id" binding:"Required"`
Flavor string `form:"flavor" binding:"Required"`
PoolID string `form:"pool_id" binding:"Required"`
Description string `form:"description"`
}



+ 18
- 10
modules/modelarts/modelarts.go View File

@@ -23,17 +23,17 @@ const (
FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"

//train-job
ResourcePools = "{\"resource_pool\":[{\"id\":1, \"value\":\"专属资源池\"}]}"
ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
"{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
"{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
"{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
"]}"
FlavorInfos = "{\"flavor\":[{\"id\":1,\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
"{\"id\":2,\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
"{\"id\":3,\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
"{\"id\":4,\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
FlavorInfos = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
"{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
"{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
"{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
"]}"
CodePath = "/code/"
OutputPath = "/output/"
@@ -48,6 +48,9 @@ type GenerateTrainJobReq struct {
BootFile string
DataUrl string
TrainUrl string
FlavorCode string
PoolID string
SpecID int64
WorkServerNumber int
EngineID int64
}
@@ -61,8 +64,8 @@ type VersionInfo struct {

type Flavor struct {
Info []struct {
ID int `json:"id"`
Value string `json:"value"`
Code string `json:"code"`
Value string `json:"value"`
} `json:"flavor"`
}

@@ -75,8 +78,8 @@ type Engine struct {

type ResourcePool struct {
Info []struct {
ID int `json:"id"`
Value string `json:"value"`
ID string `json:"id"`
Value string `json:"value"`
} `json:"resource_pool"`
}

@@ -130,10 +133,15 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
Config: models.Config{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.CodeObsPath + req.BootFile,
BootFileUrl: req.BootFile,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
PoolID: req.PoolID,
SpecID: req.SpecID,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
},

})


+ 43
- 0
modules/modelarts/resty.go View File

@@ -24,6 +24,8 @@ const (
urlGetToken = "/v3/auth/tokens"
urlNotebook = "/demanager/instances"
urlTrainJob = "/training-jobs"
urlResourceSpecs = "/job/resource-specs"

errorCodeExceedLimit = "ModelArts.0118"
)
func getRestyClient() *resty.Client {
@@ -293,6 +295,8 @@ func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.Create
client := getRestyClient()
var result models.CreateTrainJobResult

log.Info("%+v",createJobParams)

retry := 0

sendjob:
@@ -307,6 +311,8 @@ sendjob:
return nil, fmt.Errorf("resty create train-job: %s", err)
}

log.Info("", res.StatusCode(), res.Request.Body)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
@@ -320,3 +326,40 @@ sendjob:

return &result, nil
}

func GetResourceSpecs() (*models.GetResourceSpecsResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetResourceSpecsResult

retry := 0

sendjob:
res, err := client.R().
SetHeader("Content-Type", "application/json").
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs)

if err != nil {
return nil, fmt.Errorf("resty GetJob: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
log.Error("GetResourceSpecs failed(%d)", res.StatusCode())
return &result, fmt.Errorf("GetResourceSpecs failed(%d)", res.StatusCode())
}

if !result.IsSuccess {
log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg)
}

return &result, nil
}

+ 29
- 3
routers/repo/modelarts.go View File

@@ -336,6 +336,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
workServerNumber := form.WorkServerNumber
engineID := form.EngineID
bootFile := form.BootFile
flavorCode := form.Flavor
poolID := form.PoolID
specID := form.SpecID
repo := ctx.Repo.Repository
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
@@ -349,6 +352,12 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
}

//todo: upload code (send to file_server todo this work?)
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir: %s (%v)", repo.FullName(), err)
ctx.RenderWithErr("Failed to obsMkdir", tplModelArtsTrainJobNew, &form)
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form)
@@ -360,14 +369,18 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
DataUrl: dataPath,
Description: description,
CodeObsPath: codeObsPath,
BootFile: codeObsPath + "/" + bootFile,
BootFile: codeObsPath + bootFile,
TrainUrl: outputObsPath,
FlavorCode: flavorCode,
PoolID: poolID,
WorkServerNumber: workServerNumber,
EngineID: int64(engineID),
SpecID: int64(specID),
}

err := modelarts.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form)
return
}
@@ -408,7 +421,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error {
if file.IsDir() {
input := &obs.PutObjectInput{}
input.Bucket = setting.Bucket
input.Key = codePath + file.Name() + "/"
input.Key = parentDir + file.Name() + "/"
_, err = storage.ObsCli.PutObject(input)
if err != nil {
log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
@@ -422,7 +435,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error {
} else {
input := &obs.PutFileInput{}
input.Bucket = setting.Bucket
input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name()
input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
input.SourceFile = codePath + file.Name()
_, err = storage.ObsCli.PutFile(input)
if err != nil {
@@ -434,3 +447,16 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error {

return nil
}

func obsMkdir(dir string) error {
input := &obs.PutObjectInput{}
input.Bucket = setting.Bucket
input.Key = dir
_, err := storage.ObsCli.PutObject(input)
if err != nil {
log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
return err
}

return nil
}

+ 10
- 10
templates/repo/modelarts/trainjob/new.tmpl View File

@@ -128,7 +128,7 @@
</div>
<div class="inline required field">
<label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label>
<select class="ui search dropdown" id="trainjob_datasets" style='width:385px'>
<select class="ui search dropdown" id="trainjob_datasets" style='width:385px' name="attachment">
{{range .attachments}}
<option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option>
{{end}}
@@ -151,7 +151,7 @@
</select>
</div>
<div class="field">
<select class="ui search dropdown" id="trainjob_engine_versions" style='width:385px'>
<select class="ui search dropdown" id="trainjob_engine_versions" style='width:385px' name="engine_id">
{{range .engine_versions}}
<option name="engine_id" value="{{.ID}}">{{.Value}}</option>
{{end}}
@@ -161,13 +161,13 @@
</div>
<div class="inline required field">
<label>{{.i18n.Tr "repo.modelarts.train_job.start_file"}}</label>
<input name="boot_file" id="trainjob_boot_file" value="{{.dataset_path}}" tabindex="3" autofocus required maxlength="255" readonly="readonly">
<input name="boot_file" id="trainjob_boot_file" value="{{.dataset_path}}" tabindex="3" autofocus required maxlength="255">
</div>
</div>
</div>
<div class="required field">
<label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label>
<select class="ui search dropdown" id="trainjob_datasets" style='width:385px'>
<select class="ui search dropdown" id="trainjob_datasets" style='width:385px' name="attachment">
{{range .attachments}}
<option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option>
{{end}}
@@ -184,9 +184,9 @@
<h4 class="ui dividing header">{{.i18n.Tr "repo.modelarts.train_job.resource_setting"}}</h4>
<div class="required field">
<label>{{.i18n.Tr "repo.modelarts.train_job.resource_pool"}}</label>
<select class="ui search dropdown" id="trainjob_resource_pool" style='width:385px'>
<select class="ui search dropdown" id="trainjob_resource_pool" style='width:385px' name="pool_id">
{{range .resource_pools}}
<option value="{{.Value}}">{{.Value}}</option>
<option value="{{.ID}}">{{.Value}}</option>
{{end}}
</select>
</div>
@@ -211,9 +211,9 @@

<div class="required field">
<label>{{.i18n.Tr "repo.modelarts.train_job.standard"}}</label>
<select class="ui search dropdown" id="trainjob-flavor" style='width:385px'>
<select class="ui search dropdown" id="trainjob-flavor" style='width:385px' name="flavor">
{{range .flavor_infos}}
<option name="flavor" value="{{.Value}}">{{.Value}}</option>
<option name="flavor" value="{{.Code}}">{{.Value}}</option>
{{end}}
</select>
</div>
@@ -232,9 +232,9 @@
</div>
</div>
<div class="disabled field" id="save_para">
<div class="required field">
<div class="field">
<label>{{.i18n.Tr "repo.modelarts.train_job.job_parameter_name"}}</label>
<input name="job_type" id="cloudbrain_job_type" value="{{.notebook_type}}" tabindex="3" autofocus required maxlength="255">
<input name="job_type" id="cloudbrain_job_type" value="{{.notebook_type}}" tabindex="3" autofocus maxlength="255">
</div>
<div class="field">
<label for="parameter_description">{{.i18n.Tr "repo.modelarts.train_job.parameter_description"}}</label>


Loading…
Cancel
Save