@@ -494,6 +494,8 @@ type Config struct { | |||
UserCommand string `json:"user_command"` | |||
CreateVersion bool `json:"create_version"` | |||
Volumes []Volumes `json:"volumes"` | |||
Flavor Flavor `json:"flavor"` | |||
PoolID string `json:"pool_id"` | |||
} | |||
type Parameter struct { | |||
@@ -509,7 +511,7 @@ type DataSource struct { | |||
} | |||
type Volumes struct { | |||
Nfs Nfs `json:"nfs"` | |||
Nfs Nfs `json:"nfs"` | |||
HostPath HostPath `json:"host_path"` | |||
} | |||
@@ -526,6 +528,10 @@ type HostPath struct { | |||
ReadOnly bool `json:"read_only"` | |||
} | |||
type Flavor struct { | |||
Code string `json:"code"` | |||
} | |||
type CreateTrainJobResult struct { | |||
ErrorCode string `json:"error_code"` | |||
ErrorMsg string `json:"error_msg"` | |||
@@ -539,6 +545,28 @@ type CreateTrainJobResult struct { | |||
VersionName string `json:"version_name"` | |||
} | |||
type GetResourceSpecsResult struct { | |||
ErrorCode string `json:"error_code"` | |||
ErrorMsg string `json:"error_msg"` | |||
IsSuccess bool `json:"is_success"` | |||
SpecTotalCount int `json:"spec_total_count"` | |||
Specs []Specs `json:"specs"` | |||
} | |||
type Specs struct { | |||
ErrorCode string `json:"core"` | |||
ErrorMsg string `json:"cpu"` | |||
IsSuccess bool `json:"no_resource"` | |||
JobName string `json:"gpu_type"` | |||
JobID int64 `json:"spec_id"` | |||
Status int `json:"gpu_num"` | |||
ResourceID string `json:"spec_code"` | |||
VersionName string `json:"storage"` | |||
MaxNum int `json:"max_num"` | |||
UnitNum int `json:"unit_num"` | |||
InterfaceType int `json:"interface_type"` | |||
} | |||
func Cloudbrains(opts *CloudbrainsOptions) ([]*Cloudbrain, int64, error) { | |||
sess := x.NewSession() | |||
defer sess.Close() | |||
@@ -21,7 +21,9 @@ type CreateModelArtsTrainJobForm struct { | |||
BootFile string `form:"boot_file" binding:"Required"` | |||
WorkServerNumber int `form:"work_server_number" binding:"Required"` | |||
EngineID int `form:"engine_id" binding:"Required"` | |||
SpecID int `form:"spec_id" binding:"Required"` | |||
Flavor string `form:"flavor" binding:"Required"` | |||
PoolID string `form:"pool_id" binding:"Required"` | |||
Description string `form:"description"` | |||
} | |||
@@ -23,17 +23,17 @@ const ( | |||
FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)" | |||
//train-job | |||
ResourcePools = "{\"resource_pool\":[{\"id\":1, \"value\":\"专属资源池\"}]}" | |||
ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}" | |||
Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}" | |||
EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," + | |||
"{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," + | |||
"{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," + | |||
"{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" + | |||
"]}" | |||
FlavorInfos = "{\"flavor\":[{\"id\":1,\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + | |||
"{\"id\":2,\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + | |||
"{\"id\":3,\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | |||
"{\"id\":4,\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | |||
FlavorInfos = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," + | |||
"{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," + | |||
"{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," + | |||
"{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" + | |||
"]}" | |||
CodePath = "/code/" | |||
OutputPath = "/output/" | |||
@@ -48,6 +48,9 @@ type GenerateTrainJobReq struct { | |||
BootFile string | |||
DataUrl string | |||
TrainUrl string | |||
FlavorCode string | |||
PoolID string | |||
SpecID int64 | |||
WorkServerNumber int | |||
EngineID int64 | |||
} | |||
@@ -61,8 +64,8 @@ type VersionInfo struct { | |||
type Flavor struct { | |||
Info []struct { | |||
ID int `json:"id"` | |||
Value string `json:"value"` | |||
Code string `json:"code"` | |||
Value string `json:"value"` | |||
} `json:"flavor"` | |||
} | |||
@@ -75,8 +78,8 @@ type Engine struct { | |||
type ResourcePool struct { | |||
Info []struct { | |||
ID int `json:"id"` | |||
Value string `json:"value"` | |||
ID string `json:"id"` | |||
Value string `json:"value"` | |||
} `json:"resource_pool"` | |||
} | |||
@@ -130,10 +133,15 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error { | |||
Config: models.Config{ | |||
WorkServerNum: req.WorkServerNumber, | |||
AppUrl: req.CodeObsPath, | |||
BootFileUrl: req.CodeObsPath + req.BootFile, | |||
BootFileUrl: req.BootFile, | |||
DataUrl: req.DataUrl, | |||
EngineID: req.EngineID, | |||
TrainUrl: req.TrainUrl, | |||
PoolID: req.PoolID, | |||
SpecID: req.SpecID, | |||
Flavor: models.Flavor{ | |||
Code: req.FlavorCode, | |||
}, | |||
}, | |||
}) | |||
@@ -24,6 +24,8 @@ const ( | |||
urlGetToken = "/v3/auth/tokens" | |||
urlNotebook = "/demanager/instances" | |||
urlTrainJob = "/training-jobs" | |||
urlResourceSpecs = "/job/resource-specs" | |||
errorCodeExceedLimit = "ModelArts.0118" | |||
) | |||
func getRestyClient() *resty.Client { | |||
@@ -293,6 +295,8 @@ func createTrainJob(createJobParams models.CreateTrainJobParams) (*models.Create | |||
client := getRestyClient() | |||
var result models.CreateTrainJobResult | |||
log.Info("%+v",createJobParams) | |||
retry := 0 | |||
sendjob: | |||
@@ -307,6 +311,8 @@ sendjob: | |||
return nil, fmt.Errorf("resty create train-job: %s", err) | |||
} | |||
log.Info("", res.StatusCode(), res.Request.Body) | |||
if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
retry++ | |||
_ = getToken() | |||
@@ -320,3 +326,40 @@ sendjob: | |||
return &result, nil | |||
} | |||
func GetResourceSpecs() (*models.GetResourceSpecsResult, error) { | |||
checkSetting() | |||
client := getRestyClient() | |||
var result models.GetResourceSpecsResult | |||
retry := 0 | |||
sendjob: | |||
res, err := client.R(). | |||
SetHeader("Content-Type", "application/json"). | |||
SetAuthToken(TOKEN). | |||
SetResult(&result). | |||
Get(HOST + "/v1/" + setting.ProjectID + urlResourceSpecs) | |||
if err != nil { | |||
return nil, fmt.Errorf("resty GetJob: %v", err) | |||
} | |||
if res.StatusCode() == http.StatusUnauthorized && retry < 1 { | |||
retry++ | |||
_ = getToken() | |||
goto sendjob | |||
} | |||
if res.StatusCode() != http.StatusOK { | |||
log.Error("GetResourceSpecs failed(%d)", res.StatusCode()) | |||
return &result, fmt.Errorf("GetResourceSpecs failed(%d)", res.StatusCode()) | |||
} | |||
if !result.IsSuccess { | |||
log.Error("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
return &result, fmt.Errorf("GetResourceSpecs failed(%s): %s", result.ErrorCode, result.ErrorMsg) | |||
} | |||
return &result, nil | |||
} |
@@ -336,6 +336,9 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
workServerNumber := form.WorkServerNumber | |||
engineID := form.EngineID | |||
bootFile := form.BootFile | |||
flavorCode := form.Flavor | |||
poolID := form.PoolID | |||
specID := form.SpecID | |||
repo := ctx.Repo.Repository | |||
codeLocalPath := setting.JobPath + jobName + modelarts.CodePath | |||
codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath | |||
@@ -349,6 +352,12 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
} | |||
//todo: upload code (send to file_server todo this work?) | |||
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { | |||
log.Error("Failed to obsMkdir: %s (%v)", repo.FullName(), err) | |||
ctx.RenderWithErr("Failed to obsMkdir", tplModelArtsTrainJobNew, &form) | |||
return | |||
} | |||
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { | |||
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) | |||
ctx.RenderWithErr("Failed to uploadCodeToObs", tplModelArtsTrainJobNew, &form) | |||
@@ -360,14 +369,18 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
DataUrl: dataPath, | |||
Description: description, | |||
CodeObsPath: codeObsPath, | |||
BootFile: codeObsPath + "/" + bootFile, | |||
BootFile: codeObsPath + bootFile, | |||
TrainUrl: outputObsPath, | |||
FlavorCode: flavorCode, | |||
PoolID: poolID, | |||
WorkServerNumber: workServerNumber, | |||
EngineID: int64(engineID), | |||
SpecID: int64(specID), | |||
} | |||
err := modelarts.GenerateTrainJob(ctx, req) | |||
if err != nil { | |||
log.Error("GenerateTrainJob failed:%v", err.Error()) | |||
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobNew, &form) | |||
return | |||
} | |||
@@ -408,7 +421,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
if file.IsDir() { | |||
input := &obs.PutObjectInput{} | |||
input.Bucket = setting.Bucket | |||
input.Key = codePath + file.Name() + "/" | |||
input.Key = parentDir + file.Name() + "/" | |||
_, err = storage.ObsCli.PutObject(input) | |||
if err != nil { | |||
log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | |||
@@ -422,7 +435,7 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
} else { | |||
input := &obs.PutFileInput{} | |||
input.Bucket = setting.Bucket | |||
input.Key = setting.CodePathPrefix + jobName + "/" + parentDir + file.Name() | |||
input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name() | |||
input.SourceFile = codePath + file.Name() | |||
_, err = storage.ObsCli.PutFile(input) | |||
if err != nil { | |||
@@ -434,3 +447,16 @@ func uploadCodeToObs(codePath, jobName, parentDir string) error { | |||
return nil | |||
} | |||
func obsMkdir(dir string) error { | |||
input := &obs.PutObjectInput{} | |||
input.Bucket = setting.Bucket | |||
input.Key = dir | |||
_, err := storage.ObsCli.PutObject(input) | |||
if err != nil { | |||
log.Error("PutObject(%s) failed: %s", input.Key, err.Error()) | |||
return err | |||
} | |||
return nil | |||
} |
@@ -128,7 +128,7 @@ | |||
</div> | |||
<div class="inline required field"> | |||
<label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label> | |||
<select class="ui search dropdown" id="trainjob_datasets" style='width:385px'> | |||
<select class="ui search dropdown" id="trainjob_datasets" style='width:385px' name="attachment"> | |||
{{range .attachments}} | |||
<option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option> | |||
{{end}} | |||
@@ -151,7 +151,7 @@ | |||
</select> | |||
</div> | |||
<div class="field"> | |||
<select class="ui search dropdown" id="trainjob_engine_versions" style='width:385px'> | |||
<select class="ui search dropdown" id="trainjob_engine_versions" style='width:385px' name="engine_id"> | |||
{{range .engine_versions}} | |||
<option name="engine_id" value="{{.ID}}">{{.Value}}</option> | |||
{{end}} | |||
@@ -161,13 +161,13 @@ | |||
</div> | |||
<div class="inline required field"> | |||
<label>{{.i18n.Tr "repo.modelarts.train_job.start_file"}}</label> | |||
<input name="boot_file" id="trainjob_boot_file" value="{{.dataset_path}}" tabindex="3" autofocus required maxlength="255" readonly="readonly"> | |||
<input name="boot_file" id="trainjob_boot_file" value="{{.dataset_path}}" tabindex="3" autofocus required maxlength="255"> | |||
</div> | |||
</div> | |||
</div> | |||
<div class="required field"> | |||
<label>{{.i18n.Tr "repo.modelarts.train_job.dataset"}}</label> | |||
<select class="ui search dropdown" id="trainjob_datasets" style='width:385px'> | |||
<select class="ui search dropdown" id="trainjob_datasets" style='width:385px' name="attachment"> | |||
{{range .attachments}} | |||
<option name="attachment" value="{{.UUID}}">{{.Attachment.Name}}</option> | |||
{{end}} | |||
@@ -184,9 +184,9 @@ | |||
<h4 class="ui dividing header">{{.i18n.Tr "repo.modelarts.train_job.resource_setting"}}</h4> | |||
<div class="required field"> | |||
<label>{{.i18n.Tr "repo.modelarts.train_job.resource_pool"}}</label> | |||
<select class="ui search dropdown" id="trainjob_resource_pool" style='width:385px'> | |||
<select class="ui search dropdown" id="trainjob_resource_pool" style='width:385px' name="pool_id"> | |||
{{range .resource_pools}} | |||
<option value="{{.Value}}">{{.Value}}</option> | |||
<option value="{{.ID}}">{{.Value}}</option> | |||
{{end}} | |||
</select> | |||
</div> | |||
@@ -211,9 +211,9 @@ | |||
<div class="required field"> | |||
<label>{{.i18n.Tr "repo.modelarts.train_job.standard"}}</label> | |||
<select class="ui search dropdown" id="trainjob-flavor" style='width:385px'> | |||
<select class="ui search dropdown" id="trainjob-flavor" style='width:385px' name="flavor"> | |||
{{range .flavor_infos}} | |||
<option name="flavor" value="{{.Value}}">{{.Value}}</option> | |||
<option name="flavor" value="{{.Code}}">{{.Value}}</option> | |||
{{end}} | |||
</select> | |||
</div> | |||
@@ -232,9 +232,9 @@ | |||
</div> | |||
</div> | |||
<div class="disabled field" id="save_para"> | |||
<div class="required field"> | |||
<div class="field"> | |||
<label>{{.i18n.Tr "repo.modelarts.train_job.job_parameter_name"}}</label> | |||
<input name="job_type" id="cloudbrain_job_type" value="{{.notebook_type}}" tabindex="3" autofocus required maxlength="255"> | |||
<input name="job_type" id="cloudbrain_job_type" value="{{.notebook_type}}" tabindex="3" autofocus maxlength="255"> | |||
</div> | |||
<div class="field"> | |||
<label for="parameter_description">{{.i18n.Tr "repo.modelarts.train_job.parameter_description"}}</label> | |||