Browse Source

提交代码

tags/v1.22.11.2^2
ychao_1983 3 years ago
parent
commit
f676bbb943
3 changed files with 65 additions and 50 deletions
  1. +17
    -2
      models/base_message.go
  2. +33
    -33
      services/cloudbrain/cloudbrainTask/inference.go
  3. +15
    -15
      services/cloudbrain/cloudbrainTask/train.go

+ 17
- 2
models/base_message.go View File

@@ -1,8 +1,8 @@
package models

type BaseMessage struct {
Code int `json:"code"`
Message string `json:"message"`
Code int
Message string
}

var BaseOKMessage = BaseMessage{
@@ -14,3 +14,18 @@ func BaseErrorMessage(message string) BaseMessage {
1, message,
}
}

type BaseMessageApi struct {
Code int `json:"code"`
Message string `json:"message"`
}

var BaseOKMessageApi = BaseMessageApi{
0, "",
}

func BaseErrorMessageApi(message string) BaseMessageApi {
return BaseMessageApi{
1, message,
}
}

+ 33
- 33
services/cloudbrain/cloudbrainTask/inference.go View File

@@ -54,7 +54,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, option api.CreateTrainJo
if !isOk {
log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("repo.cloudbrain_samejob_err")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.cloudbrain_samejob_err")))
return
}

@@ -63,7 +63,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, option api.CreateTrainJo
command, err := getInferenceJobCommand(option)
if err != nil {
log.Error("getTrainJobCommand failed: %v", err)
ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}

@@ -71,40 +71,40 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, option api.CreateTrainJo
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage("the job name did already exist"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("the job name did already exist"))
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage("system error"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("system error"))
return
}
}

if !jobNamePattern.MatchString(displayJobName) {

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("repo.cloudbrain_jobname_err")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.cloudbrain_jobname_err")))
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("repo.cloudbrain_bootfile_err")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.cloudbrain_bootfile_err")))
return
}

count, err := GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeCloudBrainOne, jobType)
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage("system error"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("system error"))
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("repo.cloudbrain.morethanonejob")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.cloudbrain.morethanonejob")))
return
}
}
@@ -115,7 +115,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, option api.CreateTrainJo
errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ResultPath)
if errStr != "" {

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr(errStr)))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr(errStr)))
return
}

@@ -125,7 +125,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, option api.CreateTrainJo
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.error.dataset_select")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.error.dataset_select")))
return
}
spec, err := resource.GetAndCheckSpec(ctx.User.ID, option.SpecId, models.FindSpecsOptions{
@@ -135,12 +135,12 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, option api.CreateTrainJo
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {

ctx.JSON(http.StatusOK, models.BaseErrorMessage("Resource specification is not available"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Resource specification is not available"))
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("points.insufficient_points_balance")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("points.insufficient_points_balance")))
return
}
req := cloudbrain.GenerateCloudBrainTaskReq{
@@ -175,10 +175,10 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, option api.CreateTrainJo
jobId, err := cloudbrain.GenerateTask(req)
if err != nil {

ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}
ctx.JSON(http.StatusOK, models.BaseMessage{Code: 0, Message: jobId})
ctx.JSON(http.StatusOK, models.BaseMessageApi{Code: 0, Message: jobId})
}

func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJobOption) {
@@ -212,7 +212,7 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
errStr := checkInferenceJobMultiNode(ctx.User.ID, option.WorkServerNumber)
if errStr != "" {

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr(errStr)))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr(errStr)))
return
}

@@ -221,7 +221,7 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if !isOk {
log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("repo.cloudbrain_samejob_err")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.cloudbrain_samejob_err")))
return
}
defer lock.UnLock()
@@ -230,13 +230,13 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if err != nil {
log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage("system error"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("system error"))
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting inference task", ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage("you have already a running or waiting inference task, can not create more"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("you have already a running or waiting inference task, can not create more"))
return
}
}
@@ -244,14 +244,14 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if err := paramCheckCreateInferenceJob(option); err != nil {
log.Error("paramCheckCreateInferenceJob failed:(%v)", err)

ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}

bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
if err != nil || !bootFileExist {
log.Error("Get bootfile error:", err)
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("repo.cloudbrain_bootfile_err")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("repo.cloudbrain_bootfile_err")))
return
}

@@ -261,14 +261,14 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage("the job name did already exist"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("the job name did already exist"))
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage("system error"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("system error"))
return
}
}
@@ -280,12 +280,12 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {

ctx.JSON(http.StatusOK, models.BaseErrorMessage("Resource specification not available"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Resource specification not available"))
return
}
if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("points.insufficient_points_balance")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("points.insufficient_points_balance")))
return
}

@@ -301,7 +301,7 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if err := downloadCode(repo, codeLocalPath, branchName); err != nil {
log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
return
}

@@ -309,19 +309,19 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)

ctx.JSON(http.StatusOK, models.BaseErrorMessage("Failed to obsMkdir_result"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Failed to obsMkdir_result"))
return
}

if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
ctx.JSON(http.StatusOK, models.BaseErrorMessage("Failed to obsMkdir_log"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Failed to obsMkdir_log"))
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
return
}

@@ -338,7 +338,7 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
if err != nil {

ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}
dataPath := dataUrl
@@ -346,7 +346,7 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if err != nil {
log.Error("Failed to Marshal: %v", err)

ctx.JSON(http.StatusOK, models.BaseErrorMessage("json error:"+err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("json error:"+err.Error()))
return
}
if isMultiDataset {
@@ -362,7 +362,7 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if err != nil {
log.Error("Failed to Unmarshal params: %s (%v)", params, err)

ctx.JSON(http.StatusOK, models.BaseErrorMessage("运行参数错误"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("运行参数错误"))
return
}

@@ -421,10 +421,10 @@ func ModelArtsInferenceJobCreate(ctx *context.Context, option api.CreateTrainJob
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())

ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}
ctx.JSON(http.StatusOK, models.BaseMessage{Code: 0, Message: jobId})
ctx.JSON(http.StatusOK, models.BaseMessageApi{Code: 0, Message: jobId})
}

func getDatasUrlListByUUIDS(uuidStr string) ([]models.Datasurl, string, string, bool, error) {


+ 15
- 15
services/cloudbrain/cloudbrainTask/train.go View File

@@ -53,7 +53,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
defer lock.UnLock()
spec, datasetInfos, datasetNames, err := checkParameters(ctx, option, lock, repo)
if err != nil {
ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}

@@ -65,7 +65,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOpt

if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))

}

@@ -73,21 +73,21 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
//upload code
if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
return
}

modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
if err := mkModelPath(modelPath); err != nil {
log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
return
}

//init model readme
if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
return
}

@@ -109,7 +109,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, option.CkptName)
if err != nil {
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage("Create task failed, internal error"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Create task failed, internal error"))
return
}

@@ -150,10 +150,10 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
jobId, err := grampus.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}
ctx.JSON(http.StatusOK, models.BaseMessage{Code: 0, Message: jobId})
ctx.JSON(http.StatusOK, models.BaseMessageApi{Code: 0, Message: jobId})
}

func checkParameters(ctx *context.Context, option api.CreateTrainJobOption, lock *redis_lock.DistributeLock, repo *models.Repository) (*models.Specification, map[string]models.DatasetInfo, string, error) {
@@ -258,7 +258,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
defer lock.UnLock()
spec, datasetInfos, datasetNames, err := checkParameters(ctx, option, lock, repo)
if err != nil {
ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}

@@ -271,7 +271,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
return
}

@@ -279,14 +279,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)

ctx.JSON(http.StatusOK, models.BaseErrorMessage(ctx.Tr("cloudbrain.load_code_failed")))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
return
}

@@ -308,7 +308,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
if err != nil {
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])

ctx.JSON(http.StatusOK, models.BaseErrorMessage("Create task failed, internal error"))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Create task failed, internal error"))
return
}

@@ -352,10 +352,10 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOpt
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())

ctx.JSON(http.StatusOK, models.BaseErrorMessage(err.Error()))
ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
return
}
ctx.JSON(http.StatusOK, models.BaseMessage{Code: 0, Message: jobId})
ctx.JSON(http.StatusOK, models.BaseMessageApi{Code: 0, Message: jobId})
}

func obsMkdir(dir string) error {


Loading…
Cancel
Save