Compare commits

...

25 Commits

Author SHA1 Message Date
  lewis 47d650f9f5 get log 3 years ago
  lewis 87237ea557 merge 3 years ago
  lewis e56a6c7d4f inference 3 years ago
  lewis 040453f08b Merge branch 'V20220801' of git.openi.org.cn:OpenI/aiforge into fix-2419 3 years ago
  lewis 6221ecbb8f test 3 years ago
  lewis 29ea504f64 Merge branch 'V20220718' of git.openi.org.cn:OpenI/aiforge into fix-2419 3 years ago
  lewis 96e0d00d80 debug 3 years ago
  lewis ad5f1b0705 Merge branch 'V20220718' of git.openi.org.cn:OpenI/aiforge into fix-2419 3 years ago
  lewis 7fd4bef198 debug 3 years ago
  lewis e59280fe62 Merge branch 'V20220718' of git.openi.org.cn:OpenI/aiforge into fix-2419 3 years ago
  lewis d1d320c6a2 Merge branch 'V20220718' of git.openi.org.cn:OpenI/aiforge into fix-2419 3 years ago
  lewis 5ac07f4e68 del unused 3 years ago
  lewis f4142e3555 Merge branch 'V20220718' of git.openi.org.cn:OpenI/aiforge into fix-2419 3 years ago
  lewis 9ed7137063 merge 3 years ago
  lewis 370cee5a83 train 3 years ago
  lewis aabb0c3852 debug 3 years ago
  lewis 95bed2d711 Merge branch 'V20220718' of git.openi.org.cn:OpenI/aiforge into fix-2419 3 years ago
  lewis 561e37b239 add inter 3 years ago
  lewis ef8e67aea5 add modelarts interface 3 years ago
  lewis cce0176d5a Merge branch 'V20220718' of git.openi.org.cn:OpenI/aiforge into fix-2419 3 years ago
  lewis b1ac061310 debug 3 years ago
  lewis 912fbbfe23 debug 3 years ago
  lewis 2f7e4d34c6 debug 3 years ago
  lewis 20de52c10d debug 3 years ago
  lewis 52c648fca3 create failed temp 3 years ago
14 changed files with 1055 additions and 766 deletions
Split View
  1. +66
    -6
      models/cloudbrain.go
  2. +57
    -0
      models/cloudbrain_temp.go
  3. +1
    -0
      models/models.go
  4. +5
    -5
      modules/cloudbrain/cloudbrain.go
  5. +476
    -240
      modules/modelarts/modelarts.go
  6. +171
    -21
      modules/modelarts/resty.go
  7. +2
    -3
      routers/api/v1/api.go
  8. +66
    -173
      routers/api/v1/repo/modelarts.go
  9. +10
    -48
      routers/repo/cloudbrain.go
  10. +2
    -2
      routers/repo/grampus.go
  11. +161
    -233
      routers/repo/modelarts.go
  12. +12
    -11
      routers/routes/routes.go
  13. +7
    -7
      templates/repo/modelarts/trainjob/index.tmpl
  14. +19
    -17
      templates/repo/modelarts/trainjob/show.tmpl

+ 66
- 6
models/cloudbrain.go View File

@@ -31,9 +31,11 @@ const (
)

const (
NPUResource = "NPU"
GPUResource = "CPU/GPU"
AllResource = "all"
TempJobIdPrefix = "TEMP"
JobStatusTemp = "TEMP"
NPUResource = "NPU"
GPUResource = "CPU/GPU"
AllResource = "all"

//notebook storage category
EVSCategory = "EVS"
@@ -344,6 +346,7 @@ type CloudbrainsOptions struct {
RepoID int64 // include all repos if empty
UserID int64
JobID string
JobName string
SortType string
CloudbrainIDs []int64
JobStatus []string
@@ -1247,6 +1250,52 @@ type LogFile struct {
Name string
}

type JobList struct {
JobName string `json:"job_name"`
JobID int64 `json:"job_id"`
VersionID int64 `json:"version_id"`
VersionCount int64 `json:"version_count"`
Description string `json:"job_desc"`
IntStatus int `json:"status"`
}

type GetTrainJobListResult struct {
ErrorResult
JobTotalCount int `json:"job_total_count"` //查询到的用户创建作业总数
JobCountLimit int `json:"job_count_limit"` //用户还可以创建训练作业的数量
Quotas int `json:"quotas"` //训练作业的运行数量上限
JobList []JobList `json:"jobs"`
}

type JobVersionList struct {
VersionName string `json:"version_name"`
VersionID int64 `json:"version_id"`
IntStatus int `json:"status"`
}

type GetTrainJobVersionListResult struct {
ErrorResult
JobID int64 `json:"job_id"`
JobName string `json:"job_name"`
JobDesc string `json:"job_desc"`
VersionCount int64 `json:"version_count"`
JobVersionList []JobVersionList `json:"versions"`
}

type NotebookList struct {
JobName string `json:"name"`
JobID string `json:"id"`
Status string `json:"status"`
}

type GetNotebookListResult struct {
TotalCount int64 `json:"total"` //总的记录数量
CurrentPage int `json:"current"` //当前页数
TotalPages int `json:"pages"` //总的页数
Size int `json:"size"` //每一页的数量
NotebookList []NotebookList `json:"data"`
}

//Grampus
type GrampusResult struct {
ErrorCode int `json:"errorCode"`
@@ -1559,6 +1608,12 @@ func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, e
)
}

if (opts.JobName) != "" {
cond = cond.And(
builder.Eq{"cloudbrain.job_name": opts.JobName},
)
}

if len(opts.JobTypes) > 0 {
cond = cond.And(
builder.In("cloudbrain.job_type", opts.JobTypes),
@@ -1692,9 +1747,9 @@ func SetTrainJobStatusByJobID(jobID string, status string, duration int64, train
return
}

func SetVersionCountAndLatestVersion(jobID string, versionName string, versionCount int, isLatestVersion string, totalVersionCount int) (err error) {
cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion, TotalVersionCount: totalVersionCount}
_, err = x.Cols("version_Count", "is_latest_version", "total_version_count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
func SetVersionCountAndLatestVersion(jobName string, versionName string, versionCount int, isLatestVersion string, totalVersionCount int) (err error) {
cb := &Cloudbrain{JobName: jobName, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion, TotalVersionCount: totalVersionCount}
_, err = x.Cols("version_Count", "is_latest_version", "total_version_count").Where("cloudbrain.job_name=? AND cloudbrain.version_name=?", jobName, versionName).Update(cb)
return
}

@@ -2114,3 +2169,8 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) {
In("id", ids).
Find(&cloudbrains)
}

func GetCloudbrainCountByJobName(jobName, jobType string) (int, error) {
count, err := x.Where("job_name = ? and job_type= ?", jobName, jobType).Count(new(Cloudbrain))
return int(count), err
}

+ 57
- 0
models/cloudbrain_temp.go View File

@@ -0,0 +1,57 @@
package models

import (
"time"

"code.gitea.io/gitea/modules/timeutil"
)

const (
//TempJobIdPrefix = "TEMP"

)

type CloudbrainTemp struct {
CloudbrainID int64 `xorm:"pk"`
JobName string
Type int
JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
Status string `xorm:"INDEX NOT NULL DEFAULT 'TEMP'"`
VersionCount int `xorm:"NOT NULL DEFAULT 0"`
QueryTimes int `xorm:"INDEX NOT NULL DEFAULT 0"`
CreatedUnix timeutil.TimeStamp `xorm:"INDEX"`
UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
DeletedAt time.Time `xorm:"deleted"`
}

func InsertCloudbrainTemp(temp *CloudbrainTemp) (err error) {
if _, err = x.Insert(temp); err != nil {
return err
}

return nil
}

func getCloudBrainTemp(temp *CloudbrainTemp) (*CloudbrainTemp, error) {
has, err := x.Get(temp)
if err != nil {
return nil, err
} else if !has {
return nil, ErrJobNotExist{}
}
return temp, nil
}

func GetCloudbrainTempByCloudbrainID(id int64) (*CloudbrainTemp, error) {
temp := &CloudbrainTemp{CloudbrainID: id}
return getCloudBrainTemp(temp)
}

func DeleteCloudbrainTemp(temp *CloudbrainTemp) error {
return deleteCloudbrainTemp(x, temp)
}

func deleteCloudbrainTemp(e Engine, temp *CloudbrainTemp) error {
_, err := e.Where("cloudbrain_id = ?", temp.CloudbrainID).Delete(temp)
return err
}

+ 1
- 0
models/models.go View File

@@ -144,6 +144,7 @@ func init() {
new(WechatBindLog),
new(OrgStatistic),
new(SearchRecord),
new(CloudbrainTemp),
new(AiModelConvert),
)



+ 5
- 5
modules/cloudbrain/cloudbrain.go View File

@@ -142,8 +142,8 @@ func isAdminOrImageCreater(ctx *context.Context, image *models.Image, err error)

func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {

var ID = ctx.Params(":id")
job, err := models.GetCloudbrainByID(ID)
var id = ctx.Params(":id")
job, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByID failed:%v", err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
@@ -158,8 +158,8 @@ func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) {

func AdminOrJobCreaterRight(ctx *context.Context) {

var ID = ctx.Params(":id")
job, err := models.GetCloudbrainByID(ID)
var id = ctx.Params(":id")
job, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByID failed:%v", err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
@@ -547,7 +547,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e
GPUNumber: resourceSpec.GpuNum,
MemoryMB: resourceSpec.MemMiB,
ShmMB: resourceSpec.ShareMemMiB,
Command: GetCloudbrainDebugCommand(),//Command,
Command: GetCloudbrainDebugCommand(), //Command,
NeedIBDevice: false,
IsMainRole: false,
UseNNI: false,


+ 476
- 240
modules/modelarts/modelarts.go View File

@@ -4,8 +4,11 @@ import (
"encoding/json"
"errors"
"fmt"
"math/rand"
"path"
"strconv"
"strings"
"time"

"code.gitea.io/gitea/modules/timeutil"

@@ -59,7 +62,7 @@ const (
PerPage = 10
IsLatestVersion = "1"
NotLatestVersion = "0"
VersionCount = 1
VersionCountOne = 1

SortByCreateTime = "create_time"
ConfigTypeCustom = "custom"
@@ -264,31 +267,13 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
log.Error("GetNotebookImageName failed: %v", err.Error())
return err
}

createTime := timeutil.TimeStampNow()
jobResult, err := createNotebook2(models.CreateNotebook2Params{
JobName: jobName,
Description: description,
Flavor: flavor,
Duration: autoStopDurationMs,
ImageID: imageId,
PoolID: poolInfos.PoolInfo[0].PoolId,
Feature: models.NotebookFeature,
Volume: models.VolumeReq{
Capacity: setting.Capacity,
Category: models.EVSCategory,
Ownership: models.ManagedOwnership,
},
WorkspaceID: "0",
})
if err != nil {
log.Error("createNotebook2 failed: %v", err.Error())
return err
}
err = models.CreateCloudbrain(&models.Cloudbrain{
Status: jobResult.Status,
task := &models.Cloudbrain{
Status: string(models.ModelArtsTrainJobWaiting),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobResult.ID,
JobID: models.TempJobIdPrefix + jobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
JobName: jobName,
FlavorCode: flavor,
DisplayJobName: displayJobName,
@@ -300,16 +285,66 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
Description: description,
CreatedUnix: createTime,
UpdatedUnix: createTime,
})
}

err = models.CreateCloudbrain(task)
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", displayJobName, err.Error())
return err
}
task, err := models.GetCloudbrainByName(jobName)

jobResult, err := createNotebook2(models.CreateNotebook2Params{
JobName: jobName,
Description: description,
Flavor: flavor,
Duration: autoStopDurationMs,
ImageID: imageId,
PoolID: poolInfos.PoolInfo[0].PoolId,
Feature: models.NotebookFeature,
Volume: models.VolumeReq{
Capacity: setting.Capacity,
Category: models.EVSCategory,
Ownership: models.ManagedOwnership,
},
WorkspaceID: "0",
})
if err != nil {
log.Error("GetCloudbrainByName failed: %v", err.Error())
return err
log.Error("createNotebook2 failed: %v", err.Error())
if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
log.Info("(%s)unknown error, set temp status", displayJobName)
errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
CloudbrainID: task.ID,
Status: models.JobStatusTemp,
Type: task.Type,
JobName: task.JobName,
JobType: task.JobType,
})
if errTemp != nil {
log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
return errTemp
}
} else {
task.Status = string(models.ModelArtsCreateFailed)
errTemp := models.UpdateJob(task)
if errTemp != nil {
log.Error("UpdateJob failed: %v", errTemp.Error())
}
errTemp = models.DeleteJob(task)
if errTemp != nil {
log.Error("DeleteJob failed: %v", errTemp.Error())
}
return err
}
} else {
task.Status = jobResult.Status
task.JobID = jobResult.ID
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob failed: %v", err.Error())
return err
}
}

stringId := strconv.FormatInt(task.ID, 10)
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
return nil
@@ -317,66 +352,15 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc

func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
createTime := timeutil.TimeStampNow()
var jobResult *models.CreateTrainJobResult
var createErr error
if req.EngineID < 0 {
jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.UserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
})
} else {
jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.Config{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
},
})
}
if createErr != nil {
log.Error("CreateJob failed: %v", createErr.Error())
return createErr
}
jobId := strconv.FormatInt(jobResult.JobID, 10)
createErr = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
task := &models.Cloudbrain{
Status: string(models.ModelArtsTrainJobWaiting),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobId,
JobID: models.TempJobIdPrefix + req.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: req.DatasetName,
CommitID: req.CommitID,
@@ -398,49 +382,21 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
TotalVersionCount: req.TotalVersionCount,
CreatedUnix: createTime,
UpdatedUnix: createTime,
})

if createErr != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
return createErr
}
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
return nil
}

func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {

return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.UserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
})
}
err = models.CreateCloudbrain(task)
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
return err
}

func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
createTime := timeutil.TimeStampNow()
var jobResult *models.CreateTrainJobResult
var createErr error
log.Info(" req.EngineID =" + fmt.Sprint(req.EngineID))

if req.EngineID < 0 {
jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.TrainJobVersionUserImageConfig{
Config: models.UserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
@@ -448,19 +404,20 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
}, jobId)
})
} else {
jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.TrainJobVersionConfig{
Config: models.Config{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
@@ -469,87 +426,60 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
Parameter: req.Parameters,
},
}, jobId)
}
if createErr != nil {
log.Error("CreateJob failed: %v", createErr.Error())
return createErr
}

var jobTypes []string
jobTypes = append(jobTypes, string(models.JobTypeTrain))
repo := ctx.Repo.Repository
VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobTypes: jobTypes,
JobID: strconv.FormatInt(jobResult.JobID, 10),
})
if createErr != nil {
ctx.ServerError("Cloudbrain", createErr)
return createErr
}
//将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount

createErr = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: req.DatasetName,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
PreVersionName: req.PreVersionName,
ComputeResource: models.NPUResource,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
BootFile: req.BootFile,
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
PreVersionId: req.PreVersionId,
FlavorCode: req.FlavorCode,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
EngineName: req.EngineName,
TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
VersionCount: VersionListCount + 1,
CreatedUnix: createTime,
UpdatedUnix: createTime,
})
if createErr != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
return createErr
})
}

//将训练任务的上一版本的isLatestVersion设置为"0"
createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount)
if createErr != nil {
ctx.ServerError("Update IsLatestVersion failed", createErr)
return createErr
log.Error("createTrainJob failed: %v", createErr.Error())
if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
CloudbrainID: task.ID,
Status: models.JobStatusTemp,
Type: task.Type,
JobName: task.JobName,
JobType: task.JobType,
})
if errTemp != nil {
log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
return errTemp
}
} else {
task.Status = string(models.ModelArtsTrainJobFailed)
errTemp := models.UpdateJob(task)
if errTemp != nil {
log.Error("UpdateJob failed: %v", errTemp.Error())
}
errTemp = models.DeleteJob(task)
if errTemp != nil {
log.Error("DeleteJob failed: %v", errTemp.Error())
}
return createErr
}
} else {
task.Status = TransTrainJobStatus(jobResult.Status)
task.JobID = strconv.FormatInt(jobResult.JobID, 10)
task.VersionID = jobResult.VersionID
task.VersionName = jobResult.VersionName
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob failed: %v", err.Error())
return err
}
}

return createErr
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, task.JobID, req.DisplayJobName, models.ActionCreateTrainTask)
return nil
}

func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
createTime := timeutil.TimeStampNow()
jobResult, err := createTrainJobUserImage(models.CreateUserImageTrainJobParams{
func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {

return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
JobName: req.JobName,
Description: req.Description,
Config: models.UserImageConfig{
@@ -569,11 +499,9 @@ func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrain
UserCommand: req.UserCommand,
},
})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
}
}

func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
var jobTypes []string
jobTypes = append(jobTypes, string(models.JobTypeTrain))
repo := ctx.Repo.Repository
@@ -581,7 +509,7 @@ func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrain
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobTypes: jobTypes,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobID: jobId,
})
if err != nil {
ctx.ServerError("Cloudbrain", err)
@@ -589,25 +517,23 @@ func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrain
}
//将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount

err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),
createTime := timeutil.TimeStampNow()
task := &models.Cloudbrain{
Status: models.JobStatusTemp,
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: strconv.FormatInt(jobResult.JobID, 10),
JobID: jobId,
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeTrain),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: req.DatasetName,
CommitID: req.CommitID,
IsLatestVersion: req.IsLatestVersion,
PreVersionName: req.PreVersionName,
ComputeResource: models.NPUResource,
EngineID: MORDELART_USER_IMAGE_ENGINE_ID,
Image: req.UserImageUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
BranchName: req.BranchName,
Parameters: req.Params,
@@ -624,20 +550,103 @@ func GenerateTrainJobVersionByUserImage(ctx *context.Context, req *GenerateTrain
VersionCount: VersionListCount + 1,
CreatedUnix: createTime,
UpdatedUnix: createTime,
})
}
err = models.CreateCloudbrain(task)
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return err
}

//将训练任务的上一版本的isLatestVersion设置为"0"
err = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCount, NotLatestVersion, TotalVersionCount)
err = models.SetVersionCountAndLatestVersion(req.JobName, VersionTaskList[0].VersionName, VersionListCount, NotLatestVersion, VersionTaskList[0].TotalVersionCount)
if err != nil {
ctx.ServerError("Update IsLatestVersion failed", err)
return err
}

return err
var jobResult *models.CreateTrainJobResult
var createErr error

if req.EngineID < 0 {
jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
Description: req.Description,
Config: models.TrainJobVersionUserImageConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
UserImageUrl: req.UserImageUrl,
UserCommand: req.UserCommand,
},
}, jobId)
} else {
jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
Description: req.Description,
Config: models.TrainJobVersionConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
},
}, jobId)
}
if createErr != nil {
log.Error("createTrainJobVersion failed: %v", err.Error())
if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
CloudbrainID: task.ID,
Status: models.JobStatusTemp,
Type: task.Type,
JobName: task.JobName,
JobType: task.JobType,
})
if errTemp != nil {
log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
return errTemp
}
} else {
task.Status = string(models.ModelArtsTrainJobFailed)
errTemp := models.UpdateJob(task)
if errTemp != nil {
log.Error("UpdateJob failed: %v", errTemp.Error())
}
errTemp = models.DeleteJob(task)
if errTemp != nil {
log.Error("DeleteJob failed: %v", errTemp.Error())
}
return createErr
}
} else {
task.Status = TransTrainJobStatus(jobResult.Status)
task.JobID = strconv.FormatInt(jobResult.JobID, 10)
task.VersionID = jobResult.VersionID
task.VersionName = jobResult.VersionName
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob failed: %v", err.Error())
return err
}
}

return nil
}

func TransTrainJobStatus(status int) string {
@@ -700,47 +709,22 @@ func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {

func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
createTime := timeutil.TimeStampNow()
jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
JobName: req.JobName,
Description: req.Description,
InfConfig: models.InfConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
// TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
},
})
if err != nil {
log.Error("CreateJob failed: %v", err.Error())
return err
}

attach, err := models.GetAttachmentByUUID(req.Uuid)
if err != nil {
log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
log.Error("GetAttachmentByUUID(%s) failed:%v", req.DisplayJobName, err.Error())
return err
}
jobID := strconv.FormatInt(jobResult.JobID, 10)
err = models.CreateCloudbrain(&models.Cloudbrain{
Status: TransTrainJobStatus(jobResult.Status),

task := &models.Cloudbrain{
Status: string(models.ModelArtsTrainJobWaiting),
UserID: ctx.User.ID,
RepoID: ctx.Repo.Repository.ID,
JobID: jobID,
JobID: models.TempJobIdPrefix + req.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
JobName: req.JobName,
DisplayJobName: req.DisplayJobName,
JobType: string(models.JobTypeInference),
Type: models.TypeCloudBrainTwo,
VersionID: jobResult.VersionID,
VersionName: jobResult.VersionName,
Uuid: req.Uuid,
DatasetName: attach.Name,
CommitID: req.CommitID,
@@ -767,13 +751,74 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e
ResultUrl: req.ResultUrl,
CreatedUnix: createTime,
UpdatedUnix: createTime,
})
}

err = models.CreateCloudbrain(task)
if err != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
return err
}
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)

jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
JobName: req.JobName,
Description: req.Description,
InfConfig: models.InfConfig{
WorkServerNum: req.WorkServerNumber,
AppUrl: req.CodeObsPath,
BootFileUrl: req.BootFileUrl,
DataUrl: req.DataUrl,
EngineID: req.EngineID,
// TrainUrl: req.TrainUrl,
LogUrl: req.LogUrl,
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
},
Parameter: req.Parameters,
},
})
if err != nil {
log.Error("createTrainJob failed: %v", err.Error())
if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
CloudbrainID: task.ID,
Status: models.JobStatusTemp,
Type: task.Type,
JobName: task.JobName,
JobType: task.JobType,
})
if err != nil {
log.Error("InsertCloudbrainTemp failed: %v", err.Error())
return err
}
} else {
task.Status = string(models.ModelArtsTrainJobFailed)
errTemp := models.UpdateJob(task)
if errTemp != nil {
log.Error("UpdateJob failed: %v", errTemp.Error())
}
errTemp = models.DeleteJob(task)
if errTemp != nil {
log.Error("DeleteJob failed: %v", errTemp.Error())
}
return err
}
} else {
task.Status = TransTrainJobStatus(jobResult.Status)
task.JobID = strconv.FormatInt(jobResult.JobID, 10)
task.VersionID = jobResult.VersionID
task.VersionName = jobResult.VersionName
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob failed: %v", err.Error())
return err
}
}

notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, task.JobID, req.DisplayJobName, models.ActionCreateInferenceTask)

return nil
}

@@ -799,3 +844,194 @@ func GetNotebookImageName(imageId string) (string, error) {

return imageName, nil
}

func HandleTrainJobInfo(task *models.Cloudbrain) error {
if isTempJob(task.JobID, task.Status) {
if task.VersionCount > VersionCountOne {
//multi version
result, err := GetTrainJobVersionList(1000, 1, strings.TrimPrefix(task.JobID, models.TempJobIdPrefix))
if err != nil {
log.Error("GetTrainJobVersionList failed:%v", err)
return err
}

if result != nil {
if strconv.FormatInt(result.JobID, 10) == task.JobID && result.JobName == task.JobName {
if result.VersionCount == int64(task.VersionCount) {
log.Info("find the record(%s)", task.DisplayJobName)
task.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
task.VersionName = result.JobVersionList[0].VersionName
task.VersionID = result.JobVersionList[0].VersionID

err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
return err
}
temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
if err != nil {
log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
} else {
err = models.DeleteCloudbrainTemp(temp)
if err != nil {
log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
}
}

return nil
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}
}
} else {
//inference or one version
result, err := GetTrainJobList(1000, 1, "create_time", "desc", task.JobName)
if err != nil {
log.Error("GetTrainJobList failed:%v", err)
return err
}

if result != nil {
for _, job := range result.JobList {
if task.JobName == job.JobName {
log.Info("find the record(%s)", task.DisplayJobName)
task.Status = TransTrainJobStatus(job.IntStatus)
task.JobID = strconv.FormatInt(job.JobID, 10)

err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
return err
}
temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
if err != nil {
log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
return err
}
err = models.DeleteCloudbrainTemp(temp)
if err != nil {
log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
return err
}
return nil
}
}
}

}
} else {
//normal
result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
return err
}

if result != nil {
task.Status = TransTrainJobStatus(result.IntStatus)
task.Duration = result.Duration / 1000
task.TrainJobDuration = result.TrainJobDuration

if task.StartTime == 0 && result.StartTime > 0 {
task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
return err
}
}
}

return nil
}

func HandleNotebookInfo(task *models.Cloudbrain) error {
if isTempJob(task.JobID, task.Status) {
result, err := GetNotebookList(1000, 0, "createTime", "DESC", task.JobName)
if err != nil {
log.Error("GetNotebookList failed:%v", err)
return err
}

if result != nil {
count, err := models.GetCloudbrainCountByJobName(task.JobName, task.JobType)
if err != nil {
log.Error("GetCloudbrainCountByJobName failed:%v", err)
return err
}

if len(result.NotebookList) == count {
if result.NotebookList[0].JobName == task.JobName {
log.Info("find the record(%s)", task.DisplayJobName)
task.Status = result.NotebookList[0].Status
task.JobID = result.NotebookList[0].JobID

err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
return err
}
temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
if err != nil {
log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
return err
}
err = models.DeleteCloudbrainTemp(temp)
if err != nil {
log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
return err
}
return nil
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}
} else {
log.Error("can not find the record(%s) until now", task.DisplayJobName)
}
} else {
//normal
result, err := GetNotebook2(task.JobID)
if err != nil {
log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
return err
}

if result != nil {
task.Status = result.Status
if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
}
if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.CorrectCreateUnix()
task.ComputeAndSetDuration()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
return err
}
}
}

return nil
}

func isTempJob(jobID, status string) bool {
if (strings.HasPrefix(jobID, models.TempJobIdPrefix) && status == string(models.ModelArtsTrainJobWaiting)) || status == models.JobStatusTemp {
return true
}
return false
}

+ 171
- 21
modules/modelarts/resty.go View File

@@ -37,6 +37,7 @@ const (
NotebookNotFound = "ModelArts.6404"
NotebookNoPermission = "ModelArts.6407"
NotebookInvalid = "ModelArts.6400"
UnknownErrorPrefix = "UNKNOWN:"
)

func getRestyClient() *resty.Client {
@@ -298,6 +299,10 @@ sendjob:
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
}

if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

if len(response.ErrorCode) != 0 {
log.Error("ManageNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
if response.ErrorCode == modelartsIllegalToken && retry < 1 {
@@ -547,9 +552,6 @@ sendjob:
return nil, fmt.Errorf("resty create train-job: %s", err)
}

req, _ := json.Marshal(createJobParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
@@ -563,17 +565,21 @@ sendjob:
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
BootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
DataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == BootFileErrorMsg {
bootFileErrorMsg := "Invalid OBS path '" + createJobParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == DataSetErrorMsg {
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createTrainJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
@@ -603,9 +609,6 @@ sendjob:
return nil, fmt.Errorf("resty create train-job version: %s", err)
}

req, _ := json.Marshal(createJobVersionParams)
log.Info("%s", req)

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
@@ -618,17 +621,23 @@ sendjob:
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
BootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'."
DataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'."
if temp.ErrorMsg == BootFileErrorMsg {

log.Error("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
bootFileErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.BootFileUrl + "'."
dataSetErrorMsg := "Invalid OBS path '" + createJobVersionParams.Config.DataUrl + "'."
if temp.ErrorMsg == bootFileErrorMsg {
log.Error("启动文件错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("启动文件错误!")
}
if temp.ErrorMsg == DataSetErrorMsg {
if temp.ErrorMsg == dataSetErrorMsg {
log.Error("数据集错误!createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createTrainJobVersion failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
@@ -761,9 +770,6 @@ sendjob:
goto sendjob
}

//temp, _ := json.Marshal(req)
//log.Info("%s", temp)

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
@@ -1172,7 +1178,11 @@ sendjob:
log.Error("数据集错误!createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf("数据集错误!")
}
return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
} else {
return &result, fmt.Errorf("createInferenceJob failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
}
}

if !result.IsSuccess {
@@ -1212,7 +1222,11 @@ sendjob:
err = json.Unmarshal(res.Body(), &response)
if err != nil {
log.Error("json.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("son.Unmarshal failed: %s", err.Error())
return &result, fmt.Errorf("json.Unmarshal failed: %s", err.Error())
}

if res.StatusCode() == http.StatusBadGateway {
return &result, fmt.Errorf(UnknownErrorPrefix+"createNotebook2 failed(%s): %s", response.ErrorCode, response.ErrorMsg)
}

if len(response.ErrorCode) != 0 {
@@ -1271,3 +1285,139 @@ sendjob:

return &result, nil
}

func GetTrainJobList(perPage, page int, sortBy, order, searchContent string) (*models.GetTrainJobListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"per_page": strconv.Itoa(perPage),
"page": strconv.Itoa(page),
"sortBy": sortBy,
"order": order,
"search_content": searchContent,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob)

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf(result.ErrorMsg)
}

return &result, nil
}

func GetTrainJobVersionList(perPage, page int, jobID string) (*models.GetTrainJobVersionListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetTrainJobVersionListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"per_page": strconv.Itoa(perPage),
"page": strconv.Itoa(page),
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlTrainJob + "/" + jobID + "/versions")

if err != nil {
return nil, fmt.Errorf("resty GetTrainJobVersionList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetTrainJobVersionList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

if !result.IsSuccess {
log.Error("GetTrainJobVersionList failed(%s): %s", result.ErrorCode, result.ErrorMsg)
return &result, fmt.Errorf(result.ErrorMsg)
}

return &result, nil
}

func GetNotebookList(limit, offset int, sortBy, order, searchContent string) (*models.GetNotebookListResult, error) {
checkSetting()
client := getRestyClient()
var result models.GetNotebookListResult

retry := 0

sendjob:
res, err := client.R().
SetQueryParams(map[string]string{
"limit": strconv.Itoa(limit),
"offset": strconv.Itoa(offset),
"name": searchContent,
"sort_key": sortBy,
"sort_dir": order,
}).
SetAuthToken(TOKEN).
SetResult(&result).
Get(HOST + "/v1/" + setting.ProjectID + urlNotebook2)

if err != nil {
return nil, fmt.Errorf("resty GetNotebookList: %v", err)
}

if res.StatusCode() == http.StatusUnauthorized && retry < 1 {
retry++
_ = getToken()
goto sendjob
}

if res.StatusCode() != http.StatusOK {
var temp models.ErrorResult
if err = json.Unmarshal([]byte(res.String()), &temp); err != nil {
log.Error("json.Unmarshal failed(%s): %v", res.String(), err.Error())
return &result, fmt.Errorf("json.Unmarshal failed(%s): %v", res.String(), err.Error())
}
log.Error("GetNotebookList failed(%d):%s(%s)", res.StatusCode(), temp.ErrorCode, temp.ErrorMsg)
return &result, fmt.Errorf(temp.ErrorMsg)
}

return &result, nil
}

+ 2
- 3
routers/api/v1/api.go View File

@@ -938,11 +938,10 @@ func RegisterRoutes(m *macaron.Macaron) {
}, reqRepoReader(models.UnitTypeModelManage))
m.Group("/modelarts", func() {
m.Group("/notebook", func() {
//m.Get("/:jobid", repo.GetModelArtsNotebook)
m.Get("/:id", repo.GetModelArtsNotebook2)
})
m.Group("/train-job", func() {
m.Group("/:jobid", func() {
m.Group("/:id", func() {
m.Get("", repo.GetModelArtsTrainJobVersion)
m.Get("/log", repo.TrainJobGetLog)
m.Post("/del_version", repo.DelTrainJobVersion)
@@ -952,7 +951,7 @@ func RegisterRoutes(m *macaron.Macaron) {
})
})
m.Group("/inference-job", func() {
m.Group("/:jobid", func() {
m.Group("/:id", func() {
m.Get("", repo.GetModelArtsInferenceJob)
m.Get("/log", repo.TrainJobGetLog)
m.Post("/del_version", repo.DelTrainJobVersion)


+ 66
- 173
routers/api/v1/repo/modelarts.go View File

@@ -25,105 +25,27 @@ import (
routerRepo "code.gitea.io/gitea/routers/repo"
)

func GetModelArtsNotebook(ctx *context.APIContext) {
var (
err error
)

jobID := ctx.Params(":jobid")
repoID := ctx.Repo.Repository.ID
job, err := models.GetRepoCloudBrainByJobID(repoID, jobID)
if err != nil {
ctx.NotFound(err)
return
}
result, err := modelarts.GetJob(jobID)
if err != nil {
ctx.NotFound(err)
return
}

job.Status = result.Status
err = models.UpdateJob(job)
if err != nil {
log.Error("UpdateJob failed:", err)
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"JobStatus": result.Status,
})

}

func GetModelArtsNotebook2(ctx *context.APIContext) {
var (
err error
)

ID := ctx.Params(":id")
job, err := models.GetCloudbrainByID(ID)
id := ctx.Params(":id")
job, err := models.GetCloudbrainByID(id)
if err != nil {
ctx.NotFound(err)
return
}
result, err := modelarts.GetNotebook2(job.JobID)
err = modelarts.HandleNotebookInfo(job)
if err != nil {
ctx.NotFound(err)
return
}
if job.StartTime == 0 && result.Lease.UpdateTime > 0 {
job.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
}
job.Status = result.Status
if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) {
job.EndTime = timeutil.TimeStampNow()
}
job.CorrectCreateUnix()
job.ComputeAndSetDuration()
err = models.UpdateJob(job)
if err != nil {
log.Error("UpdateJob failed:", err)
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"ID": ID,
"ID": id,
"JobName": job.JobName,
"JobStatus": result.Status,
})

}

func GetModelArtsTrainJob(ctx *context.APIContext) {
var (
err error
)

jobID := ctx.Params(":jobid")
repoID := ctx.Repo.Repository.ID
job, err := models.GetRepoCloudBrainByJobID(repoID, jobID)
if err != nil {
ctx.NotFound(err)
return
}
result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
if err != nil {
ctx.NotFound(err)
return
}

job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
job.Duration = result.Duration
job.TrainJobDuration = result.TrainJobDuration
err = models.UpdateJob(job)
if err != nil {
log.Error("UpdateJob failed:", err)
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"JobStatus": job.Status,
"JobDuration": job.Duration,
"JobStatus": job.Status,
})

}
@@ -134,9 +56,8 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
aiCenterName string
)

jobID := ctx.Params(":jobid")
versionName := ctx.Query("version_name")
job, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
id := ctx.Params(":id")
job, err := models.GetCloudbrainByID(id)
if err != nil {
ctx.NotFound(err)
return
@@ -174,29 +95,13 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
}
}
} else if job.Type == models.TypeCloudBrainTwo {
result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
err := modelarts.HandleTrainJobInfo(job)
if err != nil {
ctx.NotFound(err)
return
}

if job.StartTime == 0 && result.StartTime > 0 {
job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
job.Duration = result.Duration / 1000
job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)

if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
job.EndTime = job.StartTime.Add(job.Duration)
}
job.CorrectCreateUnix()
err = models.UpdateTrainJobVersion(job)
if err != nil {
log.Error("UpdateJob failed:", err)
}
} else if job.Type == models.TypeC2Net {
result, err := grampus.GetJob(jobID)
result, err := grampus.GetJob(job.JobID)
if err != nil {
log.Error("GetJob(%s) failed:%v", job.JobName, err)
ctx.NotFound(err)
@@ -235,7 +140,8 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) {
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"JobID": job.JobID,
"ID": id,
"JobStatus": job.Status,
"JobDuration": job.TrainJobDuration,
"AiCenter": aiCenterName,
@@ -317,7 +223,7 @@ func TrainJobGetLog(ctx *context.APIContext) {
err error
)

var jobID = ctx.Params(":jobid")
var id = ctx.Params(":id")
var versionName = ctx.Query("version_name")
var baseLine = ctx.Query("base_line")
var order = ctx.Query("order")
@@ -335,14 +241,19 @@ func TrainJobGetLog(ctx *context.APIContext) {
return
}

task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
temp, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
return
}
resultLogFile, result, err := trainJobGetLogContent(jobID, task.VersionID, baseLine, order, lines_int)
task, err := models.GetCloudbrainByJobIDAndVersionName(temp.JobID, versionName)
if err != nil {
log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
return
}
resultLogFile, result, err := trainJobGetLogContent(task.JobID, task.VersionID, baseLine, order, lines_int)
if err != nil {
log.Error("trainJobGetLog(%s) failed:%v", task.JobID, err.Error())
// ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}
@@ -359,7 +270,7 @@ func TrainJobGetLog(ctx *context.APIContext) {
ctx.Data["log_file_name"] = resultLogFile.LogFileList[0]

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"ID": id,
"LogFileName": resultLogFile.LogFileList[0],
"StartLine": result.StartLine,
"EndLine": result.EndLine,
@@ -391,17 +302,16 @@ func DelTrainJobVersion(ctx *context.APIContext) {
err error
)

var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
var id = ctx.Params(":id")
task, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
ctx.NotFound(err)
return
}

//删除modelarts上的记录
_, err = modelarts.DelTrainJobVersion(jobID, strconv.FormatInt(task.VersionID, 10))
_, err = modelarts.DelTrainJobVersion(task.JobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("DelTrainJobVersion(%s) failed:%v", task.JobName, err.Error())
ctx.NotFound(err)
@@ -424,7 +334,7 @@ func DelTrainJobVersion(ctx *context.APIContext) {
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobTypes: jobTypes,
JobID: jobID,
JobID: task.JobID,
})
if err != nil {
ctx.ServerError("get VersionListCount failed", err)
@@ -433,13 +343,13 @@ func DelTrainJobVersion(ctx *context.APIContext) {
if VersionListCount > 0 {
// 判断当前删掉的任务是否是最新版本,若是,将排序后的TotalVersionCount置为删掉的最新版本的TotalVersionCount,若不是,按时间排序后的版本列表的第一个版本设置为最新版本,TotalVersionCount不变
if task.IsLatestVersion == modelarts.IsLatestVersion {
err = models.SetVersionCountAndLatestVersion(jobID, VersionTaskList[0].Cloudbrain.VersionName, VersionListCount, modelarts.IsLatestVersion, task.TotalVersionCount)
err = models.SetVersionCountAndLatestVersion(task.JobName, VersionTaskList[0].Cloudbrain.VersionName, VersionListCount, modelarts.IsLatestVersion, task.TotalVersionCount)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return
}
} else {
err = models.SetVersionCountAndLatestVersion(jobID, VersionTaskList[0].VersionName, VersionListCount, modelarts.IsLatestVersion, VersionTaskList[0].Cloudbrain.TotalVersionCount)
err = models.SetVersionCountAndLatestVersion(task.JobName, VersionTaskList[0].VersionName, VersionListCount, modelarts.IsLatestVersion, VersionTaskList[0].Cloudbrain.TotalVersionCount)
if err != nil {
ctx.ServerError("UpdateJobVersionCount failed", err)
return
@@ -450,8 +360,8 @@ func DelTrainJobVersion(ctx *context.APIContext) {
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"VersionName": versionName,
"ID": id,
"VersionName": task.VersionName,
"StatusOK": 0,
"VersionListCount": VersionListCount,
})
@@ -461,23 +371,23 @@ func StopTrainJobVersion(ctx *context.APIContext) {
var (
err error
)
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
var id = ctx.Params(":id")
task, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
ctx.NotFound(err)
return
}

_, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
_, err = modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
return
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"VersionName": versionName,
"ID": id,
"VersionName": task.VersionName,
"StatusOK": 0,
})
}
@@ -487,19 +397,19 @@ func ModelList(ctx *context.APIContext) {
err error
)

var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
parentDir := ctx.Query("parentDir")
dirArray := strings.Split(parentDir, "/")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
var id = ctx.Params(":id")
task, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
ctx.NotFound(err)
return
}

var fileInfos []storage.FileInfo
if task.ComputeResource == models.NPUResource {
fileInfos, err = storage.GetObsListObject(task.JobName, "output/", parentDir, versionName)
fileInfos, err = storage.GetObsListObject(task.JobName, "output/", parentDir, task.VersionName)
if err != nil {
log.Info("get TrainJobListModel failed:", err)
ctx.ServerError("GetObsListObject:", err)
@@ -522,8 +432,8 @@ func ModelList(ctx *context.APIContext) {
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"VersionName": versionName,
"ID": id,
"VersionName": task.VersionName,
"StatusOK": 0,
"Path": dirArray,
"Dirs": fileInfos,
@@ -537,35 +447,20 @@ func GetModelArtsInferenceJob(ctx *context.APIContext) {
err error
)

jobID := ctx.Params(":jobid")
job, err := models.GetCloudbrainByJobID(jobID)
id := ctx.Params(":id")
job, err := models.GetCloudbrainByID(id)
if err != nil {
ctx.NotFound(err)
return
}
result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10))
err = modelarts.HandleTrainJobInfo(job)
if err != nil {
ctx.NotFound(err)
return
}
if job.StartTime == 0 && result.StartTime > 0 {
job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
job.Duration = result.Duration / 1000
job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)

if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
job.EndTime = job.StartTime.Add(job.Duration)
}
job.CorrectCreateUnix()
err = models.UpdateInferenceJob(job)
if err != nil {
log.Error("UpdateJob failed:", err)
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"ID": id,
"JobStatus": job.Status,
"JobDuration": job.TrainJobDuration,
})
@@ -577,16 +472,15 @@ func ResultList(ctx *context.APIContext) {
err error
)

var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
var id = ctx.Params(":id")
parentDir := ctx.Query("parentDir")
dirArray := strings.Split(parentDir, "/")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
task, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
return
}
models, err := storage.GetObsListObject(task.JobName, "result/", parentDir, versionName)
models, err := storage.GetObsListObject(task.JobName, "result/", parentDir, task.VersionName)
if err != nil {
log.Info("get TrainJobListModel failed:", err)
ctx.ServerError("GetObsListObject:", err)
@@ -594,8 +488,8 @@ func ResultList(ctx *context.APIContext) {
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"VersionName": versionName,
"ID": id,
"VersionName": task.VersionName,
"StatusOK": 0,
"Path": dirArray,
"Dirs": models,
@@ -609,38 +503,37 @@ func TrainJobGetMetricStatistic(ctx *context.APIContext) {
err error
)

var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
var id = ctx.Params(":id")

result, err := trainJobGetMetricStatistic(jobID, versionName)
result, err := trainJobGetMetricStatistic(id)
if err != nil {
log.Error("trainJobGetMetricStatistic(%s) failed:%v", jobID, err.Error())
log.Error("trainJobGetMetricStatistic(%s) failed:%v", id, err.Error())
return
}

ctx.JSON(http.StatusOK, map[string]interface{}{
"JobID": jobID,
"ID": id,
"Interval": result.Interval,
"MetricsInfo": result.MetricsInfo,
})
}

func trainJobGetMetricStatistic(jobID string, versionName string) (*models.GetTrainJobMetricStatisticResult, error) {
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
func trainJobGetMetricStatistic(id string) (*models.GetTrainJobMetricStatisticResult, error) {
task, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
return nil, err
}

resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
resultLogFile, err := modelarts.GetTrainJobLogFileNames(task.JobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
log.Error("GetTrainJobLogFileNames(%s) failed:%v", task.JobID, err.Error())
return nil, err
}

result, err := modelarts.GetTrainJobMetricStatistic(jobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0])
result, err := modelarts.GetTrainJobMetricStatistic(task.JobID, strconv.FormatInt(task.VersionID, 10), resultLogFile.LogFileList[0])
if err != nil {
log.Error("GetTrainJobMetricStatistic(%s) failed:%v", jobID, err.Error())
log.Error("GetTrainJobMetricStatistic(%s) failed:%v", task.JobID, err.Error())
return nil, err
}



+ 10
- 48
routers/repo/cloudbrain.go View File

@@ -373,7 +373,6 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
}
}


func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBrainInferencForm) {
ctx.Data["PageIsCloudBrain"] = true
displayJobName := form.DisplayJobName
@@ -494,6 +493,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/inference-job")

}

/**
检查用户传输的参数是否符合专属资源池
*/
@@ -1214,7 +1214,7 @@ func StopJobs(cloudBrains []*models.Cloudbrain) {
Action: models.ActionStop,
}
err := retry(3, time.Second*30, func() error {
_, err := modelarts.ManageNotebook(taskInfo.JobID, param)
_, err := modelarts.ManageNotebook2(taskInfo.JobID, param)
return err
})
logErrorAndUpdateJobStatus(err, taskInfo)
@@ -1765,62 +1765,24 @@ func SyncCloudbrainStatus() {
}
} else if task.Type == models.TypeCloudBrainTwo {
if task.JobType == string(models.JobTypeDebug) {
//result, err := modelarts.GetJob(task.JobID)
result, err := modelarts.GetNotebook2(task.JobID)
err := modelarts.HandleNotebookInfo(task)
if err != nil {
log.Error("GetJob(%s) failed:%v", task.JobName, err)
log.Error("HandleNotebookInfo(%s) failed:%v", task.DisplayJobName, err)
continue
}

if result != nil {
task.Status = result.Status
if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
}
if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.CorrectCreateUnix()
task.ComputeAndSetDuration()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
continue
}
}
} else if task.JobType == string(models.JobTypeTrain) || task.JobType == string(models.JobTypeInference) {
result, err := modelarts.GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
err := modelarts.HandleTrainJobInfo(task)
if err != nil {
log.Error("GetTrainJob(%s) failed:%v", task.JobName, err)
log.Error("HandleTrainJobInfo(%s) failed:%v", task.DisplayJobName, err)
continue
}

if result != nil {
task.Status = modelarts.TransTrainJobStatus(result.IntStatus)
task.Duration = result.Duration / 1000
task.TrainJobDuration = result.TrainJobDuration

if task.StartTime == 0 && result.StartTime > 0 {
task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
}
task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
task.EndTime = task.StartTime.Add(task.Duration)
}
task.CorrectCreateUnix()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
continue
}
}
} else {
log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType)
log.Error("task.JobType(%s) is error:%s", task.DisplayJobName, task.JobType)
}
} else if task.Type == models.TypeC2Net {
result, err := grampus.GetJob(task.JobID)
if err != nil {
log.Error("GetTrainJob(%s) failed:%v", task.JobName, err)
log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
continue
}

@@ -1841,12 +1803,12 @@ func SyncCloudbrainStatus() {
task.CorrectCreateUnix()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
continue
}
}
} else {
log.Error("task.Type(%s) is error:%d", task.JobName, task.Type)
log.Error("task.Type(%s) is error:%d", task.DisplayJobName, task.Type)
}
}



+ 2
- 2
routers/repo/grampus.go View File

@@ -336,7 +336,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
EngineName: image,
DatasetName: attachment.Name,
IsLatestVersion: modelarts.IsLatestVersion,
VersionCount: modelarts.VersionCount,
VersionCount: modelarts.VersionCountOne,
WorkServerNumber: 1,
}

@@ -386,7 +386,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
branchName := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
flavorName := form.FlavorName
versionCount := modelarts.VersionCount
versionCount := modelarts.VersionCountOne
engineName := form.EngineName

if !jobNamePattern.MatchString(displayJobName) {


+ 161
- 233
routers/repo/modelarts.go View File

@@ -7,6 +7,7 @@ import (
"fmt"
"io"
"io/ioutil"
"math/rand"
"net/http"
"os"
"path"
@@ -262,28 +263,15 @@ func NotebookShow(ctx *context.Context) {
return
}

result, err := modelarts.GetNotebook2(task.JobID)
if err != nil {
log.Error("GET job error", err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}

if result != nil {
if task.DeletedAt.IsZero() { //normal record
if task.Status != result.Status {
task.Status = result.Status
models.ParseAndSetDurationFromModelArtsNotebook(result, task)
err = models.UpdateJob(task)
if err != nil {
log.Error("GET job error", err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
}
} else { //deleted record

if task.DeletedAt.IsZero() { //normal record
err := modelarts.HandleNotebookInfo(task)
if err != nil {
ctx.Data["error"] = err.Error()
ctx.RenderWithErr(err.Error(), tplModelArtsNotebookShow, nil)
return
}
} else { //deleted record

}

datasetDownload := make([]models.DatasetDownload, 0)
@@ -394,82 +382,141 @@ func NotebookDebug2(ctx *context.Context) {
ctx.Redirect(result.Url + "?token=" + result.Token)
}

func NotebookManage(ctx *context.Context) {
func NotebookRestart(ctx *context.Context) {
var ID = ctx.Params(":id")
var action = ctx.Params(":action")
var resultCode = "0"
var resultCode = "-1"
var errorMsg = ""
var status = ""

task := ctx.Cloudbrain

for {
task, err := models.GetCloudbrainByID(ID)
if err != nil {
log.Error("get task(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
ctx.CheckWechatBind()
if ctx.Written() {
return
}
if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
errorMsg = "the job is not stopped"
break
}

if action == models.ActionStop {
if task.Status != string(models.ModelArtsRunning) {
log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not running"
count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
if err != nil {
log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
errorMsg = "system error"
break
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
errorMsg = "you have already a running or waiting task, can not create more"
break
}
}

if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin() && !ctx.IsUserRepoOwner()) {
log.Error("the user has no right ro stop the job", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "you have no right to stop the job"
break
}
} else if action == models.ActionRestart {
ctx.CheckWechatBind()
if ctx.Written() {
return
}
if task.Status != string(models.ModelArtsStopped) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsCreateFailed) {
log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "the job is not stopped"
break
}
createTime := timeutil.TimeStampNow()
newTask := &models.Cloudbrain{
Status: string(models.ModelArtsTrainJobWaiting),
UserID: task.UserID,
RepoID: task.RepoID,
JobID: models.TempJobIdPrefix + task.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
JobName: task.JobName,
DisplayJobName: task.DisplayJobName,
JobType: task.JobType,
Type: task.Type,
Uuid: task.Uuid,
Image: task.Image,
ComputeResource: task.ComputeResource,
Description: task.Description,
CreatedUnix: createTime,
UpdatedUnix: createTime,
}

if !ctx.IsSigned || (ctx.User.ID != task.UserID && !ctx.IsUserSiteAdmin()) {
log.Error("the user has no right ro restart the job", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "you have no right to restart the job"
err = models.RestartCloudbrain(task, newTask)
if err != nil {
log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
errorMsg = "system error"
break
}

param := models.NotebookAction{
Action: models.ActionStart,
}

res, err := modelarts.ManageNotebook2(task.JobID, param)
if err != nil {
log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
if strings.HasPrefix(err.Error(), modelarts.UnknownErrorPrefix) {
log.Info("(%s)unknown error, set temp status", newTask.DisplayJobName)
errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
CloudbrainID: newTask.ID,
Status: models.JobStatusTemp,
Type: newTask.Type,
JobName: newTask.JobName,
JobType: newTask.JobType,
})
if errTemp != nil {
log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
}
} else {
newTask.Status = string(models.ModelArtsTrainJobFailed)
errTemp := models.UpdateJob(newTask)
if errTemp != nil {
log.Error("UpdateJob failed: %v", errTemp.Error())
}
errTemp = models.DeleteJob(newTask)
if errTemp != nil {
log.Error("DeleteJob failed: %v", errTemp.Error())
}
errorMsg = err.Error()
break
}

count, err := models.GetCloudbrainNotebookCountByUserID(ctx.User.ID)
} else {
newTask.Status = res.Status
newTask.JobID = task.JobID
err = models.UpdateJob(newTask)
if err != nil {
log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
log.Error("UpdateJob failed: %v", err.Error())
errorMsg = err.Error()
break
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "you have already a running or waiting task, can not create more"
break
}
}
}

action = models.ActionStart
} else {
log.Error("the action(%s) is illegal", action, ctx.Data["MsgID"])
status = res.Status
resultCode = "0"
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, strconv.FormatInt(newTask.ID, 10), newTask.DisplayJobName, models.ActionCreateDebugNPUTask)

break
}

ctx.JSON(200, map[string]string{
"result_code": resultCode,
"error_msg": errorMsg,
"status": status,
"id": ID,
})
}

func NotebookStop(ctx *context.Context) {
var ID = ctx.Params(":id")
var resultCode = "0"
var errorMsg = ""
var status = ""

task := ctx.Cloudbrain

for {
if task.Status != string(models.ModelArtsRunning) {
log.Error("the job(%s) is not running", task.JobName, ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "非法操作"
errorMsg = "the job is not running"
break
}

param := models.NotebookAction{
Action: action,
Action: models.ActionStop,
}
createTime := timeutil.TimeStampNow()
res, err := modelarts.ManageNotebook2(task.JobID, param)
if err != nil {
log.Error("ManageNotebook2(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
@@ -482,46 +529,17 @@ func NotebookManage(ctx *context.Context) {
}

status = res.Status
if action == models.ActionStart {
newTask := &models.Cloudbrain{
Status: status,
UserID: task.UserID,
RepoID: task.RepoID,
JobID: task.JobID,
JobName: task.JobName,
DisplayJobName: task.DisplayJobName,
JobType: task.JobType,
Type: task.Type,
Uuid: task.Uuid,
Image: task.Image,
ComputeResource: task.ComputeResource,
Description: task.Description,
CreatedUnix: createTime,
UpdatedUnix: createTime,
}

err = models.RestartCloudbrain(task, newTask)
if err != nil {
log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}
ID = strconv.FormatInt(newTask.ID, 10)
notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, ID, task.DisplayJobName, models.ActionCreateDebugNPUTask)
} else {
task.Status = res.Status
if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}
task.Status = res.Status
if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
task.EndTime = timeutil.TimeStampNow()
}
task.ComputeAndSetDuration()
err = models.UpdateJob(task)
if err != nil {
log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
resultCode = "-1"
errorMsg = "system error"
break
}

break
@@ -792,18 +810,11 @@ func TrainJobNewVersion(ctx *context.Context) {

func trainJobNewVersionDataPrepare(ctx *context.Context) error {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
var id = ctx.Params(":id")

// canNewJob, err := canUserCreateTrainJobVersion(ctx, jobID, versionName)
// if err != nil {
// ctx.ServerError("canNewJob can info failed", err)
// return err
// }

task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
task, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
return err
}

@@ -883,13 +894,10 @@ func trainJobNewVersionDataPrepare(ctx *context.Context) error {

func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) error {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")
// var versionName = ctx.Params(":version-name")
var versionName = ctx.Query("version_name")

task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
var id = ctx.Params(":id")
task, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", jobID, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
return err
}

@@ -994,7 +1002,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
branch_name := form.BranchName
isLatestVersion := modelarts.IsLatestVersion
FlavorName := form.FlavorName
VersionCount := modelarts.VersionCount
VersionCount := modelarts.VersionCountOne
EngineName := form.EngineName

count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
@@ -1248,7 +1256,7 @@ func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, s

func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) {
ctx.Data["PageIsTrainJob"] = true
var jobID = ctx.Params(":jobid")
var jobID = ctx.Cloudbrain.JobID

count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID)
if err != nil {
@@ -1620,7 +1628,13 @@ func paramCheckCreateInferenceJob(form auth.CreateModelArtsInferenceJobForm) err

func TrainJobShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")
var id = ctx.Params(":id")

job, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetCloudbrainByID failed:%v", err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
}

repo := ctx.Repo.Repository
page := ctx.QueryInt("page")
@@ -1638,11 +1652,11 @@ func TrainJobShow(ctx *context.Context) {
RepoID: repo.ID,
Type: models.TypeCloudBrainTwo,
JobTypes: jobTypes,
JobID: jobID,
JobID: job.JobID,
})

if err != nil {
log.Error("GetVersionListTasks(%s) failed:%v", jobID, err.Error())
log.Error("GetVersionListTasks(%s) failed:%v", id, err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
@@ -1688,7 +1702,7 @@ func TrainJobShow(ctx *context.Context) {
pager := context.NewPagination(VersionListCount, setting.UI.IssuePagingNum, page, 5)
pager.SetDefaultParams(ctx)
ctx.Data["Page"] = pager
ctx.Data["jobID"] = jobID
ctx.Data["jobID"] = job.JobID
ctx.Data["displayJobName"] = VersionListTasks[0].DisplayJobName
ctx.Data["version_list_task"] = VersionListTasks
ctx.Data["version_list_count"] = VersionListCount
@@ -1696,62 +1710,8 @@ func TrainJobShow(ctx *context.Context) {
ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

func TrainJobGetLog(ctx *context.Context) {
ctx.Data["PageIsTrainJob"] = true

var jobID = ctx.Params(":jobid")
var logFileName = ctx.Query("file_name")
var baseLine = ctx.Query("base_line")
var order = ctx.Query("order")

if order != modelarts.OrderDesc && order != modelarts.OrderAsc {
log.Error("order(%s) check failed", order)
ctx.HTML(http.StatusBadRequest, tplModelArtsTrainJobShow)
return
}

task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, logFileName, order, modelarts.Lines)
if err != nil {
log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil)
return
}

ctx.Data["log"] = result
//ctx.HTML(http.StatusOK, tplModelArtsTrainJobShow)
}

func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) {
task, err := models.GetCloudbrainByJobID(jobID)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error())
return nil, nil, err
}

resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error())
return nil, nil, err
}

result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), "", resultLogFile.LogFileList[0], modelarts.OrderDesc, modelarts.Lines)
if err != nil {
log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error())
return nil, nil, err
}

return resultLogFile, result, err
}

func TrainJobDel(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var jobID = ctx.Cloudbrain.JobID
var listType = ctx.Query("listType")
repo := ctx.Repo.Repository

@@ -1802,11 +1762,10 @@ func TrainJobDel(ctx *context.Context) {
}

func TrainJobStop(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var listType = ctx.Query("listType")
task := ctx.Cloudbrain

_, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10))
_, err := modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
if err != nil {
log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error())
ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil)
@@ -1816,15 +1775,6 @@ func TrainJobStop(ctx *context.Context) {
ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
}

func canUserCreateTrainJob(uid int64) (bool, error) {
org, err := models.GetOrgByName(setting.AllowedOrg)
if err != nil {
log.Error("get allowed org failed: ", setting.AllowedOrg)
return false, err
}

return org.IsOrgMember(uid)
}
func canUserCreateTrainJobVersion(ctx *context.Context, userID int64) (bool, error) {
if ctx == nil || ctx.User == nil {
log.Error("user unlogin!")
@@ -1916,7 +1866,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
EngineName := form.EngineName
LabelName := form.LabelName
isLatestVersion := modelarts.IsLatestVersion
VersionCount := modelarts.VersionCount
VersionCount := modelarts.VersionCountOne
trainUrl := form.TrainUrl
modelName := form.ModelName
modelVersion := form.ModelVersion
@@ -2291,16 +2241,15 @@ func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModel
}
func InferenceJobShow(ctx *context.Context) {
ctx.Data["PageIsCloudBrain"] = true
var jobID = ctx.Params(":jobid")

page := ctx.QueryInt("page")
if page <= 0 {
page = 1
}
task, err := models.GetCloudbrainByJobID(jobID)

var id = ctx.Params(":id")
task, err := models.GetCloudbrainByID(id)
if err != nil {
log.Error("GetInferenceTask(%s) failed:%v", jobID, err.Error())
log.Error("GetCloudbrainByID(%s) failed:%v", id, err.Error())
ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
return
}
@@ -2334,11 +2283,12 @@ func InferenceJobShow(ctx *context.Context) {

LabelName := strings.Fields(task.LabelName)
ctx.Data["labelName"] = LabelName
ctx.Data["jobID"] = jobID
ctx.Data["jobID"] = task.JobID
ctx.Data["jobName"] = task.JobName
ctx.Data["displayJobName"] = task.DisplayJobName
ctx.Data["task"] = task
ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
ctx.Data["id"] = id

tempUids := []int64{}
tempUids = append(tempUids, task.UserID)
@@ -2355,15 +2305,10 @@ func ModelDownload(ctx *context.Context) {
err error
)

jobID := ctx.Params(":jobid")
versionName := ctx.Query("version_name")
parentDir := ctx.Query("parent_dir")
fileName := ctx.Query("file_name")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error())
return
}
task := ctx.Cloudbrain

var url string
if task.ComputeResource == models.NPUResource {
@@ -2430,19 +2375,8 @@ func DeleteJobStorage(jobName string) error {
}

func DownloadMultiResultFile(ctx *context.Context) {
var jobID = ctx.Params(":jobid")
var versionName = ctx.Query("version_name")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error())
return
}
// if !isCanDeleteOrDownload(ctx, task) {
// ctx.ServerError("no right.", errors.New(ctx.Tr("repo.model_noright")))
// return
// }

// path := Model_prefix + models.AttachmentRelativePath(id) + "/"
task := ctx.Cloudbrain
path := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, "result/", versionName), "/") + "/"

allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
@@ -2513,19 +2447,13 @@ func TrainJobDownloadLogFile(ctx *context.Context) {
err error
)

var jobID = ctx.Params(":jobid")
versionName := ctx.Query("version_name")
task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName)
if err != nil {
log.Error("GetCloudbrainByJobIDAndVersionName(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"])
ctx.ServerError("GetCloudbrainByJobIDAndVersionName", err)
return
}
task := ctx.Cloudbrain

prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job"
key, err := storage.GetObsLogFileName(prefix)
if err != nil {
log.Error("GetObsLogFileName(%s) failed:%v", jobID, err.Error(), ctx.Data["msgID"])
log.Error("GetObsLogFileName(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["msgID"])
ctx.ServerError("GetObsLogFileName", err)
return
}


+ 12
- 11
routers/routes/routes.go View File

@@ -1183,7 +1183,8 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Group("/:id", func() {
m.Get("", reqRepoCloudBrainReader, repo.NotebookShow)
m.Get("/debug", cloudbrain.AdminOrJobCreaterRight, repo.NotebookDebug2)
m.Post("/:action", reqRepoCloudBrainWriter, repo.NotebookManage)
m.Post("/restart", cloudbrain.AdminOrJobCreaterRight, repo.NotebookRestart)
m.Post("/stop", cloudbrain.AdminOrJobCreaterRight, repo.NotebookStop)
m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.NotebookDel)
})
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.NotebookNew)
@@ -1192,14 +1193,14 @@ func RegisterRoutes(m *macaron.Macaron) {

m.Group("/train-job", func() {
m.Get("", reqRepoCloudBrainReader, repo.TrainJobIndex)
m.Group("/:jobid", func() {
m.Group("/:id", func() {
m.Get("", reqRepoCloudBrainReader, repo.TrainJobShow)
m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobStop)
m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.TrainJobDel)
m.Get("/model_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ModelDownload)
m.Get("/download_log_file", cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobDownloadLogFile)
m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobNewVersion)
m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion)
m.Post("/stop", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.TrainJobStop)
m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRight, repo.TrainJobDel)
m.Get("/model_download", cloudbrain.AdminOrJobCreaterRight, repo.ModelDownload)
m.Get("/download_log_file", cloudbrain.AdminOrJobCreaterRight, repo.TrainJobDownloadLogFile)
m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRight, repo.TrainJobNewVersion)
m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRight, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion)
})
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.TrainJobNew)
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreate)
@@ -1209,10 +1210,10 @@ func RegisterRoutes(m *macaron.Macaron) {

m.Group("/inference-job", func() {
m.Get("", reqRepoCloudBrainReader, repo.InferenceJobIndex)
m.Group("/:jobid", func() {
m.Group("/:id", func() {
m.Get("", reqRepoCloudBrainReader, repo.InferenceJobShow)
m.Get("/result_download", cloudbrain.AdminOrJobCreaterRightForTrain, repo.ResultDownload)
m.Get("/downloadall", repo.DownloadMultiResultFile)
m.Get("/result_download", cloudbrain.AdminOrJobCreaterRight, repo.ResultDownload)
m.Get("/downloadall", cloudbrain.AdminOrJobCreaterRight, repo.DownloadMultiResultFile)
})
m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.InferenceJobNew)
m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateModelArtsInferenceJobForm{}), repo.InferenceJobCreate)


+ 7
- 7
templates/repo/modelarts/trainjob/index.tmpl View File

@@ -112,7 +112,7 @@

<!-- 任务名 -->
<div class="three wide column padding0">
<a class="title" href='{{if eq .Cloudbrain.Type 1 }}{{$.Link}}/{{.JobID}}{{else if eq .Cloudbrain.Type 0}}{{$.RepoLink}}/cloudbrain/train-job/{{.JobID}}{{else if eq .Cloudbrain.Type 2}}{{$.RepoLink}}/grampus/train-job/{{.JobID}}{{end}}' title="{{.DisplayJobName}}" style="font-size: 14px;">
<a class="title" href='{{if eq .Cloudbrain.Type 1 }}{{$.Link}}/{{.Cloudbrain.ID}}{{else if eq .Cloudbrain.Type 0}}{{$.RepoLink}}/cloudbrain/train-job/{{.JobID}}{{else if eq .Cloudbrain.Type 2}}{{$.RepoLink}}/grampus/train-job/{{.JobID}}{{end}}' title="{{.DisplayJobName}}" style="font-size: 14px;">

<span class="fitted" style="width: 90%;vertical-align: middle;">{{.DisplayJobName}}</span>
</a>
@@ -123,8 +123,8 @@
</div>
<!-- 任务状态 -->
<div class="two wide column padding0" style="padding-left: 2.2rem !important;">
<span class="job-status" id="{{.JobID}}" data-repopath="{{$.RepoRelPath}}/modelarts/train-job" data-jobid="{{.JobID}}" data-version="{{.VersionName}}">
<span><i id="{{.JobID}}-icon" style="vertical-align: middle;" class="{{.Status}}"></i><span id="{{.JobID}}-text" style="margin-left: 0.4em;font-size: 12px;">{{.Status}}</span></span>
<span class="job-status" id="{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}" data-repopath="{{$.RepoRelPath}}/modelarts/train-job" data-jobid="{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}" data-version="{{.VersionName}}">
<span><i id="{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}-icon" style="vertical-align: middle;" class="{{.Status}}"></i><span id="{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}-text" style="margin-left: 0.4em;font-size: 12px;">{{.Status}}</span></span>
</span>
</div>
<!-- 任务创建时间 -->
@@ -133,7 +133,7 @@
</div>
<!-- 任务运行时间 -->
<div class="two wide column text center padding0">
<span style="font-size: 12px;" id="duration-{{.JobID}}">{{.TrainJobDuration}}</span>
<span style="font-size: 12px;" id="duration-{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}">{{.TrainJobDuration}}</span>
</div>
<!-- 计算资源 -->
<div class="two wide column text center padding0">
@@ -153,7 +153,7 @@
<div class="ui compact buttons">
{{$.CsrfTokenHtml}}
{{if .CanDel}}
<a style="padding: 0.5rem 1rem;" id="ai-stop-{{.JobID}}" class="ui basic ai_stop_version {{if eq .Status "KILLED" "FAILED" "START_FAILED" "KILLING" "COMPLETED" "SUCCEEDED" "STOPPED"}}disabled {{else}} blue {{end}}button" data-repopath='{{$.RepoRelPath}}{{if eq .Cloudbrain.Type 1}}/modelarts/train-job{{else if eq .Cloudbrain.Type 0}}/cloudbrain/train-job{{else if eq .Cloudbrain.Type 2}}/grampus/train-job{{end}}' data-jobid="{{.JobID}}" data-version="{{.VersionName}}">
<a style="padding: 0.5rem 1rem;" id="ai-stop-{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}" class="ui basic ai_stop_version {{if eq .Status "KILLED" "FAILED" "START_FAILED" "KILLING" "COMPLETED" "SUCCEEDED" "STOPPED"}}disabled {{else}} blue {{end}}button" data-repopath='{{$.RepoRelPath}}{{if eq .Cloudbrain.Type 1}}/modelarts/train-job{{else if eq .Cloudbrain.Type 0}}/cloudbrain/train-job{{else if eq .Cloudbrain.Type 2}}/grampus/train-job{{end}}' data-jobid="{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}" data-version="{{.VersionName}}">
{{$.i18n.Tr "repo.stop"}}
</a>
{{else}}
@@ -164,11 +164,11 @@

</div>
<!-- 删除任务 -->
<form class="ui compact buttons" id="delForm-{{.JobID}}" action='{{if eq .Cloudbrain.Type 1}}{{$.Link}}/{{.JobID}}{{else if eq .Cloudbrain.Type 0}}{{$.RepoLink}}/cloudbrain/train-job/{{.JobID}}{{else if eq .Cloudbrain.Type 2}}{{$.RepoLink}}/grampus/train-job/{{.JobID}}{{end}}/del' method="post">
<form class="ui compact buttons" id="delForm-{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}" action='{{if eq .Cloudbrain.Type 1}}{{$.Link}}/{{.Cloudbrain.ID}}{{else if eq .Cloudbrain.Type 0}}{{$.RepoLink}}/cloudbrain/train-job/{{.JobID}}{{else if eq .Cloudbrain.Type 2}}{{$.RepoLink}}/grampus/train-job/{{.JobID}}{{end}}/del' method="post">
<input type="hidden" name="listType" value="{{$.ListType}}">
{{$.CsrfTokenHtml}}
{{if .CanDel}}
<a style="padding: 0.5rem 1rem;margin-left:0.2rem" id="ai-delete-{{.JobID}}" class="ui basic ai_delete blue button" style="border-radius: .28571429rem;">
<a style="padding: 0.5rem 1rem;margin-left:0.2rem" id="ai-delete-{{if eq .Cloudbrain.Type 1 }}{{.Cloudbrain.ID}}{{else }}{{.JobID}}{{end}}" class="ui basic ai_delete blue button" style="border-radius: .28571429rem;">
{{$.i18n.Tr "repo.delete"}}
</a>
{{else}}


+ 19
- 17
templates/repo/modelarts/trainjob/show.tmpl View File

@@ -243,7 +243,7 @@

{{range $k ,$v := .version_list_task}}
<div class="ui accordion border-according" id="accordion{{.VersionName}}"
data-repopath="{{$.RepoRelPath}}/modelarts/train-job" data-jobid="{{.JobID}}"
data-repopath="{{$.RepoRelPath}}/modelarts/train-job" data-id="{{.Cloudbrain.ID}}"
data-version="{{.VersionName}}">
<div class="{{if eq $k 0}}active{{end}} title padding0">
<div class="according-panel-heading">
@@ -263,16 +263,16 @@

{{if .CanModify}}
<a class="ti-action-menu-item"
href="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/create_version?version_name={{.VersionName}}">{{$.i18n.Tr "repo.modelarts.modify"}}</a>
href="{{$.RepoLink}}/modelarts/train-job/{{.Cloudbrain.ID}}/create_version?version_name={{.VersionName}}">{{$.i18n.Tr "repo.modelarts.modify"}}</a>
{{else}}
<a class="ti-action-menu-item disabled"
href="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/create_version?version_name={{.VersionName}}">{{$.i18n.Tr "repo.modelarts.modify"}}</a>
href="{{$.RepoLink}}/modelarts/train-job/{{.Cloudbrain.ID}}/create_version?version_name={{.VersionName}}">{{$.i18n.Tr "repo.modelarts.modify"}}</a>
{{end}}

{{if .CanDel}}
<a class="ti-action-menu-item stop-show-version {{if eq .Status "KILLED" "FAILED" "START_FAILED" "KILLING" "COMPLETED"}}disabled {{end}}"
id="{{.VersionName}}-stop"
data-jobid="{{.JobID}}"
data-id="{{.Cloudbrain.ID}}"
data-repopath="{{$.RepoRelPath}}/modelarts/train-job"
data-version = "{{.VersionName}}"
>{{$.i18n.Tr "repo.stop"}}</a>
@@ -325,7 +325,7 @@
<a class="item log_bottom" data-tab="second{{$k}}"
data-version="{{.VersionName}}">{{$.i18n.Tr "repo.modelarts.log"}}</a>
<a class="item metric_chart" data-tab="four{{$k}}" data-version="{{.VersionName}}">资源占用情况</a>
<a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a>
<a class="item load-model-file" data-tab="third{{$k}}" data-download-flag="{{$.canDownload}}" data-path="{{$.RepoLink}}/modelarts/train-job/{{.Cloudbrain.ID}}/model_list" data-version="{{.VersionName}}" data-parents="" data-filename="" data-init="init" >{{$.i18n.Tr "repo.model_download"}}</a>
</div>
<div class="ui tab active" data-tab="first{{$k}}">
<div style="padding-top: 10px;">
@@ -500,7 +500,7 @@
<div>
<a id="{{.VersionName}}-log-down"
class='{{if and (.CanModify) (eq .Status "KILLED" "FAILED" "START_FAILED" "STOPPED" "COMPLETED") }}ti-download-file{{else}}disabled{{end}}'
href="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/download_log_file?version_name={{.VersionName}}">
href="{{$.RepoLink}}/modelarts/train-job/{{.Cloudbrain.ID}}/download_log_file?version_name={{.VersionName}}">
<i class="ri-download-cloud-2-line"></i>
<span style="margin-left: 0.3rem;">{{$.i18n.Tr "repo.modelarts.download_log"}}</span>
</a>
@@ -597,7 +597,7 @@
<div class="two inline fields ">
<div class="required ten wide field">
<label style="margin-left: -23px;">选择训练任务</label>
<input type="hidden" class="width83" id="JobId" name="JobId" readonly required>
<input type="hidden" class="width83" id="ID" name="ID" readonly required>
<input class="width83" id="JobName" readonly required>

</div>
@@ -793,13 +793,15 @@
let userName
let repoPath
let jobID
let id
let downlaodFlag = {{ $.canDownload }}
$(document).ready(function () {
let url = window.location.href;
let urlArr = url.split('/')
userName = urlArr.slice(-5)[0]
repoPath = urlArr.slice(-4)[0]
jobID = urlArr.slice(-1)[0]
id = urlArr.slice(-1)[0]
jobID = id
})
function stopBubbling(e) {
e = window.event || e;
@@ -816,9 +818,9 @@
centered: false,
onShow: function () {
$('input[name="Version"]').addClass('model_disabled')
// $('input[name="JobId"]').text(obj.JobName)
// $('input[name="ID"]').text(obj.JobName)
$('#JobName').val(obj.DisplayJobName).addClass('model_disabled')
$('input[name="JobId"]').val(obj.JobID)
$('input[name="ID"]').val(obj.ID)
$('input[name="VersionName"]').val(obj.VersionName).addClass('model_disabled')
if(obj.EngineID ==122 || obj.EngineID ==35 || obj.EngineID ==-1){
$('input[name="Engine_name"]').val("MindSpore").addClass('model_disabled');
@@ -883,7 +885,7 @@
flag = false
},
onApprove: function () {
$.post(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/del_version`, { version_name: version_name }, (data) => {
$.post(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${id}/del_version`, { version_name: version_name }, (data) => {
if (data.VersionListCount === 0) {
location.href = `/${userName}/${repoPath}/modelarts/train-job`
} else {
@@ -906,7 +908,7 @@
}

function loadLog(version_name) {
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&lines=50&order=asc`, (data) => {
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${id}/log?version_name=${version_name}&lines=50&order=asc`, (data) => {
$('input[name=end_line]').val(data.EndLine)
$('input[name=start_line]').val(data.StartLine)
$(`#log_file${version_name}`).text(data.Content)
@@ -940,7 +942,7 @@
let scrollLeft = container.scrollLeft
if (((parseInt(scrollTop) + clientHeight == scrollHeight || parseInt(scrollTop) + clientHeight + 1 == scrollHeight || parseInt(scrollTop) + clientHeight - 1 == scrollHeight)) && parseInt(scrollTop) !== 0 && scrollLeft == 0) {
let end_line = $(`#log${version_name} input[name=end_line]`).val()
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=${end_line}&lines=50&order=desc`, (data) => {
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${id}/log?version_name=${version_name}&base_line=${end_line}&lines=50&order=desc`, (data) => {
if (data.Lines == 0) {
$(`.message${version_name} #header`).text('您已翻阅至日志底部')
$(`.message${version_name}`).css('display', 'block')
@@ -963,7 +965,7 @@
}
if ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10].includes(scrollTop) && scrollLeft == 0) {
let start_line = $(`#log${version_name} input[name=start_line]`).val()
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=${start_line}&lines=50&order=asc`, (data) => {
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${id}/log?version_name=${version_name}&base_line=${start_line}&lines=50&order=asc`, (data) => {
if (data.Lines == 0) {
$(`.message${version_name} #header`).text('您已翻阅至日志顶部')
$(`.message${version_name}`).css('display', 'block')
@@ -1007,7 +1009,7 @@
let logContentDom = document.querySelector(`#log${version_name}`)

$(`#log_file${version_name}`).siblings('pre').remove()
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=asc`, (data) => {
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${id}/log?version_name=${version_name}&base_line=&lines=50&order=asc`, (data) => {

$(`#log${version_name} input[name=end_line]`).val(data.EndLine) //如果变动就改变所对应的值
$(`#log${version_name} input[name=start_line]`).val(data.StartLine)
@@ -1025,12 +1027,12 @@
let version_name = $(this).data('version')
let logContentDom = document.querySelector(`#log${version_name}`)
$(`#log_file${version_name}`).siblings('pre').remove()
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=&lines=50&order=desc`, (data) => {
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${id}/log?version_name=${version_name}&base_line=&lines=50&order=desc`, (data) => {

$(`#log${version_name} input[name=end_line]`).val(data.EndLine) //如果变动就改变所对应的值
$(`#log${version_name} input[name=start_line]`).val(data.StartLine)
$(`#log${version_name}`).append('<pre>' + data.Content)
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${jobID}/log?version_name=${version_name}&base_line=${data.EndLine}&lines=50&order=desc`, (data) => {
$.get(`/api/v1/repos/${userName}/${repoPath}/modelarts/train-job/${id}/log?version_name=${version_name}&base_line=${data.EndLine}&lines=50&order=desc`, (data) => {
if (data.Lines == 0) {
$(`.message${version_name} #header`).text('您已翻阅至日志底部')
$(`.message${version_name}`).css('display', 'block')


Loading…
Cancel
Save