|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037 |
- package modelarts
-
- import (
- "encoding/json"
- "errors"
- "fmt"
- "math/rand"
- "path"
- "strconv"
- "strings"
- "time"
-
- "code.gitea.io/gitea/modules/timeutil"
-
- "code.gitea.io/gitea/models"
- "code.gitea.io/gitea/modules/context"
- "code.gitea.io/gitea/modules/log"
- "code.gitea.io/gitea/modules/notification"
- "code.gitea.io/gitea/modules/setting"
- "code.gitea.io/gitea/modules/storage"
- )
-
- const (
- //notebook
- storageTypeOBS = "obs"
- autoStopDuration = 4 * 60 * 60
- autoStopDurationMs = 4 * 60 * 60 * 1000
- MORDELART_USER_IMAGE_ENGINE_ID = -1
- DataSetMountPath = "/home/ma-user/work"
- NotebookEnv = "Python3"
- NotebookType = "Ascend"
- FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
-
- //train-job
- // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
- // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
- // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
- // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
- // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
- // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
- // "]}"
- // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
- // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
- // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
- // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
- // "]}"
- CodePath = "/code/"
- OutputPath = "/output/"
- ResultPath = "/result/"
- LogPath = "/log/"
- JobPath = "/job/"
- OrderDesc = "desc" //向下查询
- OrderAsc = "asc" //向上查询
- Lines = 500
- TrainUrl = "train_url"
- DataUrl = "data_url"
- MultiDataUrl = "multi_data_url"
- ResultUrl = "result_url"
- CkptUrl = "ckpt_url"
- DeviceTarget = "device_target"
- Ascend = "Ascend"
- PerPage = 10
- IsLatestVersion = "1"
- NotLatestVersion = "0"
- VersionCountOne = 1
-
- SortByCreateTime = "create_time"
- ConfigTypeCustom = "custom"
- TotalVersionCount = 1
- )
-
- var (
- poolInfos *models.PoolInfos
- FlavorInfos *models.FlavorInfos
- ImageInfos *models.ImageInfosModelArts
- )
-
- type GenerateTrainJobReq struct {
- JobName string
- DisplayJobName string
- Uuid string
- Description string
- CodeObsPath string
- BootFile string
- BootFileUrl string
- DataUrl string
- TrainUrl string
- FlavorCode string
- LogUrl string
- PoolID string
- WorkServerNumber int
- EngineID int64
- Parameters []models.Parameter
- CommitID string
- IsLatestVersion string
- Params string
- BranchName string
- PreVersionId int64
- PreVersionName string
- FlavorName string
- VersionCount int
- EngineName string
- TotalVersionCount int
- UserImageUrl string
- UserCommand string
- DatasetName string
- }
-
- type GenerateInferenceJobReq struct {
- JobName string
- DisplayJobName string
- Uuid string
- Description string
- CodeObsPath string
- BootFile string
- BootFileUrl string
- DataUrl string
- TrainUrl string
- FlavorCode string
- LogUrl string
- PoolID string
- WorkServerNumber int
- EngineID int64
- Parameters []models.Parameter
- CommitID string
- Params string
- BranchName string
- FlavorName string
- EngineName string
- LabelName string
- IsLatestVersion string
- VersionCount int
- TotalVersionCount int
- ModelName string
- ModelVersion string
- CkptName string
- ResultUrl string
- }
-
- type VersionInfo struct {
- Version []struct {
- ID int `json:"id"`
- Value string `json:"value"`
- Url string `json:"url"`
- } `json:"version"`
- }
-
- type Flavor struct {
- Info []struct {
- Code string `json:"code"`
- Value string `json:"value"`
- } `json:"flavor"`
- }
-
- type Engine struct {
- Info []struct {
- ID int `json:"id"`
- Value string `json:"value"`
- } `json:"engine"`
- }
-
- type ResourcePool struct {
- Info []struct {
- ID string `json:"id"`
- Value string `json:"value"`
- } `json:"resource_pool"`
- }
-
- // type Parameter struct {
- // Label string `json:"label"`
- // Value string `json:"value"`
- // }
-
- // type Parameters struct {
- // Parameter []Parameter `json:"parameter"`
- // }
-
- type Parameters struct {
- Parameter []struct {
- Label string `json:"label"`
- Value string `json:"value"`
- } `json:"parameter"`
- }
-
- func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
- var dataActualPath string
- if uuid != "" {
- dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
- } else {
- userPath := setting.UserBasePath + ctx.User.Name + "/"
- isExist, err := storage.ObsHasObject(userPath)
- if err != nil {
- log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
- return err
- }
-
- if !isExist {
- if err = storage.ObsCreateObject(userPath); err != nil {
- log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
- return err
- }
- }
-
- dataActualPath = setting.Bucket + "/" + userPath
- }
-
- if poolInfos == nil {
- json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
- }
- createTime := timeutil.TimeStampNow()
- jobResult, err := CreateJob(models.CreateNotebookParams{
- JobName: jobName,
- Description: description,
- ProfileID: setting.ProfileID,
- Flavor: flavor,
- Pool: models.Pool{
- ID: poolInfos.PoolInfo[0].PoolId,
- Name: poolInfos.PoolInfo[0].PoolName,
- Type: poolInfos.PoolInfo[0].PoolType,
- },
- Spec: models.Spec{
- Storage: models.Storage{
- Type: storageTypeOBS,
- Location: models.Location{
- Path: dataActualPath,
- },
- },
- AutoStop: models.AutoStop{
- Enable: true,
- Duration: autoStopDuration,
- },
- },
- })
- if err != nil {
- log.Error("CreateJob failed: %v", err.Error())
- return err
- }
- err = models.CreateCloudbrain(&models.Cloudbrain{
-
- Status: string(models.JobWaiting),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobResult.ID,
- JobName: jobName,
- JobType: string(models.JobTypeDebug),
- Type: models.TypeCloudBrainTwo,
- Uuid: uuid,
- ComputeResource: models.NPUResource,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- })
-
- if err != nil {
- return err
- }
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
- return nil
- }
-
- func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
- if poolInfos == nil {
- json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
- }
-
- imageName, err := GetNotebookImageName(imageId)
- if err != nil {
- log.Error("GetNotebookImageName failed: %v", err.Error())
- return err
- }
-
- createTime := timeutil.TimeStampNow()
- task := &models.Cloudbrain{
- Status: string(models.ModelArtsTrainJobWaiting),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: models.TempJobIdPrefix + jobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
- JobName: jobName,
- FlavorCode: flavor,
- DisplayJobName: displayJobName,
- JobType: string(models.JobTypeDebug),
- Type: models.TypeCloudBrainTwo,
- Uuid: uuid,
- ComputeResource: models.NPUResource,
- Image: imageName,
- Description: description,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- }
-
- err = models.CreateCloudbrain(task)
- if err != nil {
- log.Error("CreateCloudbrain(%s) failed:%v", displayJobName, err.Error())
- return err
- }
-
- jobResult, err := createNotebook2(models.CreateNotebook2Params{
- JobName: jobName,
- Description: description,
- Flavor: flavor,
- Duration: autoStopDurationMs,
- ImageID: imageId,
- PoolID: poolInfos.PoolInfo[0].PoolId,
- Feature: models.NotebookFeature,
- Volume: models.VolumeReq{
- Capacity: setting.Capacity,
- Category: models.EVSCategory,
- Ownership: models.ManagedOwnership,
- },
- WorkspaceID: "0",
- })
- if err != nil {
- log.Error("createNotebook2 failed: %v", err.Error())
- if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
- log.Info("(%s)unknown error, set temp status", displayJobName)
- errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
- CloudbrainID: task.ID,
- Status: models.JobStatusTemp,
- Type: task.Type,
- JobName: task.JobName,
- JobType: task.JobType,
- })
- if errTemp != nil {
- log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
- return errTemp
- }
- } else {
- task.Status = string(models.ModelArtsCreateFailed)
- errTemp := models.UpdateJob(task)
- if errTemp != nil {
- log.Error("UpdateJob failed: %v", errTemp.Error())
- }
- errTemp = models.DeleteJob(task)
- if errTemp != nil {
- log.Error("DeleteJob failed: %v", errTemp.Error())
- }
- return err
- }
- } else {
- task.Status = jobResult.Status
- task.JobID = jobResult.ID
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob failed: %v", err.Error())
- return err
- }
- }
-
- stringId := strconv.FormatInt(task.ID, 10)
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
- return nil
- }
-
- func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
- createTime := timeutil.TimeStampNow()
- task := &models.Cloudbrain{
- Status: string(models.ModelArtsTrainJobWaiting),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: models.TempJobIdPrefix + req.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
- JobName: req.JobName,
- DisplayJobName: req.DisplayJobName,
- JobType: string(models.JobTypeTrain),
- Type: models.TypeCloudBrainTwo,
- Uuid: req.Uuid,
- DatasetName: req.DatasetName,
- CommitID: req.CommitID,
- IsLatestVersion: req.IsLatestVersion,
- ComputeResource: models.NPUResource,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- BranchName: req.BranchName,
- Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
- LogUrl: req.LogUrl,
- FlavorCode: req.FlavorCode,
- Description: req.Description,
- WorkServerNumber: req.WorkServerNumber,
- FlavorName: req.FlavorName,
- EngineName: req.EngineName,
- VersionCount: req.VersionCount,
- TotalVersionCount: req.TotalVersionCount,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- }
- err = models.CreateCloudbrain(task)
- if err != nil {
- log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
- return err
- }
-
- var jobResult *models.CreateTrainJobResult
- var createErr error
-
- if req.EngineID < 0 {
- jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.UserImageConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
- },
- Parameter: req.Parameters,
- UserImageUrl: req.UserImageUrl,
- UserCommand: req.UserCommand,
- },
- })
- } else {
- jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.Config{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
- },
- Parameter: req.Parameters,
- },
- })
- }
- if createErr != nil {
- log.Error("createTrainJob failed: %v", createErr.Error())
- if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
- log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
- errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
- CloudbrainID: task.ID,
- Status: models.JobStatusTemp,
- Type: task.Type,
- JobName: task.JobName,
- JobType: task.JobType,
- })
- if errTemp != nil {
- log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
- return errTemp
- }
- } else {
- task.Status = string(models.ModelArtsTrainJobFailed)
- errTemp := models.UpdateJob(task)
- if errTemp != nil {
- log.Error("UpdateJob failed: %v", errTemp.Error())
- }
- errTemp = models.DeleteJob(task)
- if errTemp != nil {
- log.Error("DeleteJob failed: %v", errTemp.Error())
- }
- return createErr
- }
- } else {
- task.Status = TransTrainJobStatus(jobResult.Status)
- task.JobID = strconv.FormatInt(jobResult.JobID, 10)
- task.VersionID = jobResult.VersionID
- task.VersionName = jobResult.VersionName
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob failed: %v", err.Error())
- return err
- }
- }
-
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, task.JobID, req.DisplayJobName, models.ActionCreateTrainTask)
- return nil
- }
-
- func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
-
- return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
- JobName: req.JobName,
- Description: req.Description,
- Config: models.UserImageConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
- },
- Parameter: req.Parameters,
- UserImageUrl: req.UserImageUrl,
- UserCommand: req.UserCommand,
- },
- })
- }
-
- func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
- var jobTypes []string
- jobTypes = append(jobTypes, string(models.JobTypeTrain))
- repo := ctx.Repo.Repository
- VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
- RepoID: repo.ID,
- Type: models.TypeCloudBrainTwo,
- JobTypes: jobTypes,
- JobID: jobId,
- })
- if err != nil {
- ctx.ServerError("Cloudbrain", err)
- return err
- }
- //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
-
- createTime := timeutil.TimeStampNow()
- task := &models.Cloudbrain{
- Status: models.JobStatusTemp,
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: jobId,
- JobName: req.JobName,
- DisplayJobName: req.DisplayJobName,
- JobType: string(models.JobTypeTrain),
- Type: models.TypeCloudBrainTwo,
- Uuid: req.Uuid,
- DatasetName: req.DatasetName,
- CommitID: req.CommitID,
- IsLatestVersion: req.IsLatestVersion,
- PreVersionName: req.PreVersionName,
- ComputeResource: models.NPUResource,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- BranchName: req.BranchName,
- Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
- LogUrl: req.LogUrl,
- PreVersionId: req.PreVersionId,
- FlavorCode: req.FlavorCode,
- Description: req.Description,
- WorkServerNumber: req.WorkServerNumber,
- FlavorName: req.FlavorName,
- EngineName: req.EngineName,
- TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
- VersionCount: VersionListCount + 1,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- }
- err = models.CreateCloudbrain(task)
- if err != nil {
- log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
- return err
- }
-
- //将训练任务的上一版本的isLatestVersion设置为"0"
- err = models.SetVersionCountAndLatestVersion(req.JobName, VersionTaskList[0].VersionName, VersionListCount, NotLatestVersion, VersionTaskList[0].TotalVersionCount)
- if err != nil {
- ctx.ServerError("Update IsLatestVersion failed", err)
- return err
- }
-
- var jobResult *models.CreateTrainJobResult
- var createErr error
-
- if req.EngineID < 0 {
- jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
- Description: req.Description,
- Config: models.TrainJobVersionUserImageConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
- },
- Parameter: req.Parameters,
- PreVersionId: req.PreVersionId,
- UserImageUrl: req.UserImageUrl,
- UserCommand: req.UserCommand,
- },
- }, jobId)
- } else {
- jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
- Description: req.Description,
- Config: models.TrainJobVersionConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
- },
- Parameter: req.Parameters,
- PreVersionId: req.PreVersionId,
- },
- }, jobId)
- }
- if createErr != nil {
- log.Error("createTrainJobVersion failed: %v", err.Error())
- if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
- log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
- errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
- CloudbrainID: task.ID,
- Status: models.JobStatusTemp,
- Type: task.Type,
- JobName: task.JobName,
- JobType: task.JobType,
- })
- if errTemp != nil {
- log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
- return errTemp
- }
- } else {
- task.Status = string(models.ModelArtsTrainJobFailed)
- errTemp := models.UpdateJob(task)
- if errTemp != nil {
- log.Error("UpdateJob failed: %v", errTemp.Error())
- }
- errTemp = models.DeleteJob(task)
- if errTemp != nil {
- log.Error("DeleteJob failed: %v", errTemp.Error())
- }
- return createErr
- }
- } else {
- task.Status = TransTrainJobStatus(jobResult.Status)
- task.JobID = strconv.FormatInt(jobResult.JobID, 10)
- task.VersionID = jobResult.VersionID
- task.VersionName = jobResult.VersionName
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob failed: %v", err.Error())
- return err
- }
- }
-
- return nil
- }
-
- func TransTrainJobStatus(status int) string {
- switch status {
- case 0:
- return "UNKNOWN"
- case 1:
- return "INIT"
- case 2:
- return "IMAGE_CREATING"
- case 3:
- return "IMAGE_FAILED"
- case 4:
- return "SUBMIT_TRYING"
- case 5:
- return "SUBMIT_FAILED"
- case 6:
- return "DELETE_FAILED"
- case 7:
- return "WAITING"
- case 8:
- return "RUNNING"
- case 9:
- return "KILLING"
- case 10:
- return "COMPLETED"
- case 11:
- return "FAILED"
- case 12:
- return "KILLED"
- case 13:
- return "CANCELED"
- case 14:
- return "LOST"
- case 15:
- return "SCALING"
- case 16:
- return "SUBMIT_MODEL_FAILED"
- case 17:
- return "DEPLOY_SERVICE_FAILED"
- case 18:
- return "CHECK_INIT"
- case 19:
- return "CHECK_RUNNING"
- case 20:
- return "CHECK_RUNNING_COMPLETED"
- case 21:
- return "CHECK_FAILED"
-
- default:
- return strconv.Itoa(status)
- }
- }
-
- func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
- talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
- VersionOutputPath = "V" + talVersionCountToString
- return VersionOutputPath
- }
-
- func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
- createTime := timeutil.TimeStampNow()
-
- attach, err := models.GetAttachmentByUUID(req.Uuid)
- if err != nil {
- log.Error("GetAttachmentByUUID(%s) failed:%v", req.DisplayJobName, err.Error())
- return err
- }
-
- task := &models.Cloudbrain{
- Status: string(models.ModelArtsTrainJobWaiting),
- UserID: ctx.User.ID,
- RepoID: ctx.Repo.Repository.ID,
- JobID: models.TempJobIdPrefix + req.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
- JobName: req.JobName,
- DisplayJobName: req.DisplayJobName,
- JobType: string(models.JobTypeInference),
- Type: models.TypeCloudBrainTwo,
- Uuid: req.Uuid,
- DatasetName: attach.Name,
- CommitID: req.CommitID,
- EngineID: req.EngineID,
- TrainUrl: req.TrainUrl,
- BranchName: req.BranchName,
- Parameters: req.Params,
- BootFile: req.BootFile,
- DataUrl: req.DataUrl,
- LogUrl: req.LogUrl,
- FlavorCode: req.FlavorCode,
- Description: req.Description,
- WorkServerNumber: req.WorkServerNumber,
- FlavorName: req.FlavorName,
- EngineName: req.EngineName,
- LabelName: req.LabelName,
- IsLatestVersion: req.IsLatestVersion,
- ComputeResource: models.NPUResource,
- VersionCount: req.VersionCount,
- TotalVersionCount: req.TotalVersionCount,
- ModelName: req.ModelName,
- ModelVersion: req.ModelVersion,
- CkptName: req.CkptName,
- ResultUrl: req.ResultUrl,
- CreatedUnix: createTime,
- UpdatedUnix: createTime,
- }
-
- err = models.CreateCloudbrain(task)
- if err != nil {
- log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
- return err
- }
-
- jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
- JobName: req.JobName,
- Description: req.Description,
- InfConfig: models.InfConfig{
- WorkServerNum: req.WorkServerNumber,
- AppUrl: req.CodeObsPath,
- BootFileUrl: req.BootFileUrl,
- DataUrl: req.DataUrl,
- EngineID: req.EngineID,
- // TrainUrl: req.TrainUrl,
- LogUrl: req.LogUrl,
- PoolID: req.PoolID,
- CreateVersion: true,
- Flavor: models.Flavor{
- Code: req.FlavorCode,
- },
- Parameter: req.Parameters,
- },
- })
- if err != nil {
- log.Error("createTrainJob failed: %v", err.Error())
- if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
- log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
- err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
- CloudbrainID: task.ID,
- Status: models.JobStatusTemp,
- Type: task.Type,
- JobName: task.JobName,
- JobType: task.JobType,
- })
- if err != nil {
- log.Error("InsertCloudbrainTemp failed: %v", err.Error())
- return err
- }
- } else {
- task.Status = string(models.ModelArtsTrainJobFailed)
- errTemp := models.UpdateJob(task)
- if errTemp != nil {
- log.Error("UpdateJob failed: %v", errTemp.Error())
- }
- errTemp = models.DeleteJob(task)
- if errTemp != nil {
- log.Error("DeleteJob failed: %v", errTemp.Error())
- }
- return err
- }
- } else {
- task.Status = TransTrainJobStatus(jobResult.Status)
- task.JobID = strconv.FormatInt(jobResult.JobID, 10)
- task.VersionID = jobResult.VersionID
- task.VersionName = jobResult.VersionName
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob failed: %v", err.Error())
- return err
- }
- }
-
- notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, task.JobID, req.DisplayJobName, models.ActionCreateInferenceTask)
-
- return nil
- }
-
- func GetNotebookImageName(imageId string) (string, error) {
- var validImage = false
- var imageName = ""
-
- if ImageInfos == nil {
- json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos)
- }
-
- for _, imageInfo := range ImageInfos.ImageInfo {
- if imageInfo.Id == imageId {
- validImage = true
- imageName = imageInfo.Value
- }
- }
-
- if !validImage {
- log.Error("the image id(%s) is invalid", imageId)
- return imageName, errors.New("the image id is invalid")
- }
-
- return imageName, nil
- }
-
- func HandleTrainJobInfo(task *models.Cloudbrain) error {
- if isTempJob(task.JobID, task.Status) {
- if task.VersionCount > VersionCountOne {
- //multi version
- result, err := GetTrainJobVersionList(1000, 1, strings.TrimPrefix(task.JobID, models.TempJobIdPrefix))
- if err != nil {
- log.Error("GetTrainJobVersionList failed:%v", err)
- return err
- }
-
- if result != nil {
- if strconv.FormatInt(result.JobID, 10) == task.JobID && result.JobName == task.JobName {
- if result.VersionCount == int64(task.VersionCount) {
- log.Info("find the record(%s)", task.DisplayJobName)
- task.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
- task.VersionName = result.JobVersionList[0].VersionName
- task.VersionID = result.JobVersionList[0].VersionID
-
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
- return err
- }
- temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
- if err != nil {
- log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
- } else {
- err = models.DeleteCloudbrainTemp(temp)
- if err != nil {
- log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
- }
- }
-
- return nil
- } else {
- log.Error("can not find the record(%s) until now", task.DisplayJobName)
- }
- } else {
- log.Error("can not find the record(%s) until now", task.DisplayJobName)
- }
- }
- } else {
- //inference or one version
- result, err := GetTrainJobList(1000, 1, "create_time", "desc", task.JobName)
- if err != nil {
- log.Error("GetTrainJobList failed:%v", err)
- return err
- }
-
- if result != nil {
- for _, job := range result.JobList {
- if task.JobName == job.JobName {
- log.Info("find the record(%s)", task.DisplayJobName)
- task.Status = TransTrainJobStatus(job.IntStatus)
- task.JobID = strconv.FormatInt(job.JobID, 10)
-
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
- temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
- if err != nil {
- log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
- return err
- }
- err = models.DeleteCloudbrainTemp(temp)
- if err != nil {
- log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
- return nil
- }
- }
- }
-
- }
- } else {
- //normal
- result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
- if err != nil {
- log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
-
- if result != nil {
- task.Status = TransTrainJobStatus(result.IntStatus)
- task.Duration = result.Duration / 1000
- task.TrainJobDuration = result.TrainJobDuration
-
- if task.StartTime == 0 && result.StartTime > 0 {
- task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
- }
- task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
- if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
- task.EndTime = task.StartTime.Add(task.Duration)
- }
- task.CorrectCreateUnix()
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
- return err
- }
- }
- }
-
- return nil
- }
-
- func HandleNotebookInfo(task *models.Cloudbrain) error {
- if isTempJob(task.JobID, task.Status) {
- result, err := GetNotebookList(1000, 0, "createTime", "DESC", task.JobName)
- if err != nil {
- log.Error("GetNotebookList failed:%v", err)
- return err
- }
-
- if result != nil {
- count, err := models.GetCloudbrainCountByJobName(task.JobName, task.JobType)
- if err != nil {
- log.Error("GetCloudbrainCountByJobName failed:%v", err)
- return err
- }
-
- if len(result.NotebookList) == count {
- if result.NotebookList[0].JobName == task.JobName {
- log.Info("find the record(%s)", task.DisplayJobName)
- task.Status = result.NotebookList[0].Status
- task.JobID = result.NotebookList[0].JobID
-
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
- return err
- }
- temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
- if err != nil {
- log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
- return err
- }
- err = models.DeleteCloudbrainTemp(temp)
- if err != nil {
- log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
- return nil
- } else {
- log.Error("can not find the record(%s) until now", task.DisplayJobName)
- }
- } else {
- log.Error("can not find the record(%s) until now", task.DisplayJobName)
- }
- } else {
- log.Error("can not find the record(%s) until now", task.DisplayJobName)
- }
- } else {
- //normal
- result, err := GetNotebook2(task.JobID)
- if err != nil {
- log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
-
- if result != nil {
- task.Status = result.Status
- if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
- task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
- }
- if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
- task.EndTime = timeutil.TimeStampNow()
- }
- task.CorrectCreateUnix()
- task.ComputeAndSetDuration()
- err = models.UpdateJob(task)
- if err != nil {
- log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
- return err
- }
- }
- }
-
- return nil
- }
-
- func isTempJob(jobID, status string) bool {
- if (strings.HasPrefix(jobID, models.TempJobIdPrefix) && status == string(models.ModelArtsTrainJobWaiting)) || status == models.JobStatusTemp {
- return true
- }
- return false
- }
|