You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 36 kB

4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248
  1. package modelarts
  2. import (
  3. "code.gitea.io/gitea/modules/modelarts_cd"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "code.gitea.io/gitea/models"
  11. "code.gitea.io/gitea/modules/context"
  12. "code.gitea.io/gitea/modules/log"
  13. "code.gitea.io/gitea/modules/notification"
  14. "code.gitea.io/gitea/modules/setting"
  15. "code.gitea.io/gitea/modules/storage"
  16. "code.gitea.io/gitea/modules/timeutil"
  17. )
  18. const (
  19. //notebook
  20. storageTypeOBS = "obs"
  21. autoStopDuration = 4 * 60 * 60
  22. autoStopDurationMs = 4 * 60 * 60 * 1000
  23. MORDELART_USER_IMAGE_ENGINE_ID = -1
  24. DataSetMountPath = "/home/ma-user/work"
  25. NotebookEnv = "Python3"
  26. NotebookType = "Ascend"
  27. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  28. //train-job
  29. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  30. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  31. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  34. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  35. // "]}"
  36. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  39. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  40. // "]}"
  41. CodePath = "/code/"
  42. OutputPath = "/output/"
  43. ResultPath = "/result/"
  44. LogPath = "/log/"
  45. JobPath = "/job/"
  46. OrderDesc = "desc" //向下查询
  47. OrderAsc = "asc" //向上查询
  48. Lines = 500
  49. TrainUrl = "train_url"
  50. DataUrl = "data_url"
  51. MultiDataUrl = "multi_data_url"
  52. ResultUrl = "result_url"
  53. CkptUrl = "ckpt_url"
  54. DeviceTarget = "device_target"
  55. Ascend = "Ascend"
  56. PerPage = 10
  57. IsLatestVersion = "1"
  58. NotLatestVersion = "0"
  59. VersionCountOne = 1
  60. SortByCreateTime = "create_time"
  61. ConfigTypeCustom = "custom"
  62. TotalVersionCount = 1
  63. )
  64. var (
  65. poolInfos *models.PoolInfos
  66. TrainFlavorInfos *Flavor
  67. SpecialPools *models.SpecialPools
  68. MultiNodeConfig *MultiNodes
  69. )
  70. type GenerateTrainJobReq struct {
  71. JobName string
  72. DisplayJobName string
  73. Uuid string
  74. Description string
  75. CodeObsPath string
  76. BootFile string
  77. BootFileUrl string
  78. DataUrl string
  79. TrainUrl string
  80. LogUrl string
  81. PoolID string
  82. WorkServerNumber int
  83. EngineID int64
  84. Parameters []models.Parameter
  85. CommitID string
  86. IsLatestVersion string
  87. Params string
  88. BranchName string
  89. PreVersionId int64
  90. PreVersionName string
  91. FlavorCode string
  92. FlavorName string
  93. VersionCount int
  94. EngineName string
  95. TotalVersionCount int
  96. UserImageUrl string
  97. UserCommand string
  98. DatasetName string
  99. Spec *models.Specification
  100. }
  101. type GenerateInferenceJobReq struct {
  102. JobName string
  103. DisplayJobName string
  104. Uuid string
  105. Description string
  106. CodeObsPath string
  107. BootFile string
  108. BootFileUrl string
  109. DataUrl string
  110. TrainUrl string
  111. LogUrl string
  112. PoolID string
  113. WorkServerNumber int
  114. EngineID int64
  115. Parameters []models.Parameter
  116. CommitID string
  117. Params string
  118. BranchName string
  119. FlavorName string
  120. EngineName string
  121. LabelName string
  122. IsLatestVersion string
  123. VersionCount int
  124. TotalVersionCount int
  125. ModelName string
  126. ModelVersion string
  127. CkptName string
  128. ResultUrl string
  129. Spec *models.Specification
  130. DatasetName string
  131. }
  132. type VersionInfo struct {
  133. Version []struct {
  134. ID int `json:"id"`
  135. Value string `json:"value"`
  136. Url string `json:"url"`
  137. } `json:"version"`
  138. }
  139. type Flavor struct {
  140. Info []struct {
  141. Code string `json:"code"`
  142. Value string `json:"value"`
  143. UnitPrice int64 `json:"unitPrice"`
  144. } `json:"flavor"`
  145. }
  146. type Engine struct {
  147. Info []struct {
  148. ID int `json:"id"`
  149. Value string `json:"value"`
  150. } `json:"engine"`
  151. }
  152. type ResourcePool struct {
  153. Info []struct {
  154. ID string `json:"id"`
  155. Value string `json:"value"`
  156. } `json:"resource_pool"`
  157. }
  158. type MultiNodes struct{
  159. Info []OrgMultiNode `json:"multinode"`
  160. }
  161. type OrgMultiNode struct{
  162. Org string `json:"org"`
  163. Node []int `json:"node"`
  164. }
  165. // type Parameter struct {
  166. // Label string `json:"label"`
  167. // Value string `json:"value"`
  168. // }
  169. // type Parameters struct {
  170. // Parameter []Parameter `json:"parameter"`
  171. // }
  172. type Parameters struct {
  173. Parameter []struct {
  174. Label string `json:"label"`
  175. Value string `json:"value"`
  176. } `json:"parameter"`
  177. }
  178. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  179. var dataActualPath string
  180. if uuid != "" {
  181. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  182. } else {
  183. userPath := setting.UserBasePath + ctx.User.Name + "/"
  184. isExist, err := storage.ObsHasObject(userPath)
  185. if err != nil {
  186. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  187. return err
  188. }
  189. if !isExist {
  190. if err = storage.ObsCreateObject(userPath); err != nil {
  191. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  192. return err
  193. }
  194. }
  195. dataActualPath = setting.Bucket + "/" + userPath
  196. }
  197. if poolInfos == nil {
  198. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  199. }
  200. createTime := timeutil.TimeStampNow()
  201. jobResult, err := CreateJob(models.CreateNotebookParams{
  202. JobName: jobName,
  203. Description: description,
  204. ProfileID: setting.ProfileID,
  205. Flavor: flavor,
  206. Pool: models.Pool{
  207. ID: poolInfos.PoolInfo[0].PoolId,
  208. Name: poolInfos.PoolInfo[0].PoolName,
  209. Type: poolInfos.PoolInfo[0].PoolType,
  210. },
  211. Spec: models.Spec{
  212. Storage: models.Storage{
  213. Type: storageTypeOBS,
  214. Location: models.Location{
  215. Path: dataActualPath,
  216. },
  217. },
  218. AutoStop: models.AutoStop{
  219. Enable: true,
  220. Duration: autoStopDuration,
  221. },
  222. },
  223. })
  224. if err != nil {
  225. log.Error("CreateJob failed: %v", err.Error())
  226. return err
  227. }
  228. err = models.CreateCloudbrain(&models.Cloudbrain{
  229. Status: string(models.JobWaiting),
  230. UserID: ctx.User.ID,
  231. RepoID: ctx.Repo.Repository.ID,
  232. JobID: jobResult.ID,
  233. JobName: jobName,
  234. JobType: string(models.JobTypeDebug),
  235. Type: models.TypeCloudBrainTwo,
  236. Uuid: uuid,
  237. ComputeResource: models.NPUResource,
  238. CreatedUnix: createTime,
  239. UpdatedUnix: createTime,
  240. })
  241. if err != nil {
  242. return err
  243. }
  244. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  245. return nil
  246. }
  247. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error {
  248. if poolInfos == nil {
  249. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  250. }
  251. imageName, err := GetNotebookImageName(imageId)
  252. if err != nil {
  253. log.Error("GetNotebookImageName failed: %v", err.Error())
  254. return err
  255. }
  256. createTime := timeutil.TimeStampNow()
  257. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  258. JobName: jobName,
  259. Description: description,
  260. Flavor: spec.SourceSpecId,
  261. Duration: autoStopDurationMs,
  262. ImageID: imageId,
  263. PoolID: poolInfos.PoolInfo[0].PoolId,
  264. Feature: models.NotebookFeature,
  265. Volume: models.VolumeReq{
  266. Capacity: setting.Capacity,
  267. Category: models.EVSCategory,
  268. Ownership: models.ManagedOwnership,
  269. },
  270. WorkspaceID: "0",
  271. })
  272. if err != nil {
  273. log.Error("createNotebook2 failed: %v", err.Error())
  274. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  275. log.Info("(%s)unknown error, set temp status", displayJobName)
  276. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  277. JobID: models.TempJobId,
  278. VersionID: models.TempVersionId,
  279. Status: models.TempJobStatus,
  280. Type: models.TypeCloudBrainTwo,
  281. JobName: jobName,
  282. JobType: string(models.JobTypeDebug),
  283. })
  284. if errTemp != nil {
  285. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  286. return errTemp
  287. }
  288. }
  289. return err
  290. }
  291. task := &models.Cloudbrain{
  292. Status: jobResult.Status,
  293. UserID: ctx.User.ID,
  294. RepoID: ctx.Repo.Repository.ID,
  295. JobID: jobResult.ID,
  296. JobName: jobName,
  297. FlavorCode: spec.SourceSpecId,
  298. DisplayJobName: displayJobName,
  299. JobType: string(models.JobTypeDebug),
  300. Type: models.TypeCloudBrainTwo,
  301. Uuid: uuid,
  302. ComputeResource: models.NPUResource,
  303. Image: imageName,
  304. Description: description,
  305. CreatedUnix: createTime,
  306. UpdatedUnix: createTime,
  307. Spec: spec,
  308. }
  309. err = models.CreateCloudbrain(task)
  310. if err != nil {
  311. return err
  312. }
  313. stringId := strconv.FormatInt(task.ID, 10)
  314. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  315. return nil
  316. }
  317. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  318. createTime := timeutil.TimeStampNow()
  319. var jobResult *models.CreateTrainJobResult
  320. var createErr error
  321. if req.EngineID < 0 {
  322. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  323. JobName: req.JobName,
  324. Description: req.Description,
  325. Config: models.UserImageConfig{
  326. WorkServerNum: req.WorkServerNumber,
  327. AppUrl: req.CodeObsPath,
  328. BootFileUrl: req.BootFileUrl,
  329. DataUrl: req.DataUrl,
  330. TrainUrl: req.TrainUrl,
  331. LogUrl: req.LogUrl,
  332. PoolID: req.PoolID,
  333. CreateVersion: true,
  334. Flavor: models.Flavor{
  335. Code: req.Spec.SourceSpecId,
  336. },
  337. Parameter: req.Parameters,
  338. UserImageUrl: req.UserImageUrl,
  339. UserCommand: req.UserCommand,
  340. },
  341. })
  342. } else {
  343. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  344. JobName: req.JobName,
  345. Description: req.Description,
  346. Config: models.Config{
  347. WorkServerNum: req.WorkServerNumber,
  348. AppUrl: req.CodeObsPath,
  349. BootFileUrl: req.BootFileUrl,
  350. DataUrl: req.DataUrl,
  351. EngineID: req.EngineID,
  352. TrainUrl: req.TrainUrl,
  353. LogUrl: req.LogUrl,
  354. PoolID: req.PoolID,
  355. CreateVersion: true,
  356. Flavor: models.Flavor{
  357. Code: req.Spec.SourceSpecId,
  358. },
  359. Parameter: req.Parameters,
  360. },
  361. })
  362. }
  363. if createErr != nil {
  364. log.Error("createTrainJob failed: %v", createErr.Error())
  365. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  366. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  367. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  368. JobID: models.TempJobId,
  369. VersionID: models.TempVersionId,
  370. Status: models.TempJobStatus,
  371. Type: models.TypeCloudBrainTwo,
  372. JobName: req.JobName,
  373. JobType: string(models.JobTypeTrain),
  374. })
  375. if errTemp != nil {
  376. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  377. return errTemp
  378. }
  379. }
  380. return createErr
  381. }
  382. jobId := strconv.FormatInt(jobResult.JobID, 10)
  383. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  384. Status: TransTrainJobStatus(jobResult.Status),
  385. UserID: ctx.User.ID,
  386. RepoID: ctx.Repo.Repository.ID,
  387. JobID: jobId,
  388. JobName: req.JobName,
  389. DisplayJobName: req.DisplayJobName,
  390. JobType: string(models.JobTypeTrain),
  391. Type: models.TypeCloudBrainTwo,
  392. VersionID: jobResult.VersionID,
  393. VersionName: jobResult.VersionName,
  394. Uuid: req.Uuid,
  395. DatasetName: req.DatasetName,
  396. CommitID: req.CommitID,
  397. IsLatestVersion: req.IsLatestVersion,
  398. ComputeResource: models.NPUResource,
  399. EngineID: req.EngineID,
  400. TrainUrl: req.TrainUrl,
  401. BranchName: req.BranchName,
  402. Parameters: req.Params,
  403. BootFile: req.BootFile,
  404. DataUrl: req.DataUrl,
  405. LogUrl: req.LogUrl,
  406. FlavorCode: req.Spec.SourceSpecId,
  407. Description: req.Description,
  408. WorkServerNumber: req.WorkServerNumber,
  409. FlavorName: req.FlavorName,
  410. EngineName: req.EngineName,
  411. VersionCount: req.VersionCount,
  412. TotalVersionCount: req.TotalVersionCount,
  413. CreatedUnix: createTime,
  414. UpdatedUnix: createTime,
  415. Spec: req.Spec,
  416. })
  417. if createErr != nil {
  418. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  419. return createErr
  420. }
  421. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  422. return nil
  423. }
  424. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  425. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  426. JobName: req.JobName,
  427. Description: req.Description,
  428. Config: models.UserImageConfig{
  429. WorkServerNum: req.WorkServerNumber,
  430. AppUrl: req.CodeObsPath,
  431. BootFileUrl: req.BootFileUrl,
  432. DataUrl: req.DataUrl,
  433. TrainUrl: req.TrainUrl,
  434. LogUrl: req.LogUrl,
  435. PoolID: req.PoolID,
  436. CreateVersion: true,
  437. Flavor: models.Flavor{
  438. Code: req.FlavorCode,
  439. },
  440. Parameter: req.Parameters,
  441. UserImageUrl: req.UserImageUrl,
  442. UserCommand: req.UserCommand,
  443. },
  444. })
  445. }
  446. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  447. createTime := timeutil.TimeStampNow()
  448. var jobResult *models.CreateTrainJobResult
  449. var createErr error
  450. if req.EngineID < 0 {
  451. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  452. Description: req.Description,
  453. Config: models.TrainJobVersionUserImageConfig{
  454. WorkServerNum: req.WorkServerNumber,
  455. AppUrl: req.CodeObsPath,
  456. BootFileUrl: req.BootFileUrl,
  457. DataUrl: req.DataUrl,
  458. TrainUrl: req.TrainUrl,
  459. LogUrl: req.LogUrl,
  460. PoolID: req.PoolID,
  461. Flavor: models.Flavor{
  462. Code: req.Spec.SourceSpecId,
  463. },
  464. Parameter: req.Parameters,
  465. PreVersionId: req.PreVersionId,
  466. UserImageUrl: req.UserImageUrl,
  467. UserCommand: req.UserCommand,
  468. },
  469. }, jobId)
  470. } else {
  471. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  472. Description: req.Description,
  473. Config: models.TrainJobVersionConfig{
  474. WorkServerNum: req.WorkServerNumber,
  475. AppUrl: req.CodeObsPath,
  476. BootFileUrl: req.BootFileUrl,
  477. DataUrl: req.DataUrl,
  478. EngineID: req.EngineID,
  479. TrainUrl: req.TrainUrl,
  480. LogUrl: req.LogUrl,
  481. PoolID: req.PoolID,
  482. Flavor: models.Flavor{
  483. Code: req.Spec.SourceSpecId,
  484. },
  485. Parameter: req.Parameters,
  486. PreVersionId: req.PreVersionId,
  487. },
  488. }, jobId)
  489. }
  490. if createErr != nil {
  491. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  492. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  493. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  494. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  495. JobID: jobId,
  496. VersionID: models.TempVersionId,
  497. Status: models.TempJobStatus,
  498. Type: models.TypeCloudBrainTwo,
  499. JobName: req.JobName,
  500. JobType: string(models.JobTypeTrain),
  501. })
  502. if errTemp != nil {
  503. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  504. return errTemp
  505. }
  506. }
  507. return createErr
  508. }
  509. var jobTypes []string
  510. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  511. repo := ctx.Repo.Repository
  512. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  513. RepoID: repo.ID,
  514. Type: models.TypeCloudBrainTwo,
  515. JobTypes: jobTypes,
  516. JobID: strconv.FormatInt(jobResult.JobID, 10),
  517. })
  518. if createErr != nil {
  519. ctx.ServerError("Cloudbrain", createErr)
  520. return createErr
  521. }
  522. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  523. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  524. Status: TransTrainJobStatus(jobResult.Status),
  525. UserID: ctx.User.ID,
  526. RepoID: ctx.Repo.Repository.ID,
  527. JobID: strconv.FormatInt(jobResult.JobID, 10),
  528. JobName: req.JobName,
  529. DisplayJobName: req.DisplayJobName,
  530. JobType: string(models.JobTypeTrain),
  531. Type: models.TypeCloudBrainTwo,
  532. VersionID: jobResult.VersionID,
  533. VersionName: jobResult.VersionName,
  534. Uuid: req.Uuid,
  535. DatasetName: req.DatasetName,
  536. CommitID: req.CommitID,
  537. IsLatestVersion: req.IsLatestVersion,
  538. PreVersionName: req.PreVersionName,
  539. ComputeResource: models.NPUResource,
  540. EngineID: req.EngineID,
  541. TrainUrl: req.TrainUrl,
  542. BranchName: req.BranchName,
  543. Parameters: req.Params,
  544. BootFile: req.BootFile,
  545. DataUrl: req.DataUrl,
  546. LogUrl: req.LogUrl,
  547. PreVersionId: req.PreVersionId,
  548. FlavorCode: req.Spec.SourceSpecId,
  549. Description: req.Description,
  550. WorkServerNumber: req.WorkServerNumber,
  551. FlavorName: req.FlavorName,
  552. EngineName: req.EngineName,
  553. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  554. VersionCount: VersionListCount + 1,
  555. CreatedUnix: createTime,
  556. UpdatedUnix: createTime,
  557. Spec: req.Spec,
  558. })
  559. if createErr != nil {
  560. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  561. return createErr
  562. }
  563. //将训练任务的上一版本的isLatestVersion设置为"0"
  564. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  565. if createErr != nil {
  566. ctx.ServerError("Update IsLatestVersion failed", createErr)
  567. return createErr
  568. }
  569. return createErr
  570. }
  571. func TransTrainJobStatus(status int) string {
  572. switch status {
  573. case 0:
  574. return "UNKNOWN"
  575. case 1:
  576. return "INIT"
  577. case 2:
  578. return "IMAGE_CREATING"
  579. case 3:
  580. return "IMAGE_FAILED"
  581. case 4:
  582. return "SUBMIT_TRYING"
  583. case 5:
  584. return "SUBMIT_FAILED"
  585. case 6:
  586. return "DELETE_FAILED"
  587. case 7:
  588. return "WAITING"
  589. case 8:
  590. return "RUNNING"
  591. case 9:
  592. return "KILLING"
  593. case 10:
  594. return "COMPLETED"
  595. case 11:
  596. return "FAILED"
  597. case 12:
  598. return "KILLED"
  599. case 13:
  600. return "CANCELED"
  601. case 14:
  602. return "LOST"
  603. case 15:
  604. return "SCALING"
  605. case 16:
  606. return "SUBMIT_MODEL_FAILED"
  607. case 17:
  608. return "DEPLOY_SERVICE_FAILED"
  609. case 18:
  610. return "CHECK_INIT"
  611. case 19:
  612. return "CHECK_RUNNING"
  613. case 20:
  614. return "CHECK_RUNNING_COMPLETED"
  615. case 21:
  616. return "CHECK_FAILED"
  617. default:
  618. return strconv.Itoa(status)
  619. }
  620. }
  621. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  622. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  623. VersionOutputPath = "V" + talVersionCountToString
  624. return VersionOutputPath
  625. }
  626. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  627. createTime := timeutil.TimeStampNow()
  628. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  629. JobName: req.JobName,
  630. Description: req.Description,
  631. InfConfig: models.InfConfig{
  632. WorkServerNum: req.WorkServerNumber,
  633. AppUrl: req.CodeObsPath,
  634. BootFileUrl: req.BootFileUrl,
  635. DataUrl: req.DataUrl,
  636. EngineID: req.EngineID,
  637. // TrainUrl: req.TrainUrl,
  638. LogUrl: req.LogUrl,
  639. PoolID: req.PoolID,
  640. CreateVersion: true,
  641. Flavor: models.Flavor{
  642. Code: req.Spec.SourceSpecId,
  643. },
  644. Parameter: req.Parameters,
  645. },
  646. })
  647. if err != nil {
  648. log.Error("createInferenceJob failed: %v", err.Error())
  649. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  650. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  651. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  652. JobID: models.TempJobId,
  653. VersionID: models.TempVersionId,
  654. Status: models.TempJobStatus,
  655. Type: models.TypeCloudBrainTwo,
  656. JobName: req.JobName,
  657. JobType: string(models.JobTypeInference),
  658. })
  659. if err != nil {
  660. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  661. return err
  662. }
  663. }
  664. return err
  665. }
  666. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  667. // if err != nil {
  668. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  669. // return err
  670. // }
  671. jobID := strconv.FormatInt(jobResult.JobID, 10)
  672. err = models.CreateCloudbrain(&models.Cloudbrain{
  673. Status: TransTrainJobStatus(jobResult.Status),
  674. UserID: ctx.User.ID,
  675. RepoID: ctx.Repo.Repository.ID,
  676. JobID: jobID,
  677. JobName: req.JobName,
  678. DisplayJobName: req.DisplayJobName,
  679. JobType: string(models.JobTypeInference),
  680. Type: models.TypeCloudBrainTwo,
  681. VersionID: jobResult.VersionID,
  682. VersionName: jobResult.VersionName,
  683. Uuid: req.Uuid,
  684. DatasetName: req.DatasetName,
  685. CommitID: req.CommitID,
  686. EngineID: req.EngineID,
  687. TrainUrl: req.TrainUrl,
  688. BranchName: req.BranchName,
  689. Parameters: req.Params,
  690. BootFile: req.BootFile,
  691. DataUrl: req.DataUrl,
  692. LogUrl: req.LogUrl,
  693. FlavorCode: req.Spec.SourceSpecId,
  694. Description: req.Description,
  695. WorkServerNumber: req.WorkServerNumber,
  696. FlavorName: req.FlavorName,
  697. EngineName: req.EngineName,
  698. LabelName: req.LabelName,
  699. IsLatestVersion: req.IsLatestVersion,
  700. ComputeResource: models.NPUResource,
  701. VersionCount: req.VersionCount,
  702. TotalVersionCount: req.TotalVersionCount,
  703. ModelName: req.ModelName,
  704. ModelVersion: req.ModelVersion,
  705. CkptName: req.CkptName,
  706. ResultUrl: req.ResultUrl,
  707. CreatedUnix: createTime,
  708. UpdatedUnix: createTime,
  709. Spec: req.Spec,
  710. })
  711. if err != nil {
  712. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  713. return err
  714. }
  715. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  716. return nil
  717. }
  718. func GetNotebookImageName(imageId string) (string, error) {
  719. var validImage = false
  720. var imageName = ""
  721. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  722. if imageInfo.Id == imageId {
  723. validImage = true
  724. imageName = imageInfo.Value
  725. }
  726. }
  727. if !validImage {
  728. log.Error("the image id(%s) is invalid", imageId)
  729. return imageName, errors.New("the image id is invalid")
  730. }
  731. return imageName, nil
  732. }
  733. func InitSpecialPool() {
  734. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  735. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  736. }
  737. }
  738. func InitMultiNode(){
  739. if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{
  740. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  741. }
  742. }
  743. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  744. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  745. if err != nil {
  746. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  747. return err
  748. }
  749. if result != nil {
  750. oldStatus := task.Status
  751. task.Status = TransTrainJobStatus(result.IntStatus)
  752. task.Duration = result.Duration / 1000
  753. task.TrainJobDuration = result.TrainJobDuration
  754. if task.StartTime == 0 && result.StartTime > 0 {
  755. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  756. }
  757. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  758. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  759. task.EndTime = task.StartTime.Add(task.Duration)
  760. }
  761. task.CorrectCreateUnix()
  762. if oldStatus != task.Status {
  763. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  764. }
  765. err = models.UpdateJob(task)
  766. if err != nil {
  767. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  768. return err
  769. }
  770. }
  771. return nil
  772. }
  773. func HandleNotebookInfo(task *models.Cloudbrain) error {
  774. var result *models.GetNotebook2Result
  775. var err error
  776. if task.Type == models.TypeCloudBrainTwo {
  777. result, err = GetNotebook2(task.JobID)
  778. } else if task.Type == models.TypeCDCenter {
  779. result, err = modelarts_cd.GetNotebook(task.JobID)
  780. }
  781. if err != nil {
  782. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  783. return err
  784. }
  785. if result != nil {
  786. oldStatus := task.Status
  787. task.Status = result.Status
  788. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  789. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  790. }
  791. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  792. task.EndTime = timeutil.TimeStampNow()
  793. }
  794. task.CorrectCreateUnix()
  795. task.ComputeAndSetDuration()
  796. if oldStatus != task.Status {
  797. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  798. }
  799. if task.FlavorCode == "" {
  800. task.FlavorCode = result.Flavor
  801. }
  802. err = models.UpdateJob(task)
  803. if err != nil {
  804. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  805. return err
  806. }
  807. }
  808. return nil
  809. }
  810. func SyncTempStatusJob() {
  811. jobs, err := models.GetCloudBrainTempJobs()
  812. if err != nil {
  813. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  814. return
  815. }
  816. for _, temp := range jobs {
  817. log.Info("start to handle record: %s", temp.JobName)
  818. if temp.Type == models.TypeCloudBrainTwo {
  819. if temp.JobType == string(models.JobTypeDebug) {
  820. err = handleNotebook(temp)
  821. if err != nil {
  822. log.Error("handleNotebook falied:%v", err)
  823. break
  824. }
  825. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  826. _, err = models.GetCloudbrainByJobID(temp.JobID)
  827. if err != nil {
  828. //one version
  829. err = handleTrainJob(temp)
  830. if err != nil {
  831. log.Error("handleTrainJob falied:%v", err)
  832. break
  833. }
  834. } else {
  835. //multi version
  836. err = handleTrainJobMultiVersion(temp)
  837. if err != nil {
  838. log.Error("handleTrainJobMultiVersion falied:%v", err)
  839. break
  840. }
  841. }
  842. }
  843. }
  844. }
  845. return
  846. }
  847. func handleNotebook(temp *models.CloudbrainTemp) error {
  848. if temp.Status == models.TempJobStatus {
  849. err := handleTempNotebook(temp)
  850. if err != nil {
  851. log.Error("handleTempNotebook failed:%v", err)
  852. return err
  853. }
  854. } else if temp.Status == string(models.ModelArtsStopping) {
  855. res, err := GetNotebook2(temp.JobID)
  856. if err != nil {
  857. log.Error("GetNotebook2 failed:%v", err)
  858. return err
  859. }
  860. temp.Status = res.Status
  861. if temp.Status == string(models.ModelArtsStopped) {
  862. err = models.UpdateCloudbrainTemp(temp)
  863. if err != nil {
  864. log.Error("UpdateCloudbrainTemp failed:%v", err)
  865. return err
  866. }
  867. _, err := DelNotebook2(temp.JobID)
  868. if err != nil {
  869. log.Error("DelNotebook2 failed:%v", err)
  870. return err
  871. }
  872. temp.Status = string(models.ModelArtsDeleted)
  873. err = models.UpdateCloudbrainTemp(temp)
  874. if err != nil {
  875. log.Error("UpdateCloudbrainTemp failed:%v", err)
  876. return err
  877. }
  878. }
  879. }
  880. return nil
  881. }
  882. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  883. var err error
  884. var isExist bool
  885. for {
  886. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  887. if err != nil {
  888. log.Error("GetNotebookList failed:%v", err)
  889. break
  890. }
  891. temp.QueryTimes++
  892. err = models.UpdateCloudbrainTemp(temp)
  893. if err != nil {
  894. log.Error("UpdateCloudbrainTemp failed:%v", err)
  895. }
  896. if result != nil {
  897. for _, notebook := range result.NotebookList {
  898. if temp.JobID == models.TempJobId {
  899. //new notebook
  900. if notebook.JobName == temp.JobName {
  901. isExist = true
  902. temp.Status = notebook.Status
  903. temp.JobID = notebook.JobID
  904. break
  905. }
  906. } else {
  907. //restart: always can find one record
  908. if notebook.JobName == temp.JobName {
  909. if notebook.Status != string(models.ModelArtsStopped) {
  910. isExist = true
  911. temp.Status = notebook.Status
  912. temp.JobID = notebook.JobID
  913. break
  914. }
  915. }
  916. }
  917. }
  918. if isExist {
  919. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  920. if temp.Status == string(models.ModelArtsCreateFailed) {
  921. err = models.UpdateCloudbrainTemp(temp)
  922. if err != nil {
  923. log.Error("UpdateCloudbrainTemp failed:%v", err)
  924. break
  925. }
  926. _, err := DelNotebook2(temp.JobID)
  927. if err != nil {
  928. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  929. break
  930. }
  931. temp.Status = string(models.ModelArtsDeleted)
  932. } else {
  933. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  934. if err != nil {
  935. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  936. break
  937. }
  938. temp.Status = string(models.ModelArtsStopping)
  939. }
  940. models.UpdateCloudbrainTemp(temp)
  941. } else {
  942. log.Error("can not find the record(%s) till now", temp.JobName)
  943. err = errors.New("not found")
  944. break
  945. }
  946. } else {
  947. log.Error("can not find the record(%s) till now", temp.JobName)
  948. err = errors.New("not found")
  949. break
  950. }
  951. break
  952. }
  953. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  954. log.Info("reach MaxTempQueryTimes, set the job failed")
  955. temp.Status = string(models.ModelArtsTrainJobFailed)
  956. err = models.UpdateCloudbrainTemp(temp)
  957. if err != nil {
  958. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  959. return err
  960. }
  961. }
  962. return err
  963. }
  964. func handleTrainJob(temp *models.CloudbrainTemp) error {
  965. if temp.Status == models.TempJobStatus {
  966. err := handleTempTrainJob(temp)
  967. if err != nil {
  968. log.Error("handleTempTrainJob failed:%v", err)
  969. return err
  970. }
  971. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  972. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  973. if err != nil {
  974. log.Error("GetTrainJob failed:%v", err)
  975. return err
  976. }
  977. temp.Status = TransTrainJobStatus(res.IntStatus)
  978. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  979. err = models.UpdateCloudbrainTemp(temp)
  980. if err != nil {
  981. log.Error("UpdateCloudbrainTemp failed:%v", err)
  982. return err
  983. }
  984. _, err := DelTrainJob(temp.JobID)
  985. if err != nil {
  986. log.Error("DelTrainJob failed:%v", err)
  987. return err
  988. }
  989. temp.Status = string(models.ModelArtsDeleted)
  990. err = models.UpdateCloudbrainTemp(temp)
  991. if err != nil {
  992. log.Error("UpdateCloudbrainTemp failed:%v", err)
  993. return err
  994. }
  995. }
  996. }
  997. return nil
  998. }
  999. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1000. if temp.Status == models.TempJobStatus {
  1001. err := handleTempTrainJobMultiVersion(temp)
  1002. if err != nil {
  1003. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1004. return err
  1005. }
  1006. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1007. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1008. if err != nil {
  1009. log.Error("GetTrainJob failed:%v", err)
  1010. return err
  1011. }
  1012. temp.Status = TransTrainJobStatus(res.IntStatus)
  1013. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1014. err = models.UpdateCloudbrainTemp(temp)
  1015. if err != nil {
  1016. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1017. return err
  1018. }
  1019. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1020. if err != nil {
  1021. log.Error("DelTrainJob failed:%v", err)
  1022. return err
  1023. }
  1024. temp.Status = string(models.ModelArtsDeleted)
  1025. err = models.UpdateCloudbrainTemp(temp)
  1026. if err != nil {
  1027. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1028. return err
  1029. }
  1030. }
  1031. }
  1032. return nil
  1033. }
  1034. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1035. var err error
  1036. var isExist bool
  1037. for {
  1038. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1039. if err != nil {
  1040. log.Error("GetTrainJobVersionList failed:%v", err)
  1041. break
  1042. }
  1043. temp.QueryTimes++
  1044. err = models.UpdateCloudbrainTemp(temp)
  1045. if err != nil {
  1046. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1047. }
  1048. if result != nil {
  1049. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1050. if result.VersionCount == int64(count+1) {
  1051. isExist = true
  1052. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1053. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1054. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1055. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1056. if err != nil {
  1057. log.Error("StopTrainJob failed:%v", err)
  1058. break
  1059. }
  1060. temp.Status = string(models.ModelArtsTrainJobKilling)
  1061. err = models.UpdateCloudbrainTemp(temp)
  1062. if err != nil {
  1063. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1064. break
  1065. }
  1066. } else {
  1067. log.Error("can not find the record(%s) till now", temp.JobName)
  1068. err = errors.New("not found")
  1069. break
  1070. }
  1071. }
  1072. break
  1073. }
  1074. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1075. log.Info("reach MaxTempQueryTimes, set the job failed")
  1076. temp.Status = string(models.ModelArtsTrainJobFailed)
  1077. err = models.UpdateCloudbrainTemp(temp)
  1078. if err != nil {
  1079. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1080. return err
  1081. }
  1082. }
  1083. return err
  1084. }
  1085. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1086. var err error
  1087. var isExist bool
  1088. for {
  1089. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1090. if err != nil {
  1091. log.Error("GetTrainJobList failed:%v", err)
  1092. break
  1093. }
  1094. temp.QueryTimes++
  1095. err = models.UpdateCloudbrainTemp(temp)
  1096. if err != nil {
  1097. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1098. }
  1099. if result != nil {
  1100. for _, job := range result.JobList {
  1101. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1102. isExist = true
  1103. temp.Status = TransTrainJobStatus(job.IntStatus)
  1104. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1105. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1106. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1107. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1108. if err != nil {
  1109. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1110. break
  1111. }
  1112. temp.Status = string(models.ModelArtsTrainJobKilling)
  1113. err = models.UpdateCloudbrainTemp(temp)
  1114. if err != nil {
  1115. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1116. break
  1117. }
  1118. }
  1119. }
  1120. if !isExist {
  1121. log.Error("can not find the record(%s) till now", temp.JobName)
  1122. err = errors.New("not found")
  1123. break
  1124. }
  1125. }
  1126. break
  1127. }
  1128. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1129. log.Info("reach MaxTempQueryTimes, set the job failed")
  1130. temp.Status = string(models.ModelArtsTrainJobFailed)
  1131. err = models.UpdateCloudbrainTemp(temp)
  1132. if err != nil {
  1133. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1134. return err
  1135. }
  1136. }
  1137. return err
  1138. }