You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 35 kB

4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242
  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "path"
  7. "strconv"
  8. "strings"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. "code.gitea.io/gitea/modules/storage"
  15. "code.gitea.io/gitea/modules/timeutil"
  16. )
  17. const (
  18. //notebook
  19. storageTypeOBS = "obs"
  20. autoStopDuration = 4 * 60 * 60
  21. autoStopDurationMs = 4 * 60 * 60 * 1000
  22. MORDELART_USER_IMAGE_ENGINE_ID = -1
  23. DataSetMountPath = "/home/ma-user/work"
  24. NotebookEnv = "Python3"
  25. NotebookType = "Ascend"
  26. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  27. //train-job
  28. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  29. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  30. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  31. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  34. // "]}"
  35. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  36. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  39. // "]}"
  40. CodePath = "/code/"
  41. OutputPath = "/output/"
  42. ResultPath = "/result/"
  43. LogPath = "/log/"
  44. JobPath = "/job/"
  45. OrderDesc = "desc" //向下查询
  46. OrderAsc = "asc" //向上查询
  47. Lines = 500
  48. TrainUrl = "train_url"
  49. DataUrl = "data_url"
  50. MultiDataUrl = "multi_data_url"
  51. ResultUrl = "result_url"
  52. CkptUrl = "ckpt_url"
  53. DeviceTarget = "device_target"
  54. Ascend = "Ascend"
  55. PerPage = 10
  56. IsLatestVersion = "1"
  57. NotLatestVersion = "0"
  58. VersionCountOne = 1
  59. SortByCreateTime = "create_time"
  60. ConfigTypeCustom = "custom"
  61. TotalVersionCount = 1
  62. )
  63. var (
  64. poolInfos *models.PoolInfos
  65. FlavorInfos *models.FlavorInfos
  66. ImageInfos *models.ImageInfosModelArts
  67. TrainFlavorInfos *Flavor
  68. SpecialPools *models.SpecialPools
  69. MultiNodeConfig *MultiNodes
  70. )
  71. type GenerateTrainJobReq struct {
  72. JobName string
  73. DisplayJobName string
  74. Uuid string
  75. Description string
  76. CodeObsPath string
  77. BootFile string
  78. BootFileUrl string
  79. DataUrl string
  80. TrainUrl string
  81. FlavorCode string
  82. LogUrl string
  83. PoolID string
  84. WorkServerNumber int
  85. EngineID int64
  86. Parameters []models.Parameter
  87. CommitID string
  88. IsLatestVersion string
  89. Params string
  90. BranchName string
  91. PreVersionId int64
  92. PreVersionName string
  93. FlavorName string
  94. VersionCount int
  95. EngineName string
  96. TotalVersionCount int
  97. UserImageUrl string
  98. UserCommand string
  99. DatasetName string
  100. }
  101. type GenerateInferenceJobReq struct {
  102. JobName string
  103. DisplayJobName string
  104. Uuid string
  105. Description string
  106. CodeObsPath string
  107. BootFile string
  108. BootFileUrl string
  109. DataUrl string
  110. TrainUrl string
  111. FlavorCode string
  112. LogUrl string
  113. PoolID string
  114. WorkServerNumber int
  115. EngineID int64
  116. Parameters []models.Parameter
  117. CommitID string
  118. Params string
  119. BranchName string
  120. FlavorName string
  121. EngineName string
  122. LabelName string
  123. IsLatestVersion string
  124. VersionCount int
  125. TotalVersionCount int
  126. ModelName string
  127. ModelVersion string
  128. CkptName string
  129. ResultUrl string
  130. DatasetName string
  131. }
  132. type VersionInfo struct {
  133. Version []struct {
  134. ID int `json:"id"`
  135. Value string `json:"value"`
  136. Url string `json:"url"`
  137. } `json:"version"`
  138. }
  139. type Flavor struct {
  140. Info []struct {
  141. Code string `json:"code"`
  142. Value string `json:"value"`
  143. } `json:"flavor"`
  144. }
  145. type Engine struct {
  146. Info []struct {
  147. ID int `json:"id"`
  148. Value string `json:"value"`
  149. } `json:"engine"`
  150. }
  151. type ResourcePool struct {
  152. Info []struct {
  153. ID string `json:"id"`
  154. Value string `json:"value"`
  155. } `json:"resource_pool"`
  156. }
  157. type MultiNodes struct{
  158. Info []OrgMultiNode `json:"multinode"`
  159. }
  160. type OrgMultiNode struct{
  161. Org string `json:"org"`
  162. Node []int `json:"node"`
  163. }
  164. // type Parameter struct {
  165. // Label string `json:"label"`
  166. // Value string `json:"value"`
  167. // }
  168. // type Parameters struct {
  169. // Parameter []Parameter `json:"parameter"`
  170. // }
  171. type Parameters struct {
  172. Parameter []struct {
  173. Label string `json:"label"`
  174. Value string `json:"value"`
  175. } `json:"parameter"`
  176. }
  177. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  178. var dataActualPath string
  179. if uuid != "" {
  180. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  181. } else {
  182. userPath := setting.UserBasePath + ctx.User.Name + "/"
  183. isExist, err := storage.ObsHasObject(userPath)
  184. if err != nil {
  185. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  186. return err
  187. }
  188. if !isExist {
  189. if err = storage.ObsCreateObject(userPath); err != nil {
  190. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  191. return err
  192. }
  193. }
  194. dataActualPath = setting.Bucket + "/" + userPath
  195. }
  196. if poolInfos == nil {
  197. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  198. }
  199. createTime := timeutil.TimeStampNow()
  200. jobResult, err := CreateJob(models.CreateNotebookParams{
  201. JobName: jobName,
  202. Description: description,
  203. ProfileID: setting.ProfileID,
  204. Flavor: flavor,
  205. Pool: models.Pool{
  206. ID: poolInfos.PoolInfo[0].PoolId,
  207. Name: poolInfos.PoolInfo[0].PoolName,
  208. Type: poolInfos.PoolInfo[0].PoolType,
  209. },
  210. Spec: models.Spec{
  211. Storage: models.Storage{
  212. Type: storageTypeOBS,
  213. Location: models.Location{
  214. Path: dataActualPath,
  215. },
  216. },
  217. AutoStop: models.AutoStop{
  218. Enable: true,
  219. Duration: autoStopDuration,
  220. },
  221. },
  222. })
  223. if err != nil {
  224. log.Error("CreateJob failed: %v", err.Error())
  225. return err
  226. }
  227. err = models.CreateCloudbrain(&models.Cloudbrain{
  228. Status: string(models.JobWaiting),
  229. UserID: ctx.User.ID,
  230. RepoID: ctx.Repo.Repository.ID,
  231. JobID: jobResult.ID,
  232. JobName: jobName,
  233. JobType: string(models.JobTypeDebug),
  234. Type: models.TypeCloudBrainTwo,
  235. Uuid: uuid,
  236. ComputeResource: models.NPUResource,
  237. CreatedUnix: createTime,
  238. UpdatedUnix: createTime,
  239. })
  240. if err != nil {
  241. return err
  242. }
  243. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  244. return nil
  245. }
  246. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
  247. if poolInfos == nil {
  248. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  249. }
  250. imageName, err := GetNotebookImageName(imageId)
  251. if err != nil {
  252. log.Error("GetNotebookImageName failed: %v", err.Error())
  253. return err
  254. }
  255. createTime := timeutil.TimeStampNow()
  256. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  257. JobName: jobName,
  258. Description: description,
  259. Flavor: flavor,
  260. Duration: autoStopDurationMs,
  261. ImageID: imageId,
  262. PoolID: poolInfos.PoolInfo[0].PoolId,
  263. Feature: models.NotebookFeature,
  264. Volume: models.VolumeReq{
  265. Capacity: setting.Capacity,
  266. Category: models.EVSCategory,
  267. Ownership: models.ManagedOwnership,
  268. },
  269. WorkspaceID: "0",
  270. })
  271. if err != nil {
  272. log.Error("createNotebook2 failed: %v", err.Error())
  273. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  274. log.Info("(%s)unknown error, set temp status", displayJobName)
  275. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  276. JobID: models.TempJobId,
  277. VersionID: models.TempVersionId,
  278. Status: models.TempJobStatus,
  279. Type: models.TypeCloudBrainTwo,
  280. JobName: jobName,
  281. JobType: string(models.JobTypeDebug),
  282. })
  283. if errTemp != nil {
  284. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  285. return errTemp
  286. }
  287. }
  288. return err
  289. }
  290. task := &models.Cloudbrain{
  291. Status: jobResult.Status,
  292. UserID: ctx.User.ID,
  293. RepoID: ctx.Repo.Repository.ID,
  294. JobID: jobResult.ID,
  295. JobName: jobName,
  296. FlavorCode: flavor,
  297. DisplayJobName: displayJobName,
  298. JobType: string(models.JobTypeDebug),
  299. Type: models.TypeCloudBrainTwo,
  300. Uuid: uuid,
  301. ComputeResource: models.NPUResource,
  302. Image: imageName,
  303. Description: description,
  304. CreatedUnix: createTime,
  305. UpdatedUnix: createTime,
  306. }
  307. err = models.CreateCloudbrain(task)
  308. if err != nil {
  309. return err
  310. }
  311. stringId := strconv.FormatInt(task.ID, 10)
  312. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  313. return nil
  314. }
  315. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  316. createTime := timeutil.TimeStampNow()
  317. var jobResult *models.CreateTrainJobResult
  318. var createErr error
  319. if req.EngineID < 0 {
  320. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  321. JobName: req.JobName,
  322. Description: req.Description,
  323. Config: models.UserImageConfig{
  324. WorkServerNum: req.WorkServerNumber,
  325. AppUrl: req.CodeObsPath,
  326. BootFileUrl: req.BootFileUrl,
  327. DataUrl: req.DataUrl,
  328. TrainUrl: req.TrainUrl,
  329. LogUrl: req.LogUrl,
  330. PoolID: req.PoolID,
  331. CreateVersion: true,
  332. Flavor: models.Flavor{
  333. Code: req.FlavorCode,
  334. },
  335. Parameter: req.Parameters,
  336. UserImageUrl: req.UserImageUrl,
  337. UserCommand: req.UserCommand,
  338. },
  339. })
  340. } else {
  341. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  342. JobName: req.JobName,
  343. Description: req.Description,
  344. Config: models.Config{
  345. WorkServerNum: req.WorkServerNumber,
  346. AppUrl: req.CodeObsPath,
  347. BootFileUrl: req.BootFileUrl,
  348. DataUrl: req.DataUrl,
  349. EngineID: req.EngineID,
  350. TrainUrl: req.TrainUrl,
  351. LogUrl: req.LogUrl,
  352. PoolID: req.PoolID,
  353. CreateVersion: true,
  354. Flavor: models.Flavor{
  355. Code: req.FlavorCode,
  356. },
  357. Parameter: req.Parameters,
  358. },
  359. })
  360. }
  361. if createErr != nil {
  362. log.Error("createTrainJob failed: %v", createErr.Error())
  363. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  364. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  365. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  366. JobID: models.TempJobId,
  367. VersionID: models.TempVersionId,
  368. Status: models.TempJobStatus,
  369. Type: models.TypeCloudBrainTwo,
  370. JobName: req.JobName,
  371. JobType: string(models.JobTypeTrain),
  372. })
  373. if errTemp != nil {
  374. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  375. return errTemp
  376. }
  377. }
  378. return createErr
  379. }
  380. jobId := strconv.FormatInt(jobResult.JobID, 10)
  381. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  382. Status: TransTrainJobStatus(jobResult.Status),
  383. UserID: ctx.User.ID,
  384. RepoID: ctx.Repo.Repository.ID,
  385. JobID: jobId,
  386. JobName: req.JobName,
  387. DisplayJobName: req.DisplayJobName,
  388. JobType: string(models.JobTypeTrain),
  389. Type: models.TypeCloudBrainTwo,
  390. VersionID: jobResult.VersionID,
  391. VersionName: jobResult.VersionName,
  392. Uuid: req.Uuid,
  393. DatasetName: req.DatasetName,
  394. CommitID: req.CommitID,
  395. IsLatestVersion: req.IsLatestVersion,
  396. ComputeResource: models.NPUResource,
  397. EngineID: req.EngineID,
  398. TrainUrl: req.TrainUrl,
  399. BranchName: req.BranchName,
  400. Parameters: req.Params,
  401. BootFile: req.BootFile,
  402. DataUrl: req.DataUrl,
  403. LogUrl: req.LogUrl,
  404. FlavorCode: req.FlavorCode,
  405. Description: req.Description,
  406. WorkServerNumber: req.WorkServerNumber,
  407. FlavorName: req.FlavorName,
  408. EngineName: req.EngineName,
  409. VersionCount: req.VersionCount,
  410. TotalVersionCount: req.TotalVersionCount,
  411. CreatedUnix: createTime,
  412. UpdatedUnix: createTime,
  413. })
  414. if createErr != nil {
  415. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  416. return createErr
  417. }
  418. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  419. return nil
  420. }
  421. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  422. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  423. JobName: req.JobName,
  424. Description: req.Description,
  425. Config: models.UserImageConfig{
  426. WorkServerNum: req.WorkServerNumber,
  427. AppUrl: req.CodeObsPath,
  428. BootFileUrl: req.BootFileUrl,
  429. DataUrl: req.DataUrl,
  430. TrainUrl: req.TrainUrl,
  431. LogUrl: req.LogUrl,
  432. PoolID: req.PoolID,
  433. CreateVersion: true,
  434. Flavor: models.Flavor{
  435. Code: req.FlavorCode,
  436. },
  437. Parameter: req.Parameters,
  438. UserImageUrl: req.UserImageUrl,
  439. UserCommand: req.UserCommand,
  440. },
  441. })
  442. }
  443. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  444. createTime := timeutil.TimeStampNow()
  445. var jobResult *models.CreateTrainJobResult
  446. var createErr error
  447. if req.EngineID < 0 {
  448. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  449. Description: req.Description,
  450. Config: models.TrainJobVersionUserImageConfig{
  451. WorkServerNum: req.WorkServerNumber,
  452. AppUrl: req.CodeObsPath,
  453. BootFileUrl: req.BootFileUrl,
  454. DataUrl: req.DataUrl,
  455. TrainUrl: req.TrainUrl,
  456. LogUrl: req.LogUrl,
  457. PoolID: req.PoolID,
  458. Flavor: models.Flavor{
  459. Code: req.FlavorCode,
  460. },
  461. Parameter: req.Parameters,
  462. PreVersionId: req.PreVersionId,
  463. UserImageUrl: req.UserImageUrl,
  464. UserCommand: req.UserCommand,
  465. },
  466. }, jobId)
  467. } else {
  468. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  469. Description: req.Description,
  470. Config: models.TrainJobVersionConfig{
  471. WorkServerNum: req.WorkServerNumber,
  472. AppUrl: req.CodeObsPath,
  473. BootFileUrl: req.BootFileUrl,
  474. DataUrl: req.DataUrl,
  475. EngineID: req.EngineID,
  476. TrainUrl: req.TrainUrl,
  477. LogUrl: req.LogUrl,
  478. PoolID: req.PoolID,
  479. Flavor: models.Flavor{
  480. Code: req.FlavorCode,
  481. },
  482. Parameter: req.Parameters,
  483. PreVersionId: req.PreVersionId,
  484. },
  485. }, jobId)
  486. }
  487. if createErr != nil {
  488. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  489. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  490. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  491. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  492. JobID: jobId,
  493. VersionID: models.TempVersionId,
  494. Status: models.TempJobStatus,
  495. Type: models.TypeCloudBrainTwo,
  496. JobName: req.JobName,
  497. JobType: string(models.JobTypeTrain),
  498. })
  499. if errTemp != nil {
  500. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  501. return errTemp
  502. }
  503. }
  504. return createErr
  505. }
  506. var jobTypes []string
  507. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  508. repo := ctx.Repo.Repository
  509. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  510. RepoID: repo.ID,
  511. Type: models.TypeCloudBrainTwo,
  512. JobTypes: jobTypes,
  513. JobID: strconv.FormatInt(jobResult.JobID, 10),
  514. })
  515. if createErr != nil {
  516. ctx.ServerError("Cloudbrain", createErr)
  517. return createErr
  518. }
  519. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  520. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  521. Status: TransTrainJobStatus(jobResult.Status),
  522. UserID: ctx.User.ID,
  523. RepoID: ctx.Repo.Repository.ID,
  524. JobID: strconv.FormatInt(jobResult.JobID, 10),
  525. JobName: req.JobName,
  526. DisplayJobName: req.DisplayJobName,
  527. JobType: string(models.JobTypeTrain),
  528. Type: models.TypeCloudBrainTwo,
  529. VersionID: jobResult.VersionID,
  530. VersionName: jobResult.VersionName,
  531. Uuid: req.Uuid,
  532. DatasetName: req.DatasetName,
  533. CommitID: req.CommitID,
  534. IsLatestVersion: req.IsLatestVersion,
  535. PreVersionName: req.PreVersionName,
  536. ComputeResource: models.NPUResource,
  537. EngineID: req.EngineID,
  538. TrainUrl: req.TrainUrl,
  539. BranchName: req.BranchName,
  540. Parameters: req.Params,
  541. BootFile: req.BootFile,
  542. DataUrl: req.DataUrl,
  543. LogUrl: req.LogUrl,
  544. PreVersionId: req.PreVersionId,
  545. FlavorCode: req.FlavorCode,
  546. Description: req.Description,
  547. WorkServerNumber: req.WorkServerNumber,
  548. FlavorName: req.FlavorName,
  549. EngineName: req.EngineName,
  550. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  551. VersionCount: VersionListCount + 1,
  552. CreatedUnix: createTime,
  553. UpdatedUnix: createTime,
  554. })
  555. if createErr != nil {
  556. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  557. return createErr
  558. }
  559. //将训练任务的上一版本的isLatestVersion设置为"0"
  560. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  561. if createErr != nil {
  562. ctx.ServerError("Update IsLatestVersion failed", createErr)
  563. return createErr
  564. }
  565. return createErr
  566. }
  567. func TransTrainJobStatus(status int) string {
  568. switch status {
  569. case 0:
  570. return "UNKNOWN"
  571. case 1:
  572. return "INIT"
  573. case 2:
  574. return "IMAGE_CREATING"
  575. case 3:
  576. return "IMAGE_FAILED"
  577. case 4:
  578. return "SUBMIT_TRYING"
  579. case 5:
  580. return "SUBMIT_FAILED"
  581. case 6:
  582. return "DELETE_FAILED"
  583. case 7:
  584. return "WAITING"
  585. case 8:
  586. return "RUNNING"
  587. case 9:
  588. return "KILLING"
  589. case 10:
  590. return "COMPLETED"
  591. case 11:
  592. return "FAILED"
  593. case 12:
  594. return "KILLED"
  595. case 13:
  596. return "CANCELED"
  597. case 14:
  598. return "LOST"
  599. case 15:
  600. return "SCALING"
  601. case 16:
  602. return "SUBMIT_MODEL_FAILED"
  603. case 17:
  604. return "DEPLOY_SERVICE_FAILED"
  605. case 18:
  606. return "CHECK_INIT"
  607. case 19:
  608. return "CHECK_RUNNING"
  609. case 20:
  610. return "CHECK_RUNNING_COMPLETED"
  611. case 21:
  612. return "CHECK_FAILED"
  613. default:
  614. return strconv.Itoa(status)
  615. }
  616. }
  617. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  618. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  619. VersionOutputPath = "V" + talVersionCountToString
  620. return VersionOutputPath
  621. }
  622. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  623. createTime := timeutil.TimeStampNow()
  624. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  625. JobName: req.JobName,
  626. Description: req.Description,
  627. InfConfig: models.InfConfig{
  628. WorkServerNum: req.WorkServerNumber,
  629. AppUrl: req.CodeObsPath,
  630. BootFileUrl: req.BootFileUrl,
  631. DataUrl: req.DataUrl,
  632. EngineID: req.EngineID,
  633. // TrainUrl: req.TrainUrl,
  634. LogUrl: req.LogUrl,
  635. PoolID: req.PoolID,
  636. CreateVersion: true,
  637. Flavor: models.Flavor{
  638. Code: req.FlavorCode,
  639. },
  640. Parameter: req.Parameters,
  641. },
  642. })
  643. if err != nil {
  644. log.Error("createInferenceJob failed: %v", err.Error())
  645. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  646. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  647. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  648. JobID: models.TempJobId,
  649. VersionID: models.TempVersionId,
  650. Status: models.TempJobStatus,
  651. Type: models.TypeCloudBrainTwo,
  652. JobName: req.JobName,
  653. JobType: string(models.JobTypeInference),
  654. })
  655. if err != nil {
  656. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  657. return err
  658. }
  659. }
  660. return err
  661. }
  662. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  663. // if err != nil {
  664. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  665. // return err
  666. // }
  667. jobID := strconv.FormatInt(jobResult.JobID, 10)
  668. err = models.CreateCloudbrain(&models.Cloudbrain{
  669. Status: TransTrainJobStatus(jobResult.Status),
  670. UserID: ctx.User.ID,
  671. RepoID: ctx.Repo.Repository.ID,
  672. JobID: jobID,
  673. JobName: req.JobName,
  674. DisplayJobName: req.DisplayJobName,
  675. JobType: string(models.JobTypeInference),
  676. Type: models.TypeCloudBrainTwo,
  677. VersionID: jobResult.VersionID,
  678. VersionName: jobResult.VersionName,
  679. Uuid: req.Uuid,
  680. DatasetName: req.DatasetName,
  681. CommitID: req.CommitID,
  682. EngineID: req.EngineID,
  683. TrainUrl: req.TrainUrl,
  684. BranchName: req.BranchName,
  685. Parameters: req.Params,
  686. BootFile: req.BootFile,
  687. DataUrl: req.DataUrl,
  688. LogUrl: req.LogUrl,
  689. FlavorCode: req.FlavorCode,
  690. Description: req.Description,
  691. WorkServerNumber: req.WorkServerNumber,
  692. FlavorName: req.FlavorName,
  693. EngineName: req.EngineName,
  694. LabelName: req.LabelName,
  695. IsLatestVersion: req.IsLatestVersion,
  696. ComputeResource: models.NPUResource,
  697. VersionCount: req.VersionCount,
  698. TotalVersionCount: req.TotalVersionCount,
  699. ModelName: req.ModelName,
  700. ModelVersion: req.ModelVersion,
  701. CkptName: req.CkptName,
  702. ResultUrl: req.ResultUrl,
  703. CreatedUnix: createTime,
  704. UpdatedUnix: createTime,
  705. })
  706. if err != nil {
  707. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  708. return err
  709. }
  710. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  711. return nil
  712. }
  713. func GetNotebookImageName(imageId string) (string, error) {
  714. var validImage = false
  715. var imageName = ""
  716. if ImageInfos == nil {
  717. json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos)
  718. }
  719. for _, imageInfo := range ImageInfos.ImageInfo {
  720. if imageInfo.Id == imageId {
  721. validImage = true
  722. imageName = imageInfo.Value
  723. }
  724. }
  725. if !validImage {
  726. log.Error("the image id(%s) is invalid", imageId)
  727. return imageName, errors.New("the image id is invalid")
  728. }
  729. return imageName, nil
  730. }
  731. func InitSpecialPool() {
  732. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  733. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  734. }
  735. }
  736. func InitMultiNode(){
  737. if MultiNodeConfig ==nil && setting.ModelArtsMultiNode!=""{
  738. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  739. }
  740. }
  741. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  742. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  743. if err != nil {
  744. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  745. return err
  746. }
  747. if result != nil {
  748. oldStatus := task.Status
  749. task.Status = TransTrainJobStatus(result.IntStatus)
  750. task.Duration = result.Duration / 1000
  751. task.TrainJobDuration = result.TrainJobDuration
  752. if task.StartTime == 0 && result.StartTime > 0 {
  753. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  754. }
  755. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  756. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  757. task.EndTime = task.StartTime.Add(task.Duration)
  758. }
  759. task.CorrectCreateUnix()
  760. if oldStatus != task.Status {
  761. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  762. }
  763. err = models.UpdateJob(task)
  764. if err != nil {
  765. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  766. return err
  767. }
  768. }
  769. return nil
  770. }
  771. func HandleNotebookInfo(task *models.Cloudbrain) error {
  772. result, err := GetNotebook2(task.JobID)
  773. if err != nil {
  774. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  775. return err
  776. }
  777. if result != nil {
  778. oldStatus := task.Status
  779. task.Status = result.Status
  780. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  781. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  782. }
  783. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  784. task.EndTime = timeutil.TimeStampNow()
  785. }
  786. task.CorrectCreateUnix()
  787. task.ComputeAndSetDuration()
  788. if oldStatus != task.Status {
  789. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  790. }
  791. if task.FlavorCode == "" {
  792. task.FlavorCode = result.Flavor
  793. }
  794. err = models.UpdateJob(task)
  795. if err != nil {
  796. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  797. return err
  798. }
  799. }
  800. return nil
  801. }
  802. func SyncTempStatusJob() {
  803. jobs, err := models.GetCloudBrainTempJobs()
  804. if err != nil {
  805. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  806. return
  807. }
  808. for _, temp := range jobs {
  809. log.Info("start to handle record: %s", temp.JobName)
  810. if temp.Type == models.TypeCloudBrainTwo {
  811. if temp.JobType == string(models.JobTypeDebug) {
  812. err = handleNotebook(temp)
  813. if err != nil {
  814. log.Error("handleNotebook falied:%v", err)
  815. break
  816. }
  817. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  818. _, err = models.GetCloudbrainByJobID(temp.JobID)
  819. if err != nil {
  820. //one version
  821. err = handleTrainJob(temp)
  822. if err != nil {
  823. log.Error("handleTrainJob falied:%v", err)
  824. break
  825. }
  826. } else {
  827. //multi version
  828. err = handleTrainJobMultiVersion(temp)
  829. if err != nil {
  830. log.Error("handleTrainJobMultiVersion falied:%v", err)
  831. break
  832. }
  833. }
  834. }
  835. }
  836. }
  837. return
  838. }
  839. func handleNotebook(temp *models.CloudbrainTemp) error {
  840. if temp.Status == models.TempJobStatus {
  841. err := handleTempNotebook(temp)
  842. if err != nil {
  843. log.Error("handleTempNotebook failed:%v", err)
  844. return err
  845. }
  846. } else if temp.Status == string(models.ModelArtsStopping) {
  847. res, err := GetNotebook2(temp.JobID)
  848. if err != nil {
  849. log.Error("GetNotebook2 failed:%v", err)
  850. return err
  851. }
  852. temp.Status = res.Status
  853. if temp.Status == string(models.ModelArtsStopped) {
  854. err = models.UpdateCloudbrainTemp(temp)
  855. if err != nil {
  856. log.Error("UpdateCloudbrainTemp failed:%v", err)
  857. return err
  858. }
  859. _, err := DelNotebook2(temp.JobID)
  860. if err != nil {
  861. log.Error("DelNotebook2 failed:%v", err)
  862. return err
  863. }
  864. temp.Status = string(models.ModelArtsDeleted)
  865. err = models.UpdateCloudbrainTemp(temp)
  866. if err != nil {
  867. log.Error("UpdateCloudbrainTemp failed:%v", err)
  868. return err
  869. }
  870. }
  871. }
  872. return nil
  873. }
  874. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  875. var err error
  876. var isExist bool
  877. for {
  878. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  879. if err != nil {
  880. log.Error("GetNotebookList failed:%v", err)
  881. break
  882. }
  883. temp.QueryTimes++
  884. err = models.UpdateCloudbrainTemp(temp)
  885. if err != nil {
  886. log.Error("UpdateCloudbrainTemp failed:%v", err)
  887. }
  888. if result != nil {
  889. for _, notebook := range result.NotebookList {
  890. if temp.JobID == models.TempJobId {
  891. //new notebook
  892. if notebook.JobName == temp.JobName {
  893. isExist = true
  894. temp.Status = notebook.Status
  895. temp.JobID = notebook.JobID
  896. break
  897. }
  898. } else {
  899. //restart: always can find one record
  900. if notebook.JobName == temp.JobName {
  901. if notebook.Status != string(models.ModelArtsStopped) {
  902. isExist = true
  903. temp.Status = notebook.Status
  904. temp.JobID = notebook.JobID
  905. break
  906. }
  907. }
  908. }
  909. }
  910. if isExist {
  911. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  912. if temp.Status == string(models.ModelArtsCreateFailed) {
  913. err = models.UpdateCloudbrainTemp(temp)
  914. if err != nil {
  915. log.Error("UpdateCloudbrainTemp failed:%v", err)
  916. break
  917. }
  918. _, err := DelNotebook2(temp.JobID)
  919. if err != nil {
  920. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  921. break
  922. }
  923. temp.Status = string(models.ModelArtsDeleted)
  924. } else {
  925. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  926. if err != nil {
  927. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  928. break
  929. }
  930. temp.Status = string(models.ModelArtsStopping)
  931. }
  932. models.UpdateCloudbrainTemp(temp)
  933. } else {
  934. log.Error("can not find the record(%s) till now", temp.JobName)
  935. err = errors.New("not found")
  936. break
  937. }
  938. } else {
  939. log.Error("can not find the record(%s) till now", temp.JobName)
  940. err = errors.New("not found")
  941. break
  942. }
  943. break
  944. }
  945. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  946. log.Info("reach MaxTempQueryTimes, set the job failed")
  947. temp.Status = string(models.ModelArtsTrainJobFailed)
  948. err = models.UpdateCloudbrainTemp(temp)
  949. if err != nil {
  950. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  951. return err
  952. }
  953. }
  954. return err
  955. }
  956. func handleTrainJob(temp *models.CloudbrainTemp) error {
  957. if temp.Status == models.TempJobStatus {
  958. err := handleTempTrainJob(temp)
  959. if err != nil {
  960. log.Error("handleTempTrainJob failed:%v", err)
  961. return err
  962. }
  963. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  964. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  965. if err != nil {
  966. log.Error("GetTrainJob failed:%v", err)
  967. return err
  968. }
  969. temp.Status = TransTrainJobStatus(res.IntStatus)
  970. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  971. err = models.UpdateCloudbrainTemp(temp)
  972. if err != nil {
  973. log.Error("UpdateCloudbrainTemp failed:%v", err)
  974. return err
  975. }
  976. _, err := DelTrainJob(temp.JobID)
  977. if err != nil {
  978. log.Error("DelTrainJob failed:%v", err)
  979. return err
  980. }
  981. temp.Status = string(models.ModelArtsDeleted)
  982. err = models.UpdateCloudbrainTemp(temp)
  983. if err != nil {
  984. log.Error("UpdateCloudbrainTemp failed:%v", err)
  985. return err
  986. }
  987. }
  988. }
  989. return nil
  990. }
  991. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  992. if temp.Status == models.TempJobStatus {
  993. err := handleTempTrainJobMultiVersion(temp)
  994. if err != nil {
  995. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  996. return err
  997. }
  998. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  999. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1000. if err != nil {
  1001. log.Error("GetTrainJob failed:%v", err)
  1002. return err
  1003. }
  1004. temp.Status = TransTrainJobStatus(res.IntStatus)
  1005. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1006. err = models.UpdateCloudbrainTemp(temp)
  1007. if err != nil {
  1008. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1009. return err
  1010. }
  1011. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1012. if err != nil {
  1013. log.Error("DelTrainJob failed:%v", err)
  1014. return err
  1015. }
  1016. temp.Status = string(models.ModelArtsDeleted)
  1017. err = models.UpdateCloudbrainTemp(temp)
  1018. if err != nil {
  1019. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1020. return err
  1021. }
  1022. }
  1023. }
  1024. return nil
  1025. }
  1026. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1027. var err error
  1028. var isExist bool
  1029. for {
  1030. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1031. if err != nil {
  1032. log.Error("GetTrainJobVersionList failed:%v", err)
  1033. break
  1034. }
  1035. temp.QueryTimes++
  1036. err = models.UpdateCloudbrainTemp(temp)
  1037. if err != nil {
  1038. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1039. }
  1040. if result != nil {
  1041. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1042. if result.VersionCount == int64(count+1) {
  1043. isExist = true
  1044. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1045. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1046. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1047. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1048. if err != nil {
  1049. log.Error("StopTrainJob failed:%v", err)
  1050. break
  1051. }
  1052. temp.Status = string(models.ModelArtsTrainJobKilling)
  1053. err = models.UpdateCloudbrainTemp(temp)
  1054. if err != nil {
  1055. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1056. break
  1057. }
  1058. } else {
  1059. log.Error("can not find the record(%s) till now", temp.JobName)
  1060. err = errors.New("not found")
  1061. break
  1062. }
  1063. }
  1064. break
  1065. }
  1066. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1067. log.Info("reach MaxTempQueryTimes, set the job failed")
  1068. temp.Status = string(models.ModelArtsTrainJobFailed)
  1069. err = models.UpdateCloudbrainTemp(temp)
  1070. if err != nil {
  1071. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1072. return err
  1073. }
  1074. }
  1075. return err
  1076. }
  1077. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1078. var err error
  1079. var isExist bool
  1080. for {
  1081. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1082. if err != nil {
  1083. log.Error("GetTrainJobList failed:%v", err)
  1084. break
  1085. }
  1086. temp.QueryTimes++
  1087. err = models.UpdateCloudbrainTemp(temp)
  1088. if err != nil {
  1089. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1090. }
  1091. if result != nil {
  1092. for _, job := range result.JobList {
  1093. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1094. isExist = true
  1095. temp.Status = TransTrainJobStatus(job.IntStatus)
  1096. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1097. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1098. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1099. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1100. if err != nil {
  1101. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1102. break
  1103. }
  1104. temp.Status = string(models.ModelArtsTrainJobKilling)
  1105. err = models.UpdateCloudbrainTemp(temp)
  1106. if err != nil {
  1107. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1108. break
  1109. }
  1110. }
  1111. }
  1112. if !isExist {
  1113. log.Error("can not find the record(%s) till now", temp.JobName)
  1114. err = errors.New("not found")
  1115. break
  1116. }
  1117. }
  1118. break
  1119. }
  1120. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1121. log.Info("reach MaxTempQueryTimes, set the job failed")
  1122. temp.Status = string(models.ModelArtsTrainJobFailed)
  1123. err = models.UpdateCloudbrainTemp(temp)
  1124. if err != nil {
  1125. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1126. return err
  1127. }
  1128. }
  1129. return err
  1130. }