You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 37 kB

4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package modelarts
  2. import (
  3. "encoding/base64"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io/ioutil"
  8. "net/http"
  9. "path"
  10. "strconv"
  11. "strings"
  12. "code.gitea.io/gitea/modules/cloudbrain"
  13. "code.gitea.io/gitea/modules/modelarts_cd"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/context"
  16. "code.gitea.io/gitea/modules/log"
  17. "code.gitea.io/gitea/modules/notification"
  18. "code.gitea.io/gitea/modules/setting"
  19. "code.gitea.io/gitea/modules/timeutil"
  20. )
  21. const (
  22. //notebook
  23. storageTypeOBS = "obs"
  24. autoStopDuration = 4 * 60 * 60
  25. AutoStopDurationMs = 4 * 60 * 60 * 1000
  26. CodePath = "/code/"
  27. OutputPath = "/output/"
  28. ResultPath = "/result/"
  29. LogPath = "/log/"
  30. JobPath = "/job/"
  31. OrderDesc = "desc" //向下查询
  32. OrderAsc = "asc" //向上查询
  33. Lines = 500
  34. TrainUrl = "train_url"
  35. DataUrl = "data_url"
  36. MultiDataUrl = "multi_data_url"
  37. ResultUrl = "result_url"
  38. CkptUrl = "ckpt_url"
  39. DeviceTarget = "device_target"
  40. Ascend = "Ascend"
  41. PerPage = 10
  42. IsLatestVersion = "1"
  43. NotLatestVersion = "0"
  44. VersionCountOne = 1
  45. SortByCreateTime = "create_time"
  46. ConfigTypeCustom = "custom"
  47. TotalVersionCount = 1
  48. )
  49. var (
  50. poolInfos *models.PoolInfos
  51. TrainFlavorInfos *Flavor
  52. SpecialPools *models.SpecialPools
  53. MultiNodeConfig *MultiNodes
  54. )
  55. type GenerateTrainJobReq struct {
  56. JobName string
  57. DisplayJobName string
  58. Uuid string
  59. Description string
  60. CodeObsPath string
  61. BootFile string
  62. BootFileUrl string
  63. DataUrl string
  64. TrainUrl string
  65. LogUrl string
  66. PoolID string
  67. WorkServerNumber int
  68. EngineID int64
  69. Parameters []models.Parameter
  70. CommitID string
  71. IsLatestVersion string
  72. Params string
  73. BranchName string
  74. PreVersionId int64
  75. PreVersionName string
  76. FlavorCode string
  77. FlavorName string
  78. VersionCount int
  79. EngineName string
  80. TotalVersionCount int
  81. UserImageUrl string
  82. UserCommand string
  83. DatasetName string
  84. Spec *models.Specification
  85. ModelName string
  86. LabelName string
  87. CkptName string
  88. ModelVersion string
  89. PreTrainModelUrl string
  90. }
  91. type GenerateInferenceJobReq struct {
  92. JobName string
  93. DisplayJobName string
  94. Uuid string
  95. Description string
  96. CodeObsPath string
  97. BootFile string
  98. BootFileUrl string
  99. DataUrl string
  100. TrainUrl string
  101. LogUrl string
  102. PoolID string
  103. WorkServerNumber int
  104. EngineID int64
  105. Parameters []models.Parameter
  106. CommitID string
  107. Params string
  108. BranchName string
  109. FlavorName string
  110. EngineName string
  111. LabelName string
  112. IsLatestVersion string
  113. VersionCount int
  114. TotalVersionCount int
  115. ModelName string
  116. ModelVersion string
  117. CkptName string
  118. ResultUrl string
  119. Spec *models.Specification
  120. DatasetName string
  121. JobType string
  122. UserImageUrl string
  123. UserCommand string
  124. }
  125. type VersionInfo struct {
  126. Version []struct {
  127. ID int `json:"id"`
  128. Value string `json:"value"`
  129. Url string `json:"url"`
  130. } `json:"version"`
  131. }
  132. type Flavor struct {
  133. Info []struct {
  134. Code string `json:"code"`
  135. Value string `json:"value"`
  136. UnitPrice int64 `json:"unitPrice"`
  137. } `json:"flavor"`
  138. }
  139. type Engine struct {
  140. Info []struct {
  141. ID int `json:"id"`
  142. Value string `json:"value"`
  143. } `json:"engine"`
  144. }
  145. type ResourcePool struct {
  146. Info []struct {
  147. ID string `json:"id"`
  148. Value string `json:"value"`
  149. } `json:"resource_pool"`
  150. }
  151. type MultiNodes struct {
  152. Info []OrgMultiNode `json:"multinode"`
  153. }
  154. type OrgMultiNode struct {
  155. Org string `json:"org"`
  156. Node []int `json:"node"`
  157. }
  158. type Parameters struct {
  159. Parameter []struct {
  160. Label string `json:"label"`
  161. Value string `json:"value"`
  162. } `json:"parameter"`
  163. }
  164. func GenerateNotebook2(ctx *context.Context, req cloudbrain.GenerateModelArtsNotebookReq) (string, error) {
  165. if poolInfos == nil {
  166. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  167. }
  168. imageName, err := GetNotebookImageName(req.ImageId)
  169. if err != nil {
  170. log.Error("GetNotebookImageName failed: %v", err.Error())
  171. return "", err
  172. }
  173. createTime := timeutil.TimeStampNow()
  174. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  175. JobName: req.JobName,
  176. Description: req.Description,
  177. Flavor: req.Spec.SourceSpecId,
  178. Duration: req.AutoStopDurationMs,
  179. ImageID: req.ImageId,
  180. PoolID: poolInfos.PoolInfo[0].PoolId,
  181. Feature: models.NotebookFeature,
  182. Volume: models.VolumeReq{
  183. Capacity: setting.Capacity,
  184. Category: models.EVSCategory,
  185. Ownership: models.ManagedOwnership,
  186. },
  187. WorkspaceID: "0",
  188. })
  189. if err != nil {
  190. log.Error("createNotebook2 failed: %v", err.Error())
  191. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  192. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  193. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  194. JobID: models.TempJobId,
  195. VersionID: models.TempVersionId,
  196. Status: models.TempJobStatus,
  197. Type: models.TypeCloudBrainTwo,
  198. JobName: req.JobName,
  199. JobType: string(models.JobTypeDebug),
  200. })
  201. if errTemp != nil {
  202. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  203. return "", errTemp
  204. }
  205. }
  206. return "", err
  207. }
  208. task := &models.Cloudbrain{
  209. Status: jobResult.Status,
  210. UserID: ctx.User.ID,
  211. RepoID: ctx.Repo.Repository.ID,
  212. JobID: jobResult.ID,
  213. JobName: req.JobName,
  214. FlavorCode: req.Spec.SourceSpecId,
  215. DisplayJobName: req.DisplayJobName,
  216. JobType: string(models.JobTypeDebug),
  217. Type: models.TypeCloudBrainTwo,
  218. Uuid: req.Uuid,
  219. ComputeResource: models.NPUResource,
  220. Image: imageName,
  221. BootFile: req.BootFile,
  222. Description: req.Description,
  223. CreatedUnix: createTime,
  224. UpdatedUnix: createTime,
  225. Spec: req.Spec,
  226. ModelName: req.ModelName,
  227. ModelVersion: req.ModelVersion,
  228. LabelName: req.LabelName,
  229. PreTrainModelUrl: req.PreTrainModelUrl,
  230. CkptName: req.CkptName,
  231. }
  232. err = models.CreateCloudbrain(task)
  233. if err != nil {
  234. return "", err
  235. }
  236. stringId := strconv.FormatInt(task.ID, 10)
  237. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, req.DisplayJobName, models.ActionCreateDebugNPUTask)
  238. return jobResult.ID, nil
  239. }
  240. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  241. createTime := timeutil.TimeStampNow()
  242. var jobResult *models.CreateTrainJobResult
  243. var createErr error
  244. if req.EngineID < 0 {
  245. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  246. JobName: req.JobName,
  247. Description: req.Description,
  248. Config: models.UserImageConfig{
  249. WorkServerNum: req.WorkServerNumber,
  250. AppUrl: req.CodeObsPath,
  251. BootFileUrl: req.BootFileUrl,
  252. DataUrl: req.DataUrl,
  253. TrainUrl: req.TrainUrl,
  254. LogUrl: req.LogUrl,
  255. PoolID: req.PoolID,
  256. CreateVersion: true,
  257. Flavor: models.Flavor{
  258. Code: req.Spec.SourceSpecId,
  259. },
  260. Parameter: req.Parameters,
  261. UserImageUrl: req.UserImageUrl,
  262. UserCommand: req.UserCommand,
  263. ShareAddr: setting.ModelArtsShareAddr,
  264. MountPath: setting.ModelArtsMountPath,
  265. NasType: setting.ModelArtsNasType,
  266. },
  267. })
  268. } else {
  269. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  270. JobName: req.JobName,
  271. Description: req.Description,
  272. Config: models.Config{
  273. WorkServerNum: req.WorkServerNumber,
  274. AppUrl: req.CodeObsPath,
  275. BootFileUrl: req.BootFileUrl,
  276. DataUrl: req.DataUrl,
  277. EngineID: req.EngineID,
  278. TrainUrl: req.TrainUrl,
  279. LogUrl: req.LogUrl,
  280. PoolID: req.PoolID,
  281. CreateVersion: true,
  282. Flavor: models.Flavor{
  283. Code: req.Spec.SourceSpecId,
  284. },
  285. Parameter: req.Parameters,
  286. ShareAddr: setting.ModelArtsShareAddr,
  287. MountPath: setting.ModelArtsMountPath,
  288. NasType: setting.ModelArtsNasType,
  289. },
  290. })
  291. }
  292. if createErr != nil {
  293. log.Error("createTrainJob failed: %v", createErr.Error())
  294. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  295. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  296. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  297. JobID: models.TempJobId,
  298. VersionID: models.TempVersionId,
  299. Status: models.TempJobStatus,
  300. Type: models.TypeCloudBrainTwo,
  301. JobName: req.JobName,
  302. JobType: string(models.JobTypeTrain),
  303. })
  304. if errTemp != nil {
  305. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  306. return "", errTemp
  307. }
  308. }
  309. return "", createErr
  310. }
  311. jobID := strconv.FormatInt(jobResult.JobID, 10)
  312. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  313. Status: TransTrainJobStatus(jobResult.Status),
  314. UserID: ctx.User.ID,
  315. RepoID: ctx.Repo.Repository.ID,
  316. JobID: jobID,
  317. JobName: req.JobName,
  318. DisplayJobName: req.DisplayJobName,
  319. JobType: string(models.JobTypeTrain),
  320. Type: models.TypeCloudBrainTwo,
  321. VersionID: jobResult.VersionID,
  322. VersionName: jobResult.VersionName,
  323. Uuid: req.Uuid,
  324. DatasetName: req.DatasetName,
  325. CommitID: req.CommitID,
  326. IsLatestVersion: req.IsLatestVersion,
  327. ComputeResource: models.NPUResource,
  328. EngineID: req.EngineID,
  329. TrainUrl: req.TrainUrl,
  330. BranchName: req.BranchName,
  331. Parameters: req.Params,
  332. BootFile: req.BootFile,
  333. DataUrl: req.DataUrl,
  334. LogUrl: req.LogUrl,
  335. FlavorCode: req.Spec.SourceSpecId,
  336. Description: req.Description,
  337. WorkServerNumber: req.WorkServerNumber,
  338. FlavorName: req.FlavorName,
  339. EngineName: req.EngineName,
  340. VersionCount: req.VersionCount,
  341. TotalVersionCount: req.TotalVersionCount,
  342. CreatedUnix: createTime,
  343. UpdatedUnix: createTime,
  344. Spec: req.Spec,
  345. ModelName: req.ModelName,
  346. ModelVersion: req.ModelVersion,
  347. LabelName: req.LabelName,
  348. PreTrainModelUrl: req.PreTrainModelUrl,
  349. CkptName: req.CkptName,
  350. })
  351. if createErr != nil {
  352. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  353. return "", createErr
  354. }
  355. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
  356. return jobID, nil
  357. }
  358. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  359. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  360. JobName: req.JobName,
  361. Description: req.Description,
  362. Config: models.UserImageConfig{
  363. WorkServerNum: req.WorkServerNumber,
  364. AppUrl: req.CodeObsPath,
  365. BootFileUrl: req.BootFileUrl,
  366. DataUrl: req.DataUrl,
  367. TrainUrl: req.TrainUrl,
  368. LogUrl: req.LogUrl,
  369. PoolID: req.PoolID,
  370. CreateVersion: true,
  371. Flavor: models.Flavor{
  372. Code: req.FlavorCode,
  373. },
  374. Parameter: req.Parameters,
  375. UserImageUrl: req.UserImageUrl,
  376. UserCommand: req.UserCommand,
  377. },
  378. })
  379. }
  380. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  381. createTime := timeutil.TimeStampNow()
  382. var jobResult *models.CreateTrainJobResult
  383. var createErr error
  384. if req.EngineID < 0 {
  385. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  386. Description: req.Description,
  387. Config: models.TrainJobVersionUserImageConfig{
  388. WorkServerNum: req.WorkServerNumber,
  389. AppUrl: req.CodeObsPath,
  390. BootFileUrl: req.BootFileUrl,
  391. DataUrl: req.DataUrl,
  392. TrainUrl: req.TrainUrl,
  393. LogUrl: req.LogUrl,
  394. PoolID: req.PoolID,
  395. Flavor: models.Flavor{
  396. Code: req.Spec.SourceSpecId,
  397. },
  398. Parameter: req.Parameters,
  399. PreVersionId: req.PreVersionId,
  400. UserImageUrl: req.UserImageUrl,
  401. UserCommand: req.UserCommand,
  402. ShareAddr: setting.ModelArtsShareAddr,
  403. MountPath: setting.ModelArtsMountPath,
  404. NasType: setting.ModelArtsNasType,
  405. },
  406. }, jobId)
  407. } else {
  408. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  409. Description: req.Description,
  410. Config: models.TrainJobVersionConfig{
  411. WorkServerNum: req.WorkServerNumber,
  412. AppUrl: req.CodeObsPath,
  413. BootFileUrl: req.BootFileUrl,
  414. DataUrl: req.DataUrl,
  415. EngineID: req.EngineID,
  416. TrainUrl: req.TrainUrl,
  417. LogUrl: req.LogUrl,
  418. PoolID: req.PoolID,
  419. Flavor: models.Flavor{
  420. Code: req.Spec.SourceSpecId,
  421. },
  422. Parameter: req.Parameters,
  423. PreVersionId: req.PreVersionId,
  424. ShareAddr: setting.ModelArtsShareAddr,
  425. MountPath: setting.ModelArtsMountPath,
  426. NasType: setting.ModelArtsNasType,
  427. },
  428. }, jobId)
  429. }
  430. if createErr != nil {
  431. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  432. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  433. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  434. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  435. JobID: jobId,
  436. VersionID: models.TempVersionId,
  437. Status: models.TempJobStatus,
  438. Type: models.TypeCloudBrainTwo,
  439. JobName: req.JobName,
  440. JobType: string(models.JobTypeTrain),
  441. })
  442. if errTemp != nil {
  443. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  444. return errTemp
  445. }
  446. }
  447. return createErr
  448. }
  449. var jobTypes []string
  450. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  451. repo := ctx.Repo.Repository
  452. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  453. RepoID: repo.ID,
  454. Type: models.TypeCloudBrainTwo,
  455. JobTypes: jobTypes,
  456. JobID: strconv.FormatInt(jobResult.JobID, 10),
  457. })
  458. if createErr != nil {
  459. ctx.ServerError("Cloudbrain", createErr)
  460. return createErr
  461. }
  462. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  463. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  464. Status: TransTrainJobStatus(jobResult.Status),
  465. UserID: ctx.User.ID,
  466. RepoID: ctx.Repo.Repository.ID,
  467. JobID: strconv.FormatInt(jobResult.JobID, 10),
  468. JobName: req.JobName,
  469. DisplayJobName: req.DisplayJobName,
  470. JobType: string(models.JobTypeTrain),
  471. Type: models.TypeCloudBrainTwo,
  472. VersionID: jobResult.VersionID,
  473. VersionName: jobResult.VersionName,
  474. Uuid: req.Uuid,
  475. DatasetName: req.DatasetName,
  476. CommitID: req.CommitID,
  477. IsLatestVersion: req.IsLatestVersion,
  478. PreVersionName: req.PreVersionName,
  479. ComputeResource: models.NPUResource,
  480. EngineID: req.EngineID,
  481. TrainUrl: req.TrainUrl,
  482. BranchName: req.BranchName,
  483. Parameters: req.Params,
  484. BootFile: req.BootFile,
  485. DataUrl: req.DataUrl,
  486. LogUrl: req.LogUrl,
  487. PreVersionId: req.PreVersionId,
  488. FlavorCode: req.Spec.SourceSpecId,
  489. Description: req.Description,
  490. WorkServerNumber: req.WorkServerNumber,
  491. FlavorName: req.FlavorName,
  492. EngineName: req.EngineName,
  493. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  494. VersionCount: VersionListCount + 1,
  495. CreatedUnix: createTime,
  496. UpdatedUnix: createTime,
  497. Spec: req.Spec,
  498. ModelName: req.ModelName,
  499. ModelVersion: req.ModelVersion,
  500. LabelName: req.LabelName,
  501. PreTrainModelUrl: req.PreTrainModelUrl,
  502. CkptName: req.CkptName,
  503. })
  504. if createErr != nil {
  505. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  506. return createErr
  507. }
  508. //将训练任务的上一版本的isLatestVersion设置为"0"
  509. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  510. if createErr != nil {
  511. ctx.ServerError("Update IsLatestVersion failed", createErr)
  512. return createErr
  513. }
  514. return createErr
  515. }
  516. func TransTrainJobStatus(status int) string {
  517. switch status {
  518. case 0:
  519. return "UNKNOWN"
  520. case 1:
  521. return "INIT"
  522. case 2:
  523. return "IMAGE_CREATING"
  524. case 3:
  525. return "IMAGE_FAILED"
  526. case 4:
  527. return "SUBMIT_TRYING"
  528. case 5:
  529. return "SUBMIT_FAILED"
  530. case 6:
  531. return "DELETE_FAILED"
  532. case 7:
  533. return "WAITING"
  534. case 8:
  535. return "RUNNING"
  536. case 9:
  537. return "KILLING"
  538. case 10:
  539. return "COMPLETED"
  540. case 11:
  541. return "FAILED"
  542. case 12:
  543. return "KILLED"
  544. case 13:
  545. return "CANCELED"
  546. case 14:
  547. return "LOST"
  548. case 15:
  549. return "SCALING"
  550. case 16:
  551. return "SUBMIT_MODEL_FAILED"
  552. case 17:
  553. return "DEPLOY_SERVICE_FAILED"
  554. case 18:
  555. return "CHECK_INIT"
  556. case 19:
  557. return "CHECK_RUNNING"
  558. case 20:
  559. return "CHECK_RUNNING_COMPLETED"
  560. case 21:
  561. return "CHECK_FAILED"
  562. default:
  563. return strconv.Itoa(status)
  564. }
  565. }
  566. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  567. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  568. VersionOutputPath = "V" + talVersionCountToString
  569. return VersionOutputPath
  570. }
  571. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
  572. createTime := timeutil.TimeStampNow()
  573. var jobResult *models.CreateTrainJobResult
  574. var createErr error
  575. if req.EngineID < 0 {
  576. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  577. JobName: req.JobName,
  578. Description: req.Description,
  579. Config: models.InfUserImageConfig{
  580. WorkServerNum: req.WorkServerNumber,
  581. AppUrl: req.CodeObsPath,
  582. BootFileUrl: req.BootFileUrl,
  583. DataUrl: req.DataUrl,
  584. // TrainUrl: req.TrainUrl,
  585. LogUrl: req.LogUrl,
  586. PoolID: req.PoolID,
  587. CreateVersion: true,
  588. Flavor: models.Flavor{
  589. Code: req.Spec.SourceSpecId,
  590. },
  591. Parameter: req.Parameters,
  592. UserImageUrl: req.UserImageUrl,
  593. UserCommand: req.UserCommand,
  594. },
  595. })
  596. } else {
  597. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  598. JobName: req.JobName,
  599. Description: req.Description,
  600. InfConfig: models.InfConfig{
  601. WorkServerNum: req.WorkServerNumber,
  602. AppUrl: req.CodeObsPath,
  603. BootFileUrl: req.BootFileUrl,
  604. DataUrl: req.DataUrl,
  605. EngineID: req.EngineID,
  606. // TrainUrl: req.TrainUrl,
  607. LogUrl: req.LogUrl,
  608. PoolID: req.PoolID,
  609. CreateVersion: true,
  610. Flavor: models.Flavor{
  611. Code: req.Spec.SourceSpecId,
  612. },
  613. Parameter: req.Parameters,
  614. },
  615. })
  616. }
  617. if createErr != nil {
  618. log.Error("createInferenceJob failed: %v", err.Error())
  619. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  620. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  621. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  622. JobID: models.TempJobId,
  623. VersionID: models.TempVersionId,
  624. Status: models.TempJobStatus,
  625. Type: models.TypeCloudBrainTwo,
  626. JobName: req.JobName,
  627. JobType: req.JobType,
  628. })
  629. if err != nil {
  630. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  631. return "", err
  632. }
  633. }
  634. return "", err
  635. }
  636. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  637. // if err != nil {
  638. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  639. // return err
  640. // }
  641. jobID := strconv.FormatInt(jobResult.JobID, 10)
  642. err = models.CreateCloudbrain(&models.Cloudbrain{
  643. Status: TransTrainJobStatus(jobResult.Status),
  644. UserID: ctx.User.ID,
  645. RepoID: ctx.Repo.Repository.ID,
  646. JobID: jobID,
  647. JobName: req.JobName,
  648. DisplayJobName: req.DisplayJobName,
  649. JobType: req.JobType,
  650. Type: models.TypeCloudBrainTwo,
  651. VersionID: jobResult.VersionID,
  652. VersionName: jobResult.VersionName,
  653. Uuid: req.Uuid,
  654. DatasetName: req.DatasetName,
  655. CommitID: req.CommitID,
  656. EngineID: req.EngineID,
  657. TrainUrl: req.TrainUrl,
  658. BranchName: req.BranchName,
  659. Parameters: req.Params,
  660. BootFile: req.BootFile,
  661. DataUrl: req.DataUrl,
  662. LogUrl: req.LogUrl,
  663. FlavorCode: req.Spec.SourceSpecId,
  664. Description: req.Description,
  665. WorkServerNumber: req.WorkServerNumber,
  666. FlavorName: req.FlavorName,
  667. EngineName: req.EngineName,
  668. LabelName: req.LabelName,
  669. IsLatestVersion: req.IsLatestVersion,
  670. ComputeResource: models.NPUResource,
  671. VersionCount: req.VersionCount,
  672. TotalVersionCount: req.TotalVersionCount,
  673. ModelName: req.ModelName,
  674. ModelVersion: req.ModelVersion,
  675. CkptName: req.CkptName,
  676. ResultUrl: req.ResultUrl,
  677. CreatedUnix: createTime,
  678. UpdatedUnix: createTime,
  679. Spec: req.Spec,
  680. })
  681. if err != nil {
  682. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  683. return "", err
  684. }
  685. if req.JobType == string(models.JobTypeModelSafety) {
  686. task, err := models.GetCloudbrainByJobID(jobID)
  687. if err == nil {
  688. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  689. }
  690. } else {
  691. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  692. }
  693. return jobID, nil
  694. }
  695. func GetNotebookImageName(imageId string) (string, error) {
  696. var validImage = false
  697. var imageName = ""
  698. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  699. if imageInfo.Id == imageId {
  700. validImage = true
  701. imageName = imageInfo.Value
  702. }
  703. }
  704. if !validImage {
  705. log.Error("the image id(%s) is invalid", imageId)
  706. return imageName, errors.New("the image id is invalid")
  707. }
  708. return imageName, nil
  709. }
  710. func InitSpecialPool() {
  711. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  712. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  713. }
  714. }
  715. func InitMultiNode() {
  716. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  717. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  718. }
  719. }
  720. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  721. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  722. if err != nil {
  723. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  724. return err
  725. }
  726. if result != nil {
  727. oldStatus := task.Status
  728. task.Status = TransTrainJobStatus(result.IntStatus)
  729. task.Duration = result.Duration / 1000
  730. task.TrainJobDuration = result.TrainJobDuration
  731. if task.StartTime == 0 && result.StartTime > 0 {
  732. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  733. }
  734. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  735. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  736. task.EndTime = task.StartTime.Add(task.Duration)
  737. }
  738. task.CorrectCreateUnix()
  739. if oldStatus != task.Status {
  740. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  741. }
  742. err = models.UpdateJob(task)
  743. if err != nil {
  744. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  745. return err
  746. }
  747. }
  748. return nil
  749. }
  750. func HandleNotebookInfo(task *models.Cloudbrain) error {
  751. var result *models.GetNotebook2Result
  752. var err error
  753. if task.Type == models.TypeCloudBrainTwo {
  754. result, err = GetNotebook2(task.JobID)
  755. } else if task.Type == models.TypeCDCenter {
  756. result, err = modelarts_cd.GetNotebook(task.JobID)
  757. }
  758. if err != nil {
  759. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  760. return err
  761. }
  762. if result != nil {
  763. oldStatus := task.Status
  764. task.Status = result.Status
  765. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  766. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  767. }
  768. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  769. task.EndTime = timeutil.TimeStampNow()
  770. }
  771. task.CorrectCreateUnix()
  772. task.ComputeAndSetDuration()
  773. if oldStatus != task.Status {
  774. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  775. }
  776. if task.FlavorCode == "" {
  777. task.FlavorCode = result.Flavor
  778. }
  779. if oldStatus != task.Status && task.Status == string(models.ModelArtsRunning) && task.BootFile != "" {
  780. uploadNoteBookFile(task, result)
  781. }
  782. err = models.UpdateJob(task)
  783. if err != nil {
  784. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  785. return err
  786. }
  787. }
  788. return nil
  789. }
  790. func uploadNoteBookFile(task *models.Cloudbrain, result *models.GetNotebook2Result) {
  791. jupyterUrl := result.Url + "?token=" + result.Token
  792. cookies, xsrf := getCookiesAndCsrf(jupyterUrl)
  793. if xsrf == "" {
  794. log.Error("browser jupyterUrl failed:%v", task.DisplayJobName)
  795. } else {
  796. codePath := setting.JobPath + task.JobName + cloudbrain.CodeMountPath
  797. fileContents, err := ioutil.ReadFile(codePath + "/" + task.BootFile)
  798. if err != nil {
  799. log.Error("read jupyter file failed:%v", task.DisplayJobName, err)
  800. }
  801. base64Content := base64.StdEncoding.EncodeToString(fileContents)
  802. client := getRestyClient()
  803. uploadUrl := getJupyterBaseUrl(result.Url) + "api/contents/" + path.Base(task.BootFile)
  804. res, err := client.R().
  805. SetCookies(cookies).
  806. SetHeader("X-XSRFToken", xsrf).
  807. SetBody(map[string]interface{}{
  808. "type": "file",
  809. "format": "base64",
  810. "name": path.Base(task.BootFile),
  811. "path": path.Base(task.BootFile),
  812. "content": base64Content}).
  813. Put(uploadUrl)
  814. if err != nil {
  815. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  816. } else if res.StatusCode() != http.StatusCreated {
  817. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  818. }
  819. }
  820. }
  821. func getJupyterBaseUrl(url string) string {
  822. jupyterUrlLength := len(url)
  823. baseUrl := url[0 : jupyterUrlLength-len(path.Base(url))]
  824. return baseUrl
  825. }
  826. func getCookiesAndCsrf(jupyterUrl string) ([]*http.Cookie, string) {
  827. log.Info("jupyter url:" + jupyterUrl)
  828. var cookies []*http.Cookie
  829. const retryTimes = 10
  830. for i := 0; i < retryTimes; i++ {
  831. res, err := http.Get(jupyterUrl)
  832. if err != nil {
  833. log.Error("browser jupyterUrl failed.", err)
  834. if i == retryTimes-1 {
  835. return cookies, ""
  836. }
  837. } else {
  838. cookies = res.Cookies()
  839. xsrf := ""
  840. for _, cookie := range cookies {
  841. if cookie.Name == "_xsrf" {
  842. xsrf = cookie.Value
  843. break
  844. }
  845. }
  846. if xsrf != "" {
  847. return cookies, xsrf
  848. }
  849. }
  850. }
  851. return cookies, ""
  852. }
  853. func SyncTempStatusJob() {
  854. jobs, err := models.GetCloudBrainTempJobs()
  855. if err != nil {
  856. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  857. return
  858. }
  859. for _, temp := range jobs {
  860. log.Info("start to handle record: %s", temp.JobName)
  861. if temp.Type == models.TypeCloudBrainTwo {
  862. if temp.JobType == string(models.JobTypeDebug) {
  863. err = handleNotebook(temp)
  864. if err != nil {
  865. log.Error("handleNotebook falied:%v", err)
  866. break
  867. }
  868. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  869. _, err = models.GetCloudbrainByJobID(temp.JobID)
  870. if err != nil {
  871. //one version
  872. err = handleTrainJob(temp)
  873. if err != nil {
  874. log.Error("handleTrainJob falied:%v", err)
  875. break
  876. }
  877. } else {
  878. //multi version
  879. err = handleTrainJobMultiVersion(temp)
  880. if err != nil {
  881. log.Error("handleTrainJobMultiVersion falied:%v", err)
  882. break
  883. }
  884. }
  885. }
  886. }
  887. }
  888. return
  889. }
  890. func handleNotebook(temp *models.CloudbrainTemp) error {
  891. if temp.Status == models.TempJobStatus {
  892. err := handleTempNotebook(temp)
  893. if err != nil {
  894. log.Error("handleTempNotebook failed:%v", err)
  895. return err
  896. }
  897. } else if temp.Status == string(models.ModelArtsStopping) {
  898. res, err := GetNotebook2(temp.JobID)
  899. if err != nil {
  900. log.Error("GetNotebook2 failed:%v", err)
  901. return err
  902. }
  903. temp.Status = res.Status
  904. if temp.Status == string(models.ModelArtsStopped) {
  905. err = models.UpdateCloudbrainTemp(temp)
  906. if err != nil {
  907. log.Error("UpdateCloudbrainTemp failed:%v", err)
  908. return err
  909. }
  910. _, err := DelNotebook2(temp.JobID)
  911. if err != nil {
  912. log.Error("DelNotebook2 failed:%v", err)
  913. return err
  914. }
  915. temp.Status = string(models.ModelArtsDeleted)
  916. err = models.UpdateCloudbrainTemp(temp)
  917. if err != nil {
  918. log.Error("UpdateCloudbrainTemp failed:%v", err)
  919. return err
  920. }
  921. }
  922. }
  923. return nil
  924. }
  925. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  926. var err error
  927. var isExist bool
  928. for {
  929. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  930. if err != nil {
  931. log.Error("GetNotebookList failed:%v", err)
  932. break
  933. }
  934. temp.QueryTimes++
  935. err = models.UpdateCloudbrainTemp(temp)
  936. if err != nil {
  937. log.Error("UpdateCloudbrainTemp failed:%v", err)
  938. }
  939. if result != nil {
  940. for _, notebook := range result.NotebookList {
  941. if temp.JobID == models.TempJobId {
  942. //new notebook
  943. if notebook.JobName == temp.JobName {
  944. isExist = true
  945. temp.Status = notebook.Status
  946. temp.JobID = notebook.JobID
  947. break
  948. }
  949. } else {
  950. //restart: always can find one record
  951. if notebook.JobName == temp.JobName {
  952. if notebook.Status != string(models.ModelArtsStopped) {
  953. isExist = true
  954. temp.Status = notebook.Status
  955. temp.JobID = notebook.JobID
  956. break
  957. }
  958. }
  959. }
  960. }
  961. if isExist {
  962. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  963. if temp.Status == string(models.ModelArtsCreateFailed) {
  964. err = models.UpdateCloudbrainTemp(temp)
  965. if err != nil {
  966. log.Error("UpdateCloudbrainTemp failed:%v", err)
  967. break
  968. }
  969. _, err := DelNotebook2(temp.JobID)
  970. if err != nil {
  971. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  972. break
  973. }
  974. temp.Status = string(models.ModelArtsDeleted)
  975. } else {
  976. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  977. if err != nil {
  978. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  979. break
  980. }
  981. temp.Status = string(models.ModelArtsStopping)
  982. }
  983. models.UpdateCloudbrainTemp(temp)
  984. } else {
  985. log.Error("can not find the record(%s) till now", temp.JobName)
  986. err = errors.New("not found")
  987. break
  988. }
  989. } else {
  990. log.Error("can not find the record(%s) till now", temp.JobName)
  991. err = errors.New("not found")
  992. break
  993. }
  994. break
  995. }
  996. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  997. log.Info("reach MaxTempQueryTimes, set the job failed")
  998. temp.Status = string(models.ModelArtsTrainJobFailed)
  999. err = models.UpdateCloudbrainTemp(temp)
  1000. if err != nil {
  1001. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1002. return err
  1003. }
  1004. }
  1005. return err
  1006. }
  1007. func handleTrainJob(temp *models.CloudbrainTemp) error {
  1008. if temp.Status == models.TempJobStatus {
  1009. err := handleTempTrainJob(temp)
  1010. if err != nil {
  1011. log.Error("handleTempTrainJob failed:%v", err)
  1012. return err
  1013. }
  1014. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1015. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1016. if err != nil {
  1017. log.Error("GetTrainJob failed:%v", err)
  1018. return err
  1019. }
  1020. temp.Status = TransTrainJobStatus(res.IntStatus)
  1021. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1022. err = models.UpdateCloudbrainTemp(temp)
  1023. if err != nil {
  1024. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1025. return err
  1026. }
  1027. _, err := DelTrainJob(temp.JobID)
  1028. if err != nil {
  1029. log.Error("DelTrainJob failed:%v", err)
  1030. return err
  1031. }
  1032. temp.Status = string(models.ModelArtsDeleted)
  1033. err = models.UpdateCloudbrainTemp(temp)
  1034. if err != nil {
  1035. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1036. return err
  1037. }
  1038. }
  1039. }
  1040. return nil
  1041. }
  1042. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1043. if temp.Status == models.TempJobStatus {
  1044. err := handleTempTrainJobMultiVersion(temp)
  1045. if err != nil {
  1046. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1047. return err
  1048. }
  1049. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1050. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1051. if err != nil {
  1052. log.Error("GetTrainJob failed:%v", err)
  1053. return err
  1054. }
  1055. temp.Status = TransTrainJobStatus(res.IntStatus)
  1056. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1057. err = models.UpdateCloudbrainTemp(temp)
  1058. if err != nil {
  1059. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1060. return err
  1061. }
  1062. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1063. if err != nil {
  1064. log.Error("DelTrainJob failed:%v", err)
  1065. return err
  1066. }
  1067. temp.Status = string(models.ModelArtsDeleted)
  1068. err = models.UpdateCloudbrainTemp(temp)
  1069. if err != nil {
  1070. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1071. return err
  1072. }
  1073. }
  1074. }
  1075. return nil
  1076. }
  1077. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1078. var err error
  1079. var isExist bool
  1080. for {
  1081. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1082. if err != nil {
  1083. log.Error("GetTrainJobVersionList failed:%v", err)
  1084. break
  1085. }
  1086. temp.QueryTimes++
  1087. err = models.UpdateCloudbrainTemp(temp)
  1088. if err != nil {
  1089. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1090. }
  1091. if result != nil {
  1092. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1093. if result.VersionCount == int64(count+1) {
  1094. isExist = true
  1095. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1096. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1097. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1098. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1099. if err != nil {
  1100. log.Error("StopTrainJob failed:%v", err)
  1101. break
  1102. }
  1103. temp.Status = string(models.ModelArtsTrainJobKilling)
  1104. err = models.UpdateCloudbrainTemp(temp)
  1105. if err != nil {
  1106. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1107. break
  1108. }
  1109. } else {
  1110. log.Error("can not find the record(%s) till now", temp.JobName)
  1111. err = errors.New("not found")
  1112. break
  1113. }
  1114. }
  1115. break
  1116. }
  1117. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1118. log.Info("reach MaxTempQueryTimes, set the job failed")
  1119. temp.Status = string(models.ModelArtsTrainJobFailed)
  1120. err = models.UpdateCloudbrainTemp(temp)
  1121. if err != nil {
  1122. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1123. return err
  1124. }
  1125. }
  1126. return err
  1127. }
  1128. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1129. var err error
  1130. var isExist bool
  1131. for {
  1132. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1133. if err != nil {
  1134. log.Error("GetTrainJobList failed:%v", err)
  1135. break
  1136. }
  1137. temp.QueryTimes++
  1138. err = models.UpdateCloudbrainTemp(temp)
  1139. if err != nil {
  1140. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1141. }
  1142. if result != nil {
  1143. for _, job := range result.JobList {
  1144. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1145. isExist = true
  1146. temp.Status = TransTrainJobStatus(job.IntStatus)
  1147. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1148. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1149. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1150. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1151. if err != nil {
  1152. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1153. break
  1154. }
  1155. temp.Status = string(models.ModelArtsTrainJobKilling)
  1156. err = models.UpdateCloudbrainTemp(temp)
  1157. if err != nil {
  1158. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1159. break
  1160. }
  1161. }
  1162. }
  1163. if !isExist {
  1164. log.Error("can not find the record(%s) till now", temp.JobName)
  1165. err = errors.New("not found")
  1166. break
  1167. }
  1168. }
  1169. break
  1170. }
  1171. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1172. log.Info("reach MaxTempQueryTimes, set the job failed")
  1173. temp.Status = string(models.ModelArtsTrainJobFailed)
  1174. err = models.UpdateCloudbrainTemp(temp)
  1175. if err != nil {
  1176. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1177. return err
  1178. }
  1179. }
  1180. return err
  1181. }