You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 40 kB

4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package modelarts
  2. import (
  3. "encoding/base64"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io/ioutil"
  8. "net/http"
  9. "path"
  10. "strconv"
  11. "strings"
  12. "code.gitea.io/gitea/modules/cloudbrain"
  13. "code.gitea.io/gitea/modules/modelarts_cd"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/context"
  16. "code.gitea.io/gitea/modules/log"
  17. "code.gitea.io/gitea/modules/notification"
  18. "code.gitea.io/gitea/modules/setting"
  19. "code.gitea.io/gitea/modules/storage"
  20. "code.gitea.io/gitea/modules/timeutil"
  21. )
  22. const (
  23. //notebook
  24. storageTypeOBS = "obs"
  25. autoStopDuration = 4 * 60 * 60
  26. AutoStopDurationMs = 4 * 60 * 60 * 1000
  27. MORDELART_USER_IMAGE_ENGINE_ID = -1
  28. DataSetMountPath = "/home/ma-user/work"
  29. NotebookEnv = "Python3"
  30. NotebookType = "Ascend"
  31. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  32. //train-job
  33. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  34. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  35. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  36. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  37. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  38. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  39. // "]}"
  40. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  41. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  42. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  43. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  44. // "]}"
  45. CodePath = "/code/"
  46. OutputPath = "/output/"
  47. ResultPath = "/result/"
  48. LogPath = "/log/"
  49. JobPath = "/job/"
  50. OrderDesc = "desc" //向下查询
  51. OrderAsc = "asc" //向上查询
  52. Lines = 500
  53. TrainUrl = "train_url"
  54. DataUrl = "data_url"
  55. MultiDataUrl = "multi_data_url"
  56. ResultUrl = "result_url"
  57. CkptUrl = "ckpt_url"
  58. DeviceTarget = "device_target"
  59. Ascend = "Ascend"
  60. PerPage = 10
  61. IsLatestVersion = "1"
  62. NotLatestVersion = "0"
  63. VersionCountOne = 1
  64. SortByCreateTime = "create_time"
  65. ConfigTypeCustom = "custom"
  66. TotalVersionCount = 1
  67. )
  68. var (
  69. poolInfos *models.PoolInfos
  70. TrainFlavorInfos *Flavor
  71. SpecialPools *models.SpecialPools
  72. MultiNodeConfig *MultiNodes
  73. )
  74. type GenerateTrainJobReq struct {
  75. JobName string
  76. DisplayJobName string
  77. Uuid string
  78. Description string
  79. CodeObsPath string
  80. BootFile string
  81. BootFileUrl string
  82. DataUrl string
  83. TrainUrl string
  84. LogUrl string
  85. PoolID string
  86. WorkServerNumber int
  87. EngineID int64
  88. Parameters []models.Parameter
  89. CommitID string
  90. IsLatestVersion string
  91. Params string
  92. BranchName string
  93. PreVersionId int64
  94. PreVersionName string
  95. FlavorCode string
  96. FlavorName string
  97. VersionCount int
  98. EngineName string
  99. TotalVersionCount int
  100. UserImageUrl string
  101. UserCommand string
  102. DatasetName string
  103. Spec *models.Specification
  104. ModelName string
  105. LabelName string
  106. CkptName string
  107. ModelVersion string
  108. PreTrainModelUrl string
  109. }
  110. type GenerateInferenceJobReq struct {
  111. JobName string
  112. DisplayJobName string
  113. Uuid string
  114. Description string
  115. CodeObsPath string
  116. BootFile string
  117. BootFileUrl string
  118. DataUrl string
  119. TrainUrl string
  120. LogUrl string
  121. PoolID string
  122. WorkServerNumber int
  123. EngineID int64
  124. Parameters []models.Parameter
  125. CommitID string
  126. Params string
  127. BranchName string
  128. FlavorName string
  129. EngineName string
  130. LabelName string
  131. IsLatestVersion string
  132. VersionCount int
  133. TotalVersionCount int
  134. ModelName string
  135. ModelVersion string
  136. CkptName string
  137. ResultUrl string
  138. Spec *models.Specification
  139. DatasetName string
  140. JobType string
  141. UserImageUrl string
  142. UserCommand string
  143. }
  144. type VersionInfo struct {
  145. Version []struct {
  146. ID int `json:"id"`
  147. Value string `json:"value"`
  148. Url string `json:"url"`
  149. } `json:"version"`
  150. }
  151. type Flavor struct {
  152. Info []struct {
  153. Code string `json:"code"`
  154. Value string `json:"value"`
  155. UnitPrice int64 `json:"unitPrice"`
  156. } `json:"flavor"`
  157. }
  158. type Engine struct {
  159. Info []struct {
  160. ID int `json:"id"`
  161. Value string `json:"value"`
  162. } `json:"engine"`
  163. }
  164. type ResourcePool struct {
  165. Info []struct {
  166. ID string `json:"id"`
  167. Value string `json:"value"`
  168. } `json:"resource_pool"`
  169. }
  170. type MultiNodes struct {
  171. Info []OrgMultiNode `json:"multinode"`
  172. }
  173. type OrgMultiNode struct {
  174. Org string `json:"org"`
  175. Node []int `json:"node"`
  176. }
  177. // type Parameter struct {
  178. // Label string `json:"label"`
  179. // Value string `json:"value"`
  180. // }
  181. // type Parameters struct {
  182. // Parameter []Parameter `json:"parameter"`
  183. // }
  184. type Parameters struct {
  185. Parameter []struct {
  186. Label string `json:"label"`
  187. Value string `json:"value"`
  188. } `json:"parameter"`
  189. }
  190. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  191. var dataActualPath string
  192. if uuid != "" {
  193. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  194. } else {
  195. userPath := setting.UserBasePath + ctx.User.Name + "/"
  196. isExist, err := storage.ObsHasObject(userPath)
  197. if err != nil {
  198. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  199. return err
  200. }
  201. if !isExist {
  202. if err = storage.ObsCreateObject(userPath); err != nil {
  203. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  204. return err
  205. }
  206. }
  207. dataActualPath = setting.Bucket + "/" + userPath
  208. }
  209. if poolInfos == nil {
  210. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  211. }
  212. createTime := timeutil.TimeStampNow()
  213. jobResult, err := CreateJob(models.CreateNotebookParams{
  214. JobName: jobName,
  215. Description: description,
  216. ProfileID: setting.ProfileID,
  217. Flavor: flavor,
  218. Pool: models.Pool{
  219. ID: poolInfos.PoolInfo[0].PoolId,
  220. Name: poolInfos.PoolInfo[0].PoolName,
  221. Type: poolInfos.PoolInfo[0].PoolType,
  222. },
  223. Spec: models.Spec{
  224. Storage: models.Storage{
  225. Type: storageTypeOBS,
  226. Location: models.Location{
  227. Path: dataActualPath,
  228. },
  229. },
  230. AutoStop: models.AutoStop{
  231. Enable: true,
  232. Duration: autoStopDuration,
  233. },
  234. },
  235. })
  236. if err != nil {
  237. log.Error("CreateJob failed: %v", err.Error())
  238. return err
  239. }
  240. err = models.CreateCloudbrain(&models.Cloudbrain{
  241. Status: string(models.JobWaiting),
  242. UserID: ctx.User.ID,
  243. RepoID: ctx.Repo.Repository.ID,
  244. JobID: jobResult.ID,
  245. JobName: jobName,
  246. JobType: string(models.JobTypeDebug),
  247. Type: models.TypeCloudBrainTwo,
  248. Uuid: uuid,
  249. ComputeResource: models.NPUResource,
  250. CreatedUnix: createTime,
  251. UpdatedUnix: createTime,
  252. })
  253. if err != nil {
  254. return err
  255. }
  256. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  257. return nil
  258. }
  259. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification, bootFile string,autoStopDurationInMs int64) (string, error) {
  260. if poolInfos == nil {
  261. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  262. }
  263. imageName, err := GetNotebookImageName(imageId)
  264. if err != nil {
  265. log.Error("GetNotebookImageName failed: %v", err.Error())
  266. return "", err
  267. }
  268. createTime := timeutil.TimeStampNow()
  269. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  270. JobName: jobName,
  271. Description: description,
  272. Flavor: spec.SourceSpecId,
  273. Duration: autoStopDurationInMs,
  274. ImageID: imageId,
  275. PoolID: poolInfos.PoolInfo[0].PoolId,
  276. Feature: models.NotebookFeature,
  277. Volume: models.VolumeReq{
  278. Capacity: setting.Capacity,
  279. Category: models.EVSCategory,
  280. Ownership: models.ManagedOwnership,
  281. },
  282. WorkspaceID: "0",
  283. })
  284. if err != nil {
  285. log.Error("createNotebook2 failed: %v", err.Error())
  286. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  287. log.Info("(%s)unknown error, set temp status", displayJobName)
  288. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  289. JobID: models.TempJobId,
  290. VersionID: models.TempVersionId,
  291. Status: models.TempJobStatus,
  292. Type: models.TypeCloudBrainTwo,
  293. JobName: jobName,
  294. JobType: string(models.JobTypeDebug),
  295. })
  296. if errTemp != nil {
  297. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  298. return "", errTemp
  299. }
  300. }
  301. return "", err
  302. }
  303. task := &models.Cloudbrain{
  304. Status: jobResult.Status,
  305. UserID: ctx.User.ID,
  306. RepoID: ctx.Repo.Repository.ID,
  307. JobID: jobResult.ID,
  308. JobName: jobName,
  309. FlavorCode: spec.SourceSpecId,
  310. DisplayJobName: displayJobName,
  311. JobType: string(models.JobTypeDebug),
  312. Type: models.TypeCloudBrainTwo,
  313. Uuid: uuid,
  314. ComputeResource: models.NPUResource,
  315. Image: imageName,
  316. BootFile: bootFile,
  317. Description: description,
  318. CreatedUnix: createTime,
  319. UpdatedUnix: createTime,
  320. Spec: spec,
  321. }
  322. err = models.CreateCloudbrain(task)
  323. if err != nil {
  324. return "", err
  325. }
  326. stringId := strconv.FormatInt(task.ID, 10)
  327. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  328. return jobResult.ID, nil
  329. }
  330. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  331. createTime := timeutil.TimeStampNow()
  332. var jobResult *models.CreateTrainJobResult
  333. var createErr error
  334. if req.EngineID < 0 {
  335. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  336. JobName: req.JobName,
  337. Description: req.Description,
  338. Config: models.UserImageConfig{
  339. WorkServerNum: req.WorkServerNumber,
  340. AppUrl: req.CodeObsPath,
  341. BootFileUrl: req.BootFileUrl,
  342. DataUrl: req.DataUrl,
  343. TrainUrl: req.TrainUrl,
  344. LogUrl: req.LogUrl,
  345. PoolID: req.PoolID,
  346. CreateVersion: true,
  347. Flavor: models.Flavor{
  348. Code: req.Spec.SourceSpecId,
  349. },
  350. Parameter: req.Parameters,
  351. UserImageUrl: req.UserImageUrl,
  352. UserCommand: req.UserCommand,
  353. },
  354. })
  355. } else {
  356. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  357. JobName: req.JobName,
  358. Description: req.Description,
  359. Config: models.Config{
  360. WorkServerNum: req.WorkServerNumber,
  361. AppUrl: req.CodeObsPath,
  362. BootFileUrl: req.BootFileUrl,
  363. DataUrl: req.DataUrl,
  364. EngineID: req.EngineID,
  365. TrainUrl: req.TrainUrl,
  366. LogUrl: req.LogUrl,
  367. PoolID: req.PoolID,
  368. CreateVersion: true,
  369. Flavor: models.Flavor{
  370. Code: req.Spec.SourceSpecId,
  371. },
  372. Parameter: req.Parameters,
  373. },
  374. })
  375. }
  376. if createErr != nil {
  377. log.Error("createTrainJob failed: %v", createErr.Error())
  378. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  379. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  380. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  381. JobID: models.TempJobId,
  382. VersionID: models.TempVersionId,
  383. Status: models.TempJobStatus,
  384. Type: models.TypeCloudBrainTwo,
  385. JobName: req.JobName,
  386. JobType: string(models.JobTypeTrain),
  387. })
  388. if errTemp != nil {
  389. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  390. return "", errTemp
  391. }
  392. }
  393. return "", createErr
  394. }
  395. jobID := strconv.FormatInt(jobResult.JobID, 10)
  396. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  397. Status: TransTrainJobStatus(jobResult.Status),
  398. UserID: ctx.User.ID,
  399. RepoID: ctx.Repo.Repository.ID,
  400. JobID: jobID,
  401. JobName: req.JobName,
  402. DisplayJobName: req.DisplayJobName,
  403. JobType: string(models.JobTypeTrain),
  404. Type: models.TypeCloudBrainTwo,
  405. VersionID: jobResult.VersionID,
  406. VersionName: jobResult.VersionName,
  407. Uuid: req.Uuid,
  408. DatasetName: req.DatasetName,
  409. CommitID: req.CommitID,
  410. IsLatestVersion: req.IsLatestVersion,
  411. ComputeResource: models.NPUResource,
  412. EngineID: req.EngineID,
  413. TrainUrl: req.TrainUrl,
  414. BranchName: req.BranchName,
  415. Parameters: req.Params,
  416. BootFile: req.BootFile,
  417. DataUrl: req.DataUrl,
  418. LogUrl: req.LogUrl,
  419. FlavorCode: req.Spec.SourceSpecId,
  420. Description: req.Description,
  421. WorkServerNumber: req.WorkServerNumber,
  422. FlavorName: req.FlavorName,
  423. EngineName: req.EngineName,
  424. VersionCount: req.VersionCount,
  425. TotalVersionCount: req.TotalVersionCount,
  426. CreatedUnix: createTime,
  427. UpdatedUnix: createTime,
  428. Spec: req.Spec,
  429. ModelName: req.ModelName,
  430. ModelVersion: req.ModelVersion,
  431. LabelName: req.LabelName,
  432. PreTrainModelUrl: req.PreTrainModelUrl,
  433. CkptName: req.CkptName,
  434. })
  435. if createErr != nil {
  436. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  437. return "", createErr
  438. }
  439. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateTrainTask)
  440. return jobID, nil
  441. }
  442. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  443. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  444. JobName: req.JobName,
  445. Description: req.Description,
  446. Config: models.UserImageConfig{
  447. WorkServerNum: req.WorkServerNumber,
  448. AppUrl: req.CodeObsPath,
  449. BootFileUrl: req.BootFileUrl,
  450. DataUrl: req.DataUrl,
  451. TrainUrl: req.TrainUrl,
  452. LogUrl: req.LogUrl,
  453. PoolID: req.PoolID,
  454. CreateVersion: true,
  455. Flavor: models.Flavor{
  456. Code: req.FlavorCode,
  457. },
  458. Parameter: req.Parameters,
  459. UserImageUrl: req.UserImageUrl,
  460. UserCommand: req.UserCommand,
  461. },
  462. })
  463. }
  464. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  465. createTime := timeutil.TimeStampNow()
  466. var jobResult *models.CreateTrainJobResult
  467. var createErr error
  468. if req.EngineID < 0 {
  469. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  470. Description: req.Description,
  471. Config: models.TrainJobVersionUserImageConfig{
  472. WorkServerNum: req.WorkServerNumber,
  473. AppUrl: req.CodeObsPath,
  474. BootFileUrl: req.BootFileUrl,
  475. DataUrl: req.DataUrl,
  476. TrainUrl: req.TrainUrl,
  477. LogUrl: req.LogUrl,
  478. PoolID: req.PoolID,
  479. Flavor: models.Flavor{
  480. Code: req.Spec.SourceSpecId,
  481. },
  482. Parameter: req.Parameters,
  483. PreVersionId: req.PreVersionId,
  484. UserImageUrl: req.UserImageUrl,
  485. UserCommand: req.UserCommand,
  486. },
  487. }, jobId)
  488. } else {
  489. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  490. Description: req.Description,
  491. Config: models.TrainJobVersionConfig{
  492. WorkServerNum: req.WorkServerNumber,
  493. AppUrl: req.CodeObsPath,
  494. BootFileUrl: req.BootFileUrl,
  495. DataUrl: req.DataUrl,
  496. EngineID: req.EngineID,
  497. TrainUrl: req.TrainUrl,
  498. LogUrl: req.LogUrl,
  499. PoolID: req.PoolID,
  500. Flavor: models.Flavor{
  501. Code: req.Spec.SourceSpecId,
  502. },
  503. Parameter: req.Parameters,
  504. PreVersionId: req.PreVersionId,
  505. },
  506. }, jobId)
  507. }
  508. if createErr != nil {
  509. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  510. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  511. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  512. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  513. JobID: jobId,
  514. VersionID: models.TempVersionId,
  515. Status: models.TempJobStatus,
  516. Type: models.TypeCloudBrainTwo,
  517. JobName: req.JobName,
  518. JobType: string(models.JobTypeTrain),
  519. })
  520. if errTemp != nil {
  521. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  522. return errTemp
  523. }
  524. }
  525. return createErr
  526. }
  527. var jobTypes []string
  528. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  529. repo := ctx.Repo.Repository
  530. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  531. RepoID: repo.ID,
  532. Type: models.TypeCloudBrainTwo,
  533. JobTypes: jobTypes,
  534. JobID: strconv.FormatInt(jobResult.JobID, 10),
  535. })
  536. if createErr != nil {
  537. ctx.ServerError("Cloudbrain", createErr)
  538. return createErr
  539. }
  540. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  541. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  542. Status: TransTrainJobStatus(jobResult.Status),
  543. UserID: ctx.User.ID,
  544. RepoID: ctx.Repo.Repository.ID,
  545. JobID: strconv.FormatInt(jobResult.JobID, 10),
  546. JobName: req.JobName,
  547. DisplayJobName: req.DisplayJobName,
  548. JobType: string(models.JobTypeTrain),
  549. Type: models.TypeCloudBrainTwo,
  550. VersionID: jobResult.VersionID,
  551. VersionName: jobResult.VersionName,
  552. Uuid: req.Uuid,
  553. DatasetName: req.DatasetName,
  554. CommitID: req.CommitID,
  555. IsLatestVersion: req.IsLatestVersion,
  556. PreVersionName: req.PreVersionName,
  557. ComputeResource: models.NPUResource,
  558. EngineID: req.EngineID,
  559. TrainUrl: req.TrainUrl,
  560. BranchName: req.BranchName,
  561. Parameters: req.Params,
  562. BootFile: req.BootFile,
  563. DataUrl: req.DataUrl,
  564. LogUrl: req.LogUrl,
  565. PreVersionId: req.PreVersionId,
  566. FlavorCode: req.Spec.SourceSpecId,
  567. Description: req.Description,
  568. WorkServerNumber: req.WorkServerNumber,
  569. FlavorName: req.FlavorName,
  570. EngineName: req.EngineName,
  571. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  572. VersionCount: VersionListCount + 1,
  573. CreatedUnix: createTime,
  574. UpdatedUnix: createTime,
  575. Spec: req.Spec,
  576. ModelName: req.ModelName,
  577. ModelVersion: req.ModelVersion,
  578. LabelName: req.LabelName,
  579. PreTrainModelUrl: req.PreTrainModelUrl,
  580. CkptName: req.CkptName,
  581. })
  582. if createErr != nil {
  583. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  584. return createErr
  585. }
  586. //将训练任务的上一版本的isLatestVersion设置为"0"
  587. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  588. if createErr != nil {
  589. ctx.ServerError("Update IsLatestVersion failed", createErr)
  590. return createErr
  591. }
  592. return createErr
  593. }
  594. func TransTrainJobStatus(status int) string {
  595. switch status {
  596. case 0:
  597. return "UNKNOWN"
  598. case 1:
  599. return "INIT"
  600. case 2:
  601. return "IMAGE_CREATING"
  602. case 3:
  603. return "IMAGE_FAILED"
  604. case 4:
  605. return "SUBMIT_TRYING"
  606. case 5:
  607. return "SUBMIT_FAILED"
  608. case 6:
  609. return "DELETE_FAILED"
  610. case 7:
  611. return "WAITING"
  612. case 8:
  613. return "RUNNING"
  614. case 9:
  615. return "KILLING"
  616. case 10:
  617. return "COMPLETED"
  618. case 11:
  619. return "FAILED"
  620. case 12:
  621. return "KILLED"
  622. case 13:
  623. return "CANCELED"
  624. case 14:
  625. return "LOST"
  626. case 15:
  627. return "SCALING"
  628. case 16:
  629. return "SUBMIT_MODEL_FAILED"
  630. case 17:
  631. return "DEPLOY_SERVICE_FAILED"
  632. case 18:
  633. return "CHECK_INIT"
  634. case 19:
  635. return "CHECK_RUNNING"
  636. case 20:
  637. return "CHECK_RUNNING_COMPLETED"
  638. case 21:
  639. return "CHECK_FAILED"
  640. default:
  641. return strconv.Itoa(status)
  642. }
  643. }
  644. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  645. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  646. VersionOutputPath = "V" + talVersionCountToString
  647. return VersionOutputPath
  648. }
  649. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (jobId string, err error) {
  650. createTime := timeutil.TimeStampNow()
  651. var jobResult *models.CreateTrainJobResult
  652. var createErr error
  653. if req.EngineID < 0 {
  654. jobResult, createErr = createInferenceJobUserImage(models.CreateInfUserImageParams{
  655. JobName: req.JobName,
  656. Description: req.Description,
  657. Config: models.InfUserImageConfig{
  658. WorkServerNum: req.WorkServerNumber,
  659. AppUrl: req.CodeObsPath,
  660. BootFileUrl: req.BootFileUrl,
  661. DataUrl: req.DataUrl,
  662. // TrainUrl: req.TrainUrl,
  663. LogUrl: req.LogUrl,
  664. PoolID: req.PoolID,
  665. CreateVersion: true,
  666. Flavor: models.Flavor{
  667. Code: req.Spec.SourceSpecId,
  668. },
  669. Parameter: req.Parameters,
  670. UserImageUrl: req.UserImageUrl,
  671. UserCommand: req.UserCommand,
  672. },
  673. })
  674. } else {
  675. jobResult, createErr = createInferenceJob(models.CreateInferenceJobParams{
  676. JobName: req.JobName,
  677. Description: req.Description,
  678. InfConfig: models.InfConfig{
  679. WorkServerNum: req.WorkServerNumber,
  680. AppUrl: req.CodeObsPath,
  681. BootFileUrl: req.BootFileUrl,
  682. DataUrl: req.DataUrl,
  683. EngineID: req.EngineID,
  684. // TrainUrl: req.TrainUrl,
  685. LogUrl: req.LogUrl,
  686. PoolID: req.PoolID,
  687. CreateVersion: true,
  688. Flavor: models.Flavor{
  689. Code: req.Spec.SourceSpecId,
  690. },
  691. Parameter: req.Parameters,
  692. },
  693. })
  694. }
  695. if createErr != nil {
  696. log.Error("createInferenceJob failed: %v", err.Error())
  697. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  698. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  699. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  700. JobID: models.TempJobId,
  701. VersionID: models.TempVersionId,
  702. Status: models.TempJobStatus,
  703. Type: models.TypeCloudBrainTwo,
  704. JobName: req.JobName,
  705. JobType: req.JobType,
  706. })
  707. if err != nil {
  708. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  709. return "", err
  710. }
  711. }
  712. return "", err
  713. }
  714. // attach, err := models.GetAttachmentByUUID(req.Uuid)
  715. // if err != nil {
  716. // log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  717. // return err
  718. // }
  719. jobID := strconv.FormatInt(jobResult.JobID, 10)
  720. err = models.CreateCloudbrain(&models.Cloudbrain{
  721. Status: TransTrainJobStatus(jobResult.Status),
  722. UserID: ctx.User.ID,
  723. RepoID: ctx.Repo.Repository.ID,
  724. JobID: jobID,
  725. JobName: req.JobName,
  726. DisplayJobName: req.DisplayJobName,
  727. JobType: req.JobType,
  728. Type: models.TypeCloudBrainTwo,
  729. VersionID: jobResult.VersionID,
  730. VersionName: jobResult.VersionName,
  731. Uuid: req.Uuid,
  732. DatasetName: req.DatasetName,
  733. CommitID: req.CommitID,
  734. EngineID: req.EngineID,
  735. TrainUrl: req.TrainUrl,
  736. BranchName: req.BranchName,
  737. Parameters: req.Params,
  738. BootFile: req.BootFile,
  739. DataUrl: req.DataUrl,
  740. LogUrl: req.LogUrl,
  741. FlavorCode: req.Spec.SourceSpecId,
  742. Description: req.Description,
  743. WorkServerNumber: req.WorkServerNumber,
  744. FlavorName: req.FlavorName,
  745. EngineName: req.EngineName,
  746. LabelName: req.LabelName,
  747. IsLatestVersion: req.IsLatestVersion,
  748. ComputeResource: models.NPUResource,
  749. VersionCount: req.VersionCount,
  750. TotalVersionCount: req.TotalVersionCount,
  751. ModelName: req.ModelName,
  752. ModelVersion: req.ModelVersion,
  753. CkptName: req.CkptName,
  754. ResultUrl: req.ResultUrl,
  755. CreatedUnix: createTime,
  756. UpdatedUnix: createTime,
  757. Spec: req.Spec,
  758. })
  759. if err != nil {
  760. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  761. return "", err
  762. }
  763. if req.JobType == string(models.JobTypeModelSafety) {
  764. task, err := models.GetCloudbrainByJobID(jobID)
  765. if err == nil {
  766. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, fmt.Sprint(task.ID), req.DisplayJobName, models.ActionCreateBenchMarkTask)
  767. }
  768. } else {
  769. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  770. }
  771. return jobID, nil
  772. }
  773. func GetNotebookImageName(imageId string) (string, error) {
  774. var validImage = false
  775. var imageName = ""
  776. for _, imageInfo := range setting.StImageInfos.ImageInfo {
  777. if imageInfo.Id == imageId {
  778. validImage = true
  779. imageName = imageInfo.Value
  780. }
  781. }
  782. if !validImage {
  783. log.Error("the image id(%s) is invalid", imageId)
  784. return imageName, errors.New("the image id is invalid")
  785. }
  786. return imageName, nil
  787. }
  788. func InitSpecialPool() {
  789. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  790. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  791. }
  792. }
  793. func InitMultiNode() {
  794. if MultiNodeConfig == nil && setting.ModelArtsMultiNode != "" {
  795. json.Unmarshal([]byte(setting.ModelArtsMultiNode), &MultiNodeConfig)
  796. }
  797. }
  798. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  799. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  800. if err != nil {
  801. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  802. return err
  803. }
  804. if result != nil {
  805. oldStatus := task.Status
  806. task.Status = TransTrainJobStatus(result.IntStatus)
  807. task.Duration = result.Duration / 1000
  808. task.TrainJobDuration = result.TrainJobDuration
  809. if task.StartTime == 0 && result.StartTime > 0 {
  810. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  811. }
  812. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  813. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  814. task.EndTime = task.StartTime.Add(task.Duration)
  815. }
  816. task.CorrectCreateUnix()
  817. if oldStatus != task.Status {
  818. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  819. }
  820. err = models.UpdateJob(task)
  821. if err != nil {
  822. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  823. return err
  824. }
  825. }
  826. return nil
  827. }
  828. func HandleNotebookInfo(task *models.Cloudbrain) error {
  829. var result *models.GetNotebook2Result
  830. var err error
  831. if task.Type == models.TypeCloudBrainTwo {
  832. result, err = GetNotebook2(task.JobID)
  833. } else if task.Type == models.TypeCDCenter {
  834. result, err = modelarts_cd.GetNotebook(task.JobID)
  835. }
  836. if err != nil {
  837. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  838. return err
  839. }
  840. if result != nil {
  841. oldStatus := task.Status
  842. task.Status = result.Status
  843. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  844. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  845. }
  846. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  847. task.EndTime = timeutil.TimeStampNow()
  848. }
  849. task.CorrectCreateUnix()
  850. task.ComputeAndSetDuration()
  851. if oldStatus != task.Status {
  852. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  853. }
  854. if task.FlavorCode == "" {
  855. task.FlavorCode = result.Flavor
  856. }
  857. if oldStatus != task.Status && task.Status == string(models.ModelArtsRunning) && task.BootFile != "" {
  858. uploadNoteBookFile(task, result)
  859. }
  860. err = models.UpdateJob(task)
  861. if err != nil {
  862. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  863. return err
  864. }
  865. }
  866. return nil
  867. }
  868. func uploadNoteBookFile(task *models.Cloudbrain, result *models.GetNotebook2Result) {
  869. jupyterUrl := result.Url + "?token=" + result.Token
  870. cookies, xsrf := getCookiesAndCsrf(jupyterUrl)
  871. if xsrf == "" {
  872. log.Error("browser jupyterUrl failed:%v", task.DisplayJobName)
  873. } else {
  874. codePath := setting.JobPath + task.JobName + cloudbrain.CodeMountPath
  875. fileContents, err := ioutil.ReadFile(codePath + "/" + task.BootFile)
  876. if err != nil {
  877. log.Error("read jupyter file failed:%v", task.DisplayJobName, err)
  878. }
  879. base64Content := base64.StdEncoding.EncodeToString(fileContents)
  880. client := getRestyClient()
  881. uploadUrl := getJupyterBaseUrl(result.Url) + "api/contents/" + path.Base(task.BootFile)
  882. res, err := client.R().
  883. SetCookies(cookies).
  884. SetHeader("X-XSRFToken", xsrf).
  885. SetBody(map[string]interface{}{
  886. "type": "file",
  887. "format": "base64",
  888. "name": path.Base(task.BootFile),
  889. "path": path.Base(task.BootFile),
  890. "content": base64Content}).
  891. Put(uploadUrl)
  892. if err != nil {
  893. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  894. } else if res.StatusCode() != http.StatusCreated {
  895. log.Error("upload jupyter file failed:%v", task.DisplayJobName, err)
  896. }
  897. }
  898. }
  899. func getJupyterBaseUrl(url string) string {
  900. jupyterUrlLength := len(url)
  901. baseUrl := url[0 : jupyterUrlLength-len(path.Base(url))]
  902. return baseUrl
  903. }
  904. func getCookiesAndCsrf(jupyterUrl string) ([]*http.Cookie, string) {
  905. log.Info("jupyter url:"+jupyterUrl)
  906. var cookies []*http.Cookie
  907. const retryTimes = 10
  908. for i := 0; i < retryTimes; i++ {
  909. res, err := http.Get(jupyterUrl)
  910. if err != nil {
  911. log.Error("browser jupyterUrl failed.",err)
  912. if i==retryTimes-1{
  913. return cookies, ""
  914. }
  915. } else {
  916. cookies = res.Cookies()
  917. xsrf := ""
  918. for _, cookie := range cookies {
  919. if cookie.Name == "_xsrf" {
  920. xsrf = cookie.Value
  921. break
  922. }
  923. }
  924. if xsrf != "" {
  925. return cookies, xsrf
  926. }
  927. }
  928. }
  929. return cookies, ""
  930. }
  931. func SyncTempStatusJob() {
  932. jobs, err := models.GetCloudBrainTempJobs()
  933. if err != nil {
  934. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  935. return
  936. }
  937. for _, temp := range jobs {
  938. log.Info("start to handle record: %s", temp.JobName)
  939. if temp.Type == models.TypeCloudBrainTwo {
  940. if temp.JobType == string(models.JobTypeDebug) {
  941. err = handleNotebook(temp)
  942. if err != nil {
  943. log.Error("handleNotebook falied:%v", err)
  944. break
  945. }
  946. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  947. _, err = models.GetCloudbrainByJobID(temp.JobID)
  948. if err != nil {
  949. //one version
  950. err = handleTrainJob(temp)
  951. if err != nil {
  952. log.Error("handleTrainJob falied:%v", err)
  953. break
  954. }
  955. } else {
  956. //multi version
  957. err = handleTrainJobMultiVersion(temp)
  958. if err != nil {
  959. log.Error("handleTrainJobMultiVersion falied:%v", err)
  960. break
  961. }
  962. }
  963. }
  964. }
  965. }
  966. return
  967. }
  968. func handleNotebook(temp *models.CloudbrainTemp) error {
  969. if temp.Status == models.TempJobStatus {
  970. err := handleTempNotebook(temp)
  971. if err != nil {
  972. log.Error("handleTempNotebook failed:%v", err)
  973. return err
  974. }
  975. } else if temp.Status == string(models.ModelArtsStopping) {
  976. res, err := GetNotebook2(temp.JobID)
  977. if err != nil {
  978. log.Error("GetNotebook2 failed:%v", err)
  979. return err
  980. }
  981. temp.Status = res.Status
  982. if temp.Status == string(models.ModelArtsStopped) {
  983. err = models.UpdateCloudbrainTemp(temp)
  984. if err != nil {
  985. log.Error("UpdateCloudbrainTemp failed:%v", err)
  986. return err
  987. }
  988. _, err := DelNotebook2(temp.JobID)
  989. if err != nil {
  990. log.Error("DelNotebook2 failed:%v", err)
  991. return err
  992. }
  993. temp.Status = string(models.ModelArtsDeleted)
  994. err = models.UpdateCloudbrainTemp(temp)
  995. if err != nil {
  996. log.Error("UpdateCloudbrainTemp failed:%v", err)
  997. return err
  998. }
  999. }
  1000. }
  1001. return nil
  1002. }
  1003. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  1004. var err error
  1005. var isExist bool
  1006. for {
  1007. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  1008. if err != nil {
  1009. log.Error("GetNotebookList failed:%v", err)
  1010. break
  1011. }
  1012. temp.QueryTimes++
  1013. err = models.UpdateCloudbrainTemp(temp)
  1014. if err != nil {
  1015. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1016. }
  1017. if result != nil {
  1018. for _, notebook := range result.NotebookList {
  1019. if temp.JobID == models.TempJobId {
  1020. //new notebook
  1021. if notebook.JobName == temp.JobName {
  1022. isExist = true
  1023. temp.Status = notebook.Status
  1024. temp.JobID = notebook.JobID
  1025. break
  1026. }
  1027. } else {
  1028. //restart: always can find one record
  1029. if notebook.JobName == temp.JobName {
  1030. if notebook.Status != string(models.ModelArtsStopped) {
  1031. isExist = true
  1032. temp.Status = notebook.Status
  1033. temp.JobID = notebook.JobID
  1034. break
  1035. }
  1036. }
  1037. }
  1038. }
  1039. if isExist {
  1040. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1041. if temp.Status == string(models.ModelArtsCreateFailed) {
  1042. err = models.UpdateCloudbrainTemp(temp)
  1043. if err != nil {
  1044. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1045. break
  1046. }
  1047. _, err := DelNotebook2(temp.JobID)
  1048. if err != nil {
  1049. log.Error("DelNotebook2(%s) failed:%v", temp.JobName, err)
  1050. break
  1051. }
  1052. temp.Status = string(models.ModelArtsDeleted)
  1053. } else {
  1054. _, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  1055. if err != nil {
  1056. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  1057. break
  1058. }
  1059. temp.Status = string(models.ModelArtsStopping)
  1060. }
  1061. models.UpdateCloudbrainTemp(temp)
  1062. } else {
  1063. log.Error("can not find the record(%s) till now", temp.JobName)
  1064. err = errors.New("not found")
  1065. break
  1066. }
  1067. } else {
  1068. log.Error("can not find the record(%s) till now", temp.JobName)
  1069. err = errors.New("not found")
  1070. break
  1071. }
  1072. break
  1073. }
  1074. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1075. log.Info("reach MaxTempQueryTimes, set the job failed")
  1076. temp.Status = string(models.ModelArtsTrainJobFailed)
  1077. err = models.UpdateCloudbrainTemp(temp)
  1078. if err != nil {
  1079. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1080. return err
  1081. }
  1082. }
  1083. return err
  1084. }
  1085. func handleTrainJob(temp *models.CloudbrainTemp) error {
  1086. if temp.Status == models.TempJobStatus {
  1087. err := handleTempTrainJob(temp)
  1088. if err != nil {
  1089. log.Error("handleTempTrainJob failed:%v", err)
  1090. return err
  1091. }
  1092. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1093. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1094. if err != nil {
  1095. log.Error("GetTrainJob failed:%v", err)
  1096. return err
  1097. }
  1098. temp.Status = TransTrainJobStatus(res.IntStatus)
  1099. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1100. err = models.UpdateCloudbrainTemp(temp)
  1101. if err != nil {
  1102. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1103. return err
  1104. }
  1105. _, err := DelTrainJob(temp.JobID)
  1106. if err != nil {
  1107. log.Error("DelTrainJob failed:%v", err)
  1108. return err
  1109. }
  1110. temp.Status = string(models.ModelArtsDeleted)
  1111. err = models.UpdateCloudbrainTemp(temp)
  1112. if err != nil {
  1113. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1114. return err
  1115. }
  1116. }
  1117. }
  1118. return nil
  1119. }
  1120. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1121. if temp.Status == models.TempJobStatus {
  1122. err := handleTempTrainJobMultiVersion(temp)
  1123. if err != nil {
  1124. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  1125. return err
  1126. }
  1127. } else if temp.Status == string(models.ModelArtsTrainJobKilling) {
  1128. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  1129. if err != nil {
  1130. log.Error("GetTrainJob failed:%v", err)
  1131. return err
  1132. }
  1133. temp.Status = TransTrainJobStatus(res.IntStatus)
  1134. if temp.Status == string(models.ModelArtsTrainJobKilled) {
  1135. err = models.UpdateCloudbrainTemp(temp)
  1136. if err != nil {
  1137. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1138. return err
  1139. }
  1140. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  1141. if err != nil {
  1142. log.Error("DelTrainJob failed:%v", err)
  1143. return err
  1144. }
  1145. temp.Status = string(models.ModelArtsDeleted)
  1146. err = models.UpdateCloudbrainTemp(temp)
  1147. if err != nil {
  1148. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1149. return err
  1150. }
  1151. }
  1152. }
  1153. return nil
  1154. }
  1155. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  1156. var err error
  1157. var isExist bool
  1158. for {
  1159. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  1160. if err != nil {
  1161. log.Error("GetTrainJobVersionList failed:%v", err)
  1162. break
  1163. }
  1164. temp.QueryTimes++
  1165. err = models.UpdateCloudbrainTemp(temp)
  1166. if err != nil {
  1167. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1168. }
  1169. if result != nil {
  1170. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  1171. if result.VersionCount == int64(count+1) {
  1172. isExist = true
  1173. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1174. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1175. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1176. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1177. if err != nil {
  1178. log.Error("StopTrainJob failed:%v", err)
  1179. break
  1180. }
  1181. temp.Status = string(models.ModelArtsTrainJobKilling)
  1182. err = models.UpdateCloudbrainTemp(temp)
  1183. if err != nil {
  1184. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1185. break
  1186. }
  1187. } else {
  1188. log.Error("can not find the record(%s) till now", temp.JobName)
  1189. err = errors.New("not found")
  1190. break
  1191. }
  1192. }
  1193. break
  1194. }
  1195. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1196. log.Info("reach MaxTempQueryTimes, set the job failed")
  1197. temp.Status = string(models.ModelArtsTrainJobFailed)
  1198. err = models.UpdateCloudbrainTemp(temp)
  1199. if err != nil {
  1200. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1201. return err
  1202. }
  1203. }
  1204. return err
  1205. }
  1206. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1207. var err error
  1208. var isExist bool
  1209. for {
  1210. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1211. if err != nil {
  1212. log.Error("GetTrainJobList failed:%v", err)
  1213. break
  1214. }
  1215. temp.QueryTimes++
  1216. err = models.UpdateCloudbrainTemp(temp)
  1217. if err != nil {
  1218. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1219. }
  1220. if result != nil {
  1221. for _, job := range result.JobList {
  1222. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1223. isExist = true
  1224. temp.Status = TransTrainJobStatus(job.IntStatus)
  1225. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1226. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1227. log.Info("find the record(%s), status(%s)", temp.JobName, temp.Status)
  1228. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1229. if err != nil {
  1230. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1231. break
  1232. }
  1233. temp.Status = string(models.ModelArtsTrainJobKilling)
  1234. err = models.UpdateCloudbrainTemp(temp)
  1235. if err != nil {
  1236. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1237. break
  1238. }
  1239. }
  1240. }
  1241. if !isExist {
  1242. log.Error("can not find the record(%s) till now", temp.JobName)
  1243. err = errors.New("not found")
  1244. break
  1245. }
  1246. }
  1247. break
  1248. }
  1249. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1250. log.Info("reach MaxTempQueryTimes, set the job failed")
  1251. temp.Status = string(models.ModelArtsTrainJobFailed)
  1252. err = models.UpdateCloudbrainTemp(temp)
  1253. if err != nil {
  1254. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1255. return err
  1256. }
  1257. }
  1258. return err
  1259. }