You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 34 kB

4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago

  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "path"
  7. "strconv"
  8. "strings"
  9. "code.gitea.io/gitea/models"
  10. "code.gitea.io/gitea/modules/context"
  11. "code.gitea.io/gitea/modules/log"
  12. "code.gitea.io/gitea/modules/notification"
  13. "code.gitea.io/gitea/modules/setting"
  14. "code.gitea.io/gitea/modules/storage"
  15. "code.gitea.io/gitea/modules/timeutil"
  16. )
  17. const (
  18. //notebook
  19. storageTypeOBS = "obs"
  20. autoStopDuration = 4 * 60 * 60
  21. autoStopDurationMs = 4 * 60 * 60 * 1000
  22. MORDELART_USER_IMAGE_ENGINE_ID = -1
  23. DataSetMountPath = "/home/ma-user/work"
  24. NotebookEnv = "Python3"
  25. NotebookType = "Ascend"
  26. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  27. //train-job
  28. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  29. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  30. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  31. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  32. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  34. // "]}"
  35. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  36. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  37. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  39. // "]}"
  40. CodePath = "/code/"
  41. OutputPath = "/output/"
  42. ResultPath = "/result/"
  43. LogPath = "/log/"
  44. JobPath = "/job/"
  45. OrderDesc = "desc" //向下查询
  46. OrderAsc = "asc" //向上查询
  47. Lines = 500
  48. TrainUrl = "train_url"
  49. DataUrl = "data_url"
  50. MultiDataUrl = "multi_data_url"
  51. ResultUrl = "result_url"
  52. CkptUrl = "ckpt_url"
  53. DeviceTarget = "device_target"
  54. Ascend = "Ascend"
  55. PerPage = 10
  56. IsLatestVersion = "1"
  57. NotLatestVersion = "0"
  58. VersionCountOne = 1
  59. SortByCreateTime = "create_time"
  60. ConfigTypeCustom = "custom"
  61. TotalVersionCount = 1
  62. )
  63. var (
  64. poolInfos *models.PoolInfos
  65. FlavorInfos *models.FlavorInfos
  66. ImageInfos *models.ImageInfosModelArts
  67. TrainFlavorInfos *Flavor
  68. SpecialPools *models.SpecialPools
  69. )
  70. type GenerateTrainJobReq struct {
  71. JobName string
  72. DisplayJobName string
  73. Uuid string
  74. Description string
  75. CodeObsPath string
  76. BootFile string
  77. BootFileUrl string
  78. DataUrl string
  79. TrainUrl string
  80. FlavorCode string
  81. LogUrl string
  82. PoolID string
  83. WorkServerNumber int
  84. EngineID int64
  85. Parameters []models.Parameter
  86. CommitID string
  87. IsLatestVersion string
  88. Params string
  89. BranchName string
  90. PreVersionId int64
  91. PreVersionName string
  92. FlavorName string
  93. VersionCount int
  94. EngineName string
  95. TotalVersionCount int
  96. UserImageUrl string
  97. UserCommand string
  98. DatasetName string
  99. }
  100. type GenerateInferenceJobReq struct {
  101. JobName string
  102. DisplayJobName string
  103. Uuid string
  104. Description string
  105. CodeObsPath string
  106. BootFile string
  107. BootFileUrl string
  108. DataUrl string
  109. TrainUrl string
  110. FlavorCode string
  111. LogUrl string
  112. PoolID string
  113. WorkServerNumber int
  114. EngineID int64
  115. Parameters []models.Parameter
  116. CommitID string
  117. Params string
  118. BranchName string
  119. FlavorName string
  120. EngineName string
  121. LabelName string
  122. IsLatestVersion string
  123. VersionCount int
  124. TotalVersionCount int
  125. ModelName string
  126. ModelVersion string
  127. CkptName string
  128. ResultUrl string
  129. }
  130. type VersionInfo struct {
  131. Version []struct {
  132. ID int `json:"id"`
  133. Value string `json:"value"`
  134. Url string `json:"url"`
  135. } `json:"version"`
  136. }
  137. type Flavor struct {
  138. Info []struct {
  139. Code string `json:"code"`
  140. Value string `json:"value"`
  141. } `json:"flavor"`
  142. }
  143. type Engine struct {
  144. Info []struct {
  145. ID int `json:"id"`
  146. Value string `json:"value"`
  147. } `json:"engine"`
  148. }
  149. type ResourcePool struct {
  150. Info []struct {
  151. ID string `json:"id"`
  152. Value string `json:"value"`
  153. } `json:"resource_pool"`
  154. }
  155. // type Parameter struct {
  156. // Label string `json:"label"`
  157. // Value string `json:"value"`
  158. // }
  159. // type Parameters struct {
  160. // Parameter []Parameter `json:"parameter"`
  161. // }
  162. type Parameters struct {
  163. Parameter []struct {
  164. Label string `json:"label"`
  165. Value string `json:"value"`
  166. } `json:"parameter"`
  167. }
  168. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  169. var dataActualPath string
  170. if uuid != "" {
  171. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  172. } else {
  173. userPath := setting.UserBasePath + ctx.User.Name + "/"
  174. isExist, err := storage.ObsHasObject(userPath)
  175. if err != nil {
  176. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  177. return err
  178. }
  179. if !isExist {
  180. if err = storage.ObsCreateObject(userPath); err != nil {
  181. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  182. return err
  183. }
  184. }
  185. dataActualPath = setting.Bucket + "/" + userPath
  186. }
  187. if poolInfos == nil {
  188. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  189. }
  190. createTime := timeutil.TimeStampNow()
  191. jobResult, err := CreateJob(models.CreateNotebookParams{
  192. JobName: jobName,
  193. Description: description,
  194. ProfileID: setting.ProfileID,
  195. Flavor: flavor,
  196. Pool: models.Pool{
  197. ID: poolInfos.PoolInfo[0].PoolId,
  198. Name: poolInfos.PoolInfo[0].PoolName,
  199. Type: poolInfos.PoolInfo[0].PoolType,
  200. },
  201. Spec: models.Spec{
  202. Storage: models.Storage{
  203. Type: storageTypeOBS,
  204. Location: models.Location{
  205. Path: dataActualPath,
  206. },
  207. },
  208. AutoStop: models.AutoStop{
  209. Enable: true,
  210. Duration: autoStopDuration,
  211. },
  212. },
  213. })
  214. if err != nil {
  215. log.Error("CreateJob failed: %v", err.Error())
  216. return err
  217. }
  218. err = models.CreateCloudbrain(&models.Cloudbrain{
  219. Status: string(models.JobWaiting),
  220. UserID: ctx.User.ID,
  221. RepoID: ctx.Repo.Repository.ID,
  222. JobID: jobResult.ID,
  223. JobName: jobName,
  224. JobType: string(models.JobTypeDebug),
  225. Type: models.TypeCloudBrainTwo,
  226. Uuid: uuid,
  227. ComputeResource: models.NPUResource,
  228. CreatedUnix: createTime,
  229. UpdatedUnix: createTime,
  230. })
  231. if err != nil {
  232. return err
  233. }
  234. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  235. return nil
  236. }
  237. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
  238. if poolInfos == nil {
  239. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  240. }
  241. imageName, err := GetNotebookImageName(imageId)
  242. if err != nil {
  243. log.Error("GetNotebookImageName failed: %v", err.Error())
  244. return err
  245. }
  246. createTime := timeutil.TimeStampNow()
  247. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  248. JobName: jobName,
  249. Description: description,
  250. Flavor: flavor,
  251. Duration: autoStopDurationMs,
  252. ImageID: imageId,
  253. PoolID: poolInfos.PoolInfo[0].PoolId,
  254. Feature: models.NotebookFeature,
  255. Volume: models.VolumeReq{
  256. Capacity: setting.Capacity,
  257. Category: models.EVSCategory,
  258. Ownership: models.ManagedOwnership,
  259. },
  260. WorkspaceID: "0",
  261. })
  262. if err != nil {
  263. log.Error("createNotebook2 failed: %v", err.Error())
  264. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  265. log.Info("(%s)unknown error, set temp status", displayJobName)
  266. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  267. JobID: models.TempJobId,
  268. VersionID: models.TempVersionId,
  269. Status: models.TempJobStatus,
  270. Type: models.TypeCloudBrainTwo,
  271. JobName: jobName,
  272. JobType: string(models.JobTypeDebug),
  273. })
  274. if errTemp != nil {
  275. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  276. return errTemp
  277. }
  278. }
  279. return err
  280. }
  281. task := &models.Cloudbrain{
  282. Status: jobResult.Status,
  283. UserID: ctx.User.ID,
  284. RepoID: ctx.Repo.Repository.ID,
  285. JobID: jobResult.ID,
  286. JobName: jobName,
  287. FlavorCode: flavor,
  288. DisplayJobName: displayJobName,
  289. JobType: string(models.JobTypeDebug),
  290. Type: models.TypeCloudBrainTwo,
  291. Uuid: uuid,
  292. ComputeResource: models.NPUResource,
  293. Image: imageName,
  294. Description: description,
  295. CreatedUnix: createTime,
  296. UpdatedUnix: createTime,
  297. }
  298. err = models.CreateCloudbrain(task)
  299. if err != nil {
  300. return err
  301. }
  302. stringId := strconv.FormatInt(task.ID, 10)
  303. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  304. return nil
  305. }
  306. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  307. createTime := timeutil.TimeStampNow()
  308. var jobResult *models.CreateTrainJobResult
  309. var createErr error
  310. if req.EngineID < 0 {
  311. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  312. JobName: req.JobName,
  313. Description: req.Description,
  314. Config: models.UserImageConfig{
  315. WorkServerNum: req.WorkServerNumber,
  316. AppUrl: req.CodeObsPath,
  317. BootFileUrl: req.BootFileUrl,
  318. DataUrl: req.DataUrl,
  319. TrainUrl: req.TrainUrl,
  320. LogUrl: req.LogUrl,
  321. PoolID: req.PoolID,
  322. CreateVersion: true,
  323. Flavor: models.Flavor{
  324. Code: req.FlavorCode,
  325. },
  326. Parameter: req.Parameters,
  327. UserImageUrl: req.UserImageUrl,
  328. UserCommand: req.UserCommand,
  329. },
  330. })
  331. } else {
  332. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  333. JobName: req.JobName,
  334. Description: req.Description,
  335. Config: models.Config{
  336. WorkServerNum: req.WorkServerNumber,
  337. AppUrl: req.CodeObsPath,
  338. BootFileUrl: req.BootFileUrl,
  339. DataUrl: req.DataUrl,
  340. EngineID: req.EngineID,
  341. TrainUrl: req.TrainUrl,
  342. LogUrl: req.LogUrl,
  343. PoolID: req.PoolID,
  344. CreateVersion: true,
  345. Flavor: models.Flavor{
  346. Code: req.FlavorCode,
  347. },
  348. Parameter: req.Parameters,
  349. },
  350. })
  351. }
  352. if createErr != nil {
  353. log.Error("createTrainJob failed: %v", createErr.Error())
  354. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  355. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  356. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  357. JobID: models.TempJobId,
  358. VersionID: models.TempVersionId,
  359. Status: models.TempJobStatus,
  360. Type: models.TypeCloudBrainTwo,
  361. JobName: req.JobName,
  362. JobType: string(models.JobTypeTrain),
  363. })
  364. if errTemp != nil {
  365. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  366. return errTemp
  367. }
  368. }
  369. return createErr
  370. }
  371. jobId := strconv.FormatInt(jobResult.JobID, 10)
  372. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  373. Status: TransTrainJobStatus(jobResult.Status),
  374. UserID: ctx.User.ID,
  375. RepoID: ctx.Repo.Repository.ID,
  376. JobID: jobId,
  377. JobName: req.JobName,
  378. DisplayJobName: req.DisplayJobName,
  379. JobType: string(models.JobTypeTrain),
  380. Type: models.TypeCloudBrainTwo,
  381. VersionID: jobResult.VersionID,
  382. VersionName: jobResult.VersionName,
  383. Uuid: req.Uuid,
  384. DatasetName: req.DatasetName,
  385. CommitID: req.CommitID,
  386. IsLatestVersion: req.IsLatestVersion,
  387. ComputeResource: models.NPUResource,
  388. EngineID: req.EngineID,
  389. TrainUrl: req.TrainUrl,
  390. BranchName: req.BranchName,
  391. Parameters: req.Params,
  392. BootFile: req.BootFile,
  393. DataUrl: req.DataUrl,
  394. LogUrl: req.LogUrl,
  395. FlavorCode: req.FlavorCode,
  396. Description: req.Description,
  397. WorkServerNumber: req.WorkServerNumber,
  398. FlavorName: req.FlavorName,
  399. EngineName: req.EngineName,
  400. VersionCount: req.VersionCount,
  401. TotalVersionCount: req.TotalVersionCount,
  402. CreatedUnix: createTime,
  403. UpdatedUnix: createTime,
  404. })
  405. if createErr != nil {
  406. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, createErr.Error())
  407. return createErr
  408. }
  409. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobId, req.DisplayJobName, models.ActionCreateTrainTask)
  410. return nil
  411. }
  412. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  413. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  414. JobName: req.JobName,
  415. Description: req.Description,
  416. Config: models.UserImageConfig{
  417. WorkServerNum: req.WorkServerNumber,
  418. AppUrl: req.CodeObsPath,
  419. BootFileUrl: req.BootFileUrl,
  420. DataUrl: req.DataUrl,
  421. TrainUrl: req.TrainUrl,
  422. LogUrl: req.LogUrl,
  423. PoolID: req.PoolID,
  424. CreateVersion: true,
  425. Flavor: models.Flavor{
  426. Code: req.FlavorCode,
  427. },
  428. Parameter: req.Parameters,
  429. UserImageUrl: req.UserImageUrl,
  430. UserCommand: req.UserCommand,
  431. },
  432. })
  433. }
  434. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  435. createTime := timeutil.TimeStampNow()
  436. var jobResult *models.CreateTrainJobResult
  437. var createErr error
  438. if req.EngineID < 0 {
  439. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  440. Description: req.Description,
  441. Config: models.TrainJobVersionUserImageConfig{
  442. WorkServerNum: req.WorkServerNumber,
  443. AppUrl: req.CodeObsPath,
  444. BootFileUrl: req.BootFileUrl,
  445. DataUrl: req.DataUrl,
  446. TrainUrl: req.TrainUrl,
  447. LogUrl: req.LogUrl,
  448. PoolID: req.PoolID,
  449. Flavor: models.Flavor{
  450. Code: req.FlavorCode,
  451. },
  452. Parameter: req.Parameters,
  453. PreVersionId: req.PreVersionId,
  454. UserImageUrl: req.UserImageUrl,
  455. UserCommand: req.UserCommand,
  456. },
  457. }, jobId)
  458. } else {
  459. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  460. Description: req.Description,
  461. Config: models.TrainJobVersionConfig{
  462. WorkServerNum: req.WorkServerNumber,
  463. AppUrl: req.CodeObsPath,
  464. BootFileUrl: req.BootFileUrl,
  465. DataUrl: req.DataUrl,
  466. EngineID: req.EngineID,
  467. TrainUrl: req.TrainUrl,
  468. LogUrl: req.LogUrl,
  469. PoolID: req.PoolID,
  470. Flavor: models.Flavor{
  471. Code: req.FlavorCode,
  472. },
  473. Parameter: req.Parameters,
  474. PreVersionId: req.PreVersionId,
  475. },
  476. }, jobId)
  477. }
  478. if createErr != nil {
  479. log.Error("createTrainJobVersion failed: %v", createErr.Error())
  480. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  481. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  482. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  483. JobID: jobId,
  484. VersionID: models.TempVersionId,
  485. Status: models.TempJobStatus,
  486. Type: models.TypeCloudBrainTwo,
  487. JobName: req.JobName,
  488. JobType: string(models.JobTypeTrain),
  489. })
  490. if errTemp != nil {
  491. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  492. return errTemp
  493. }
  494. }
  495. return createErr
  496. }
  497. var jobTypes []string
  498. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  499. repo := ctx.Repo.Repository
  500. VersionTaskList, VersionListCount, createErr := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  501. RepoID: repo.ID,
  502. Type: models.TypeCloudBrainTwo,
  503. JobTypes: jobTypes,
  504. JobID: strconv.FormatInt(jobResult.JobID, 10),
  505. })
  506. if createErr != nil {
  507. ctx.ServerError("Cloudbrain", createErr)
  508. return createErr
  509. }
  510. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  511. createErr = models.CreateCloudbrain(&models.Cloudbrain{
  512. Status: TransTrainJobStatus(jobResult.Status),
  513. UserID: ctx.User.ID,
  514. RepoID: ctx.Repo.Repository.ID,
  515. JobID: strconv.FormatInt(jobResult.JobID, 10),
  516. JobName: req.JobName,
  517. DisplayJobName: req.DisplayJobName,
  518. JobType: string(models.JobTypeTrain),
  519. Type: models.TypeCloudBrainTwo,
  520. VersionID: jobResult.VersionID,
  521. VersionName: jobResult.VersionName,
  522. Uuid: req.Uuid,
  523. DatasetName: req.DatasetName,
  524. CommitID: req.CommitID,
  525. IsLatestVersion: req.IsLatestVersion,
  526. PreVersionName: req.PreVersionName,
  527. ComputeResource: models.NPUResource,
  528. EngineID: req.EngineID,
  529. TrainUrl: req.TrainUrl,
  530. BranchName: req.BranchName,
  531. Parameters: req.Params,
  532. BootFile: req.BootFile,
  533. DataUrl: req.DataUrl,
  534. LogUrl: req.LogUrl,
  535. PreVersionId: req.PreVersionId,
  536. FlavorCode: req.FlavorCode,
  537. Description: req.Description,
  538. WorkServerNumber: req.WorkServerNumber,
  539. FlavorName: req.FlavorName,
  540. EngineName: req.EngineName,
  541. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  542. VersionCount: VersionListCount + 1,
  543. CreatedUnix: createTime,
  544. UpdatedUnix: createTime,
  545. })
  546. if createErr != nil {
  547. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
  548. return createErr
  549. }
  550. //将训练任务的上一版本的isLatestVersion设置为"0"
  551. createErr = models.SetVersionCountAndLatestVersion(strconv.FormatInt(jobResult.JobID, 10), VersionTaskList[0].VersionName, VersionCountOne, NotLatestVersion, TotalVersionCount)
  552. if createErr != nil {
  553. ctx.ServerError("Update IsLatestVersion failed", createErr)
  554. return createErr
  555. }
  556. return createErr
  557. }
  558. func TransTrainJobStatus(status int) string {
  559. switch status {
  560. case 0:
  561. return "UNKNOWN"
  562. case 1:
  563. return "INIT"
  564. case 2:
  565. return "IMAGE_CREATING"
  566. case 3:
  567. return "IMAGE_FAILED"
  568. case 4:
  569. return "SUBMIT_TRYING"
  570. case 5:
  571. return "SUBMIT_FAILED"
  572. case 6:
  573. return "DELETE_FAILED"
  574. case 7:
  575. return "WAITING"
  576. case 8:
  577. return "RUNNING"
  578. case 9:
  579. return "KILLING"
  580. case 10:
  581. return "COMPLETED"
  582. case 11:
  583. return "FAILED"
  584. case 12:
  585. return "KILLED"
  586. case 13:
  587. return "CANCELED"
  588. case 14:
  589. return "LOST"
  590. case 15:
  591. return "SCALING"
  592. case 16:
  593. return "SUBMIT_MODEL_FAILED"
  594. case 17:
  595. return "DEPLOY_SERVICE_FAILED"
  596. case 18:
  597. return "CHECK_INIT"
  598. case 19:
  599. return "CHECK_RUNNING"
  600. case 20:
  601. return "CHECK_RUNNING_COMPLETED"
  602. case 21:
  603. return "CHECK_FAILED"
  604. default:
  605. return strconv.Itoa(status)
  606. }
  607. }
  608. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  609. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  610. VersionOutputPath = "V" + talVersionCountToString
  611. return VersionOutputPath
  612. }
  613. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  614. createTime := timeutil.TimeStampNow()
  615. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  616. JobName: req.JobName,
  617. Description: req.Description,
  618. InfConfig: models.InfConfig{
  619. WorkServerNum: req.WorkServerNumber,
  620. AppUrl: req.CodeObsPath,
  621. BootFileUrl: req.BootFileUrl,
  622. DataUrl: req.DataUrl,
  623. EngineID: req.EngineID,
  624. // TrainUrl: req.TrainUrl,
  625. LogUrl: req.LogUrl,
  626. PoolID: req.PoolID,
  627. CreateVersion: true,
  628. Flavor: models.Flavor{
  629. Code: req.FlavorCode,
  630. },
  631. Parameter: req.Parameters,
  632. },
  633. })
  634. if err != nil {
  635. log.Error("createInferenceJob failed: %v", err.Error())
  636. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  637. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  638. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  639. JobID: models.TempJobId,
  640. VersionID: models.TempVersionId,
  641. Status: models.TempJobStatus,
  642. Type: models.TypeCloudBrainTwo,
  643. JobName: req.JobName,
  644. JobType: string(models.JobTypeInference),
  645. })
  646. if err != nil {
  647. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  648. return err
  649. }
  650. }
  651. return err
  652. }
  653. attach, err := models.GetAttachmentByUUID(req.Uuid)
  654. if err != nil {
  655. log.Error("GetAttachmentByUUID(%s) failed:%v", strconv.FormatInt(jobResult.JobID, 10), err.Error())
  656. return err
  657. }
  658. jobID := strconv.FormatInt(jobResult.JobID, 10)
  659. err = models.CreateCloudbrain(&models.Cloudbrain{
  660. Status: TransTrainJobStatus(jobResult.Status),
  661. UserID: ctx.User.ID,
  662. RepoID: ctx.Repo.Repository.ID,
  663. JobID: jobID,
  664. JobName: req.JobName,
  665. DisplayJobName: req.DisplayJobName,
  666. JobType: string(models.JobTypeInference),
  667. Type: models.TypeCloudBrainTwo,
  668. VersionID: jobResult.VersionID,
  669. VersionName: jobResult.VersionName,
  670. Uuid: req.Uuid,
  671. DatasetName: attach.Name,
  672. CommitID: req.CommitID,
  673. EngineID: req.EngineID,
  674. TrainUrl: req.TrainUrl,
  675. BranchName: req.BranchName,
  676. Parameters: req.Params,
  677. BootFile: req.BootFile,
  678. DataUrl: req.DataUrl,
  679. LogUrl: req.LogUrl,
  680. FlavorCode: req.FlavorCode,
  681. Description: req.Description,
  682. WorkServerNumber: req.WorkServerNumber,
  683. FlavorName: req.FlavorName,
  684. EngineName: req.EngineName,
  685. LabelName: req.LabelName,
  686. IsLatestVersion: req.IsLatestVersion,
  687. ComputeResource: models.NPUResource,
  688. VersionCount: req.VersionCount,
  689. TotalVersionCount: req.TotalVersionCount,
  690. ModelName: req.ModelName,
  691. ModelVersion: req.ModelVersion,
  692. CkptName: req.CkptName,
  693. ResultUrl: req.ResultUrl,
  694. CreatedUnix: createTime,
  695. UpdatedUnix: createTime,
  696. })
  697. if err != nil {
  698. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  699. return err
  700. }
  701. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  702. return nil
  703. }
  704. func GetNotebookImageName(imageId string) (string, error) {
  705. var validImage = false
  706. var imageName = ""
  707. if ImageInfos == nil {
  708. json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos)
  709. }
  710. for _, imageInfo := range ImageInfos.ImageInfo {
  711. if imageInfo.Id == imageId {
  712. validImage = true
  713. imageName = imageInfo.Value
  714. }
  715. }
  716. if !validImage {
  717. log.Error("the image id(%s) is invalid", imageId)
  718. return imageName, errors.New("the image id is invalid")
  719. }
  720. return imageName, nil
  721. }
  722. func InitSpecialPool() {
  723. if SpecialPools == nil && setting.ModelArtsSpecialPools != "" {
  724. json.Unmarshal([]byte(setting.ModelArtsSpecialPools), &SpecialPools)
  725. }
  726. }
  727. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  728. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  729. if err != nil {
  730. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  731. return err
  732. }
  733. if result != nil {
  734. oldStatus := task.Status
  735. task.Status = TransTrainJobStatus(result.IntStatus)
  736. task.Duration = result.Duration / 1000
  737. task.TrainJobDuration = result.TrainJobDuration
  738. if task.StartTime == 0 && result.StartTime > 0 {
  739. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  740. }
  741. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  742. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  743. task.EndTime = task.StartTime.Add(task.Duration)
  744. }
  745. task.CorrectCreateUnix()
  746. if oldStatus != task.Status {
  747. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  748. }
  749. err = models.UpdateJob(task)
  750. if err != nil {
  751. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  752. return err
  753. }
  754. }
  755. return nil
  756. }
  757. func HandleNotebookInfo(task *models.Cloudbrain) error {
  758. result, err := GetNotebook2(task.JobID)
  759. if err != nil {
  760. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  761. return err
  762. }
  763. if result != nil {
  764. oldStatus := task.Status
  765. task.Status = result.Status
  766. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  767. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  768. }
  769. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  770. task.EndTime = timeutil.TimeStampNow()
  771. }
  772. task.CorrectCreateUnix()
  773. task.ComputeAndSetDuration()
  774. if oldStatus != task.Status {
  775. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  776. }
  777. if task.FlavorCode == "" {
  778. task.FlavorCode = result.Flavor
  779. }
  780. err = models.UpdateJob(task)
  781. if err != nil {
  782. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  783. return err
  784. }
  785. }
  786. return nil
  787. }
  788. func SyncTempStatusJob() {
  789. jobs, err := models.GetCloudBrainTempJobs()
  790. if err != nil {
  791. log.Error("GetCloudBrainTempJobs failed:%v", err.Error())
  792. return
  793. }
  794. for _, temp := range jobs {
  795. log.Info("start to handle record: %s", temp.JobName)
  796. if temp.Type == models.TypeCloudBrainTwo {
  797. if temp.JobType == string(models.JobTypeDebug) {
  798. err = handleNotebook(temp)
  799. if err != nil {
  800. log.Error("handleNotebook falied:%v", err)
  801. break
  802. }
  803. } else if temp.JobType == string(models.JobTypeTrain) || temp.JobType == string(models.JobTypeInference) {
  804. _, err = models.GetCloudbrainByJobID(temp.JobID)
  805. if err != nil {
  806. //one version
  807. err = handleTrainJob(temp)
  808. if err != nil {
  809. log.Error("handleTrainJob falied:%v", err)
  810. break
  811. }
  812. } else {
  813. //multi version
  814. err = handleTrainJobMultiVersion(temp)
  815. if err != nil {
  816. log.Error("handleTrainJobMultiVersion falied:%v", err)
  817. break
  818. }
  819. }
  820. }
  821. }
  822. }
  823. return
  824. }
  825. func handleNotebook(temp *models.CloudbrainTemp) error {
  826. if temp.Status == models.TempJobStatus {
  827. err := handleTempNotebook(temp)
  828. if err != nil {
  829. log.Error("handleTempNotebook failed:%v", err)
  830. return err
  831. }
  832. } else if temp.Status == string(models.ModelArtsStopping) {
  833. res, err := GetNotebook2(temp.JobID)
  834. if err != nil {
  835. log.Error("GetNotebook2 failed:%v", err)
  836. return err
  837. }
  838. temp.Status = res.Status
  839. if temp.Status == string(models.ModelArtsStopped) {
  840. err = models.UpdateCloudbrainTemp(temp)
  841. if err != nil {
  842. log.Error("UpdateCloudbrainTemp failed:%v", err)
  843. return err
  844. }
  845. _, err := DelNotebook2(temp.JobID)
  846. if err != nil {
  847. log.Error("DelNotebook2 failed:%v", err)
  848. return err
  849. }
  850. }
  851. }
  852. return nil
  853. }
  854. func handleTempNotebook(temp *models.CloudbrainTemp) error {
  855. var err error
  856. var isExist bool
  857. for {
  858. result, err := GetNotebookList(1000, 0, "createTime", "DESC", temp.JobName)
  859. if err != nil {
  860. log.Error("GetNotebookList failed:%v", err)
  861. break
  862. }
  863. temp.QueryTimes++
  864. err = models.UpdateCloudbrainTemp(temp)
  865. if err != nil {
  866. log.Error("UpdateCloudbrainTemp failed:%v", err)
  867. }
  868. if result != nil {
  869. for _, notebook := range result.NotebookList {
  870. if temp.JobID == models.TempJobId {
  871. //new notebook
  872. if notebook.JobName == temp.JobName {
  873. isExist = true
  874. temp.Status = notebook.Status
  875. temp.JobID = notebook.JobID
  876. break
  877. }
  878. } else {
  879. //restart: always can find one record
  880. if notebook.JobName == temp.JobName {
  881. if notebook.Status != string(models.ModelArtsStopped) {
  882. isExist = true
  883. temp.Status = notebook.Status
  884. temp.JobID = notebook.JobID
  885. break
  886. }
  887. }
  888. }
  889. }
  890. if isExist {
  891. log.Info("find the record(%s)", temp.JobName)
  892. res, err := ManageNotebook2(temp.JobID, models.NotebookAction{Action: models.ActionStop})
  893. if err != nil {
  894. log.Error("ManageNotebook2(%s) failed:%v", temp.JobName, err)
  895. break
  896. }
  897. temp.Status = res.Status
  898. models.UpdateCloudbrainTemp(temp)
  899. } else {
  900. log.Error("can not find the record(%s) till now", temp.JobName)
  901. err = errors.New("not found")
  902. break
  903. }
  904. } else {
  905. log.Error("can not find the record(%s) till now", temp.JobName)
  906. err = errors.New("not found")
  907. break
  908. }
  909. break
  910. }
  911. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  912. log.Info("reach MaxTempQueryTimes, set the job failed")
  913. temp.Status = string(models.ModelArtsTrainJobFailed)
  914. err = models.UpdateCloudbrainTemp(temp)
  915. if err != nil {
  916. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  917. return err
  918. }
  919. }
  920. return err
  921. }
  922. func handleTrainJob(temp *models.CloudbrainTemp) error {
  923. if temp.Status == models.TempJobStatus {
  924. err := handleTempTrainJob(temp)
  925. if err != nil {
  926. log.Error("handleTempTrainJob failed:%v", err)
  927. return err
  928. }
  929. } else if temp.Status == string(models.ModelArtsStopping) {
  930. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  931. if err != nil {
  932. log.Error("GetTrainJob failed:%v", err)
  933. return err
  934. }
  935. temp.Status = TransTrainJobStatus(res.IntStatus)
  936. if temp.Status == string(models.ModelArtsStopped) {
  937. err = models.UpdateCloudbrainTemp(temp)
  938. if err != nil {
  939. log.Error("UpdateCloudbrainTemp failed:%v", err)
  940. return err
  941. }
  942. _, err := DelTrainJob(temp.JobID)
  943. if err != nil {
  944. log.Error("DelTrainJob failed:%v", err)
  945. return err
  946. }
  947. }
  948. }
  949. return nil
  950. }
  951. func handleTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  952. if temp.Status == models.TempJobStatus {
  953. err := handleTempTrainJobMultiVersion(temp)
  954. if err != nil {
  955. log.Error("handleTempTrainJobMultiVersion failed:%v", err)
  956. return err
  957. }
  958. } else if temp.Status == string(models.ModelArtsStopping) {
  959. res, err := GetTrainJob(temp.JobID, temp.VersionID)
  960. if err != nil {
  961. log.Error("GetTrainJob failed:%v", err)
  962. return err
  963. }
  964. temp.Status = TransTrainJobStatus(res.IntStatus)
  965. if temp.Status == string(models.ModelArtsStopped) {
  966. err = models.UpdateCloudbrainTemp(temp)
  967. if err != nil {
  968. log.Error("UpdateCloudbrainTemp failed:%v", err)
  969. return err
  970. }
  971. _, err := DelTrainJobVersion(temp.JobID, temp.VersionID)
  972. if err != nil {
  973. log.Error("DelTrainJob failed:%v", err)
  974. return err
  975. }
  976. }
  977. }
  978. return nil
  979. }
  980. func handleTempTrainJobMultiVersion(temp *models.CloudbrainTemp) error {
  981. var err error
  982. var isExist bool
  983. for {
  984. result, err := GetTrainJobVersionList(1000, 1, temp.JobID)
  985. if err != nil {
  986. log.Error("GetTrainJobVersionList failed:%v", err)
  987. break
  988. }
  989. temp.QueryTimes++
  990. err = models.UpdateCloudbrainTemp(temp)
  991. if err != nil {
  992. log.Error("UpdateCloudbrainTemp failed:%v", err)
  993. }
  994. if result != nil {
  995. //todo: check find
  996. count, _ := models.GetCloudbrainCountByJobName(temp.JobName, temp.JobType, temp.Type)
  997. if result.VersionCount == int64(count+1) {
  998. log.Info("find the record(%s)", temp.JobName)
  999. isExist = true
  1000. temp.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  1001. temp.VersionID = strconv.FormatInt(result.JobVersionList[0].VersionID, 10)
  1002. _, err := StopTrainJob(temp.JobID, temp.VersionID)
  1003. if err != nil {
  1004. log.Error("StopTrainJob failed:%v", err)
  1005. break
  1006. }
  1007. temp.Status = string(models.ModelArtsStopping)
  1008. err = models.UpdateCloudbrainTemp(temp)
  1009. if err != nil {
  1010. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1011. break
  1012. }
  1013. }
  1014. }
  1015. break
  1016. }
  1017. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1018. log.Info("reach MaxTempQueryTimes, set the job failed")
  1019. temp.Status = string(models.ModelArtsTrainJobFailed)
  1020. err = models.UpdateCloudbrainTemp(temp)
  1021. if err != nil {
  1022. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1023. return err
  1024. }
  1025. }
  1026. return err
  1027. }
  1028. func handleTempTrainJob(temp *models.CloudbrainTemp) error {
  1029. var err error
  1030. var isExist bool
  1031. for {
  1032. result, err := GetTrainJobList(1000, 1, "create_time", "desc", temp.JobName)
  1033. if err != nil {
  1034. log.Error("GetTrainJobList failed:%v", err)
  1035. break
  1036. }
  1037. temp.QueryTimes++
  1038. err = models.UpdateCloudbrainTemp(temp)
  1039. if err != nil {
  1040. log.Error("UpdateCloudbrainTemp failed:%v", err)
  1041. }
  1042. if result != nil {
  1043. for _, job := range result.JobList {
  1044. if temp.JobName == job.JobName && TransTrainJobStatus(job.IntStatus) != string(models.ModelArtsTrainJobFailed) {
  1045. log.Info("find the record(%s)", temp.JobName)
  1046. isExist = true
  1047. temp.Status = TransTrainJobStatus(job.IntStatus)
  1048. temp.JobID = strconv.FormatInt(job.JobID, 10)
  1049. temp.VersionID = strconv.FormatInt(job.VersionID, 10)
  1050. _, err = StopTrainJob(temp.JobID, temp.VersionID)
  1051. if err != nil {
  1052. log.Error("StopTrainJob(%s) failed:%v", temp.JobName, err)
  1053. break
  1054. }
  1055. temp.Status = string(models.ModelArtsStopping)
  1056. err = models.UpdateCloudbrainTemp(temp)
  1057. if err != nil {
  1058. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1059. break
  1060. }
  1061. }
  1062. }
  1063. }
  1064. break
  1065. }
  1066. if temp.QueryTimes >= setting.MaxTempQueryTimes && !isExist {
  1067. log.Info("reach MaxTempQueryTimes, set the job failed")
  1068. temp.Status = string(models.ModelArtsTrainJobFailed)
  1069. err = models.UpdateCloudbrainTemp(temp)
  1070. if err != nil {
  1071. log.Error("UpdateCloudbrainTemp(%s) failed:%v", temp.JobName, err)
  1072. return err
  1073. }
  1074. }
  1075. return err
  1076. }