You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 31 kB

4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
3 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
3 years ago
4 years ago
4 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037
  1. package modelarts
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "math/rand"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. "code.gitea.io/gitea/models"
  13. "code.gitea.io/gitea/modules/context"
  14. "code.gitea.io/gitea/modules/log"
  15. "code.gitea.io/gitea/modules/notification"
  16. "code.gitea.io/gitea/modules/setting"
  17. "code.gitea.io/gitea/modules/storage"
  18. )
  19. const (
  20. //notebook
  21. storageTypeOBS = "obs"
  22. autoStopDuration = 4 * 60 * 60
  23. autoStopDurationMs = 4 * 60 * 60 * 1000
  24. MORDELART_USER_IMAGE_ENGINE_ID = -1
  25. DataSetMountPath = "/home/ma-user/work"
  26. NotebookEnv = "Python3"
  27. NotebookType = "Ascend"
  28. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  29. //train-job
  30. // ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  31. // Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  32. // EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  33. // "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  34. // "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  35. // "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  36. // "]}"
  37. // TrainJobFlavorInfo = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  38. // "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  39. // "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  40. // "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  41. // "]}"
  42. CodePath = "/code/"
  43. OutputPath = "/output/"
  44. ResultPath = "/result/"
  45. LogPath = "/log/"
  46. JobPath = "/job/"
  47. OrderDesc = "desc" //向下查询
  48. OrderAsc = "asc" //向上查询
  49. Lines = 500
  50. TrainUrl = "train_url"
  51. DataUrl = "data_url"
  52. MultiDataUrl = "multi_data_url"
  53. ResultUrl = "result_url"
  54. CkptUrl = "ckpt_url"
  55. DeviceTarget = "device_target"
  56. Ascend = "Ascend"
  57. PerPage = 10
  58. IsLatestVersion = "1"
  59. NotLatestVersion = "0"
  60. VersionCountOne = 1
  61. SortByCreateTime = "create_time"
  62. ConfigTypeCustom = "custom"
  63. TotalVersionCount = 1
  64. )
  65. var (
  66. poolInfos *models.PoolInfos
  67. FlavorInfos *models.FlavorInfos
  68. ImageInfos *models.ImageInfosModelArts
  69. )
  70. type GenerateTrainJobReq struct {
  71. JobName string
  72. DisplayJobName string
  73. Uuid string
  74. Description string
  75. CodeObsPath string
  76. BootFile string
  77. BootFileUrl string
  78. DataUrl string
  79. TrainUrl string
  80. FlavorCode string
  81. LogUrl string
  82. PoolID string
  83. WorkServerNumber int
  84. EngineID int64
  85. Parameters []models.Parameter
  86. CommitID string
  87. IsLatestVersion string
  88. Params string
  89. BranchName string
  90. PreVersionId int64
  91. PreVersionName string
  92. FlavorName string
  93. VersionCount int
  94. EngineName string
  95. TotalVersionCount int
  96. UserImageUrl string
  97. UserCommand string
  98. DatasetName string
  99. }
  100. type GenerateInferenceJobReq struct {
  101. JobName string
  102. DisplayJobName string
  103. Uuid string
  104. Description string
  105. CodeObsPath string
  106. BootFile string
  107. BootFileUrl string
  108. DataUrl string
  109. TrainUrl string
  110. FlavorCode string
  111. LogUrl string
  112. PoolID string
  113. WorkServerNumber int
  114. EngineID int64
  115. Parameters []models.Parameter
  116. CommitID string
  117. Params string
  118. BranchName string
  119. FlavorName string
  120. EngineName string
  121. LabelName string
  122. IsLatestVersion string
  123. VersionCount int
  124. TotalVersionCount int
  125. ModelName string
  126. ModelVersion string
  127. CkptName string
  128. ResultUrl string
  129. }
  130. type VersionInfo struct {
  131. Version []struct {
  132. ID int `json:"id"`
  133. Value string `json:"value"`
  134. Url string `json:"url"`
  135. } `json:"version"`
  136. }
  137. type Flavor struct {
  138. Info []struct {
  139. Code string `json:"code"`
  140. Value string `json:"value"`
  141. } `json:"flavor"`
  142. }
  143. type Engine struct {
  144. Info []struct {
  145. ID int `json:"id"`
  146. Value string `json:"value"`
  147. } `json:"engine"`
  148. }
  149. type ResourcePool struct {
  150. Info []struct {
  151. ID string `json:"id"`
  152. Value string `json:"value"`
  153. } `json:"resource_pool"`
  154. }
  155. // type Parameter struct {
  156. // Label string `json:"label"`
  157. // Value string `json:"value"`
  158. // }
  159. // type Parameters struct {
  160. // Parameter []Parameter `json:"parameter"`
  161. // }
  162. type Parameters struct {
  163. Parameter []struct {
  164. Label string `json:"label"`
  165. Value string `json:"value"`
  166. } `json:"parameter"`
  167. }
  168. func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor string) error {
  169. var dataActualPath string
  170. if uuid != "" {
  171. dataActualPath = setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  172. } else {
  173. userPath := setting.UserBasePath + ctx.User.Name + "/"
  174. isExist, err := storage.ObsHasObject(userPath)
  175. if err != nil {
  176. log.Error("ObsHasObject failed:%v", err.Error(), ctx.Data["MsgID"])
  177. return err
  178. }
  179. if !isExist {
  180. if err = storage.ObsCreateObject(userPath); err != nil {
  181. log.Error("ObsCreateObject failed:%v", err.Error(), ctx.Data["MsgID"])
  182. return err
  183. }
  184. }
  185. dataActualPath = setting.Bucket + "/" + userPath
  186. }
  187. if poolInfos == nil {
  188. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  189. }
  190. createTime := timeutil.TimeStampNow()
  191. jobResult, err := CreateJob(models.CreateNotebookParams{
  192. JobName: jobName,
  193. Description: description,
  194. ProfileID: setting.ProfileID,
  195. Flavor: flavor,
  196. Pool: models.Pool{
  197. ID: poolInfos.PoolInfo[0].PoolId,
  198. Name: poolInfos.PoolInfo[0].PoolName,
  199. Type: poolInfos.PoolInfo[0].PoolType,
  200. },
  201. Spec: models.Spec{
  202. Storage: models.Storage{
  203. Type: storageTypeOBS,
  204. Location: models.Location{
  205. Path: dataActualPath,
  206. },
  207. },
  208. AutoStop: models.AutoStop{
  209. Enable: true,
  210. Duration: autoStopDuration,
  211. },
  212. },
  213. })
  214. if err != nil {
  215. log.Error("CreateJob failed: %v", err.Error())
  216. return err
  217. }
  218. err = models.CreateCloudbrain(&models.Cloudbrain{
  219. Status: string(models.JobWaiting),
  220. UserID: ctx.User.ID,
  221. RepoID: ctx.Repo.Repository.ID,
  222. JobID: jobResult.ID,
  223. JobName: jobName,
  224. JobType: string(models.JobTypeDebug),
  225. Type: models.TypeCloudBrainTwo,
  226. Uuid: uuid,
  227. ComputeResource: models.NPUResource,
  228. CreatedUnix: createTime,
  229. UpdatedUnix: createTime,
  230. })
  231. if err != nil {
  232. return err
  233. }
  234. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobResult.ID, jobName, models.ActionCreateDebugNPUTask)
  235. return nil
  236. }
  237. func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
  238. if poolInfos == nil {
  239. json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
  240. }
  241. imageName, err := GetNotebookImageName(imageId)
  242. if err != nil {
  243. log.Error("GetNotebookImageName failed: %v", err.Error())
  244. return err
  245. }
  246. createTime := timeutil.TimeStampNow()
  247. task := &models.Cloudbrain{
  248. Status: string(models.ModelArtsTrainJobWaiting),
  249. UserID: ctx.User.ID,
  250. RepoID: ctx.Repo.Repository.ID,
  251. JobID: models.TempJobIdPrefix + jobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
  252. JobName: jobName,
  253. FlavorCode: flavor,
  254. DisplayJobName: displayJobName,
  255. JobType: string(models.JobTypeDebug),
  256. Type: models.TypeCloudBrainTwo,
  257. Uuid: uuid,
  258. ComputeResource: models.NPUResource,
  259. Image: imageName,
  260. Description: description,
  261. CreatedUnix: createTime,
  262. UpdatedUnix: createTime,
  263. }
  264. err = models.CreateCloudbrain(task)
  265. if err != nil {
  266. log.Error("CreateCloudbrain(%s) failed:%v", displayJobName, err.Error())
  267. return err
  268. }
  269. jobResult, err := createNotebook2(models.CreateNotebook2Params{
  270. JobName: jobName,
  271. Description: description,
  272. Flavor: flavor,
  273. Duration: autoStopDurationMs,
  274. ImageID: imageId,
  275. PoolID: poolInfos.PoolInfo[0].PoolId,
  276. Feature: models.NotebookFeature,
  277. Volume: models.VolumeReq{
  278. Capacity: setting.Capacity,
  279. Category: models.EVSCategory,
  280. Ownership: models.ManagedOwnership,
  281. },
  282. WorkspaceID: "0",
  283. })
  284. if err != nil {
  285. log.Error("createNotebook2 failed: %v", err.Error())
  286. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  287. log.Info("(%s)unknown error, set temp status", displayJobName)
  288. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  289. CloudbrainID: task.ID,
  290. Status: models.JobStatusTemp,
  291. Type: task.Type,
  292. JobName: task.JobName,
  293. JobType: task.JobType,
  294. })
  295. if errTemp != nil {
  296. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  297. return errTemp
  298. }
  299. } else {
  300. task.Status = string(models.ModelArtsCreateFailed)
  301. errTemp := models.UpdateJob(task)
  302. if errTemp != nil {
  303. log.Error("UpdateJob failed: %v", errTemp.Error())
  304. }
  305. errTemp = models.DeleteJob(task)
  306. if errTemp != nil {
  307. log.Error("DeleteJob failed: %v", errTemp.Error())
  308. }
  309. return err
  310. }
  311. } else {
  312. task.Status = jobResult.Status
  313. task.JobID = jobResult.ID
  314. err = models.UpdateJob(task)
  315. if err != nil {
  316. log.Error("UpdateJob failed: %v", err.Error())
  317. return err
  318. }
  319. }
  320. stringId := strconv.FormatInt(task.ID, 10)
  321. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugNPUTask)
  322. return nil
  323. }
  324. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  325. createTime := timeutil.TimeStampNow()
  326. task := &models.Cloudbrain{
  327. Status: string(models.ModelArtsTrainJobWaiting),
  328. UserID: ctx.User.ID,
  329. RepoID: ctx.Repo.Repository.ID,
  330. JobID: models.TempJobIdPrefix + req.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
  331. JobName: req.JobName,
  332. DisplayJobName: req.DisplayJobName,
  333. JobType: string(models.JobTypeTrain),
  334. Type: models.TypeCloudBrainTwo,
  335. Uuid: req.Uuid,
  336. DatasetName: req.DatasetName,
  337. CommitID: req.CommitID,
  338. IsLatestVersion: req.IsLatestVersion,
  339. ComputeResource: models.NPUResource,
  340. EngineID: req.EngineID,
  341. TrainUrl: req.TrainUrl,
  342. BranchName: req.BranchName,
  343. Parameters: req.Params,
  344. BootFile: req.BootFile,
  345. DataUrl: req.DataUrl,
  346. LogUrl: req.LogUrl,
  347. FlavorCode: req.FlavorCode,
  348. Description: req.Description,
  349. WorkServerNumber: req.WorkServerNumber,
  350. FlavorName: req.FlavorName,
  351. EngineName: req.EngineName,
  352. VersionCount: req.VersionCount,
  353. TotalVersionCount: req.TotalVersionCount,
  354. CreatedUnix: createTime,
  355. UpdatedUnix: createTime,
  356. }
  357. err = models.CreateCloudbrain(task)
  358. if err != nil {
  359. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  360. return err
  361. }
  362. var jobResult *models.CreateTrainJobResult
  363. var createErr error
  364. if req.EngineID < 0 {
  365. jobResult, createErr = createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  366. JobName: req.JobName,
  367. Description: req.Description,
  368. Config: models.UserImageConfig{
  369. WorkServerNum: req.WorkServerNumber,
  370. AppUrl: req.CodeObsPath,
  371. BootFileUrl: req.BootFileUrl,
  372. DataUrl: req.DataUrl,
  373. TrainUrl: req.TrainUrl,
  374. LogUrl: req.LogUrl,
  375. PoolID: req.PoolID,
  376. CreateVersion: true,
  377. Flavor: models.Flavor{
  378. Code: req.FlavorCode,
  379. },
  380. Parameter: req.Parameters,
  381. UserImageUrl: req.UserImageUrl,
  382. UserCommand: req.UserCommand,
  383. },
  384. })
  385. } else {
  386. jobResult, createErr = createTrainJob(models.CreateTrainJobParams{
  387. JobName: req.JobName,
  388. Description: req.Description,
  389. Config: models.Config{
  390. WorkServerNum: req.WorkServerNumber,
  391. AppUrl: req.CodeObsPath,
  392. BootFileUrl: req.BootFileUrl,
  393. DataUrl: req.DataUrl,
  394. EngineID: req.EngineID,
  395. TrainUrl: req.TrainUrl,
  396. LogUrl: req.LogUrl,
  397. PoolID: req.PoolID,
  398. CreateVersion: true,
  399. Flavor: models.Flavor{
  400. Code: req.FlavorCode,
  401. },
  402. Parameter: req.Parameters,
  403. },
  404. })
  405. }
  406. if createErr != nil {
  407. log.Error("createTrainJob failed: %v", createErr.Error())
  408. if strings.HasPrefix(createErr.Error(), UnknownErrorPrefix) {
  409. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  410. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  411. CloudbrainID: task.ID,
  412. Status: models.JobStatusTemp,
  413. Type: task.Type,
  414. JobName: task.JobName,
  415. JobType: task.JobType,
  416. })
  417. if errTemp != nil {
  418. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  419. return errTemp
  420. }
  421. } else {
  422. task.Status = string(models.ModelArtsTrainJobFailed)
  423. errTemp := models.UpdateJob(task)
  424. if errTemp != nil {
  425. log.Error("UpdateJob failed: %v", errTemp.Error())
  426. }
  427. errTemp = models.DeleteJob(task)
  428. if errTemp != nil {
  429. log.Error("DeleteJob failed: %v", errTemp.Error())
  430. }
  431. return createErr
  432. }
  433. } else {
  434. task.Status = TransTrainJobStatus(jobResult.Status)
  435. task.JobID = strconv.FormatInt(jobResult.JobID, 10)
  436. task.VersionID = jobResult.VersionID
  437. task.VersionName = jobResult.VersionName
  438. err = models.UpdateJob(task)
  439. if err != nil {
  440. log.Error("UpdateJob failed: %v", err.Error())
  441. return err
  442. }
  443. }
  444. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, task.JobID, req.DisplayJobName, models.ActionCreateTrainTask)
  445. return nil
  446. }
  447. func GenerateModelConvertTrainJob(req *GenerateTrainJobReq) (*models.CreateTrainJobResult, error) {
  448. return createTrainJobUserImage(models.CreateUserImageTrainJobParams{
  449. JobName: req.JobName,
  450. Description: req.Description,
  451. Config: models.UserImageConfig{
  452. WorkServerNum: req.WorkServerNumber,
  453. AppUrl: req.CodeObsPath,
  454. BootFileUrl: req.BootFileUrl,
  455. DataUrl: req.DataUrl,
  456. TrainUrl: req.TrainUrl,
  457. LogUrl: req.LogUrl,
  458. PoolID: req.PoolID,
  459. CreateVersion: true,
  460. Flavor: models.Flavor{
  461. Code: req.FlavorCode,
  462. },
  463. Parameter: req.Parameters,
  464. UserImageUrl: req.UserImageUrl,
  465. UserCommand: req.UserCommand,
  466. },
  467. })
  468. }
  469. func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, jobId string) (err error) {
  470. var jobTypes []string
  471. jobTypes = append(jobTypes, string(models.JobTypeTrain))
  472. repo := ctx.Repo.Repository
  473. VersionTaskList, VersionListCount, err := models.CloudbrainsVersionList(&models.CloudbrainsOptions{
  474. RepoID: repo.ID,
  475. Type: models.TypeCloudBrainTwo,
  476. JobTypes: jobTypes,
  477. JobID: jobId,
  478. })
  479. if err != nil {
  480. ctx.ServerError("Cloudbrain", err)
  481. return err
  482. }
  483. //将当前版本的isLatestVersion设置为"1"和任务数量更新,任务数量包括当前版本数VersionCount和历史创建的总版本数TotalVersionCount
  484. createTime := timeutil.TimeStampNow()
  485. task := &models.Cloudbrain{
  486. Status: models.JobStatusTemp,
  487. UserID: ctx.User.ID,
  488. RepoID: ctx.Repo.Repository.ID,
  489. JobID: jobId,
  490. JobName: req.JobName,
  491. DisplayJobName: req.DisplayJobName,
  492. JobType: string(models.JobTypeTrain),
  493. Type: models.TypeCloudBrainTwo,
  494. Uuid: req.Uuid,
  495. DatasetName: req.DatasetName,
  496. CommitID: req.CommitID,
  497. IsLatestVersion: req.IsLatestVersion,
  498. PreVersionName: req.PreVersionName,
  499. ComputeResource: models.NPUResource,
  500. EngineID: req.EngineID,
  501. TrainUrl: req.TrainUrl,
  502. BranchName: req.BranchName,
  503. Parameters: req.Params,
  504. BootFile: req.BootFile,
  505. DataUrl: req.DataUrl,
  506. LogUrl: req.LogUrl,
  507. PreVersionId: req.PreVersionId,
  508. FlavorCode: req.FlavorCode,
  509. Description: req.Description,
  510. WorkServerNumber: req.WorkServerNumber,
  511. FlavorName: req.FlavorName,
  512. EngineName: req.EngineName,
  513. TotalVersionCount: VersionTaskList[0].TotalVersionCount + 1,
  514. VersionCount: VersionListCount + 1,
  515. CreatedUnix: createTime,
  516. UpdatedUnix: createTime,
  517. }
  518. err = models.CreateCloudbrain(task)
  519. if err != nil {
  520. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  521. return err
  522. }
  523. //将训练任务的上一版本的isLatestVersion设置为"0"
  524. err = models.SetVersionCountAndLatestVersion(req.JobName, VersionTaskList[0].VersionName, VersionListCount, NotLatestVersion, VersionTaskList[0].TotalVersionCount)
  525. if err != nil {
  526. ctx.ServerError("Update IsLatestVersion failed", err)
  527. return err
  528. }
  529. var jobResult *models.CreateTrainJobResult
  530. var createErr error
  531. if req.EngineID < 0 {
  532. jobResult, createErr = createTrainJobVersionUserImage(models.CreateTrainJobVersionUserImageParams{
  533. Description: req.Description,
  534. Config: models.TrainJobVersionUserImageConfig{
  535. WorkServerNum: req.WorkServerNumber,
  536. AppUrl: req.CodeObsPath,
  537. BootFileUrl: req.BootFileUrl,
  538. DataUrl: req.DataUrl,
  539. TrainUrl: req.TrainUrl,
  540. LogUrl: req.LogUrl,
  541. PoolID: req.PoolID,
  542. Flavor: models.Flavor{
  543. Code: req.FlavorCode,
  544. },
  545. Parameter: req.Parameters,
  546. PreVersionId: req.PreVersionId,
  547. UserImageUrl: req.UserImageUrl,
  548. UserCommand: req.UserCommand,
  549. },
  550. }, jobId)
  551. } else {
  552. jobResult, createErr = createTrainJobVersion(models.CreateTrainJobVersionParams{
  553. Description: req.Description,
  554. Config: models.TrainJobVersionConfig{
  555. WorkServerNum: req.WorkServerNumber,
  556. AppUrl: req.CodeObsPath,
  557. BootFileUrl: req.BootFileUrl,
  558. DataUrl: req.DataUrl,
  559. EngineID: req.EngineID,
  560. TrainUrl: req.TrainUrl,
  561. LogUrl: req.LogUrl,
  562. PoolID: req.PoolID,
  563. Flavor: models.Flavor{
  564. Code: req.FlavorCode,
  565. },
  566. Parameter: req.Parameters,
  567. PreVersionId: req.PreVersionId,
  568. },
  569. }, jobId)
  570. }
  571. if createErr != nil {
  572. log.Error("createTrainJobVersion failed: %v", err.Error())
  573. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  574. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  575. errTemp := models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  576. CloudbrainID: task.ID,
  577. Status: models.JobStatusTemp,
  578. Type: task.Type,
  579. JobName: task.JobName,
  580. JobType: task.JobType,
  581. })
  582. if errTemp != nil {
  583. log.Error("InsertCloudbrainTemp failed: %v", errTemp.Error())
  584. return errTemp
  585. }
  586. } else {
  587. task.Status = string(models.ModelArtsTrainJobFailed)
  588. errTemp := models.UpdateJob(task)
  589. if errTemp != nil {
  590. log.Error("UpdateJob failed: %v", errTemp.Error())
  591. }
  592. errTemp = models.DeleteJob(task)
  593. if errTemp != nil {
  594. log.Error("DeleteJob failed: %v", errTemp.Error())
  595. }
  596. return createErr
  597. }
  598. } else {
  599. task.Status = TransTrainJobStatus(jobResult.Status)
  600. task.JobID = strconv.FormatInt(jobResult.JobID, 10)
  601. task.VersionID = jobResult.VersionID
  602. task.VersionName = jobResult.VersionName
  603. err = models.UpdateJob(task)
  604. if err != nil {
  605. log.Error("UpdateJob failed: %v", err.Error())
  606. return err
  607. }
  608. }
  609. return nil
  610. }
  611. func TransTrainJobStatus(status int) string {
  612. switch status {
  613. case 0:
  614. return "UNKNOWN"
  615. case 1:
  616. return "INIT"
  617. case 2:
  618. return "IMAGE_CREATING"
  619. case 3:
  620. return "IMAGE_FAILED"
  621. case 4:
  622. return "SUBMIT_TRYING"
  623. case 5:
  624. return "SUBMIT_FAILED"
  625. case 6:
  626. return "DELETE_FAILED"
  627. case 7:
  628. return "WAITING"
  629. case 8:
  630. return "RUNNING"
  631. case 9:
  632. return "KILLING"
  633. case 10:
  634. return "COMPLETED"
  635. case 11:
  636. return "FAILED"
  637. case 12:
  638. return "KILLED"
  639. case 13:
  640. return "CANCELED"
  641. case 14:
  642. return "LOST"
  643. case 15:
  644. return "SCALING"
  645. case 16:
  646. return "SUBMIT_MODEL_FAILED"
  647. case 17:
  648. return "DEPLOY_SERVICE_FAILED"
  649. case 18:
  650. return "CHECK_INIT"
  651. case 19:
  652. return "CHECK_RUNNING"
  653. case 20:
  654. return "CHECK_RUNNING_COMPLETED"
  655. case 21:
  656. return "CHECK_FAILED"
  657. default:
  658. return strconv.Itoa(status)
  659. }
  660. }
  661. func GetOutputPathByCount(TotalVersionCount int) (VersionOutputPath string) {
  662. talVersionCountToString := fmt.Sprintf("%04d", TotalVersionCount)
  663. VersionOutputPath = "V" + talVersionCountToString
  664. return VersionOutputPath
  665. }
  666. func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (err error) {
  667. createTime := timeutil.TimeStampNow()
  668. attach, err := models.GetAttachmentByUUID(req.Uuid)
  669. if err != nil {
  670. log.Error("GetAttachmentByUUID(%s) failed:%v", req.DisplayJobName, err.Error())
  671. return err
  672. }
  673. task := &models.Cloudbrain{
  674. Status: string(models.ModelArtsTrainJobWaiting),
  675. UserID: ctx.User.ID,
  676. RepoID: ctx.Repo.Repository.ID,
  677. JobID: models.TempJobIdPrefix + req.JobName + strconv.Itoa(int(rand.New(rand.NewSource(time.Now().UnixNano())).Int31n(100000))),
  678. JobName: req.JobName,
  679. DisplayJobName: req.DisplayJobName,
  680. JobType: string(models.JobTypeInference),
  681. Type: models.TypeCloudBrainTwo,
  682. Uuid: req.Uuid,
  683. DatasetName: attach.Name,
  684. CommitID: req.CommitID,
  685. EngineID: req.EngineID,
  686. TrainUrl: req.TrainUrl,
  687. BranchName: req.BranchName,
  688. Parameters: req.Params,
  689. BootFile: req.BootFile,
  690. DataUrl: req.DataUrl,
  691. LogUrl: req.LogUrl,
  692. FlavorCode: req.FlavorCode,
  693. Description: req.Description,
  694. WorkServerNumber: req.WorkServerNumber,
  695. FlavorName: req.FlavorName,
  696. EngineName: req.EngineName,
  697. LabelName: req.LabelName,
  698. IsLatestVersion: req.IsLatestVersion,
  699. ComputeResource: models.NPUResource,
  700. VersionCount: req.VersionCount,
  701. TotalVersionCount: req.TotalVersionCount,
  702. ModelName: req.ModelName,
  703. ModelVersion: req.ModelVersion,
  704. CkptName: req.CkptName,
  705. ResultUrl: req.ResultUrl,
  706. CreatedUnix: createTime,
  707. UpdatedUnix: createTime,
  708. }
  709. err = models.CreateCloudbrain(task)
  710. if err != nil {
  711. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  712. return err
  713. }
  714. jobResult, err := createInferenceJob(models.CreateInferenceJobParams{
  715. JobName: req.JobName,
  716. Description: req.Description,
  717. InfConfig: models.InfConfig{
  718. WorkServerNum: req.WorkServerNumber,
  719. AppUrl: req.CodeObsPath,
  720. BootFileUrl: req.BootFileUrl,
  721. DataUrl: req.DataUrl,
  722. EngineID: req.EngineID,
  723. // TrainUrl: req.TrainUrl,
  724. LogUrl: req.LogUrl,
  725. PoolID: req.PoolID,
  726. CreateVersion: true,
  727. Flavor: models.Flavor{
  728. Code: req.FlavorCode,
  729. },
  730. Parameter: req.Parameters,
  731. },
  732. })
  733. if err != nil {
  734. log.Error("createTrainJob failed: %v", err.Error())
  735. if strings.HasPrefix(err.Error(), UnknownErrorPrefix) {
  736. log.Info("(%s)unknown error, set temp status", req.DisplayJobName)
  737. err = models.InsertCloudbrainTemp(&models.CloudbrainTemp{
  738. CloudbrainID: task.ID,
  739. Status: models.JobStatusTemp,
  740. Type: task.Type,
  741. JobName: task.JobName,
  742. JobType: task.JobType,
  743. })
  744. if err != nil {
  745. log.Error("InsertCloudbrainTemp failed: %v", err.Error())
  746. return err
  747. }
  748. } else {
  749. task.Status = string(models.ModelArtsTrainJobFailed)
  750. errTemp := models.UpdateJob(task)
  751. if errTemp != nil {
  752. log.Error("UpdateJob failed: %v", errTemp.Error())
  753. }
  754. errTemp = models.DeleteJob(task)
  755. if errTemp != nil {
  756. log.Error("DeleteJob failed: %v", errTemp.Error())
  757. }
  758. return err
  759. }
  760. } else {
  761. task.Status = TransTrainJobStatus(jobResult.Status)
  762. task.JobID = strconv.FormatInt(jobResult.JobID, 10)
  763. task.VersionID = jobResult.VersionID
  764. task.VersionName = jobResult.VersionName
  765. err = models.UpdateJob(task)
  766. if err != nil {
  767. log.Error("UpdateJob failed: %v", err.Error())
  768. return err
  769. }
  770. }
  771. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, task.JobID, req.DisplayJobName, models.ActionCreateInferenceTask)
  772. return nil
  773. }
  774. func GetNotebookImageName(imageId string) (string, error) {
  775. var validImage = false
  776. var imageName = ""
  777. if ImageInfos == nil {
  778. json.Unmarshal([]byte(setting.ImageInfos), &ImageInfos)
  779. }
  780. for _, imageInfo := range ImageInfos.ImageInfo {
  781. if imageInfo.Id == imageId {
  782. validImage = true
  783. imageName = imageInfo.Value
  784. }
  785. }
  786. if !validImage {
  787. log.Error("the image id(%s) is invalid", imageId)
  788. return imageName, errors.New("the image id is invalid")
  789. }
  790. return imageName, nil
  791. }
  792. func HandleTrainJobInfo(task *models.Cloudbrain) error {
  793. if isTempJob(task.JobID, task.Status) {
  794. if task.VersionCount > VersionCountOne {
  795. //multi version
  796. result, err := GetTrainJobVersionList(1000, 1, strings.TrimPrefix(task.JobID, models.TempJobIdPrefix))
  797. if err != nil {
  798. log.Error("GetTrainJobVersionList failed:%v", err)
  799. return err
  800. }
  801. if result != nil {
  802. if strconv.FormatInt(result.JobID, 10) == task.JobID && result.JobName == task.JobName {
  803. if result.VersionCount == int64(task.VersionCount) {
  804. log.Info("find the record(%s)", task.DisplayJobName)
  805. task.Status = TransTrainJobStatus(result.JobVersionList[0].IntStatus)
  806. task.VersionName = result.JobVersionList[0].VersionName
  807. task.VersionID = result.JobVersionList[0].VersionID
  808. err = models.UpdateJob(task)
  809. if err != nil {
  810. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  811. return err
  812. }
  813. temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
  814. if err != nil {
  815. log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
  816. } else {
  817. err = models.DeleteCloudbrainTemp(temp)
  818. if err != nil {
  819. log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
  820. }
  821. }
  822. return nil
  823. } else {
  824. log.Error("can not find the record(%s) until now", task.DisplayJobName)
  825. }
  826. } else {
  827. log.Error("can not find the record(%s) until now", task.DisplayJobName)
  828. }
  829. }
  830. } else {
  831. //inference or one version
  832. result, err := GetTrainJobList(1000, 1, "create_time", "desc", task.JobName)
  833. if err != nil {
  834. log.Error("GetTrainJobList failed:%v", err)
  835. return err
  836. }
  837. if result != nil {
  838. for _, job := range result.JobList {
  839. if task.JobName == job.JobName {
  840. log.Info("find the record(%s)", task.DisplayJobName)
  841. task.Status = TransTrainJobStatus(job.IntStatus)
  842. task.JobID = strconv.FormatInt(job.JobID, 10)
  843. err = models.UpdateJob(task)
  844. if err != nil {
  845. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  846. return err
  847. }
  848. temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
  849. if err != nil {
  850. log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
  851. return err
  852. }
  853. err = models.DeleteCloudbrainTemp(temp)
  854. if err != nil {
  855. log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
  856. return err
  857. }
  858. return nil
  859. }
  860. }
  861. }
  862. }
  863. } else {
  864. //normal
  865. result, err := GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  866. if err != nil {
  867. log.Error("GetTrainJob(%s) failed:%v", task.DisplayJobName, err)
  868. return err
  869. }
  870. if result != nil {
  871. task.Status = TransTrainJobStatus(result.IntStatus)
  872. task.Duration = result.Duration / 1000
  873. task.TrainJobDuration = result.TrainJobDuration
  874. if task.StartTime == 0 && result.StartTime > 0 {
  875. task.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  876. }
  877. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  878. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  879. task.EndTime = task.StartTime.Add(task.Duration)
  880. }
  881. task.CorrectCreateUnix()
  882. err = models.UpdateJob(task)
  883. if err != nil {
  884. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  885. return err
  886. }
  887. }
  888. }
  889. return nil
  890. }
  891. func HandleNotebookInfo(task *models.Cloudbrain) error {
  892. if isTempJob(task.JobID, task.Status) {
  893. result, err := GetNotebookList(1000, 0, "createTime", "DESC", task.JobName)
  894. if err != nil {
  895. log.Error("GetNotebookList failed:%v", err)
  896. return err
  897. }
  898. if result != nil {
  899. count, err := models.GetCloudbrainCountByJobName(task.JobName, task.JobType)
  900. if err != nil {
  901. log.Error("GetCloudbrainCountByJobName failed:%v", err)
  902. return err
  903. }
  904. if len(result.NotebookList) == count {
  905. if result.NotebookList[0].JobName == task.JobName {
  906. log.Info("find the record(%s)", task.DisplayJobName)
  907. task.Status = result.NotebookList[0].Status
  908. task.JobID = result.NotebookList[0].JobID
  909. err = models.UpdateJob(task)
  910. if err != nil {
  911. log.Error("UpdateJob(%s) failed:%v", task.JobName, err)
  912. return err
  913. }
  914. temp, err := models.GetCloudbrainTempByCloudbrainID(task.ID)
  915. if err != nil {
  916. log.Error("no such temp record(%s):%v", task.DisplayJobName, err.Error())
  917. return err
  918. }
  919. err = models.DeleteCloudbrainTemp(temp)
  920. if err != nil {
  921. log.Error("DeleteCloudbrainTemp(%s) failed:%v", task.DisplayJobName, err)
  922. return err
  923. }
  924. return nil
  925. } else {
  926. log.Error("can not find the record(%s) until now", task.DisplayJobName)
  927. }
  928. } else {
  929. log.Error("can not find the record(%s) until now", task.DisplayJobName)
  930. }
  931. } else {
  932. log.Error("can not find the record(%s) until now", task.DisplayJobName)
  933. }
  934. } else {
  935. //normal
  936. result, err := GetNotebook2(task.JobID)
  937. if err != nil {
  938. log.Error("GetNotebook2(%s) failed:%v", task.DisplayJobName, err)
  939. return err
  940. }
  941. if result != nil {
  942. task.Status = result.Status
  943. if task.StartTime == 0 && result.Lease.UpdateTime > 0 {
  944. task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000)
  945. }
  946. if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) {
  947. task.EndTime = timeutil.TimeStampNow()
  948. }
  949. task.CorrectCreateUnix()
  950. task.ComputeAndSetDuration()
  951. err = models.UpdateJob(task)
  952. if err != nil {
  953. log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err)
  954. return err
  955. }
  956. }
  957. }
  958. return nil
  959. }
  960. func isTempJob(jobID, status string) bool {
  961. if (strings.HasPrefix(jobID, models.TempJobIdPrefix) && status == string(models.ModelArtsTrainJobWaiting)) || status == models.JobStatusTemp {
  962. return true
  963. }
  964. return false
  965. }