You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

train.go 26 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794
  1. package cloudbrainTask
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "io"
  7. "io/ioutil"
  8. "net/http"
  9. "os"
  10. "path"
  11. "regexp"
  12. "strings"
  13. "code.gitea.io/gitea/modules/timeutil"
  14. "code.gitea.io/gitea/modules/notification"
  15. "code.gitea.io/gitea/modules/obs"
  16. "code.gitea.io/gitea/modules/git"
  17. "code.gitea.io/gitea/modules/storage"
  18. "github.com/unknwon/com"
  19. "code.gitea.io/gitea/models"
  20. "code.gitea.io/gitea/modules/cloudbrain"
  21. "code.gitea.io/gitea/modules/context"
  22. "code.gitea.io/gitea/modules/grampus"
  23. "code.gitea.io/gitea/modules/log"
  24. "code.gitea.io/gitea/modules/modelarts"
  25. "code.gitea.io/gitea/modules/redis/redis_key"
  26. "code.gitea.io/gitea/modules/redis/redis_lock"
  27. "code.gitea.io/gitea/modules/setting"
  28. api "code.gitea.io/gitea/modules/structs"
  29. "code.gitea.io/gitea/modules/util"
  30. "code.gitea.io/gitea/services/cloudbrain/resource"
  31. "code.gitea.io/gitea/services/reward/point/account"
  32. )
  33. var jobNamePattern = regexp.MustCompile(`^[a-z0-9][a-z0-9-_]{1,34}[a-z0-9-]$`)
  34. func GrampusTrainJobGpuCreate(ctx *context.Context, option api.CreateTrainJobOption) {
  35. displayJobName := option.DisplayJobName
  36. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  37. uuid := option.Attachment
  38. description := option.Description
  39. bootFile := strings.TrimSpace(option.BootFile)
  40. params := option.Params
  41. repo := ctx.Repo.Repository
  42. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  43. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  44. branchName := option.BranchName
  45. image := strings.TrimSpace(option.Image)
  46. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  47. defer lock.UnLock()
  48. spec, datasetInfos, datasetNames, err := checkParameters(ctx, option, lock, repo)
  49. if err != nil {
  50. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
  51. return
  52. }
  53. //prepare code and out path
  54. _, err = ioutil.ReadDir(codeLocalPath)
  55. if err == nil {
  56. os.RemoveAll(codeLocalPath)
  57. }
  58. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  59. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  60. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
  61. }
  62. //todo: upload code (send to file_server todo this work?)
  63. //upload code
  64. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  65. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  66. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
  67. return
  68. }
  69. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  70. if err := mkModelPath(modelPath); err != nil {
  71. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  72. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
  73. return
  74. }
  75. //init model readme
  76. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  77. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  78. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
  79. return
  80. }
  81. var datasetRemotePath, allFileName string
  82. for _, datasetInfo := range datasetInfos {
  83. if datasetRemotePath == "" {
  84. datasetRemotePath = datasetInfo.DataLocalPath
  85. allFileName = datasetInfo.FullName
  86. } else {
  87. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  88. allFileName = allFileName + ";" + datasetInfo.FullName
  89. }
  90. }
  91. //prepare command
  92. preTrainModelPath := getPreTrainModelPath(option.PreTrainModelUrl, option.CkptName)
  93. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, option.CkptName)
  94. if err != nil {
  95. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  96. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Create task failed, internal error"))
  97. return
  98. }
  99. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  100. req := &grampus.GenerateTrainJobReq{
  101. JobName: jobName,
  102. DisplayJobName: displayJobName,
  103. ComputeResource: models.GPUResource,
  104. ProcessType: grampus.ProcessorTypeGPU,
  105. Command: command,
  106. ImageUrl: image,
  107. Description: description,
  108. BootFile: bootFile,
  109. Uuid: uuid,
  110. CommitID: commitID,
  111. BranchName: branchName,
  112. Params: option.Params,
  113. EngineName: image,
  114. DatasetNames: datasetNames,
  115. DatasetInfos: datasetInfos,
  116. IsLatestVersion: modelarts.IsLatestVersion,
  117. VersionCount: modelarts.VersionCountOne,
  118. WorkServerNumber: 1,
  119. Spec: spec,
  120. }
  121. if option.ModelName != "" { //使用预训练模型训练
  122. req.ModelName = option.ModelName
  123. req.LabelName = option.LabelName
  124. req.CkptName = option.CkptName
  125. req.ModelVersion = option.ModelVersion
  126. req.PreTrainModelUrl = option.PreTrainModelUrl
  127. }
  128. jobId, err := grampus.GenerateTrainJob(ctx, req)
  129. if err != nil {
  130. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  131. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
  132. return
  133. }
  134. ctx.JSON(http.StatusOK, models.BaseMessageApi{Code: 0, Message: jobId})
  135. }
  136. func checkParameters(ctx *context.Context, option api.CreateTrainJobOption, lock *redis_lock.DistributeLock, repo *models.Repository) (*models.Specification, map[string]models.DatasetInfo, string, error) {
  137. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  138. if !isOk {
  139. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  140. return nil, nil, "", fmt.Errorf(ctx.Tr("repo.cloudbrain_samejob_err"))
  141. }
  142. if !jobNamePattern.MatchString(option.DisplayJobName) {
  143. return nil, nil, "", fmt.Errorf(ctx.Tr("repo.cloudbrain_jobname_err"))
  144. }
  145. bootFileExist, err := ctx.Repo.FileExists(option.BootFile, option.BranchName)
  146. if err != nil || !bootFileExist {
  147. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  148. return nil, nil, "", fmt.Errorf(ctx.Tr("repo.cloudbrain_bootfile_err"))
  149. }
  150. computeResource := models.GPUResource
  151. if option.Type == 3 {
  152. computeResource = models.NPUResource
  153. }
  154. //check count limit
  155. count, err := GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), computeResource)
  156. if err != nil {
  157. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  158. return nil, nil, "", fmt.Errorf("system error")
  159. } else {
  160. if count >= 1 {
  161. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  162. return nil, nil, "", fmt.Errorf("you have already a running or waiting task, can not create more.")
  163. }
  164. }
  165. //check param
  166. if err := grampusParamCheckCreateTrainJob(option.BootFile, option.BranchName); err != nil {
  167. log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
  168. return nil, nil, "", err
  169. }
  170. //check whether the task name in the project is duplicated
  171. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), option.DisplayJobName)
  172. if err == nil {
  173. if len(tasks) != 0 {
  174. log.Error("the job name did already exist", ctx.Data["MsgID"])
  175. return nil, nil, "", fmt.Errorf("The job name did already exist.")
  176. }
  177. } else {
  178. if !models.IsErrJobNotExist(err) {
  179. log.Error("system error, %v", err, ctx.Data["MsgID"])
  180. return nil, nil, "", fmt.Errorf("system error")
  181. }
  182. }
  183. //check specification
  184. computeType := models.GPU
  185. if option.Type == 3 {
  186. computeType = models.NPU
  187. }
  188. spec, err := resource.GetAndCheckSpec(ctx.User.ID, option.SpecId, models.FindSpecsOptions{
  189. JobType: models.JobTypeTrain,
  190. ComputeResource: computeType,
  191. Cluster: models.C2NetCluster,
  192. })
  193. if err != nil || spec == nil {
  194. return nil, nil, "", fmt.Errorf("Resource specification is not available.")
  195. }
  196. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  197. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  198. return nil, nil, "", fmt.Errorf(ctx.Tr("points.insufficient_points_balance"))
  199. }
  200. //check dataset
  201. datasetInfos, datasetNames, err := models.GetDatasetInfo(option.Attachment, computeType)
  202. if err != nil {
  203. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  204. return nil, nil, "", fmt.Errorf(ctx.Tr("cloudbrain.error.dataset_select"))
  205. }
  206. return spec, datasetInfos, datasetNames, err
  207. }
  208. func GrampusTrainJobNpuCreate(ctx *context.Context, option api.CreateTrainJobOption) {
  209. displayJobName := option.DisplayJobName
  210. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  211. uuid := option.Attachment
  212. description := option.Description
  213. bootFile := strings.TrimSpace(option.BootFile)
  214. params := option.Params
  215. repo := ctx.Repo.Repository
  216. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  217. codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
  218. branchName := option.BranchName
  219. isLatestVersion := modelarts.IsLatestVersion
  220. versionCount := modelarts.VersionCountOne
  221. engineName := option.Image
  222. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  223. defer lock.UnLock()
  224. spec, datasetInfos, datasetNames, err := checkParameters(ctx, option, lock, repo)
  225. if err != nil {
  226. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
  227. return
  228. }
  229. //prepare code and out path
  230. _, err = ioutil.ReadDir(codeLocalPath)
  231. if err == nil {
  232. os.RemoveAll(codeLocalPath)
  233. }
  234. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  235. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  236. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
  237. return
  238. }
  239. //todo: upload code (send to file_server todo this work?)
  240. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
  241. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  242. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
  243. return
  244. }
  245. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  246. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  247. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(ctx.Tr("cloudbrain.load_code_failed")))
  248. return
  249. }
  250. var datasetRemotePath, allFileName string
  251. for _, datasetInfo := range datasetInfos {
  252. if datasetRemotePath == "" {
  253. datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  254. allFileName = datasetInfo.FullName
  255. } else {
  256. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  257. allFileName = allFileName + ";" + datasetInfo.FullName
  258. }
  259. }
  260. //prepare command
  261. preTrainModelPath := getPreTrainModelPath(option.PreTrainModelUrl, option.CkptName)
  262. command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, option.CkptName)
  263. if err != nil {
  264. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  265. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi("Create task failed, internal error"))
  266. return
  267. }
  268. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  269. req := &grampus.GenerateTrainJobReq{
  270. JobName: jobName,
  271. DisplayJobName: displayJobName,
  272. ComputeResource: models.NPUResource,
  273. ProcessType: grampus.ProcessorTypeNPU,
  274. Command: command,
  275. ImageId: option.ImageID,
  276. Description: description,
  277. CodeObsPath: codeObsPath,
  278. BootFileUrl: codeObsPath + bootFile,
  279. BootFile: bootFile,
  280. WorkServerNumber: option.WorkServerNumber,
  281. Uuid: uuid,
  282. CommitID: commitID,
  283. IsLatestVersion: isLatestVersion,
  284. BranchName: branchName,
  285. Params: option.Params,
  286. EngineName: engineName,
  287. VersionCount: versionCount,
  288. TotalVersionCount: modelarts.TotalVersionCount,
  289. DatasetNames: datasetNames,
  290. DatasetInfos: datasetInfos,
  291. Spec: spec,
  292. CodeName: strings.ToLower(repo.Name),
  293. }
  294. if option.ModelName != "" { //使用预训练模型训练
  295. req.ModelName = option.ModelName
  296. req.LabelName = option.LabelName
  297. req.CkptName = option.CkptName
  298. req.ModelVersion = option.ModelVersion
  299. req.PreTrainModelUrl = option.PreTrainModelUrl
  300. req.PreTrainModelPath = preTrainModelPath
  301. }
  302. jobId, err := grampus.GenerateTrainJob(ctx, req)
  303. if err != nil {
  304. log.Error("GenerateTrainJob failed:%v", err.Error())
  305. ctx.JSON(http.StatusOK, models.BaseErrorMessageApi(err.Error()))
  306. return
  307. }
  308. ctx.JSON(http.StatusOK, models.BaseMessageApi{Code: 0, Message: jobId})
  309. }
  310. func obsMkdir(dir string) error {
  311. input := &obs.PutObjectInput{}
  312. input.Bucket = setting.Bucket
  313. input.Key = dir
  314. _, err := storage.ObsCli.PutObject(input)
  315. if err != nil {
  316. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  317. return err
  318. }
  319. return nil
  320. }
  321. func uploadCodeToObs(codePath, jobName, parentDir string) error {
  322. files, err := readDir(codePath)
  323. if err != nil {
  324. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  325. return err
  326. }
  327. for _, file := range files {
  328. if file.IsDir() {
  329. input := &obs.PutObjectInput{}
  330. input.Bucket = setting.Bucket
  331. input.Key = parentDir + file.Name() + "/"
  332. _, err = storage.ObsCli.PutObject(input)
  333. if err != nil {
  334. log.Error("PutObject(%s) failed: %s", input.Key, err.Error())
  335. return err
  336. }
  337. if err = uploadCodeToObs(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  338. log.Error("uploadCodeToObs(%s) failed: %s", file.Name(), err.Error())
  339. return err
  340. }
  341. } else {
  342. input := &obs.PutFileInput{}
  343. input.Bucket = setting.Bucket
  344. input.Key = setting.CodePathPrefix + jobName + "/code/" + parentDir + file.Name()
  345. input.SourceFile = codePath + file.Name()
  346. _, err = storage.ObsCli.PutFile(input)
  347. if err != nil {
  348. log.Error("PutFile(%s) failed: %s", input.SourceFile, err.Error())
  349. return err
  350. }
  351. }
  352. }
  353. return nil
  354. }
  355. func grampusParamCheckCreateTrainJob(bootFile string, branchName string) error {
  356. if !strings.HasSuffix(strings.TrimSpace(bootFile), ".py") {
  357. log.Error("the boot file(%s) must be a python file", bootFile)
  358. return errors.New("启动文件必须是python文件")
  359. }
  360. if branchName == "" {
  361. log.Error("the branch must not be null!", branchName)
  362. return errors.New("代码分支不能为空!")
  363. }
  364. return nil
  365. }
  366. func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
  367. archiveType := git.ZIP
  368. archivePath := codePath
  369. if !com.IsDir(archivePath) {
  370. if err := os.MkdirAll(archivePath, os.ModePerm); err != nil {
  371. log.Error("MkdirAll failed:" + err.Error())
  372. return err
  373. }
  374. }
  375. // Get corresponding commit.
  376. var (
  377. commit *git.Commit
  378. err error
  379. )
  380. gitRepo := ctx.Repo.GitRepo
  381. if err != nil {
  382. log.Error("OpenRepository failed:" + err.Error())
  383. return err
  384. }
  385. if gitRepo.IsBranchExist(branchName) {
  386. commit, err = gitRepo.GetBranchCommit(branchName)
  387. if err != nil {
  388. log.Error("GetBranchCommit failed:" + err.Error())
  389. return err
  390. }
  391. } else {
  392. log.Error("the branch is not exist: " + branchName)
  393. return fmt.Errorf("The branch does not exist.")
  394. }
  395. archivePath = path.Join(archivePath, grampus.CodeArchiveName)
  396. if !com.IsFile(archivePath) {
  397. if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{
  398. Format: archiveType,
  399. Prefix: setting.Repository.PrefixArchiveFiles,
  400. }); err != nil {
  401. log.Error("CreateArchive failed:" + err.Error())
  402. return err
  403. }
  404. }
  405. return nil
  406. }
  407. func uploadCodeToMinio(codePath, jobName, parentDir string) error {
  408. files, err := readDir(codePath)
  409. if err != nil {
  410. log.Error("readDir(%s) failed: %s", codePath, err.Error())
  411. return err
  412. }
  413. for _, file := range files {
  414. if file.IsDir() {
  415. if err = uploadCodeToMinio(codePath+file.Name()+"/", jobName, parentDir+file.Name()+"/"); err != nil {
  416. log.Error("uploadCodeToMinio(%s) failed: %s", file.Name(), err.Error())
  417. return err
  418. }
  419. } else {
  420. destObject := setting.CBCodePathPrefix + jobName + parentDir + file.Name()
  421. sourceFile := codePath + file.Name()
  422. err = storage.Attachments.UploadObject(destObject, sourceFile)
  423. if err != nil {
  424. log.Error("UploadObject(%s) failed: %s", file.Name(), err.Error())
  425. return err
  426. }
  427. }
  428. }
  429. return nil
  430. }
  431. func readDir(dirname string) ([]os.FileInfo, error) {
  432. f, err := os.Open(dirname)
  433. if err != nil {
  434. return nil, err
  435. }
  436. list, err := f.Readdir(0)
  437. f.Close()
  438. if err != nil {
  439. //todo: can not upload empty folder
  440. if err == io.EOF {
  441. return nil, nil
  442. }
  443. return nil, err
  444. }
  445. //sort.Slice(list, func(i, j int) bool { return list[i].Name() < list[j].Name() })
  446. return list, nil
  447. }
  448. func mkModelPath(modelPath string) error {
  449. return mkPathAndReadMeFile(modelPath, "You can put the files into this directory and download the files by the web page.")
  450. }
  451. func mkPathAndReadMeFile(path string, text string) error {
  452. err := os.MkdirAll(path, os.ModePerm)
  453. if err != nil {
  454. log.Error("MkdirAll(%s) failed:%v", path, err)
  455. return err
  456. }
  457. fileName := path + "README"
  458. f, err := os.OpenFile(fileName, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm)
  459. if err != nil {
  460. log.Error("OpenFile failed", err.Error())
  461. return err
  462. }
  463. defer f.Close()
  464. _, err = f.WriteString(text)
  465. if err != nil {
  466. log.Error("WriteString failed", err.Error())
  467. return err
  468. }
  469. return nil
  470. }
  471. func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
  472. index := strings.Index(pretrainModelDir, "/")
  473. if index > 0 {
  474. filterBucket := pretrainModelDir[index+1:]
  475. return filterBucket + fileName
  476. } else {
  477. return ""
  478. }
  479. }
  480. func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName string) (string, error) {
  481. var command string
  482. workDir := grampus.NpuWorkDir
  483. if processorType == grampus.ProcessorTypeGPU {
  484. workDir = grampus.GpuWorkDir
  485. }
  486. command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
  487. //download code & dataset
  488. if processorType == grampus.ProcessorTypeNPU {
  489. //no need to download code & dataset by internet
  490. } else if processorType == grampus.ProcessorTypeGPU {
  491. commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
  492. commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
  493. command += commandDownload
  494. }
  495. //unzip code & dataset
  496. if processorType == grampus.ProcessorTypeNPU {
  497. //no need to process
  498. } else if processorType == grampus.ProcessorTypeGPU {
  499. unZipDatasetCommand := generateDatasetUnzipCommand(datasetName)
  500. commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
  501. command += commandUnzip
  502. }
  503. command += "echo \"unzip finished;start to exec code;\";"
  504. // set export
  505. var commandExport string
  506. if processorType == grampus.ProcessorTypeNPU {
  507. commandExport = "export bucket=" + setting.Bucket + " && export remote_path=" + outputRemotePath + ";"
  508. } else if processorType == grampus.ProcessorTypeGPU {
  509. commandExport = "export env=" + setting.Grampus.Env + " && export remote_path=" + outputRemotePath + ";"
  510. }
  511. command += commandExport
  512. //exec code
  513. var parameters models.Parameters
  514. var paramCode string
  515. if len(paramSrc) != 0 {
  516. err := json.Unmarshal([]byte(paramSrc), &parameters)
  517. if err != nil {
  518. log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err)
  519. return command, err
  520. }
  521. for _, parameter := range parameters.Parameter {
  522. paramCode += " --" + parameter.Label + "=" + parameter.Value
  523. }
  524. }
  525. var commandCode string
  526. if processorType == grampus.ProcessorTypeNPU {
  527. commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py /tmp/log/train.log" + paramCode + ";"
  528. } else if processorType == grampus.ProcessorTypeGPU {
  529. if pretrainModelFileName != "" {
  530. paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName
  531. }
  532. commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";"
  533. }
  534. command += commandCode
  535. //get exec result
  536. commandGetRes := "result=$?;"
  537. command += commandGetRes
  538. //upload models
  539. if processorType == grampus.ProcessorTypeNPU {
  540. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_npu " + setting.Bucket + " " + outputRemotePath + " " + workDir + "output/;"
  541. command += commandUpload
  542. } else if processorType == grampus.ProcessorTypeGPU {
  543. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
  544. command += commandUpload
  545. }
  546. //check exec result
  547. commandCheckRes := "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\""
  548. command += commandCheckRes
  549. return command, nil
  550. }
  551. func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string {
  552. commandDownloadTemp := commandDownload
  553. if pretrainModelPath != "" {
  554. commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'"
  555. }
  556. commandDownloadTemp += ";"
  557. return commandDownloadTemp
  558. }
  559. func generateDatasetUnzipCommand(datasetName string) string {
  560. var unZipDatasetCommand string
  561. datasetNameArray := strings.Split(datasetName, ";")
  562. if len(datasetNameArray) == 1 { //单数据集
  563. unZipDatasetCommand = "unzip -q '" + datasetName + "';"
  564. if strings.HasSuffix(datasetNameArray[0], ".tar.gz") {
  565. unZipDatasetCommand = "tar --strip-components=1 -zxvf '" + datasetName + "';"
  566. }
  567. } else { //多数据集
  568. for _, datasetNameTemp := range datasetNameArray {
  569. if strings.HasSuffix(datasetNameTemp, ".tar.gz") {
  570. unZipDatasetCommand = unZipDatasetCommand + "tar -zxvf '" + datasetNameTemp + "';"
  571. } else {
  572. unZipDatasetCommand = unZipDatasetCommand + "unzip -q '" + datasetNameTemp + "' -d './" + strings.TrimSuffix(datasetNameTemp, ".zip") + "';"
  573. }
  574. }
  575. }
  576. return unZipDatasetCommand
  577. }
  578. func getPoolId() string {
  579. var resourcePools modelarts.ResourcePool
  580. json.Unmarshal([]byte(setting.ResourcePools), &resourcePools)
  581. return resourcePools.Info[0].ID
  582. }
  583. func PrepareSpec4Show(task *models.Cloudbrain) {
  584. s, err := resource.GetCloudbrainSpec(task.ID)
  585. if err != nil {
  586. log.Info("error:" + err.Error())
  587. return
  588. }
  589. task.Spec = s
  590. }
  591. func IsTaskNotStop(task *models.Cloudbrain) bool {
  592. statuses := CloudbrainOneNotFinalStatuses
  593. if task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeCDCenter {
  594. statuses = CloudbrainTwoNotFinalStatuses
  595. } else {
  596. statuses = GrampusNotFinalStatuses
  597. }
  598. for _, status := range statuses {
  599. if task.Status == status {
  600. return true
  601. }
  602. }
  603. return false
  604. }
  605. func SyncTaskStatus(task *models.Cloudbrain) error {
  606. if task.Type == models.TypeCloudBrainOne {
  607. result, err := cloudbrain.GetJob(task.JobID)
  608. if err != nil {
  609. log.Info("error:" + err.Error())
  610. return fmt.Errorf("repo.cloudbrain_query_fail")
  611. }
  612. if result != nil {
  613. jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
  614. taskRoles := jobRes.TaskRoles
  615. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  616. oldStatus := task.Status
  617. task.Status = taskRes.TaskStatuses[0].State
  618. task.ContainerID = taskRes.TaskStatuses[0].ContainerID
  619. models.ParseAndSetDurationFromCloudBrainOne(jobRes, task)
  620. if task.DeletedAt.IsZero() { //normal record
  621. if oldStatus != task.Status {
  622. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  623. }
  624. err = models.UpdateJob(task)
  625. if err != nil {
  626. return fmt.Errorf("repo.cloudbrain_query_fail")
  627. }
  628. }
  629. } else {
  630. log.Info("error:" + err.Error())
  631. return fmt.Errorf("repo.cloudbrain_query_fail")
  632. }
  633. } else if task.Type == models.TypeCloudBrainTwo || task.Type == models.TypeCDCenter {
  634. err := modelarts.HandleTrainJobInfo(task)
  635. if err != nil {
  636. return fmt.Errorf("repo.cloudbrain_query_fail")
  637. }
  638. } else if task.Type == models.TypeC2Net {
  639. result, err := grampus.GetJob(task.JobID)
  640. if err != nil {
  641. log.Error("GetJob failed:" + err.Error())
  642. return fmt.Errorf("repo.cloudbrain_query_fail")
  643. }
  644. if result != nil {
  645. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  646. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  647. }
  648. oldStatus := task.Status
  649. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  650. if task.Status != result.JobInfo.Status || result.JobInfo.Status == models.GrampusStatusRunning {
  651. task.Duration = result.JobInfo.RunSec
  652. if task.Duration < 0 {
  653. task.Duration = 0
  654. }
  655. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  656. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  657. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  658. }
  659. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  660. task.EndTime = task.StartTime.Add(task.Duration)
  661. }
  662. task.CorrectCreateUnix()
  663. if oldStatus != task.Status {
  664. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  665. }
  666. err = models.UpdateJob(task)
  667. if err != nil {
  668. log.Error("UpdateJob failed:" + err.Error())
  669. return fmt.Errorf("repo.cloudbrain_query_fail")
  670. }
  671. }
  672. }
  673. }
  674. return nil
  675. }