You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ai_model_convert.go 28 kB

3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "path"
  12. "strings"
  13. "code.gitea.io/gitea/models"
  14. "code.gitea.io/gitea/modules/cloudbrain"
  15. "code.gitea.io/gitea/modules/context"
  16. "code.gitea.io/gitea/modules/git"
  17. "code.gitea.io/gitea/modules/log"
  18. "code.gitea.io/gitea/modules/modelarts"
  19. "code.gitea.io/gitea/modules/setting"
  20. "code.gitea.io/gitea/modules/storage"
  21. "code.gitea.io/gitea/modules/timeutil"
  22. uuid "github.com/satori/go.uuid"
  23. )
  24. const (
  25. tplModelManageConvertIndex = "repo/modelmanage/convertIndex"
  26. tplModelConvertInfo = "repo/modelmanage/convertshowinfo"
  27. PYTORCH_ENGINE = 0
  28. TENSORFLOW_ENGINE = 1
  29. MINDSPORE_ENGINE = 2
  30. PADDLE_ENGINE = 4
  31. MXNET_ENGINE = 6
  32. ModelMountPath = "/model"
  33. CodeMountPath = "/code"
  34. DataSetMountPath = "/dataset"
  35. LogFile = "log.txt"
  36. DefaultBranchName = "master"
  37. SubTaskName = "task1"
  38. //GpuQueue = "openidgx"
  39. Success = "S000"
  40. //GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap"
  41. //GPU_TENSORFLOW_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx"
  42. //NPU_MINDSPORE_16_IMAGE = "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend"
  43. //PytorchOnnxBootFile = "convert_pytorch.py"
  44. //PytorchTrTBootFile = "convert_pytorch_tensorrt.py"
  45. //MindsporeBootFile = "convert_mindspore.py"
  46. //TensorFlowNpuBootFile = "convert_tensorflow.py"
  47. //TensorFlowGpuBootFile = "convert_tensorflow_gpu.py"
  48. //ConvertRepoPath = "https://openi.pcl.ac.cn/zouap/npu_test"
  49. CONVERT_FORMAT_ONNX = 0
  50. CONVERT_FORMAT_TRT = 1
  51. NetOutputFormat_FP32 = 0
  52. NetOutputFormat_FP16 = 1
  53. NPU_MINDSPORE_IMAGE_ID = 35
  54. NPU_TENSORFLOW_IMAGE_ID = 121
  55. //GPU_Resource_Specs_ID = 1 //cpu 1, gpu 1
  56. //NPU_FlavorCode = "modelarts.bm.910.arm.public.1"
  57. //NPU_PoolID = "pool7908321a"
  58. )
  59. var (
  60. TrainResourceSpecs *models.ResourceSpecs
  61. )
  62. func SaveModelConvert(ctx *context.Context) {
  63. log.Info("save model convert start.")
  64. if !ctx.Repo.CanWrite(models.UnitTypeModelManage) {
  65. ctx.JSON(200, map[string]string{
  66. "code": "1",
  67. "msg": ctx.Tr("repo.modelconvert.manage.no_operate_right"),
  68. })
  69. return
  70. }
  71. name := ctx.Query("name")
  72. desc := ctx.Query("desc")
  73. modelId := ctx.Query("modelId")
  74. modelPath := ctx.Query("modelFile")
  75. SrcEngine := ctx.QueryInt("srcEngine")
  76. InputShape := ctx.Query("inputshape")
  77. InputDataFormat := ctx.Query("inputdataformat")
  78. DestFormat := ctx.QueryInt("destFormat")
  79. NetOutputFormat := ctx.QueryInt("netOutputFormat")
  80. task, err := models.QueryModelById(modelId)
  81. if err != nil {
  82. log.Error("no such model!", err.Error())
  83. ctx.JSON(200, map[string]string{
  84. "code": "1",
  85. "msg": ctx.Tr("repo.modelconvert.manage.model_not_exist"),
  86. })
  87. return
  88. }
  89. convertList, err := models.QueryModelConvertByRepoID(ctx.Repo.Repository.ID)
  90. if err == nil {
  91. for _, convert := range convertList {
  92. if convert.Name == name {
  93. log.Info("convert.Name=" + name + " convert.id=" + convert.ID)
  94. ctx.JSON(200, map[string]string{
  95. "code": "1",
  96. "msg": ctx.Tr("repo.modelconvert.manage.create_error1"),
  97. })
  98. return
  99. }
  100. }
  101. }
  102. convertList, err = models.QueryModelConvertByUserID(ctx.User.ID)
  103. if err == nil {
  104. for _, convert := range convertList {
  105. if isRunningTask(convert.Status) {
  106. log.Info("convert.Status=" + convert.Status + " convert.id=" + convert.ID)
  107. ctx.JSON(200, map[string]string{
  108. "code": "1",
  109. "msg": ctx.Tr("repo.modelconvert.manage.create_error2"),
  110. })
  111. return
  112. }
  113. }
  114. }
  115. uuid := uuid.NewV4()
  116. id := uuid.String()
  117. modelConvert := &models.AiModelConvert{
  118. ID: id,
  119. Name: name,
  120. Description: desc,
  121. Status: string(models.JobWaiting),
  122. SrcEngine: SrcEngine,
  123. RepoId: ctx.Repo.Repository.ID,
  124. ModelName: task.Name,
  125. ModelVersion: task.Version,
  126. ModelId: modelId,
  127. ModelPath: modelPath,
  128. DestFormat: DestFormat,
  129. NetOutputFormat: NetOutputFormat,
  130. InputShape: InputShape,
  131. InputDataFormat: InputDataFormat,
  132. UserId: ctx.User.ID,
  133. }
  134. models.SaveModelConvert(modelConvert)
  135. go goCreateTask(modelConvert, ctx, task)
  136. ctx.JSON(200, map[string]string{
  137. "id": id,
  138. "code": "0",
  139. })
  140. }
  141. func isRunningTask(status string) bool {
  142. stopStatus := []string{"COMPLETED", "STOPPED", "FAILED", "START_FAILED", "STOPPING", "SUCCEEDED"}
  143. for _, sta := range stopStatus {
  144. if sta == status {
  145. return false
  146. }
  147. }
  148. return true
  149. }
  150. func goCreateTask(modelConvert *models.AiModelConvert, ctx *context.Context, task *models.AiModelManage) error {
  151. if modelConvert.IsGpuTrainTask() {
  152. log.Info("create gpu train job.")
  153. return createGpuTrainJob(modelConvert, ctx, task)
  154. } else {
  155. //create npu job
  156. log.Info("create npu train job.")
  157. return createNpuTrainJob(modelConvert, ctx, task.Path)
  158. }
  159. }
  160. func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, modelRelativePath string) error {
  161. VersionOutputPath := "V0001"
  162. codeLocalPath := setting.JobPath + modelConvert.ID + modelarts.CodePath
  163. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.CodePath
  164. outputObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.OutputPath + VersionOutputPath + "/"
  165. logObsPath := "/" + setting.Bucket + modelarts.JobPath + modelConvert.ID + modelarts.LogPath + VersionOutputPath + "/"
  166. dataPath := "/" + modelRelativePath
  167. _, err := ioutil.ReadDir(codeLocalPath)
  168. if err == nil {
  169. deleteLocalDir(codeLocalPath)
  170. }
  171. if err := downloadConvertCode(setting.ModelConvert.ConvertRepoPath, codeLocalPath, DefaultBranchName); err != nil {
  172. log.Error("downloadCode failed, server timed out: %s (%v)", setting.ModelConvert.ConvertRepoPath, err)
  173. return err
  174. }
  175. if err := obsMkdir(setting.CodePathPrefix + modelConvert.ID + modelarts.OutputPath + VersionOutputPath + "/"); err != nil {
  176. log.Error("Failed to obsMkdir_output: %s (%v)", modelConvert.ID+modelarts.OutputPath, err)
  177. return err
  178. }
  179. if err := obsMkdir(setting.CodePathPrefix + modelConvert.ID + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  180. log.Error("Failed to obsMkdir_log: %s (%v)", modelConvert.ID+modelarts.LogPath, err)
  181. return err
  182. }
  183. if err := uploadCodeToObs(codeLocalPath, modelConvert.ID, ""); err != nil {
  184. log.Error("Failed to uploadCodeToObs: %s (%v)", modelConvert.ID, err)
  185. return err
  186. }
  187. deleteLocalDir(codeLocalPath)
  188. intputshape := strings.Split(modelConvert.InputShape, ",")
  189. n := "256"
  190. c := "1"
  191. h := "28"
  192. w := "28"
  193. if len(intputshape) == 4 {
  194. n = intputshape[0]
  195. c = intputshape[1]
  196. h = intputshape[2]
  197. w = intputshape[3]
  198. }
  199. var engineId int64
  200. engineId = int64(NPU_MINDSPORE_IMAGE_ID)
  201. bootfile := setting.ModelConvert.MindsporeBootFile
  202. if modelConvert.SrcEngine == TENSORFLOW_ENGINE {
  203. engineId = int64(NPU_TENSORFLOW_IMAGE_ID)
  204. bootfile = setting.ModelConvert.TensorFlowNpuBootFile
  205. }
  206. userCommand := "/bin/bash /home/work/run_train.sh 's3://" + codeObsPath + "' 'code/" + bootfile + "' '/tmp/log/train.log' --'data_url'='s3://" + dataPath + "' --'train_url'='s3://" + outputObsPath + "'"
  207. userCommand += " --'model'='" + modelConvert.ModelPath + "'"
  208. userCommand += " --'n'='" + fmt.Sprint(n) + "'"
  209. userCommand += " --'c'='" + fmt.Sprint(c) + "'"
  210. userCommand += " --'h'='" + fmt.Sprint(h) + "'"
  211. userCommand += " --'w'='" + fmt.Sprint(w) + "'"
  212. req := &modelarts.GenerateTrainJobReq{
  213. JobName: modelConvert.ID,
  214. DisplayJobName: modelConvert.Name,
  215. DataUrl: dataPath,
  216. Description: modelConvert.Description,
  217. CodeObsPath: codeObsPath,
  218. BootFileUrl: codeObsPath + bootfile,
  219. BootFile: bootfile,
  220. TrainUrl: outputObsPath,
  221. FlavorCode: setting.ModelConvert.NPU_FlavorCode,
  222. WorkServerNumber: 1,
  223. IsLatestVersion: modelarts.IsLatestVersion,
  224. EngineID: engineId,
  225. LogUrl: logObsPath,
  226. PoolID: setting.ModelConvert.NPU_PoolID,
  227. //Parameters: param,
  228. BranchName: DefaultBranchName,
  229. UserImageUrl: setting.ModelConvert.NPU_MINDSPORE_16_IMAGE,
  230. UserCommand: userCommand,
  231. }
  232. result, err := modelarts.GenerateModelConvertTrainJob(req)
  233. if err == nil {
  234. log.Info("jobId=" + fmt.Sprint(result.JobID) + " versionid=" + fmt.Sprint(result.VersionID))
  235. models.UpdateModelConvertModelArts(modelConvert.ID, fmt.Sprint(result.JobID), fmt.Sprint(result.VersionID))
  236. } else {
  237. log.Info("create modelarts taks failed.error=" + err.Error())
  238. models.UpdateModelConvertFailed(modelConvert.ID, "FAILED", err.Error())
  239. }
  240. return err
  241. }
  242. func downloadConvertCode(repopath string, codePath, branchName string) error {
  243. //add "file:///" prefix to make the depth valid
  244. if err := git.Clone(repopath, codePath, git.CloneRepoOptions{Branch: branchName, Depth: 1}); err != nil {
  245. log.Error("Failed to clone repository: %s (%v)", repopath, err)
  246. return err
  247. }
  248. log.Info("srcPath=" + repopath + " codePath=" + codePath)
  249. configFile, err := os.OpenFile(codePath+"/.git/config", os.O_RDWR, 0666)
  250. if err != nil {
  251. log.Error("open file(%s) failed:%v", codePath+"/,git/config", err)
  252. return err
  253. }
  254. defer configFile.Close()
  255. pos := int64(0)
  256. reader := bufio.NewReader(configFile)
  257. for {
  258. line, err := reader.ReadString('\n')
  259. if err != nil {
  260. if err == io.EOF {
  261. log.Error("not find the remote-url")
  262. return nil
  263. } else {
  264. log.Error("read error: %v", err)
  265. return err
  266. }
  267. }
  268. if strings.Contains(line, "url") && strings.Contains(line, ".git") {
  269. originUrl := "\turl = " + repopath + "\n"
  270. if len(line) > len(originUrl) {
  271. originUrl += strings.Repeat(" ", len(line)-len(originUrl))
  272. }
  273. bytes := []byte(originUrl)
  274. _, err := configFile.WriteAt(bytes, pos)
  275. if err != nil {
  276. log.Error("WriteAt failed:%v", err)
  277. return err
  278. }
  279. break
  280. }
  281. pos += int64(len(line))
  282. }
  283. return nil
  284. }
  285. func downloadFromObsToLocal(task *models.AiModelManage, localPath string) error {
  286. path := Model_prefix + models.AttachmentRelativePath(task.ID) + "/"
  287. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, path)
  288. if err == nil {
  289. _, errState := os.Stat(localPath)
  290. if errState != nil {
  291. if err = os.MkdirAll(localPath, os.ModePerm); err != nil {
  292. return err
  293. }
  294. }
  295. for _, oneFile := range allFile {
  296. if oneFile.IsDir {
  297. log.Info(" dir name:" + oneFile.FileName)
  298. } else {
  299. allFileName := localPath + "/" + oneFile.FileName
  300. index := strings.LastIndex(allFileName, "/")
  301. if index != -1 {
  302. parentDir := allFileName[0:index]
  303. if err = os.MkdirAll(parentDir, os.ModePerm); err != nil {
  304. log.Info("make dir may be error," + err.Error())
  305. }
  306. }
  307. fDest, err := os.Create(allFileName)
  308. if err != nil {
  309. log.Info("create file error, download file failed: %s\n", err.Error())
  310. return err
  311. }
  312. body, err := storage.ObsDownloadAFile(setting.Bucket, path+oneFile.FileName)
  313. if err != nil {
  314. log.Info("download file failed: %s\n", err.Error())
  315. return err
  316. } else {
  317. defer body.Close()
  318. p := make([]byte, 1024)
  319. var readErr error
  320. var readCount int
  321. // 读取对象内容
  322. for {
  323. readCount, readErr = body.Read(p)
  324. if readCount > 0 {
  325. fDest.Write(p[:readCount])
  326. }
  327. if readErr != nil {
  328. break
  329. }
  330. }
  331. }
  332. }
  333. }
  334. } else {
  335. log.Info("error,msg=" + err.Error())
  336. return err
  337. }
  338. return nil
  339. }
  340. func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, model *models.AiModelManage) error {
  341. modelRelativePath := model.Path
  342. command := ""
  343. IMAGE_URL := setting.ModelConvert.GPU_PYTORCH_IMAGE
  344. dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath
  345. if modelConvert.SrcEngine == PYTORCH_ENGINE {
  346. if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
  347. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchOnnxBootFile)
  348. } else if modelConvert.DestFormat == CONVERT_FORMAT_TRT {
  349. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchTrTBootFile)
  350. } else {
  351. return errors.New("Not support the format.")
  352. }
  353. } else if modelConvert.SrcEngine == TENSORFLOW_ENGINE {
  354. IMAGE_URL = setting.ModelConvert.GPU_TENSORFLOW_IMAGE
  355. if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
  356. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.TensorFlowGpuBootFile)
  357. } else {
  358. return errors.New("Not support the format.")
  359. }
  360. //如果模型在OBS上,需要下载到本地,并上传到minio中
  361. if model.Type == models.TypeCloudBrainTwo {
  362. relatetiveModelPath := setting.JobPath + modelConvert.ID + "/dataset"
  363. log.Info("local dataset path:" + relatetiveModelPath)
  364. downloadFromObsToLocal(model, relatetiveModelPath)
  365. uploadCodeToMinio(relatetiveModelPath+"/", modelConvert.ID, "/dataset/")
  366. deleteLocalDir(relatetiveModelPath)
  367. dataActualPath = setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/dataset"
  368. }
  369. } else if modelConvert.SrcEngine == PADDLE_ENGINE {
  370. IMAGE_URL = setting.ModelConvert.GPU_PADDLE_IMAGE
  371. if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
  372. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PaddleOnnxBootFile)
  373. } else {
  374. return errors.New("Not support the format.")
  375. }
  376. } else if modelConvert.SrcEngine == MXNET_ENGINE {
  377. IMAGE_URL = setting.ModelConvert.GPU_MXNET_IMAGE
  378. if modelConvert.DestFormat == CONVERT_FORMAT_ONNX {
  379. command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.MXnetOnnxBootFile)
  380. } else {
  381. return errors.New("Not support the format.")
  382. }
  383. }
  384. log.Info("dataActualPath=" + dataActualPath)
  385. log.Info("command=" + command)
  386. codePath := setting.JobPath + modelConvert.ID + CodeMountPath
  387. downloadConvertCode(setting.ModelConvert.ConvertRepoPath, codePath, DefaultBranchName)
  388. uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/")
  389. deleteLocalDir(codePath)
  390. minioCodePath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/code"
  391. log.Info("minio codePath=" + minioCodePath)
  392. modelPath := setting.JobPath + modelConvert.ID + ModelMountPath + "/"
  393. log.Info("local modelPath=" + modelPath)
  394. mkModelPath(modelPath)
  395. uploadCodeToMinio(modelPath, modelConvert.ID, ModelMountPath+"/")
  396. deleteLocalDir(modelPath)
  397. minioModelPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.CBCodePathPrefix + modelConvert.ID + "/model"
  398. log.Info("minio model path=" + minioModelPath)
  399. if TrainResourceSpecs == nil {
  400. json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs)
  401. }
  402. resourceSpec := TrainResourceSpecs.ResourceSpec[setting.ModelConvert.GPU_Resource_Specs_ID]
  403. jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{
  404. JobName: modelConvert.ID,
  405. RetryCount: 1,
  406. GpuType: setting.ModelConvert.GpuQueue,
  407. Image: IMAGE_URL,
  408. TaskRoles: []models.TaskRole{
  409. {
  410. Name: SubTaskName,
  411. TaskNumber: 1,
  412. MinSucceededTaskCount: 1,
  413. MinFailedTaskCount: 1,
  414. CPUNumber: resourceSpec.CpuNum,
  415. GPUNumber: resourceSpec.GpuNum,
  416. MemoryMB: resourceSpec.MemMiB,
  417. ShmMB: resourceSpec.ShareMemMiB,
  418. Command: command,
  419. NeedIBDevice: false,
  420. IsMainRole: false,
  421. UseNNI: false,
  422. },
  423. },
  424. Volumes: []models.Volume{
  425. {
  426. HostPath: models.StHostPath{
  427. Path: minioCodePath,
  428. MountPath: CodeMountPath,
  429. ReadOnly: false,
  430. },
  431. },
  432. {
  433. HostPath: models.StHostPath{
  434. Path: dataActualPath,
  435. MountPath: DataSetMountPath,
  436. ReadOnly: true,
  437. },
  438. },
  439. {
  440. HostPath: models.StHostPath{
  441. Path: minioModelPath,
  442. MountPath: ModelMountPath,
  443. ReadOnly: false,
  444. },
  445. },
  446. },
  447. })
  448. if err != nil {
  449. log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"])
  450. models.UpdateModelConvertFailed(modelConvert.ID, "FAILED", err.Error())
  451. return err
  452. }
  453. if jobResult.Code != Success {
  454. log.Error("CreateJob(%s) failed:%s", modelConvert.ID, jobResult.Msg, ctx.Data["MsgID"])
  455. models.UpdateModelConvertFailed(modelConvert.ID, "FAILED", err.Error())
  456. return errors.New(jobResult.Msg)
  457. }
  458. var jobID = jobResult.Payload["jobId"].(string)
  459. log.Info("jobId=" + jobID)
  460. models.UpdateModelConvertCBTI(modelConvert.ID, jobID)
  461. return nil
  462. }
  463. func deleteLocalDir(dirpath string) {
  464. //TODO delete
  465. _err := os.RemoveAll(dirpath)
  466. if _err == nil {
  467. log.Info("Delete local file:" + dirpath)
  468. } else {
  469. log.Info("Delete local file error: path=" + dirpath)
  470. }
  471. }
  472. func getGpuModelConvertCommand(name string, modelFile string, modelConvert *models.AiModelConvert, bootfile string) string {
  473. var command string
  474. inputshape := strings.Split(modelConvert.InputShape, ",")
  475. n := "256"
  476. c := "1"
  477. h := "28"
  478. w := "28"
  479. if len(inputshape) == 4 {
  480. n = inputshape[0]
  481. c = inputshape[1]
  482. h = inputshape[2]
  483. w = inputshape[3]
  484. }
  485. command += "python3 /code/" + bootfile + " --model " + modelFile + " --n " + n + " --c " + c + " --h " + h + " --w " + w
  486. if modelConvert.DestFormat == CONVERT_FORMAT_TRT {
  487. if modelConvert.NetOutputFormat == NetOutputFormat_FP16 {
  488. command += " --fp16 True"
  489. } else {
  490. command += " --fp16 False"
  491. }
  492. }
  493. command += " > " + ModelMountPath + "/" + name + "-" + LogFile
  494. return command
  495. }
  496. func DeleteModelConvert(ctx *context.Context) {
  497. log.Info("delete model convert start.")
  498. id := ctx.Params(":id")
  499. task, err := models.QueryModelConvertById(id)
  500. if err == nil {
  501. go deleteCloudBrainTask(task)
  502. }
  503. err = models.DeleteModelConvertById(id)
  504. //TODO delete OBS文件及云脑任务
  505. if err != nil {
  506. ctx.JSON(500, err.Error())
  507. } else {
  508. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelmanage/convert_model")
  509. }
  510. }
  511. func deleteCloudBrainTask(task *models.AiModelConvert) {
  512. if task.IsGpuTrainTask() {
  513. log.Info("delete cloudbrain one resource.")
  514. dirPath := setting.CBCodePathPrefix + task.ID + "/"
  515. err := storage.Attachments.DeleteDir(dirPath)
  516. if err != nil {
  517. log.Error("DeleteDir(%s) failed:%v", dirPath, err)
  518. }
  519. } else {
  520. log.Info("delete cloudbrain two resource.")
  521. _, err := modelarts.DelTrainJob(task.CloudBrainTaskId)
  522. if err != nil {
  523. log.Error("DelTrainJob(%s) failed:%v", task.CloudBrainTaskId, err.Error())
  524. }
  525. DeleteJobStorage(task.ID)
  526. }
  527. }
  528. func stopModelConvert(id string) error {
  529. job, err := models.QueryModelConvertById(id)
  530. if err != nil {
  531. return err
  532. }
  533. if job.IsGpuTrainTask() {
  534. err = cloudbrain.StopJob(job.CloudBrainTaskId)
  535. if err != nil {
  536. log.Error("Stop cloudbrain Job(%s) failed:%v", job.CloudBrainTaskId, err)
  537. }
  538. } else {
  539. _, err = modelarts.StopTrainJob(job.CloudBrainTaskId, job.ModelArtsVersionId)
  540. if err != nil {
  541. log.Error("Stop modelarts Job(%s) failed:%v", job.CloudBrainTaskId, err)
  542. }
  543. }
  544. job.Status = string(models.JobStopped)
  545. if job.EndTime == 0 {
  546. job.EndTime = timeutil.TimeStampNow()
  547. }
  548. models.ModelConvertSetDuration(job)
  549. err = models.UpdateModelConvert(job)
  550. if err != nil {
  551. log.Error("UpdateModelConvert failed:", err)
  552. return err
  553. }
  554. return nil
  555. }
  556. func StopModelConvertApi(ctx *context.Context) {
  557. id := ctx.Query("id")
  558. log.Info("stop model convert start.id=" + id)
  559. err := stopModelConvert(id)
  560. if err == nil {
  561. ctx.JSON(200, map[string]string{
  562. "code": "0",
  563. "msg": "succeed",
  564. })
  565. } else {
  566. ctx.JSON(200, map[string]string{
  567. "code": "1",
  568. "msg": err.Error(),
  569. })
  570. }
  571. }
  572. func StopModelConvert(ctx *context.Context) {
  573. id := ctx.Params(":id")
  574. log.Info("stop model convert start.id=" + id)
  575. err := stopModelConvert(id)
  576. if err != nil {
  577. ctx.ServerError("Not found task.", err)
  578. return
  579. }
  580. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelmanage/convert_model")
  581. }
  582. func ShowModelConvertInfo(ctx *context.Context) {
  583. ctx.Data["ID"] = ctx.Query("id")
  584. ctx.Data["isModelManage"] = true
  585. ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage)
  586. job, err := models.QueryModelConvertById(ctx.Query("id"))
  587. if err == nil {
  588. if job.TrainJobDuration == "" {
  589. job.TrainJobDuration = "00:00:00"
  590. }
  591. ctx.Data["task"] = job
  592. } else {
  593. ctx.ServerError("Not found task.", err)
  594. return
  595. }
  596. ctx.Data["Name"] = job.Name
  597. ctx.Data["canDownload"] = isOperModifyOrDelete(ctx, job.UserId)
  598. user, err := models.GetUserByID(job.UserId)
  599. if err == nil {
  600. job.UserName = user.Name
  601. job.UserRelAvatarLink = user.RelAvatarLink()
  602. }
  603. if job.IsGpuTrainTask() {
  604. ctx.Data["npu_display"] = "none"
  605. ctx.Data["gpu_display"] = "block"
  606. if job.CloudBrainTaskId == "" {
  607. ctx.Data["ExitDiagnostics"] = ""
  608. ctx.Data["AppExitDiagnostics"] = ""
  609. ctx.HTML(200, tplModelConvertInfo)
  610. return
  611. }
  612. result, err := cloudbrain.GetJob(job.CloudBrainTaskId)
  613. if err != nil {
  614. log.Info("error:" + err.Error())
  615. ctx.Data["error"] = err.Error()
  616. ctx.HTML(200, tplModelConvertInfo)
  617. return
  618. }
  619. if result != nil {
  620. jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
  621. ctx.Data["result"] = jobRes
  622. taskRoles := jobRes.TaskRoles
  623. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  624. ctx.Data["taskRes"] = taskRes
  625. ctx.Data["ExitDiagnostics"] = taskRes.TaskStatuses[0].ExitDiagnostics
  626. ctx.Data["AppExitDiagnostics"] = jobRes.JobStatus.AppExitDiagnostics
  627. job.Status = jobRes.JobStatus.State
  628. if jobRes.JobStatus.State != string(models.JobWaiting) && jobRes.JobStatus.State != string(models.JobFailed) {
  629. job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP
  630. job.ContainerID = taskRes.TaskStatuses[0].ContainerID
  631. job.Status = taskRes.TaskStatuses[0].State
  632. }
  633. if jobRes.JobStatus.State != string(models.JobWaiting) {
  634. models.ModelComputeAndSetDuration(job, jobRes)
  635. err = models.UpdateModelConvert(job)
  636. if err != nil {
  637. log.Error("UpdateModelConvert failed:", err)
  638. }
  639. }
  640. }
  641. } else {
  642. if job.CloudBrainTaskId != "" {
  643. result, err := modelarts.GetTrainJob(job.CloudBrainTaskId, job.ModelArtsVersionId)
  644. if err != nil {
  645. log.Info("error:" + err.Error())
  646. ctx.Data["error"] = err.Error()
  647. return
  648. }
  649. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  650. job.RunTime = result.Duration / 1000
  651. job.TrainJobDuration = models.ConvertDurationToStr(job.RunTime)
  652. err = models.UpdateModelConvert(job)
  653. if err != nil {
  654. log.Error("UpdateJob failed:", err)
  655. }
  656. }
  657. ctx.Data["npu_display"] = "block"
  658. ctx.Data["gpu_display"] = "none"
  659. ctx.Data["ExitDiagnostics"] = ""
  660. ctx.Data["AppExitDiagnostics"] = ""
  661. }
  662. ctx.HTML(200, tplModelConvertInfo)
  663. }
  664. func ConvertModelTemplate(ctx *context.Context) {
  665. ctx.Data["isModelManage"] = true
  666. ctx.Data["TRAIN_COUNT"] = 0
  667. SetModelCount(ctx)
  668. ctx.Data["ModelManageAccess"] = ctx.Repo.CanWrite(models.UnitTypeModelManage)
  669. ShowModelConvertPageInfo(ctx)
  670. ctx.HTML(200, tplModelManageConvertIndex)
  671. }
  672. func ShowModelConvertPageInfo(ctx *context.Context) {
  673. log.Info("ShowModelConvertInfo start.")
  674. if !isQueryRight(ctx) {
  675. log.Info("no right.")
  676. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  677. return
  678. }
  679. page := ctx.QueryInt("page")
  680. if page <= 0 {
  681. page = 1
  682. }
  683. pageSize := ctx.QueryInt("pageSize")
  684. if pageSize <= 0 {
  685. pageSize = setting.UI.IssuePagingNum
  686. }
  687. modelResult, count, err := GetModelConvertPageData(ctx)
  688. if err == nil {
  689. pager := context.NewPagination(int(count), page, pageSize, 5)
  690. ctx.Data["Page"] = pager
  691. ctx.Data["Tasks"] = modelResult
  692. ctx.Data["MODEL_CONVERT_COUNT"] = count
  693. } else {
  694. ctx.ServerError("Query data error.", err)
  695. }
  696. }
  697. func GetModelConvertById(ctx *context.Context) (*models.AiModelConvert, error) {
  698. id := ctx.Query("id")
  699. return models.QueryModelConvertById(id)
  700. }
  701. func GetModelConvertByName(ctx *context.Context) ([]*models.AiModelConvert, error) {
  702. name := ctx.Query("name")
  703. return models.QueryModelConvertByName(name, ctx.Repo.Repository.ID)
  704. }
  705. func GetModelConvertPageData(ctx *context.Context) ([]*models.AiModelConvert, int64, error) {
  706. page := ctx.QueryInt("page")
  707. if page <= 0 {
  708. page = 1
  709. }
  710. pageSize := ctx.QueryInt("pageSize")
  711. if pageSize <= 0 {
  712. pageSize = setting.UI.IssuePagingNum
  713. }
  714. repoId := ctx.Repo.Repository.ID
  715. modelResult, count, err := models.QueryModelConvert(&models.AiModelQueryOptions{
  716. ListOptions: models.ListOptions{
  717. Page: page,
  718. PageSize: pageSize,
  719. },
  720. RepoID: repoId,
  721. })
  722. if err != nil {
  723. log.Info("query db error." + err.Error())
  724. return nil, 0, err
  725. }
  726. userIds := make([]int64, len(modelResult))
  727. for i, model := range modelResult {
  728. model.IsCanOper = isOperModifyOrDelete(ctx, model.UserId)
  729. model.IsCanDelete = isCanDelete(ctx, model.UserId)
  730. userIds[i] = model.UserId
  731. }
  732. userNameMap := queryUserName(userIds)
  733. for _, model := range modelResult {
  734. value := userNameMap[model.UserId]
  735. if value != nil {
  736. model.UserName = value.Name
  737. model.UserRelAvatarLink = value.RelAvatarLink()
  738. }
  739. }
  740. return modelResult, count, nil
  741. }
  742. func ModelConvertDownloadModel(ctx *context.Context) {
  743. log.Info("enter here......")
  744. id := ctx.Params(":id")
  745. job, err := models.QueryModelConvertById(id)
  746. if err != nil {
  747. ctx.ServerError("Not found task.", err)
  748. return
  749. }
  750. AllDownload := ctx.QueryBool("allDownload")
  751. if AllDownload {
  752. if job.IsGpuTrainTask() {
  753. path := setting.CBCodePathPrefix + job.ID + "/model/"
  754. allFile, err := storage.GetAllObjectByBucketAndPrefixMinio(setting.Attachment.Minio.Bucket, path)
  755. if err == nil {
  756. returnFileName := job.Name + ".zip"
  757. MinioDownloadManyFile(path, ctx, returnFileName, allFile)
  758. } else {
  759. log.Info("error,msg=" + err.Error())
  760. ctx.ServerError("no file to download.", err)
  761. }
  762. } else {
  763. Prefix := path.Join(setting.TrainJobModelPath, job.ID, "output/", "V0001", "") + "/"
  764. log.Info("bucket=" + setting.Bucket + "prefix=" + Prefix)
  765. allFile, err := storage.GetAllObjectByBucketAndPrefix(setting.Bucket, Prefix)
  766. if err == nil {
  767. returnFileName := job.Name + ".zip"
  768. ObsDownloadManyFile(Prefix, ctx, returnFileName, allFile)
  769. } else {
  770. log.Info("error,msg=" + err.Error())
  771. ctx.ServerError("no file to download.", err)
  772. }
  773. }
  774. } else {
  775. parentDir := ctx.Query("parentDir")
  776. fileName := ctx.Query("fileName")
  777. jobName := ctx.Query("jobName")
  778. if job.IsGpuTrainTask() {
  779. filePath := "jobs/" + jobName + "/model/" + parentDir
  780. url, err := storage.Attachments.PresignedGetURL(filePath, fileName)
  781. if err != nil {
  782. log.Error("PresignedGetURL failed: %v", err.Error(), ctx.Data["msgID"])
  783. ctx.ServerError("PresignedGetURL", err)
  784. return
  785. }
  786. //ctx.JSON(200, url)
  787. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusTemporaryRedirect)
  788. } else {
  789. ObjectKey := path.Join(setting.TrainJobModelPath, job.ID, "output/", "V0001", parentDir, fileName)
  790. log.Info("ObjectKey=" + ObjectKey)
  791. url, err := storage.GetObsCreateSignedUrlByBucketAndKey(setting.Bucket, ObjectKey)
  792. if err != nil {
  793. log.Error("GetObsCreateSignedUrl failed: %v", err.Error(), ctx.Data["msgID"])
  794. ctx.ServerError("GetObsCreateSignedUrl", err)
  795. return
  796. }
  797. http.Redirect(ctx.Resp, ctx.Req.Request, url, http.StatusTemporaryRedirect)
  798. }
  799. }
  800. }