You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 5.7 kB

4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. package modelarts
  2. import (
  3. "path"
  4. "strconv"
  5. "code.gitea.io/gitea/models"
  6. "code.gitea.io/gitea/modules/context"
  7. "code.gitea.io/gitea/modules/log"
  8. "code.gitea.io/gitea/modules/setting"
  9. )
  10. const (
  11. //notebook
  12. storageTypeOBS = "obs"
  13. autoStopDuration = 4 * 60 * 60
  14. flavor = "modelarts.kat1.xlarge"
  15. //profileID = "Python3-ascend910-arm"
  16. profileID = "efa847c0-7359-11eb-b34f-0255ac100057"
  17. poolID = "pool1328035d"
  18. poolName = "train-private-1"
  19. poolType = "USER_DEFINED"
  20. DataSetMountPath = "/home/ma-user/work"
  21. NotebookEnv = "Python3"
  22. NotebookType = "Ascend"
  23. FlavorInfo = "Ascend: 1*Ascend 910 CPU: 24 核 96GiB (modelarts.kat1.xlarge)"
  24. //train-job
  25. ResourcePools = "{\"resource_pool\":[{\"id\":\"pool1328035d\", \"value\":\"专属资源池\"}]}"
  26. Engines = "{\"engine\":[{\"id\":1, \"value\":\"Ascend-Powered-Engine\"}]}"
  27. EngineVersions = "{\"version\":[{\"id\":118,\"value\":\"MindSpore-1.0.0-c75-python3.7-euleros2.8-aarch64\"}," +
  28. "{\"id\":119,\"value\":\"MindSpore-1.1.1-c76-python3.7-euleros2.8-aarch64\"}," +
  29. "{\"id\":120,\"value\":\"MindSpore-1.1.1-c76-tr5-python3.7-euleros2.8-aarch64\"}," +
  30. "{\"id\":117,\"value\":\"TF-1.15-c75-python3.7-euleros2.8-aarch64\"}" +
  31. "]}"
  32. FlavorInfos = "{\"flavor\":[{\"code\":\"modelarts.bm.910.arm.public.2\",\"value\":\"Ascend : 2 * Ascend 910 CPU:48 核 512GiB\"}," +
  33. "{\"code\":\"modelarts.bm.910.arm.public.8\",\"value\":\"Ascend : 8 * Ascend 910 CPU:192 核 2048GiB\"}," +
  34. "{\"code\":\"modelarts.bm.910.arm.public.4\",\"value\":\"Ascend : 4 * Ascend 910 CPU:96 核 1024GiB\"}," +
  35. "{\"code\":\"modelarts.bm.910.arm.public.1\",\"value\":\"Ascend : 1 * Ascend 910 CPU:24 核 256GiB\"}" +
  36. "]}"
  37. CodePath = "/code/"
  38. OutputPath = "/output/"
  39. LogPath = "/log/"
  40. JobPath = "/job/"
  41. OrderDesc = "desc"
  42. OrderAsc = "asc"
  43. )
  44. type GenerateTrainJobReq struct {
  45. JobName string
  46. Uuid string
  47. Description string
  48. CodeObsPath string
  49. BootFile string
  50. DataUrl string
  51. TrainUrl string
  52. FlavorCode string
  53. LogUrl string
  54. PoolID string
  55. WorkServerNumber int
  56. EngineID int64
  57. }
  58. type VersionInfo struct {
  59. Version []struct {
  60. ID int `json:"id"`
  61. Value string `json:"value"`
  62. } `json:"version"`
  63. }
  64. type Flavor struct {
  65. Info []struct {
  66. Code string `json:"code"`
  67. Value string `json:"value"`
  68. } `json:"flavor"`
  69. }
  70. type Engine struct {
  71. Info []struct {
  72. ID int `json:"id"`
  73. Value string `json:"value"`
  74. } `json:"engine"`
  75. }
  76. type ResourcePool struct {
  77. Info []struct {
  78. ID string `json:"id"`
  79. Value string `json:"value"`
  80. } `json:"resource_pool"`
  81. }
  82. func GenerateTask(ctx *context.Context, jobName, uuid, description string) error {
  83. dataActualPath := setting.Bucket + "/" + setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  84. jobResult, err := createNotebook(models.CreateNotebookParams{
  85. JobName: jobName,
  86. Description:description,
  87. ProfileID: profileID,
  88. Flavor: flavor,
  89. Pool: models.Pool{
  90. ID: poolID,
  91. Name: poolName,
  92. Type: poolType,
  93. },
  94. Spec: models.Spec{
  95. Storage: models.Storage{
  96. Type: storageTypeOBS,
  97. Location:models.Location{
  98. Path: dataActualPath,
  99. },
  100. },
  101. AutoStop: models.AutoStop{
  102. Enable: true,
  103. Duration: autoStopDuration,
  104. },
  105. },
  106. })
  107. if err != nil {
  108. log.Error("CreateJob failed: %v", err.Error())
  109. return err
  110. }
  111. err = models.CreateCloudbrain(&models.Cloudbrain{
  112. Status: string(models.JobWaiting),
  113. UserID: ctx.User.ID,
  114. RepoID: ctx.Repo.Repository.ID,
  115. JobID: jobResult.ID,
  116. JobName: jobName,
  117. JobType: string(models.JobTypeDebug),
  118. Type: models.TypeCloudBrainNotebook,
  119. })
  120. if err != nil {
  121. return err
  122. }
  123. return nil
  124. }
  125. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) error {
  126. jobResult, err := createTrainJob(models.CreateTrainJobParams{
  127. JobName: req.JobName,
  128. Description: req.Description,
  129. Config: models.Config{
  130. WorkServerNum: req.WorkServerNumber,
  131. AppUrl: req.CodeObsPath,
  132. BootFileUrl: req.BootFile,
  133. DataUrl: req.DataUrl,
  134. EngineID: req.EngineID,
  135. TrainUrl: req.TrainUrl,
  136. LogUrl: req.LogUrl,
  137. PoolID: req.PoolID,
  138. CreateVersion: true,
  139. Flavor: models.Flavor{
  140. Code: req.FlavorCode,
  141. },
  142. },
  143. })
  144. if err != nil {
  145. log.Error("CreateJob failed: %v", err.Error())
  146. return err
  147. }
  148. err = models.CreateCloudbrain(&models.Cloudbrain{
  149. Status: TransTrainJobStatus(jobResult.Status),
  150. UserID: ctx.User.ID,
  151. RepoID: ctx.Repo.Repository.ID,
  152. JobID: strconv.FormatInt(jobResult.JobID, 10),
  153. JobName: req.JobName,
  154. JobType: string(models.JobTypeDebug),
  155. Type: models.TypeCloudBrainTrainJob,
  156. VersionID: jobResult.VersionID,
  157. VersionName: jobResult.VersionName,
  158. })
  159. if err != nil {
  160. log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, err.Error())
  161. return err
  162. }
  163. return nil
  164. }
  165. func TransTrainJobStatus(status int) string{
  166. switch status {
  167. case 0:
  168. return "UNKNOWN"
  169. case 1:
  170. return "INIT"
  171. case 2:
  172. return "IMAGE_CREATING"
  173. case 3:
  174. return "IMAGE_FAILED"
  175. case 4:
  176. return "SUBMIT_TRYING"
  177. case 5:
  178. return "SUBMIT_FAILED"
  179. case 6:
  180. return "DELETE_FAILED"
  181. case 7:
  182. return "WAITING"
  183. case 8:
  184. return "RUNNING"
  185. case 9:
  186. return "KILLING"
  187. case 10:
  188. return "COMPLETED"
  189. case 11:
  190. return "FAILED"
  191. case 12:
  192. return "KILLED"
  193. case 13:
  194. return "CANCELED"
  195. case 14:
  196. return "LOST"
  197. case 15:
  198. return "SCALING"
  199. case 16:
  200. return "SUBMIT_MODEL_FAILED"
  201. case 17:
  202. return "DEPLOY_SERVICE_FAILED"
  203. case 18:
  204. return "CHECK_INIT"
  205. case 19:
  206. return "CHECK_RUNNING"
  207. case 20:
  208. return "CHECK_RUNNING_COMPLETED"
  209. case 21:
  210. return "CHECK_FAILED"
  211. default:
  212. return strconv.Itoa(status)
  213. }
  214. return ""
  215. }