You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 15 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. package grampus
  2. import (
  3. "fmt"
  4. "strings"
  5. "code.gitea.io/gitea/models"
  6. "code.gitea.io/gitea/modules/cloudbrain"
  7. "code.gitea.io/gitea/modules/context"
  8. "code.gitea.io/gitea/modules/log"
  9. "code.gitea.io/gitea/modules/notification"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. const (
  14. JobPath = "job/"
  15. ProcessorTypeNPU = "npu.huawei.com/NPU"
  16. ProcessorTypeGPU = "nvidia.com/gpu"
  17. GpuWorkDir = "/tmp/"
  18. NpuWorkDir = "/cache/"
  19. NpuLocalLogUrl = "/tmp/train.log"
  20. CommandPrepareScriptNpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;"
  21. CodeArchiveName = "master.zip"
  22. BucketRemote = "grampus"
  23. RemoteModelPath = "/output/" + models.ModelSuffix
  24. autoStopDurationMs = 4 * 60 * 60 * 1000
  25. CommandGpuDebug = "mkdir -p /dataset;%s! [ -x \"$(command -v jupyter)\" ] && pip install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;jupyter lab --ServerApp.shutdown_no_activity_timeout=%s --TerminalManager.cull_inactive_timeout=%s --TerminalManager.cull_interval=%s --MappingKernelManager.cull_idle_timeout=%s --MappingKernelManager.cull_interval=%s --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir='/code' --port=$OCTOPUS_NOTEBOOK_PORT --LabApp.token='' --LabApp.allow_origin='*' --LabApp.base_url=$OCTOPUS_NOTEBOOK_BASE_URL;"
  26. )
  27. var (
  28. poolInfos *models.PoolInfos
  29. FlavorInfos *setting.StFlavorInfos
  30. ImageInfos *setting.StImageInfosModelArts
  31. SpecialPools *models.SpecialPools
  32. CommandPrepareScriptGpu = ";mkdir -p output;mkdir -p code;mkdir -p dataset;mkdir -p pretrainmodel;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/%s/archive/master.zip;" +
  33. "echo \"finish loading script\";unzip -q master.zip;cd %s;chmod 777 downloader_for_obs uploader_for_npu downloader_for_minio uploader_for_gpu;"
  34. )
  35. type GenerateTrainJobReq struct {
  36. JobName string
  37. Command string
  38. ImageUrl string //与image_id二选一,都有的情况下优先image_url
  39. ImageId string
  40. DisplayJobName string
  41. Uuid string
  42. Description string
  43. CodeObsPath string
  44. BootFile string
  45. BootFileUrl string
  46. DataUrl string
  47. TrainUrl string
  48. WorkServerNumber int
  49. EngineID int64
  50. CommitID string
  51. IsLatestVersion string
  52. BranchName string
  53. PreVersionId int64
  54. PreVersionName string
  55. VersionCount int
  56. EngineName string
  57. TotalVersionCount int
  58. ComputeResource string
  59. ProcessType string
  60. DatasetNames string
  61. DatasetInfos map[string]models.DatasetInfo
  62. Params string
  63. ModelName string
  64. LabelName string
  65. CkptName string
  66. ModelVersion string
  67. PreTrainModelPath string
  68. PreTrainModelUrl string
  69. Spec *models.Specification
  70. CodeName string
  71. }
  72. type GenerateNotebookJobReq struct {
  73. JobName string
  74. Command string
  75. ImageUrl string
  76. ImageId string
  77. DisplayJobName string
  78. Uuid string
  79. Description string
  80. CodeStoragePath string
  81. CommitID string
  82. BranchName string
  83. ComputeResource string
  84. ProcessType string
  85. DatasetNames string
  86. DatasetInfos map[string]models.DatasetInfo
  87. ModelName string
  88. LabelName string
  89. CkptName string
  90. ModelVersion string
  91. PreTrainModelPath string
  92. PreTrainModelUrl string
  93. Spec *models.Specification
  94. CodeName string
  95. ModelPath string //参考启智GPU调试, 挂载/model目录用户的模型可以输出到这个目录
  96. }
  97. func getEndPoint() string {
  98. index := strings.Index(setting.Endpoint, "//")
  99. endpoint := setting.Endpoint[index+2:]
  100. return endpoint
  101. }
  102. func getDatasetGrampus(datasetInfos map[string]models.DatasetInfo) []models.GrampusDataset {
  103. var datasetGrampus []models.GrampusDataset
  104. endPoint := getEndPoint()
  105. for _, datasetInfo := range datasetInfos {
  106. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  107. Name: datasetInfo.FullName,
  108. Bucket: setting.Bucket,
  109. EndPoint: endPoint,
  110. ObjectKey: datasetInfo.DataLocalPath + datasetInfo.FullName,
  111. })
  112. }
  113. return datasetGrampus
  114. }
  115. func getDatasetGPUGrampus(datasetInfos map[string]models.DatasetInfo) ([]models.GrampusDataset, string) {
  116. var datasetGrampus []models.GrampusDataset
  117. var command = ""
  118. for uuid, datasetInfo := range datasetInfos {
  119. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  120. Name: datasetInfo.FullName,
  121. Bucket: setting.Attachment.Minio.Bucket,
  122. EndPoint: setting.Attachment.Minio.Endpoint,
  123. ObjectKey: datasetInfo.DataLocalPath,
  124. ReadOnly: true,
  125. ContainerPath: "/dataset1/" + datasetInfo.Name,
  126. })
  127. command += "cp /dataset1/'" + datasetInfo.Name + "'/" + uuid + " /dataset/'" + datasetInfo.FullName + "';"
  128. }
  129. return datasetGrampus, command
  130. }
  131. func GenerateNotebookJob(ctx *context.Context, req *GenerateNotebookJobReq) (jobId string, err error) {
  132. createTime := timeutil.TimeStampNow()
  133. var datasetGrampus []models.GrampusDataset
  134. var codeGrampus models.GrampusDataset
  135. var cpCommand string
  136. imageUrl := req.ImageUrl
  137. if ProcessorTypeNPU == req.ProcessType {
  138. datasetGrampus = getDatasetGrampus(req.DatasetInfos)
  139. if len(req.ModelName) != 0 {
  140. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  141. Name: req.ModelName,
  142. Bucket: setting.Bucket,
  143. EndPoint: getEndPoint(),
  144. ReadOnly: true,
  145. ObjectKey: req.PreTrainModelPath,
  146. })
  147. }
  148. codeGrampus = models.GrampusDataset{
  149. Name: req.CodeName,
  150. Bucket: setting.Bucket,
  151. EndPoint: getEndPoint(),
  152. ObjectKey: req.CodeStoragePath + cloudbrain.DefaultBranchName + ".zip",
  153. ReadOnly: false,
  154. }
  155. imageUrl = ""
  156. req.Command = ""
  157. } else {
  158. datasetGrampus, cpCommand = getDatasetGPUGrampus(req.DatasetInfos)
  159. if len(req.ModelName) != 0 {
  160. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  161. Name: req.ModelName,
  162. Bucket: setting.Attachment.Minio.Bucket,
  163. EndPoint: setting.Attachment.Minio.Endpoint,
  164. ObjectKey: req.PreTrainModelPath,
  165. ReadOnly: true,
  166. ContainerPath: cloudbrain.PretrainModelMountPath,
  167. })
  168. }
  169. datasetGrampus = append(datasetGrampus, models.GrampusDataset{
  170. Name: "modelOutput",
  171. Bucket: setting.Attachment.Minio.Bucket,
  172. EndPoint: setting.Attachment.Minio.Endpoint,
  173. ReadOnly: false,
  174. ObjectKey: req.ModelPath,
  175. ContainerPath: cloudbrain.ModelMountPath,
  176. })
  177. codeGrampus = models.GrampusDataset{
  178. Name: req.CodeName,
  179. Bucket: setting.Attachment.Minio.Bucket,
  180. EndPoint: setting.Attachment.Minio.Endpoint,
  181. ObjectKey: req.CodeStoragePath + cloudbrain.DefaultBranchName + ".zip",
  182. ReadOnly: false,
  183. ContainerPath: cloudbrain.CodeMountPath,
  184. }
  185. req.Command = fmt.Sprintf(CommandGpuDebug, cpCommand, setting.CullIdleTimeout, setting.CullIdleTimeout, setting.CullInterval, setting.CullIdleTimeout, setting.CullInterval)
  186. log.Info("debug command:" + req.Command)
  187. }
  188. jobResult, err := createNotebookJob(models.CreateGrampusNotebookRequest{
  189. Name: req.JobName,
  190. Tasks: []models.GrampusNotebookTask{
  191. {
  192. Name: req.JobName,
  193. ResourceSpecId: req.Spec.SourceSpecId,
  194. ImageId: req.ImageId,
  195. ImageUrl: imageUrl,
  196. Datasets: datasetGrampus,
  197. Code: codeGrampus,
  198. AutoStopDuration: autoStopDurationMs,
  199. Capacity: setting.Capacity,
  200. Command: req.Command,
  201. },
  202. },
  203. })
  204. if err != nil {
  205. log.Error("createNotebookJob failed: %v", err.Error())
  206. return "", err
  207. }
  208. jobID := jobResult.JobInfo.JobID
  209. err = models.CreateCloudbrain(&models.Cloudbrain{
  210. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  211. UserID: ctx.User.ID,
  212. RepoID: ctx.Repo.Repository.ID,
  213. JobID: jobID,
  214. JobName: req.JobName,
  215. DisplayJobName: req.DisplayJobName,
  216. JobType: string(models.JobTypeDebug),
  217. Type: models.TypeC2Net,
  218. Uuid: req.Uuid,
  219. DatasetName: req.DatasetNames,
  220. CommitID: req.CommitID,
  221. IsLatestVersion: "1",
  222. ComputeResource: req.ComputeResource,
  223. ImageID: req.ImageId,
  224. BranchName: req.BranchName,
  225. Description: req.Description,
  226. WorkServerNumber: 1,
  227. EngineName: req.ImageUrl,
  228. CreatedUnix: createTime,
  229. UpdatedUnix: createTime,
  230. Spec: req.Spec,
  231. ModelName: req.ModelName,
  232. ModelVersion: req.ModelVersion,
  233. LabelName: req.LabelName,
  234. PreTrainModelUrl: req.PreTrainModelUrl,
  235. CkptName: req.CkptName,
  236. })
  237. if err != nil {
  238. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  239. return "", err
  240. }
  241. var actionType models.ActionType
  242. if req.ComputeResource == models.NPUResource {
  243. actionType = models.ActionCreateGrampusNPUDebugTask
  244. } else if req.ComputeResource == models.GPUResource {
  245. actionType = models.ActionCreateGrampusGPUDebugTask
  246. }
  247. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)
  248. return jobID, nil
  249. }
  250. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
  251. createTime := timeutil.TimeStampNow()
  252. centerID, centerName := getCentersParamter(ctx, req)
  253. var datasetGrampus, modelGrampus []models.GrampusDataset
  254. var codeGrampus models.GrampusDataset
  255. if ProcessorTypeNPU == req.ProcessType {
  256. datasetGrampus = getDatasetGrampus(req.DatasetInfos)
  257. if len(req.ModelName) != 0 {
  258. modelGrampus = []models.GrampusDataset{
  259. {
  260. Name: req.ModelName,
  261. Bucket: setting.Bucket,
  262. EndPoint: getEndPoint(),
  263. ObjectKey: req.PreTrainModelPath,
  264. },
  265. }
  266. }
  267. codeGrampus = models.GrampusDataset{
  268. Name: req.CodeName,
  269. Bucket: setting.Bucket,
  270. EndPoint: getEndPoint(),
  271. ObjectKey: req.CodeObsPath + cloudbrain.DefaultBranchName + ".zip",
  272. }
  273. }
  274. jobResult, err := createJob(models.CreateGrampusJobRequest{
  275. Name: req.JobName,
  276. Tasks: []models.GrampusTasks{
  277. {
  278. Name: req.JobName,
  279. Command: req.Command,
  280. ResourceSpecId: req.Spec.SourceSpecId,
  281. ImageId: req.ImageId,
  282. ImageUrl: req.ImageUrl,
  283. CenterID: centerID,
  284. CenterName: centerName,
  285. ReplicaNum: 1,
  286. Datasets: datasetGrampus,
  287. Models: modelGrampus,
  288. Code: codeGrampus,
  289. BootFile: req.BootFile,
  290. },
  291. },
  292. })
  293. if err != nil {
  294. log.Error("createJob failed: %v", err.Error())
  295. return "", err
  296. }
  297. jobID := jobResult.JobInfo.JobID
  298. err = models.CreateCloudbrain(&models.Cloudbrain{
  299. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  300. UserID: ctx.User.ID,
  301. RepoID: ctx.Repo.Repository.ID,
  302. JobID: jobID,
  303. JobName: req.JobName,
  304. DisplayJobName: req.DisplayJobName,
  305. JobType: string(models.JobTypeTrain),
  306. Type: models.TypeC2Net,
  307. Uuid: req.Uuid,
  308. DatasetName: req.DatasetNames,
  309. CommitID: req.CommitID,
  310. IsLatestVersion: req.IsLatestVersion,
  311. ComputeResource: req.ComputeResource,
  312. ImageID: req.ImageId,
  313. TrainUrl: req.TrainUrl,
  314. BranchName: req.BranchName,
  315. Parameters: req.Params,
  316. BootFile: req.BootFile,
  317. DataUrl: req.DataUrl,
  318. Description: req.Description,
  319. WorkServerNumber: req.WorkServerNumber,
  320. EngineName: req.EngineName,
  321. VersionCount: req.VersionCount,
  322. TotalVersionCount: req.TotalVersionCount,
  323. CreatedUnix: createTime,
  324. UpdatedUnix: createTime,
  325. Spec: req.Spec,
  326. ModelName: req.ModelName,
  327. ModelVersion: req.ModelVersion,
  328. LabelName: req.LabelName,
  329. PreTrainModelUrl: req.PreTrainModelUrl,
  330. CkptName: req.CkptName,
  331. })
  332. if err != nil {
  333. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  334. return "", err
  335. }
  336. var actionType models.ActionType
  337. if req.ComputeResource == models.NPUResource {
  338. actionType = models.ActionCreateGrampusNPUTrainTask
  339. } else if req.ComputeResource == models.GPUResource {
  340. actionType = models.ActionCreateGrampusGPUTrainTask
  341. }
  342. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)
  343. return jobID, nil
  344. }
  345. func getCentersParamter(ctx *context.Context, req *GenerateTrainJobReq) ([]string, []string) {
  346. var centerID []string
  347. var centerName []string
  348. includeCenters := make(map[string]string)
  349. excludeCenters := make(map[string]string)
  350. if SpecialPools != nil {
  351. for _, pool := range SpecialPools.Pools {
  352. if !pool.IsExclusive && strings.Contains(req.ComputeResource, pool.Type) {
  353. org, _ := models.GetOrgByName(pool.Org)
  354. if org != nil {
  355. isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
  356. if isOrgMember {
  357. for _, info := range pool.Pool {
  358. includeCenters[info.Queue] = info.Value
  359. }
  360. } else {
  361. for _, info := range pool.Pool {
  362. excludeCenters[info.Queue] = info.Value
  363. }
  364. }
  365. }
  366. }
  367. }
  368. }
  369. if len(includeCenters) > 0 {
  370. //如果有专属资源池,根据专属资源池指定智算中心
  371. for k, v := range includeCenters {
  372. centerID = append(centerID, k)
  373. centerName = append(centerName, v)
  374. }
  375. } else if len(excludeCenters) > 0 {
  376. //否则,有要排除的中心,先获取所有中心,删除其中的排除中心,得到指定的智算中心
  377. allCenters := make(map[string]string)
  378. specs, err := GetResourceSpecs(req.ProcessType)
  379. if err == nil {
  380. for _, info := range specs.Infos {
  381. for _, center := range info.Centers {
  382. allCenters[center.ID] = center.Name
  383. }
  384. }
  385. }
  386. for k, _ := range excludeCenters {
  387. delete(allCenters, k)
  388. }
  389. for k, v := range allCenters {
  390. centerID = append(centerID, k)
  391. centerName = append(centerName, v)
  392. }
  393. }
  394. return centerID, centerName
  395. }
  396. func TransTrainJobStatus(status string) string {
  397. if status == models.GrampusStatusPending {
  398. status = models.GrampusStatusWaiting
  399. }
  400. return strings.ToUpper(status)
  401. }
  402. func GetNpuModelRemoteObsUrl(jobName string) string {
  403. return "s3:///" + BucketRemote + "/" + GetNpuModelObjectKey(jobName)
  404. }
  405. func GetNpuModelObjectKey(jobName string) string {
  406. return setting.CodePathPrefix + jobName + RemoteModelPath
  407. }
  408. func GetRemoteEndPoint(aiCenterID string) string {
  409. var endPoint string
  410. for _, info := range setting.CenterInfos.Info {
  411. if info.CenterID == aiCenterID {
  412. endPoint = info.Endpoint
  413. break
  414. }
  415. }
  416. return endPoint
  417. }
  418. func GetCenterProxy(aiCenterID string) string {
  419. var proxy string
  420. for _, info := range setting.CenterInfos.Info {
  421. if info.CenterID == aiCenterID {
  422. proxy = info.StorageProxyServer
  423. break
  424. }
  425. }
  426. return proxy
  427. }