You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 5.2 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. package grampus
  2. import (
  3. "encoding/json"
  4. "strings"
  5. "code.gitea.io/gitea/modules/setting"
  6. "code.gitea.io/gitea/models"
  7. "code.gitea.io/gitea/modules/context"
  8. "code.gitea.io/gitea/modules/log"
  9. "code.gitea.io/gitea/modules/notification"
  10. "code.gitea.io/gitea/modules/timeutil"
  11. )
  12. const (
  13. JobPath = "job/"
  14. ProcessorTypeNPU = "npu.huawei.com/NPU"
  15. ProcessorTypeGPU = "nvidia.com/gpu"
  16. CommandPrepareScript = "pwd;cd /cache;mkdir -p output;mkdir -p code;mkdir -p dataset;echo \"start loading script\";wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;" +
  17. "echo \"finish loading script\";unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_obs downloader_for_minio uploader_for_minio;"
  18. //CommandPrepareScript = "bash;pwd;apt-get -y update;apt-get -y upgrade;apt-get -y install wget;apt-get -y install unzip;" +
  19. // "cd /tmp;mkdir -p output;mkdir -p code;mkdir -p dataset;wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;" +
  20. // "unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_obs downloader_for_minio uploader_for_minio;"
  21. CodeArchiveName = "master.zip"
  22. )
  23. var (
  24. poolInfos *models.PoolInfos
  25. FlavorInfos *models.FlavorInfos
  26. ImageInfos *models.ImageInfosModelArts
  27. SpecialPools *models.SpecialPools
  28. )
  29. type GenerateTrainJobReq struct {
  30. JobName string
  31. Command string
  32. ResourceSpecId string
  33. ImageUrl string //与image_id二选一,都有的情况下优先image_url
  34. ImageId string
  35. DisplayJobName string
  36. Uuid string
  37. Description string
  38. CodeObsPath string
  39. BootFile string
  40. BootFileUrl string
  41. DataUrl string
  42. TrainUrl string
  43. WorkServerNumber int
  44. EngineID int64
  45. CommitID string
  46. IsLatestVersion string
  47. BranchName string
  48. PreVersionId int64
  49. PreVersionName string
  50. FlavorName string
  51. VersionCount int
  52. EngineName string
  53. TotalVersionCount int
  54. ComputeResource string
  55. DatasetName string
  56. Params string
  57. }
  58. func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error) {
  59. createTime := timeutil.TimeStampNow()
  60. var CenterID []string
  61. var CenterName []string
  62. if SpecialPools != nil {
  63. for _, pool := range SpecialPools.Pools {
  64. if !pool.IsExclusive && strings.Contains(req.ComputeResource, pool.Type) {
  65. org, _ := models.GetOrgByName(pool.Org)
  66. if org != nil {
  67. isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
  68. if isOrgMember {
  69. for _, info := range pool.Pool {
  70. CenterID = append(CenterID, info.Queue)
  71. CenterName = append(CenterName, info.Value)
  72. }
  73. }
  74. }
  75. }
  76. }
  77. }
  78. jobResult, err := createJob(models.CreateGrampusJobRequest{
  79. Name: req.JobName,
  80. Tasks: []models.GrampusTasks{
  81. {
  82. Name: req.JobName,
  83. Command: req.Command,
  84. ResourceSpecId: req.ResourceSpecId,
  85. ImageId: req.ImageId,
  86. ImageUrl: req.ImageUrl,
  87. CenterID: CenterID,
  88. CenterName: CenterName,
  89. ReplicaNum: 1,
  90. },
  91. },
  92. })
  93. if err != nil {
  94. log.Error("createJob failed: %v", err.Error())
  95. return err
  96. }
  97. jobID := jobResult.JobInfo.JobID
  98. err = models.CreateCloudbrain(&models.Cloudbrain{
  99. Status: TransTrainJobStatus(jobResult.JobInfo.Status),
  100. UserID: ctx.User.ID,
  101. RepoID: ctx.Repo.Repository.ID,
  102. JobID: jobID,
  103. JobName: req.JobName,
  104. DisplayJobName: req.DisplayJobName,
  105. JobType: string(models.JobTypeTrain),
  106. Type: models.TypeC2Net,
  107. Uuid: req.Uuid,
  108. DatasetName: req.DatasetName,
  109. CommitID: req.CommitID,
  110. IsLatestVersion: req.IsLatestVersion,
  111. ComputeResource: req.ComputeResource,
  112. ImageID: req.ImageId,
  113. TrainUrl: req.TrainUrl,
  114. BranchName: req.BranchName,
  115. Parameters: req.Params,
  116. BootFile: req.BootFile,
  117. DataUrl: req.DataUrl,
  118. FlavorCode: req.ResourceSpecId,
  119. Description: req.Description,
  120. WorkServerNumber: req.WorkServerNumber,
  121. FlavorName: req.FlavorName,
  122. EngineName: req.EngineName,
  123. VersionCount: req.VersionCount,
  124. TotalVersionCount: req.TotalVersionCount,
  125. CreatedUnix: createTime,
  126. UpdatedUnix: createTime,
  127. })
  128. if err != nil {
  129. log.Error("CreateCloudbrain(%s) failed:%v", req.DisplayJobName, err.Error())
  130. return err
  131. }
  132. var actionType models.ActionType
  133. if req.ComputeResource == models.NPUResource {
  134. actionType = models.ActionCreateGrampusNPUTrainTask
  135. } else if req.ComputeResource == models.GPUResource {
  136. actionType = models.ActionCreateGrampusGPUTrainTask
  137. }
  138. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, req.DisplayJobName, actionType)
  139. return nil
  140. }
  141. func TransTrainJobStatus(status string) string {
  142. if status == models.GrampusStatusPending {
  143. status = models.GrampusStatusWaiting
  144. }
  145. return strings.ToUpper(status)
  146. }
  147. func InitSpecialPool() {
  148. if SpecialPools == nil && setting.Grampus.SpecialPools != "" {
  149. json.Unmarshal([]byte(setting.Grampus.SpecialPools), &SpecialPools)
  150. }
  151. }