You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

schedulecreatetasklogic.go 7.9 kB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. package schedule
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  15. "slices"
  16. "strings"
  17. "time"
  18. "github.com/zeromicro/go-zero/core/logx"
  19. )
  20. const (
  21. TRAINNING_TASK_REPLICA = 1
  22. TRAINNING_TASK_SUFFIX_LEN = 10
  23. QUERY_RESOURCE_RETRY = 3
  24. )
  25. type ScheduleCreateTaskLogic struct {
  26. logx.Logger
  27. ctx context.Context
  28. svcCtx *svc.ServiceContext
  29. queryResource *QueryResourcesLogic
  30. }
  31. func NewScheduleCreateTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleCreateTaskLogic {
  32. return &ScheduleCreateTaskLogic{
  33. Logger: logx.WithContext(ctx),
  34. ctx: ctx,
  35. svcCtx: svcCtx,
  36. queryResource: NewQueryResourcesLogic(ctx, svcCtx),
  37. }
  38. }
  39. func (l *ScheduleCreateTaskLogic) ScheduleCreateTask(req *types.CreateTaskReq) (resp *types.CreateTaskResp, err error) {
  40. resp = &types.CreateTaskResp{}
  41. if req.JobResources.ScheduleStrategy == "" {
  42. return nil, fmt.Errorf("must specify ScheduleStrategy")
  43. }
  44. if len(req.JobResources.Clusters) == 0 {
  45. return nil, fmt.Errorf("must specify at least one cluster")
  46. }
  47. var clusters []string
  48. if len(req.JobResources.Clusters) == 1 {
  49. clusters = append(clusters, req.JobResources.Clusters[0].ClusterID)
  50. schedatas, err := l.generateScheduleResult(req.DataDistributes, clusters)
  51. if err != nil {
  52. return nil, err
  53. }
  54. taskId, err := l.createTask("SCHEDULE_TASK_"+utils.RandomString(TRAINNING_TASK_SUFFIX_LEN), req.JobResources.ScheduleStrategy, req.JobResources.Clusters)
  55. if err != nil {
  56. return nil, err
  57. }
  58. resp.ScheduleDatas = schedatas
  59. resp.TaskID = taskId
  60. return resp, nil
  61. } else {
  62. clusterInfos, err := l.getClusterInfosByStrategy(&req.JobResources)
  63. if err != nil {
  64. return nil, err
  65. }
  66. if len(clusterInfos) == 0 {
  67. return nil, fmt.Errorf("failed to create task, no scheduled cluster found")
  68. }
  69. for _, info := range clusterInfos {
  70. clusters = append(clusters, info.ClusterID)
  71. }
  72. schedatas, err := l.generateScheduleResult(req.DataDistributes, clusters)
  73. if err != nil {
  74. return nil, err
  75. }
  76. taskId, err := l.createTask("SCHEDULE_TASK_"+utils.RandomString(TRAINNING_TASK_SUFFIX_LEN), req.JobResources.ScheduleStrategy, clusterInfos)
  77. if err != nil {
  78. return nil, err
  79. }
  80. resp.ScheduleDatas = schedatas
  81. resp.TaskID = taskId
  82. return resp, nil
  83. }
  84. }
  85. func (l *ScheduleCreateTaskLogic) getClusterInfosByStrategy(resources *types.JobResources) ([]*types.JobClusterInfo, error) {
  86. var resSpecs []*collector.ResourceSpec
  87. var resCount int
  88. for i := 0; i < QUERY_RESOURCE_RETRY; i++ {
  89. defer time.Sleep(time.Second)
  90. qResources, err := l.queryResource.queryResources(make([]string, 0))
  91. if err != nil {
  92. continue
  93. }
  94. for _, resource := range qResources {
  95. if resource.Resources != nil {
  96. resCount++
  97. }
  98. }
  99. if resCount >= 1 {
  100. resSpecs = qResources
  101. break
  102. } else {
  103. resCount = 0
  104. continue
  105. }
  106. }
  107. if resCount == 0 {
  108. return nil, fmt.Errorf("failed to create task, resources counting fails")
  109. }
  110. var clusterInfos []*types.JobClusterInfo
  111. switch resources.ScheduleStrategy {
  112. case strategy.LEASTLOADFIRST:
  113. strtg := strategy.NewLeastLoadFirst(TRAINNING_TASK_REPLICA, resSpecs)
  114. clusters, err := strtg.Schedule()
  115. if err != nil {
  116. return nil, err
  117. }
  118. clusterInfos = filterClusterInfos(clusters, resources.Clusters)
  119. }
  120. return clusterInfos, nil
  121. }
  122. func filterClusterInfos(clusters []*strategy.AssignedCluster, clusterInfos []*types.JobClusterInfo) []*types.JobClusterInfo {
  123. var result []*types.JobClusterInfo
  124. for _, cinfo := range clusterInfos {
  125. for _, c := range clusters {
  126. if cinfo.ClusterID == c.ClusterId {
  127. result = append(result, cinfo)
  128. }
  129. }
  130. }
  131. return result
  132. }
  133. func (l *ScheduleCreateTaskLogic) createTask(taskName string, strategyName string, jobClusterInfo []*types.JobClusterInfo) (int64, error) {
  134. var synergyStatus int64
  135. if len(jobClusterInfo) > 1 {
  136. synergyStatus = 1
  137. }
  138. strategyCode, err := l.svcCtx.Scheduler.AiStorages.GetStrategyCode(strategyName)
  139. if err != nil {
  140. return 0, err
  141. }
  142. taskId, err := l.svcCtx.Scheduler.AiStorages.SaveTask(taskName, strategyCode, synergyStatus, "10")
  143. if err != nil {
  144. return 0, err
  145. }
  146. adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(ADAPTERID)
  147. if err != nil {
  148. return 0, err
  149. }
  150. for _, i := range jobClusterInfo {
  151. clusterName, _ := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(i.ClusterID)
  152. opt := &option.AiOption{}
  153. err := l.svcCtx.Scheduler.AiStorages.SaveAiTask(taskId, opt, adapterName, i.ClusterID, clusterName, "", constants.Saved, "")
  154. if err != nil {
  155. return 0, errors.New("database add failed: " + err.Error())
  156. }
  157. }
  158. return taskId, nil
  159. }
  160. func (l *ScheduleCreateTaskLogic) generateScheduleResult(distribute types.DataDistribute, clusters []string) ([]*types.ScheduleData, error) {
  161. var schedatas []*types.ScheduleData
  162. for _, d := range distribute.Dataset {
  163. data := &types.ScheduleData{
  164. DataType: "dataset",
  165. PackageID: d.PackageID,
  166. ClusterIDs: make([]string, 0),
  167. }
  168. for _, cluster := range clusters {
  169. if !slices.Contains(d.Clusters, cluster) {
  170. data.ClusterIDs = append(data.ClusterIDs, cluster)
  171. } else {
  172. continue
  173. }
  174. }
  175. if len(data.ClusterIDs) != 0 {
  176. schedatas = append(schedatas, data)
  177. }
  178. }
  179. for _, d := range distribute.Code {
  180. data := &types.ScheduleData{
  181. DataType: "code",
  182. PackageID: d.PackageID,
  183. ClusterIDs: make([]string, 0),
  184. }
  185. for _, cluster := range clusters {
  186. if !slices.Contains(d.Clusters, cluster) {
  187. data.ClusterIDs = append(data.ClusterIDs, cluster)
  188. } else {
  189. continue
  190. }
  191. }
  192. if len(data.ClusterIDs) != 0 {
  193. schedatas = append(schedatas, data)
  194. }
  195. }
  196. for _, d := range distribute.Image {
  197. data := &types.ScheduleData{
  198. DataType: "image",
  199. PackageID: d.PackageID,
  200. ClusterIDs: make([]string, 0),
  201. }
  202. for _, cluster := range clusters {
  203. if !slices.Contains(d.Clusters, cluster) {
  204. data.ClusterIDs = append(data.ClusterIDs, cluster)
  205. } else {
  206. continue
  207. }
  208. }
  209. if len(data.ClusterIDs) != 0 {
  210. schedatas = append(schedatas, data)
  211. }
  212. }
  213. for _, d := range distribute.Model {
  214. data := &types.ScheduleData{
  215. DataType: "model",
  216. PackageID: d.PackageID,
  217. ClusterIDs: make([]string, 0),
  218. }
  219. for _, cluster := range clusters {
  220. if !slices.Contains(d.Clusters, cluster) {
  221. data.ClusterIDs = append(data.ClusterIDs, cluster)
  222. } else {
  223. continue
  224. }
  225. }
  226. if len(data.ClusterIDs) != 0 {
  227. schedatas = append(schedatas, data)
  228. }
  229. }
  230. if len(schedatas) != 0 {
  231. err := l.updateStorageType(&schedatas)
  232. if err != nil {
  233. return nil, err
  234. }
  235. }
  236. return schedatas, nil
  237. }
  238. func (l *ScheduleCreateTaskLogic) updateStorageType(schedatas *[]*types.ScheduleData) error {
  239. for _, s := range *schedatas {
  240. var storageType string
  241. var sTypes []string
  242. for _, id := range s.ClusterIDs {
  243. cluster, err := l.svcCtx.Scheduler.AiStorages.GetClustersById(id)
  244. if err != nil {
  245. return err
  246. }
  247. stype, ok := storeLink.StorageTypeMap[strings.Title(cluster.Name)]
  248. if ok {
  249. sTypes = append(sTypes, stype)
  250. }
  251. }
  252. sTypes = common.Unique(sTypes)
  253. for _, st := range sTypes {
  254. storageType += st + storeLink.COMMA
  255. }
  256. storageType = strings.TrimSuffix(storageType, storeLink.COMMA)
  257. s.StorageType = storageType
  258. }
  259. return nil
  260. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.