You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

schedulecreatetasklogic.go 9.5 kB

11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
10 months ago
11 months ago
10 months ago
11 months ago
10 months ago
11 months ago
10 months ago
11 months ago
11 months ago
11 months ago
10 months ago
10 months ago
10 months ago
11 months ago
10 months ago
11 months ago
10 months ago
11 months ago
10 months ago
11 months ago
10 months ago
10 months ago
11 months ago
11 months ago
10 months ago
11 months ago
10 months ago
10 months ago
10 months ago
11 months ago
11 months ago
10 months ago
11 months ago
10 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
11 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. package schedule
  2. import (
  3. "context"
  4. "fmt"
  5. "github.com/pkg/errors"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  13. "gopkg.in/yaml.v3"
  14. "slices"
  15. "strings"
  16. "time"
  17. "github.com/zeromicro/go-zero/core/logx"
  18. )
  19. const (
  20. TRAINNING_TASK_REPLICA = 1
  21. TRAINNING_TASK_SUFFIX_LEN = 10
  22. QUERY_RESOURCE_RETRY = 3
  23. )
  24. type ScheduleCreateTaskLogic struct {
  25. logx.Logger
  26. ctx context.Context
  27. svcCtx *svc.ServiceContext
  28. queryResource *QueryResourcesLogic
  29. }
  30. func NewScheduleCreateTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ScheduleCreateTaskLogic {
  31. return &ScheduleCreateTaskLogic{
  32. Logger: logx.WithContext(ctx),
  33. ctx: ctx,
  34. svcCtx: svcCtx,
  35. queryResource: NewQueryResourcesLogic(ctx, svcCtx),
  36. }
  37. }
  38. func (l *ScheduleCreateTaskLogic) ScheduleCreateTask(req *types.CreateTaskReq) (resp *types.CreateTaskResp, err error) {
  39. resp = &types.CreateTaskResp{}
  40. err = validateJobResources(req.JobResources)
  41. if err != nil {
  42. return nil, err
  43. }
  44. var clusters []string
  45. if len(req.JobResources.Clusters) == 1 {
  46. clusters = append(clusters, req.JobResources.Clusters[0].ClusterID)
  47. schedatas, err := l.generateScheduleResult(req.DataDistributes, clusters)
  48. if err != nil {
  49. return nil, err
  50. }
  51. assignedClusters := copyParams([]*strategy.AssignedCluster{{
  52. ClusterId: req.JobResources.Clusters[0].ClusterID,
  53. }}, req.JobResources.Clusters)
  54. taskId, err := l.createTask("SCHEDULE_TASK_"+utils.RandomString(TRAINNING_TASK_SUFFIX_LEN), req.JobResources.ScheduleStrategy, assignedClusters, req.Token)
  55. if err != nil {
  56. return nil, err
  57. }
  58. resp.ScheduleDatas = schedatas
  59. resp.TaskID = taskId
  60. return resp, nil
  61. } else {
  62. assignedClusters, err := l.getAssignedClustersByStrategy(&req.JobResources, &req.DataDistributes)
  63. if err != nil {
  64. return nil, err
  65. }
  66. if len(assignedClusters) == 0 {
  67. return nil, fmt.Errorf("failed to create task, no scheduled cluster found")
  68. }
  69. for _, c := range assignedClusters {
  70. clusters = append(clusters, c.ClusterId)
  71. }
  72. schedatas, err := l.generateScheduleResult(req.DataDistributes, clusters)
  73. if err != nil {
  74. return nil, err
  75. }
  76. taskId, err := l.createTask("SCHEDULE_TASK_"+utils.RandomString(TRAINNING_TASK_SUFFIX_LEN), req.JobResources.ScheduleStrategy, assignedClusters, req.Token)
  77. if err != nil {
  78. return nil, err
  79. }
  80. resp.ScheduleDatas = schedatas
  81. resp.TaskID = taskId
  82. return resp, nil
  83. }
  84. }
  85. func validateJobResources(resources types.JobResources) error {
  86. if resources.ScheduleStrategy == "" {
  87. return fmt.Errorf("must specify ScheduleStrategy")
  88. }
  89. if len(resources.Clusters) == 0 {
  90. return fmt.Errorf("must specify at least one cluster")
  91. }
  92. for _, c := range resources.Clusters {
  93. if c.ClusterID == "" {
  94. return fmt.Errorf("must specify clusterID")
  95. }
  96. if len(c.Resources) == 0 {
  97. return fmt.Errorf("cluster: %s must specify at least one compute resource", c.ClusterID)
  98. //return errors.Wrapf(xerr.NewErrCodeMsg(1234, fmt.Sprintf("cluster: %s must specify at least one compute resource", c.ClusterID)), "")
  99. }
  100. }
  101. return nil
  102. }
  103. func (l *ScheduleCreateTaskLogic) getAssignedClustersByStrategy(resources *types.JobResources, dataDistribute *types.DataDistribute) ([]*strategy.AssignedCluster, error) {
  104. var assignedClusters []*strategy.AssignedCluster
  105. switch resources.ScheduleStrategy {
  106. case strategy.LEASTLOADFIRST:
  107. var resSpecs []*collector.ResourceSpec
  108. var resCount int
  109. for i := 0; i < QUERY_RESOURCE_RETRY; i++ {
  110. defer time.Sleep(time.Second)
  111. qResources, err := l.queryResource.QueryResourcesByClusterId(nil)
  112. if err != nil {
  113. continue
  114. }
  115. for _, resource := range qResources {
  116. if resource.Resources != nil {
  117. resCount++
  118. }
  119. }
  120. if resCount >= 1 {
  121. resSpecs = qResources
  122. break
  123. } else {
  124. resCount = 0
  125. continue
  126. }
  127. }
  128. if resCount == 0 {
  129. return nil, fmt.Errorf("failed to create task, resources counting fails")
  130. }
  131. strtg := strategy.NewLeastLoadFirst(TRAINNING_TASK_REPLICA, resSpecs)
  132. clusters, err := strtg.Schedule()
  133. if err != nil {
  134. return nil, err
  135. }
  136. assignedClusters = copyParams(clusters, resources.Clusters)
  137. case strategy.DATA_LOCALITY:
  138. strtg := strategy.NewDataLocality(TRAINNING_TASK_REPLICA, dataDistribute)
  139. clusters, err := strtg.Schedule()
  140. if err != nil {
  141. return nil, err
  142. }
  143. assignedClusters = copyParams(clusters, resources.Clusters)
  144. default:
  145. return nil, errors.New("no strategy has been chosen")
  146. }
  147. return assignedClusters, nil
  148. }
  149. func copyParams(clusters []*strategy.AssignedCluster, clusterInfos []*types.JobClusterInfo) []*strategy.AssignedCluster {
  150. var result []*strategy.AssignedCluster
  151. for _, c := range clusters {
  152. for _, info := range clusterInfos {
  153. if c.ClusterId == info.ClusterID {
  154. var envs []string
  155. var params []string
  156. for k, v := range info.Runtime.Envs {
  157. val := common.ConvertTypeToString(v)
  158. if val != "" {
  159. env := k + storeLink.COMMA + val
  160. envs = append(envs, env)
  161. }
  162. }
  163. for k, v := range info.Runtime.Params {
  164. val := common.ConvertTypeToString(v)
  165. if val != "" {
  166. p := k + storeLink.COMMA + val
  167. params = append(params, p)
  168. }
  169. }
  170. cluster := &strategy.AssignedCluster{
  171. ClusterId: c.ClusterId,
  172. ClusterName: c.ClusterName,
  173. Replicas: c.Replicas,
  174. ResourcesRequired: info.Resources,
  175. Cmd: info.Runtime.Command,
  176. Envs: envs,
  177. Params: params,
  178. }
  179. result = append(result, cluster)
  180. }
  181. }
  182. }
  183. return result
  184. }
  185. func (l *ScheduleCreateTaskLogic) createTask(taskName string, strategyName string, clusters []*strategy.AssignedCluster, token string) (int64, error) {
  186. var synergyStatus int64
  187. if len(clusters) > 1 {
  188. synergyStatus = 1
  189. }
  190. y, err := yaml.Marshal(clusters)
  191. if err != nil {
  192. fmt.Printf("Error while Marshaling. %v", err)
  193. }
  194. taskId, err := l.svcCtx.Scheduler.CreateTask(taskName, synergyStatus, strategyName, string(y), token, &l.svcCtx.Config)
  195. if err != nil {
  196. return 0, err
  197. }
  198. return taskId, nil
  199. }
  200. func (l *ScheduleCreateTaskLogic) generateScheduleResult(distribute types.DataDistribute, clusters []string) ([]*types.ScheduleData, error) {
  201. var schedatas []*types.ScheduleData
  202. for _, d := range distribute.Dataset {
  203. data := &types.ScheduleData{
  204. DataType: "dataset",
  205. PackageID: d.PackageID,
  206. ClusterIDs: make([]string, 0),
  207. }
  208. var cSlc []string
  209. for _, cluster := range d.Clusters {
  210. cSlc = append(cSlc, cluster.ClusterID)
  211. }
  212. for _, cluster := range clusters {
  213. if !slices.Contains(cSlc, cluster) {
  214. data.ClusterIDs = append(data.ClusterIDs, cluster)
  215. } else {
  216. continue
  217. }
  218. }
  219. if len(data.ClusterIDs) != 0 {
  220. schedatas = append(schedatas, data)
  221. }
  222. }
  223. for _, d := range distribute.Code {
  224. data := &types.ScheduleData{
  225. DataType: "code",
  226. PackageID: d.PackageID,
  227. ClusterIDs: make([]string, 0),
  228. }
  229. var cSlc []string
  230. for _, cluster := range d.Clusters {
  231. cSlc = append(cSlc, cluster.ClusterID)
  232. }
  233. for _, cluster := range clusters {
  234. if !slices.Contains(cSlc, cluster) {
  235. data.ClusterIDs = append(data.ClusterIDs, cluster)
  236. } else {
  237. continue
  238. }
  239. }
  240. if len(data.ClusterIDs) != 0 {
  241. schedatas = append(schedatas, data)
  242. }
  243. }
  244. for _, d := range distribute.Image {
  245. data := &types.ScheduleData{
  246. DataType: "image",
  247. PackageID: d.PackageID,
  248. ClusterIDs: make([]string, 0),
  249. }
  250. var cSlc []string
  251. for _, cluster := range d.Clusters {
  252. cSlc = append(cSlc, cluster.ClusterID)
  253. }
  254. for _, cluster := range clusters {
  255. if !slices.Contains(cSlc, cluster) {
  256. data.ClusterIDs = append(data.ClusterIDs, cluster)
  257. } else {
  258. continue
  259. }
  260. }
  261. if len(data.ClusterIDs) != 0 {
  262. schedatas = append(schedatas, data)
  263. }
  264. }
  265. for _, d := range distribute.Model {
  266. data := &types.ScheduleData{
  267. DataType: "model",
  268. PackageID: d.PackageID,
  269. ClusterIDs: make([]string, 0),
  270. }
  271. var cSlc []string
  272. for _, cluster := range d.Clusters {
  273. cSlc = append(cSlc, cluster.ClusterID)
  274. }
  275. for _, cluster := range clusters {
  276. if !slices.Contains(cSlc, cluster) {
  277. data.ClusterIDs = append(data.ClusterIDs, cluster)
  278. } else {
  279. continue
  280. }
  281. }
  282. if len(data.ClusterIDs) != 0 {
  283. schedatas = append(schedatas, data)
  284. }
  285. }
  286. if len(schedatas) != 0 {
  287. err := l.updateStorageType(&schedatas)
  288. if err != nil {
  289. return nil, err
  290. }
  291. }
  292. return schedatas, nil
  293. }
  294. func (l *ScheduleCreateTaskLogic) updateStorageType(schedatas *[]*types.ScheduleData) error {
  295. for _, s := range *schedatas {
  296. var storageType string
  297. var sTypes []string
  298. for _, id := range s.ClusterIDs {
  299. cluster, err := l.svcCtx.Scheduler.AiStorages.GetClustersById(id)
  300. if err != nil {
  301. return err
  302. }
  303. stype, ok := storeLink.StorageTypeMap[strings.Title(cluster.Name)]
  304. if ok {
  305. sTypes = append(sTypes, stype)
  306. }
  307. }
  308. sTypes = common.Unique(sTypes)
  309. for _, st := range sTypes {
  310. storageType += st + storeLink.COMMA
  311. }
  312. storageType = strings.TrimSuffix(storageType, storeLink.COMMA)
  313. s.StorageType = storageType
  314. }
  315. return nil
  316. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.