You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aiScheduler.go 12 kB

10 months ago
10 months ago
11 months ago
10 months ago
10 months ago
10 months ago
11 months ago
11 months ago
11 months ago
11 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package schedulers
  13. import (
  14. "context"
  15. "encoding/json"
  16. "errors"
  17. "fmt"
  18. "github.com/zeromicro/go-zero/core/logx"
  19. "gitlink.org.cn/JointCloud/pcm-ac/hpcAC"
  20. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler"
  21. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  22. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  23. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  24. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  25. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
  26. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
  27. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy/param"
  28. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  29. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  30. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/response"
  31. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  32. "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
  33. "gitlink.org.cn/JointCloud/pcm-octopus/octopus"
  34. "gitlink.org.cn/JointCloud/pcm-openi/model"
  35. "strconv"
  36. "sync"
  37. )
  38. type AiScheduler struct {
  39. yamlString string
  40. task *response.TaskInfo
  41. *scheduler.Scheduler
  42. option *option.AiOption
  43. ctx context.Context
  44. }
  45. type AiResult struct {
  46. AdapterId string
  47. TaskName string
  48. JobId string
  49. ClusterId string
  50. Strategy string
  51. Replica int32
  52. Card string
  53. Msg string
  54. }
  55. func NewAiScheduler(ctx context.Context, val string, scheduler *scheduler.Scheduler, option *option.AiOption) (*AiScheduler, error) {
  56. return &AiScheduler{ctx: ctx, yamlString: val, Scheduler: scheduler, option: option}, nil
  57. }
  58. func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
  59. ai := models.Ai{
  60. AdapterId: participantId,
  61. TaskId: task.TaskId,
  62. Status: "Saved",
  63. YamlString: as.yamlString,
  64. }
  65. utils.Convert(task.Metadata, &ai)
  66. return ai, nil
  67. }
  68. func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
  69. if as.option.ComputeCard != "" {
  70. m, ok := as.AiService.AiCollectorAdapterMap[as.option.AdapterId]
  71. if ok {
  72. for _, id := range as.option.ClusterIds {
  73. cm, ok := m[id]
  74. if ok {
  75. cards, err := cm.GetComputeCards(as.ctx)
  76. if err != nil {
  77. return nil, err
  78. }
  79. if common.Contains(cards, as.option.ComputeCard) {
  80. return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: id, Replicas: 1}}, nil
  81. }
  82. }
  83. }
  84. }
  85. }
  86. if len(as.option.ClusterIds) == 1 {
  87. return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: as.option.ClusterIds[0], Replicas: 1}}, nil
  88. }
  89. resources, err := as.findClustersWithResources()
  90. if err != nil {
  91. return nil, err
  92. }
  93. if len(resources) == 0 {
  94. return nil, errors.New("no cluster has resources")
  95. }
  96. if len(resources) == 1 {
  97. var cluster strategy.AssignedCluster
  98. cluster.ClusterId = resources[0].ClusterId
  99. cluster.Replicas = 1
  100. return &strategy.SingleAssignment{Cluster: &cluster}, nil
  101. }
  102. params := &param.Params{Resources: resources}
  103. switch as.option.StrategyName {
  104. case strategy.REPLICATION:
  105. var clusterIds []string
  106. for _, resource := range resources {
  107. if resource == nil {
  108. continue
  109. }
  110. clusterIds = append(clusterIds, resource.ClusterId)
  111. }
  112. strategy := strategy.NewReplicationStrategy(clusterIds, 1)
  113. return strategy, nil
  114. case strategy.RESOURCES_PRICING:
  115. strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{Params: params, Replicas: 1})
  116. return strategy, nil
  117. case strategy.DYNAMIC_RESOURCES:
  118. strategy := strategy.NewDynamicResourcesStrategy(params.Resources, as.option, 1)
  119. return strategy, nil
  120. case strategy.STATIC_WEIGHT:
  121. //todo resources should match cluster StaticWeightMap
  122. strategy := strategy.NewStaticWeightStrategy(as.option.ClusterToStaticWeight, as.option.Replica)
  123. return strategy, nil
  124. case strategy.RANDOM:
  125. strategy := strategy.NewRandomStrategy(as.option.ClusterIds, as.option.Replica)
  126. return strategy, nil
  127. }
  128. return nil, errors.New("no strategy has been chosen")
  129. }
  130. func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int) (interface{}, error) {
  131. if clusters == nil {
  132. return nil, errors.New("clusters is nil")
  133. }
  134. for i := len(clusters) - 1; i >= 0; i-- {
  135. if clusters[i].Replicas == 0 {
  136. clusters = append(clusters[:i], clusters[i+1:]...)
  137. }
  138. }
  139. if len(clusters) == 0 {
  140. return nil, errors.New("clusters is nil")
  141. }
  142. var wg sync.WaitGroup
  143. var results []*AiResult
  144. var mu sync.Mutex
  145. var errs []interface{}
  146. var taskNum int32
  147. for _, cluster := range clusters {
  148. taskNum += cluster.Replicas
  149. }
  150. var ch = make(chan *AiResult, taskNum)
  151. var errCh = make(chan interface{}, taskNum)
  152. executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId]
  153. for _, cluster := range clusters {
  154. c := cluster
  155. for i := 0; i < int(c.Replicas); i++ {
  156. wg.Add(1)
  157. go func() {
  158. opt, _ := cloneAiOption(as.option)
  159. // decide opt params by mode
  160. updateAiOptionByMode(c, opt, mode)
  161. resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode)
  162. if err != nil {
  163. e := struct {
  164. err error
  165. clusterId string
  166. }{
  167. err: err,
  168. clusterId: c.ClusterId,
  169. }
  170. errCh <- e
  171. wg.Done()
  172. return
  173. }
  174. result := &AiResult{}
  175. mu.Lock()
  176. result, _ = convertType(resp)
  177. mu.Unlock()
  178. result.AdapterId = opt.AdapterId
  179. result.TaskName = opt.TaskName
  180. result.Replica = c.Replicas
  181. result.ClusterId = c.ClusterId
  182. result.Strategy = as.option.StrategyName
  183. result.Card = opt.ComputeCard
  184. ch <- result
  185. wg.Done()
  186. }()
  187. }
  188. }
  189. wg.Wait()
  190. close(ch)
  191. close(errCh)
  192. for e := range errCh {
  193. errs = append(errs, e)
  194. }
  195. for s := range ch {
  196. results = append(results, s)
  197. }
  198. err := as.handleErrors(errs, clusters, results, mode)
  199. if err != nil {
  200. return nil, err
  201. }
  202. return results, nil
  203. }
  204. func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.AssignedCluster, results []*AiResult, mode int) error {
  205. if len(errs) != 0 {
  206. var synergystatus int64
  207. if len(clusters) > 1 {
  208. synergystatus = 1
  209. }
  210. var taskId int64
  211. switch mode {
  212. case executor.SUBMIT_MODE_JOINT_CLOUD:
  213. tid, err := as.CreateTask(as.option.TaskName, "", synergystatus, as.option.StrategyName, "", "", "", nil)
  214. if err != nil {
  215. return err
  216. }
  217. taskId = tid
  218. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  219. taskId = as.option.TaskId
  220. }
  221. // aiTasks
  222. adapterName, err := as.AiStorages.GetAdapterNameById(as.option.AdapterId)
  223. if err != nil {
  224. return err
  225. }
  226. //report msg
  227. report := &jcs.JobStatusReportReq{
  228. TaskName: "",
  229. TaskID: strconv.FormatInt(taskId, 10),
  230. Messages: make([]*jcs.ReportMessage, 0),
  231. }
  232. var errmsg string
  233. for _, err := range errs {
  234. e := (err).(struct {
  235. err error
  236. clusterId string
  237. })
  238. msg := fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
  239. errmsg += msg
  240. clusterName, _ := as.AiStorages.GetClusterNameById(e.clusterId)
  241. err := as.AiStorages.SaveAiTask(taskId, as.option, adapterName, e.clusterId, clusterName, "", constants.Failed, msg)
  242. if err != nil {
  243. return errors.New("database add failed: " + err.Error())
  244. }
  245. //add report msg
  246. jobMsg := &jcs.ReportMessage{
  247. Status: false,
  248. Message: msg,
  249. ClusterID: e.clusterId,
  250. Output: "",
  251. }
  252. report.Messages = append(report.Messages, jobMsg)
  253. }
  254. for _, s := range results {
  255. as.option.ComputeCard = s.Card //execute card
  256. clusterName, _ := as.AiStorages.GetClusterNameById(s.ClusterId)
  257. if s.Msg != "" {
  258. msg := fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg)
  259. errmsg += msg
  260. err := as.AiStorages.SaveAiTask(taskId, as.option, adapterName, s.ClusterId, clusterName, "", constants.Failed, msg)
  261. if err != nil {
  262. return errors.New("database add failed: " + err.Error())
  263. }
  264. } else {
  265. msg := fmt.Sprintf("clusterId: %v , submitted successfully, jobId: %v \n", s.ClusterId, s.JobId)
  266. errmsg += msg
  267. err := as.AiStorages.SaveAiTask(taskId, as.option, adapterName, s.ClusterId, clusterName, s.JobId, constants.Saved, msg)
  268. if err != nil {
  269. return errors.New("database add failed: " + err.Error())
  270. }
  271. }
  272. //add report msg
  273. jobMsg := &jcs.ReportMessage{
  274. Status: false,
  275. Message: s.Msg,
  276. ClusterID: s.ClusterId,
  277. Output: "",
  278. }
  279. report.Messages = append(report.Messages, jobMsg)
  280. }
  281. //report status
  282. if mode == executor.SUBMIT_MODE_STORAGE_SCHEDULE {
  283. _ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
  284. }
  285. logx.Errorf(errors.New(errmsg).Error())
  286. return errors.New(errmsg)
  287. }
  288. return nil
  289. }
  290. func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) {
  291. switch mode {
  292. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  293. opt.Cmd = cluster.Cmd
  294. opt.Envs = cluster.Envs
  295. opt.Params = cluster.Params
  296. opt.ImageId = cluster.ImageId
  297. opt.AlgorithmId = cluster.CodeId
  298. opt.DatasetsId = cluster.DatasetId
  299. opt.ResourcesRequired = cluster.ResourcesRequired
  300. default:
  301. }
  302. }
  303. func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) {
  304. var wg sync.WaitGroup
  305. var clustersNum = len(as.AiService.AiCollectorAdapterMap[as.option.AdapterId])
  306. var ch = make(chan *collector.ResourceStats, clustersNum)
  307. var errCh = make(chan interface{}, clustersNum)
  308. var resourceSpecs []*collector.ResourceStats
  309. var errs []interface{}
  310. for s, resourceCollector := range as.AiService.AiCollectorAdapterMap[as.option.AdapterId] {
  311. wg.Add(1)
  312. rc := resourceCollector
  313. id := s
  314. go func() {
  315. spec, err := rc.GetResourceStats(as.ctx)
  316. if err != nil {
  317. e := struct {
  318. err error
  319. clusterId string
  320. }{
  321. err: err,
  322. clusterId: id,
  323. }
  324. errCh <- e
  325. wg.Done()
  326. return
  327. }
  328. ch <- spec
  329. wg.Done()
  330. }()
  331. }
  332. wg.Wait()
  333. close(ch)
  334. close(errCh)
  335. for s := range ch {
  336. resourceSpecs = append(resourceSpecs, s)
  337. }
  338. for e := range errCh {
  339. errs = append(errs, e)
  340. }
  341. if len(errs) == clustersNum {
  342. return nil, errors.New("get resources failed")
  343. }
  344. if len(errs) != 0 {
  345. var msg string
  346. for _, err := range errs {
  347. e := (err).(struct {
  348. err error
  349. clusterId string
  350. })
  351. msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
  352. }
  353. //return nil, errors.New(msg)
  354. }
  355. return resourceSpecs, nil
  356. }
  357. func convertType(in interface{}) (*AiResult, error) {
  358. var result AiResult
  359. switch (in).(type) {
  360. case *hpcAC.SubmitTaskAiResp:
  361. resp := (in).(*hpcAC.SubmitTaskAiResp)
  362. if resp.Code == "0" {
  363. result.JobId = resp.Data
  364. } else {
  365. result.Msg = resp.Msg
  366. }
  367. return &result, nil
  368. case *octopus.CreateTrainJobResp:
  369. resp := (in).(*octopus.CreateTrainJobResp)
  370. if resp.Success {
  371. result.JobId = resp.Payload.JobId
  372. } else {
  373. result.Msg = resp.Error.Message
  374. }
  375. return &result, nil
  376. case *modelartsservice.CreateTrainingJobResp:
  377. resp := (in).(*modelartsservice.CreateTrainingJobResp)
  378. if resp.ErrorMsg != "" {
  379. result.Msg = resp.ErrorMsg
  380. } else {
  381. result.JobId = resp.Metadata.Id
  382. }
  383. return &result, nil
  384. case model.CreateTask:
  385. resp := (in).(model.CreateTask)
  386. if resp.Code != 0 {
  387. result.Msg = resp.Msg
  388. } else {
  389. result.JobId = strconv.Itoa(resp.Data.Id)
  390. }
  391. return &result, nil
  392. default:
  393. return nil, errors.New("ai task response failed")
  394. }
  395. }
  396. func cloneAiOption(opt *option.AiOption) (*option.AiOption, error) {
  397. origJSON, err := json.Marshal(opt)
  398. if err != nil {
  399. return nil, err
  400. }
  401. clone := option.AiOption{}
  402. if err = json.Unmarshal(origJSON, &clone); err != nil {
  403. return nil, err
  404. }
  405. return &clone, nil
  406. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.