You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aiScheduler.go 12 kB

10 months ago
10 months ago
11 months ago
10 months ago
10 months ago
10 months ago
11 months ago
11 months ago
11 months ago
11 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
10 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package schedulers
  13. import (
  14. "context"
  15. "encoding/json"
  16. "errors"
  17. "fmt"
  18. "github.com/zeromicro/go-zero/core/logx"
  19. "gitlink.org.cn/JointCloud/pcm-ac/hpcAC"
  20. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler"
  21. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  22. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  23. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  24. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  25. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/jcs"
  26. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
  27. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy/param"
  28. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  29. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  30. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/response"
  31. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  32. "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
  33. "gitlink.org.cn/JointCloud/pcm-octopus/octopus"
  34. "gitlink.org.cn/JointCloud/pcm-openi/model"
  35. "strconv"
  36. "sync"
  37. )
  38. type AiScheduler struct {
  39. yamlString string
  40. task *response.TaskInfo
  41. *scheduler.Scheduler
  42. option *option.AiOption
  43. ctx context.Context
  44. }
  45. type AiResult struct {
  46. AdapterId string
  47. TaskName string
  48. JobId string
  49. ClusterId string
  50. Strategy string
  51. Replica int32
  52. Card string
  53. Msg string
  54. Output string
  55. }
  56. func NewAiScheduler(ctx context.Context, val string, scheduler *scheduler.Scheduler, option *option.AiOption) (*AiScheduler, error) {
  57. return &AiScheduler{ctx: ctx, yamlString: val, Scheduler: scheduler, option: option}, nil
  58. }
  59. func (as *AiScheduler) GetNewStructForDb(task *response.TaskInfo, resource string, participantId int64) (interface{}, error) {
  60. ai := models.Ai{
  61. AdapterId: participantId,
  62. TaskId: task.TaskId,
  63. Status: "Saved",
  64. YamlString: as.yamlString,
  65. }
  66. utils.Convert(task.Metadata, &ai)
  67. return ai, nil
  68. }
  69. func (as *AiScheduler) PickOptimalStrategy() (strategy.Strategy, error) {
  70. if as.option.ComputeCard != "" {
  71. m, ok := as.AiService.AiCollectorAdapterMap[as.option.AdapterId]
  72. if ok {
  73. for _, id := range as.option.ClusterIds {
  74. cm, ok := m[id]
  75. if ok {
  76. cards, err := cm.GetComputeCards(as.ctx)
  77. if err != nil {
  78. return nil, err
  79. }
  80. if common.Contains(cards, as.option.ComputeCard) {
  81. return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: id, Replicas: 1}}, nil
  82. }
  83. }
  84. }
  85. }
  86. }
  87. if len(as.option.ClusterIds) == 1 {
  88. return &strategy.SingleAssignment{Cluster: &strategy.AssignedCluster{ClusterId: as.option.ClusterIds[0], Replicas: 1}}, nil
  89. }
  90. resources, err := as.findClustersWithResources()
  91. if err != nil {
  92. return nil, err
  93. }
  94. if len(resources) == 0 {
  95. return nil, errors.New("no cluster has resources")
  96. }
  97. if len(resources) == 1 {
  98. var cluster strategy.AssignedCluster
  99. cluster.ClusterId = resources[0].ClusterId
  100. cluster.Replicas = 1
  101. return &strategy.SingleAssignment{Cluster: &cluster}, nil
  102. }
  103. params := &param.Params{Resources: resources}
  104. switch as.option.StrategyName {
  105. case strategy.REPLICATION:
  106. var clusterIds []string
  107. for _, resource := range resources {
  108. if resource == nil {
  109. continue
  110. }
  111. clusterIds = append(clusterIds, resource.ClusterId)
  112. }
  113. strategy := strategy.NewReplicationStrategy(clusterIds, 1)
  114. return strategy, nil
  115. case strategy.RESOURCES_PRICING:
  116. strategy := strategy.NewPricingStrategy(&param.ResourcePricingParams{Params: params, Replicas: 1})
  117. return strategy, nil
  118. case strategy.DYNAMIC_RESOURCES:
  119. strategy := strategy.NewDynamicResourcesStrategy(params.Resources, as.option, 1)
  120. return strategy, nil
  121. case strategy.STATIC_WEIGHT:
  122. //todo resources should match cluster StaticWeightMap
  123. strategy := strategy.NewStaticWeightStrategy(as.option.ClusterToStaticWeight, as.option.Replica)
  124. return strategy, nil
  125. case strategy.RANDOM:
  126. strategy := strategy.NewRandomStrategy(as.option.ClusterIds, as.option.Replica)
  127. return strategy, nil
  128. }
  129. return nil, errors.New("no strategy has been chosen")
  130. }
  131. func (as *AiScheduler) AssignTask(clusters []*strategy.AssignedCluster, mode int) (interface{}, error) {
  132. if clusters == nil {
  133. return nil, errors.New("clusters is nil")
  134. }
  135. for i := len(clusters) - 1; i >= 0; i-- {
  136. if clusters[i].Replicas == 0 {
  137. clusters = append(clusters[:i], clusters[i+1:]...)
  138. }
  139. }
  140. if len(clusters) == 0 {
  141. return nil, errors.New("clusters is nil")
  142. }
  143. var wg sync.WaitGroup
  144. var results []*AiResult
  145. var mu sync.Mutex
  146. var errs []interface{}
  147. var taskNum int32
  148. for _, cluster := range clusters {
  149. taskNum += cluster.Replicas
  150. }
  151. var ch = make(chan *AiResult, taskNum)
  152. var errCh = make(chan interface{}, taskNum)
  153. executorMap := as.AiService.AiExecutorAdapterMap[as.option.AdapterId]
  154. for _, cluster := range clusters {
  155. c := cluster
  156. for i := 0; i < int(c.Replicas); i++ {
  157. wg.Add(1)
  158. go func() {
  159. opt, _ := cloneAiOption(as.option)
  160. // decide opt params by mode
  161. updateAiOptionByMode(c, opt, mode)
  162. resp, err := executorMap[c.ClusterId].Execute(as.ctx, opt, mode)
  163. if err != nil {
  164. e := struct {
  165. err error
  166. clusterId string
  167. }{
  168. err: err,
  169. clusterId: c.ClusterId,
  170. }
  171. errCh <- e
  172. wg.Done()
  173. return
  174. }
  175. result := &AiResult{}
  176. mu.Lock()
  177. result, _ = convertType(resp)
  178. mu.Unlock()
  179. result.AdapterId = opt.AdapterId
  180. result.TaskName = opt.TaskName
  181. result.Replica = c.Replicas
  182. result.ClusterId = c.ClusterId
  183. result.Strategy = as.option.StrategyName
  184. result.Card = opt.ComputeCard
  185. result.Output = opt.Output
  186. ch <- result
  187. wg.Done()
  188. }()
  189. }
  190. }
  191. wg.Wait()
  192. close(ch)
  193. close(errCh)
  194. for e := range errCh {
  195. errs = append(errs, e)
  196. }
  197. for s := range ch {
  198. results = append(results, s)
  199. }
  200. err := as.handleErrors(errs, clusters, results, mode)
  201. if err != nil {
  202. return nil, err
  203. }
  204. return results, nil
  205. }
  206. func (as *AiScheduler) handleErrors(errs []interface{}, clusters []*strategy.AssignedCluster, results []*AiResult, mode int) error {
  207. if len(errs) != 0 {
  208. var synergystatus int64
  209. if len(clusters) > 1 {
  210. synergystatus = 1
  211. }
  212. var taskId int64
  213. switch mode {
  214. case executor.SUBMIT_MODE_JOINT_CLOUD:
  215. tid, err := as.CreateTask(as.option.TaskName, "", synergystatus, as.option.StrategyName, "", "", "", nil)
  216. if err != nil {
  217. return err
  218. }
  219. taskId = tid
  220. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  221. taskId = as.option.TaskId
  222. }
  223. // aiTasks
  224. adapterName, err := as.AiStorages.GetAdapterNameById(as.option.AdapterId)
  225. if err != nil {
  226. return err
  227. }
  228. //report msg
  229. report := &jcs.JobStatusReportReq{
  230. TaskName: "",
  231. TaskID: strconv.FormatInt(taskId, 10),
  232. Messages: make([]*jcs.ReportMessage, 0),
  233. }
  234. var errmsg string
  235. for _, err := range errs {
  236. e := (err).(struct {
  237. err error
  238. clusterId string
  239. })
  240. msg := fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
  241. errmsg += msg
  242. clusterName, _ := as.AiStorages.GetClusterNameById(e.clusterId)
  243. err := as.AiStorages.SaveAiTask(taskId, as.option, adapterName, e.clusterId, clusterName, "", constants.Failed, msg)
  244. if err != nil {
  245. return errors.New("database add failed: " + err.Error())
  246. }
  247. //add report msg
  248. jobMsg := &jcs.ReportMessage{
  249. Status: false,
  250. Message: msg,
  251. ClusterID: e.clusterId,
  252. Output: "",
  253. }
  254. report.Messages = append(report.Messages, jobMsg)
  255. }
  256. for _, s := range results {
  257. as.option.ComputeCard = s.Card //execute card
  258. clusterName, _ := as.AiStorages.GetClusterNameById(s.ClusterId)
  259. if s.Msg != "" {
  260. msg := fmt.Sprintf("clusterId: %v , error: %v \n", s.ClusterId, s.Msg)
  261. errmsg += msg
  262. err := as.AiStorages.SaveAiTask(taskId, as.option, adapterName, s.ClusterId, clusterName, "", constants.Failed, msg)
  263. if err != nil {
  264. return errors.New("database add failed: " + err.Error())
  265. }
  266. } else {
  267. msg := fmt.Sprintf("clusterId: %v , submitted successfully, jobId: %v \n", s.ClusterId, s.JobId)
  268. errmsg += msg
  269. err := as.AiStorages.SaveAiTask(taskId, as.option, adapterName, s.ClusterId, clusterName, s.JobId, constants.Saved, msg)
  270. if err != nil {
  271. return errors.New("database add failed: " + err.Error())
  272. }
  273. }
  274. //add report msg
  275. jobMsg := &jcs.ReportMessage{
  276. Status: false,
  277. Message: s.Msg,
  278. ClusterID: s.ClusterId,
  279. Output: "",
  280. }
  281. report.Messages = append(report.Messages, jobMsg)
  282. }
  283. //report status
  284. _ = jcs.StatusReport(as.AiService.Conf.JcsMiddleware.JobStatusReportUrl, report)
  285. logx.Errorf(errors.New(errmsg).Error())
  286. return errors.New(errmsg)
  287. }
  288. return nil
  289. }
  290. func updateAiOptionByMode(cluster *strategy.AssignedCluster, opt *option.AiOption, mode int) {
  291. switch mode {
  292. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  293. opt.Cmd = cluster.Cmd
  294. opt.Envs = cluster.Envs
  295. opt.Params = cluster.Params
  296. opt.ImageId = cluster.ImageId
  297. opt.AlgorithmId = cluster.CodeId
  298. opt.DatasetsId = cluster.DatasetId
  299. opt.ResourcesRequired = cluster.ResourcesRequired
  300. opt.Output = cluster.Output
  301. default:
  302. }
  303. }
  304. func (as *AiScheduler) findClustersWithResources() ([]*collector.ResourceStats, error) {
  305. var wg sync.WaitGroup
  306. var clustersNum = len(as.AiService.AiCollectorAdapterMap[as.option.AdapterId])
  307. var ch = make(chan *collector.ResourceStats, clustersNum)
  308. var errCh = make(chan interface{}, clustersNum)
  309. var resourceSpecs []*collector.ResourceStats
  310. var errs []interface{}
  311. for s, resourceCollector := range as.AiService.AiCollectorAdapterMap[as.option.AdapterId] {
  312. wg.Add(1)
  313. rc := resourceCollector
  314. id := s
  315. go func() {
  316. spec, err := rc.GetResourceStats(as.ctx)
  317. if err != nil {
  318. e := struct {
  319. err error
  320. clusterId string
  321. }{
  322. err: err,
  323. clusterId: id,
  324. }
  325. errCh <- e
  326. wg.Done()
  327. return
  328. }
  329. ch <- spec
  330. wg.Done()
  331. }()
  332. }
  333. wg.Wait()
  334. close(ch)
  335. close(errCh)
  336. for s := range ch {
  337. resourceSpecs = append(resourceSpecs, s)
  338. }
  339. for e := range errCh {
  340. errs = append(errs, e)
  341. }
  342. if len(errs) == clustersNum {
  343. return nil, errors.New("get resources failed")
  344. }
  345. if len(errs) != 0 {
  346. var msg string
  347. for _, err := range errs {
  348. e := (err).(struct {
  349. err error
  350. clusterId string
  351. })
  352. msg += fmt.Sprintf("clusterId: %v , error: %v \n", e.clusterId, e.err.Error())
  353. }
  354. //return nil, errors.New(msg)
  355. }
  356. return resourceSpecs, nil
  357. }
  358. func convertType(in interface{}) (*AiResult, error) {
  359. var result AiResult
  360. switch (in).(type) {
  361. case *hpcAC.SubmitTaskAiResp:
  362. resp := (in).(*hpcAC.SubmitTaskAiResp)
  363. if resp.Code == "0" {
  364. result.JobId = resp.Data
  365. } else {
  366. result.Msg = resp.Msg
  367. }
  368. return &result, nil
  369. case *octopus.CreateTrainJobResp:
  370. resp := (in).(*octopus.CreateTrainJobResp)
  371. if resp.Success {
  372. result.JobId = resp.Payload.JobId
  373. } else {
  374. result.Msg = resp.Error.Message
  375. }
  376. return &result, nil
  377. case *modelartsservice.CreateTrainingJobResp:
  378. resp := (in).(*modelartsservice.CreateTrainingJobResp)
  379. if resp.ErrorMsg != "" {
  380. result.Msg = resp.ErrorMsg
  381. } else {
  382. result.JobId = resp.Metadata.Id
  383. }
  384. return &result, nil
  385. case model.CreateTask:
  386. resp := (in).(model.CreateTask)
  387. if resp.Code != 0 {
  388. result.Msg = resp.Msg
  389. } else {
  390. result.JobId = strconv.Itoa(resp.Data.Id)
  391. }
  392. return &result, nil
  393. default:
  394. return nil, errors.New("ai task response failed")
  395. }
  396. }
  397. func cloneAiOption(opt *option.AiOption) (*option.AiOption, error) {
  398. origJSON, err := json.Marshal(opt)
  399. if err != nil {
  400. return nil, err
  401. }
  402. clone := option.AiOption{}
  403. if err = json.Unmarshal(origJSON, &clone); err != nil {
  404. return nil, err
  405. }
  406. return &clone, nil
  407. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.