You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

queryresourceslogic.go 5.1 kB

11 months ago
11 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. package schedule
  2. import (
  3. "context"
  4. "errors"
  5. "github.com/zeromicro/go-zero/core/logx"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  10. "strconv"
  11. "strings"
  12. "sync"
  13. "time"
  14. )
  15. const (
  16. ADAPTERID = "1777144940459986944" // 异构适配器id
  17. QUERY_TRAIN_RESOURCES = "train_resources"
  18. QUERY_INFERENCE_RESOURCES = "inference_resources"
  19. )
  20. type QueryResourcesLogic struct {
  21. logx.Logger
  22. ctx context.Context
  23. svcCtx *svc.ServiceContext
  24. }
  25. func NewQueryResourcesLogic(ctx context.Context, svcCtx *svc.ServiceContext) *QueryResourcesLogic {
  26. return &QueryResourcesLogic{
  27. Logger: logx.WithContext(ctx),
  28. ctx: ctx,
  29. svcCtx: svcCtx,
  30. }
  31. }
  32. func (l *QueryResourcesLogic) QueryResources(req *types.QueryResourcesReq) (resp *types.QueryResourcesResp, err error) {
  33. resp = &types.QueryResourcesResp{}
  34. if len(req.ClusterIDs) == 0 {
  35. cs, err := l.svcCtx.Scheduler.AiStorages.GetClustersByAdapterId(ADAPTERID)
  36. if err != nil {
  37. return nil, err
  38. }
  39. var resources interface{}
  40. switch req.Type {
  41. case "Train":
  42. resources, _ = l.svcCtx.Scheduler.AiService.LocalCache[QUERY_TRAIN_RESOURCES]
  43. case "Inference":
  44. resources, _ = l.svcCtx.Scheduler.AiService.LocalCache[QUERY_INFERENCE_RESOURCES]
  45. default:
  46. resources, _ = l.svcCtx.Scheduler.AiService.LocalCache[QUERY_TRAIN_RESOURCES]
  47. }
  48. specs, ok := resources.([]*collector.ResourceSpec)
  49. if ok {
  50. results := handleEmptyResourceUsage(cs.List, specs)
  51. resp.Data = results
  52. return resp, nil
  53. }
  54. rus, err := l.QueryResourcesByClusterId(cs.List, req.Type)
  55. if err != nil {
  56. return nil, err
  57. }
  58. results := handleEmptyResourceUsage(cs.List, rus)
  59. resp.Data = results
  60. } else {
  61. var clusters []types.ClusterInfo
  62. for _, id := range req.ClusterIDs {
  63. cluster, err := l.svcCtx.Scheduler.AiStorages.GetClustersById(id)
  64. if err != nil {
  65. return nil, err
  66. }
  67. clusters = append(clusters, *cluster)
  68. }
  69. if len(clusters) == 0 {
  70. return nil, errors.New("no clusters found ")
  71. }
  72. rus, err := l.QueryResourcesByClusterId(clusters, req.Type)
  73. if err != nil {
  74. return nil, err
  75. }
  76. results := handleEmptyResourceUsage(clusters, rus)
  77. resp.Data = results
  78. }
  79. return resp, nil
  80. }
  81. func (l *QueryResourcesLogic) QueryResourcesByClusterId(clusterinfos []types.ClusterInfo, resrcType string) ([]*collector.ResourceSpec, error) {
  82. var clusters []types.ClusterInfo
  83. if len(clusterinfos) == 0 {
  84. cs, err := l.svcCtx.Scheduler.AiStorages.GetClustersByAdapterId(ADAPTERID)
  85. if err != nil {
  86. return nil, err
  87. }
  88. clusters = cs.List
  89. } else {
  90. clusters = clusterinfos
  91. }
  92. var ulist []*collector.ResourceSpec
  93. var ch = make(chan *collector.ResourceSpec, len(clusters))
  94. var wg sync.WaitGroup
  95. for _, cluster := range clusters {
  96. wg.Add(1)
  97. c := cluster
  98. go func() {
  99. defer wg.Done()
  100. done := make(chan bool)
  101. var u *collector.ResourceSpec
  102. var err error
  103. go func() {
  104. col, found := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[strconv.FormatInt(c.AdapterId, 10)][c.Id]
  105. if !found {
  106. done <- true
  107. return
  108. }
  109. u, err = col.GetResourceSpecs(l.ctx, resrcType)
  110. if err != nil {
  111. logx.Error(err)
  112. done <- true
  113. return
  114. }
  115. done <- true
  116. }()
  117. select {
  118. case <-done:
  119. if u != nil {
  120. ch <- u
  121. }
  122. case <-time.After(10 * time.Second):
  123. return
  124. }
  125. }()
  126. }
  127. wg.Wait()
  128. close(ch)
  129. for v := range ch {
  130. ulist = append(ulist, v)
  131. }
  132. return ulist, nil
  133. }
  134. func handleEmptyResourceUsage(list []types.ClusterInfo, ulist []*collector.ResourceSpec) []*collector.ResourceSpec {
  135. var rus []*collector.ResourceSpec
  136. m := make(map[string]interface{})
  137. for _, u := range ulist {
  138. if u == nil {
  139. continue
  140. }
  141. m[u.ClusterId] = u
  142. }
  143. for _, l := range list {
  144. s, ok := m[l.Id]
  145. if !ok {
  146. ru := &collector.ResourceSpec{
  147. ClusterId: l.Id,
  148. Resources: nil,
  149. Msg: "resources unavailable, please retry later",
  150. }
  151. rus = append(rus, ru)
  152. } else {
  153. if s == nil {
  154. ru := &collector.ResourceSpec{
  155. ClusterId: l.Id,
  156. Resources: nil,
  157. Msg: "resources unavailable, please retry later",
  158. }
  159. rus = append(rus, ru)
  160. } else {
  161. r, ok := s.(*collector.ResourceSpec)
  162. if ok {
  163. if r.Resources == nil || len(r.Resources) == 0 {
  164. ru := &collector.ResourceSpec{
  165. ClusterId: r.ClusterId,
  166. Resources: nil,
  167. Msg: "resources unavailable, please retry later",
  168. }
  169. rus = append(rus, ru)
  170. } else {
  171. // add cluster type
  172. t, ok := storeLink.ClusterTypeMap[strings.Title(l.Name)]
  173. if ok {
  174. r.ClusterType = t
  175. }
  176. rus = append(rus, r)
  177. }
  178. }
  179. }
  180. }
  181. }
  182. return rus
  183. }
  184. func checkCachingCondition(clusters []types.ClusterInfo, specs []*collector.ResourceSpec) bool {
  185. var count int
  186. for _, spec := range specs {
  187. if spec.Resources != nil {
  188. count++
  189. }
  190. }
  191. if count == len(clusters) {
  192. return true
  193. }
  194. return false
  195. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.