You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

startallbydeploytaskidlogic.go 3.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. package inference
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "github.com/zeromicro/go-zero/core/logx"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/status"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  11. "strconv"
  12. "sync"
  13. )
  14. type StartAllByDeployTaskIdLogic struct {
  15. logx.Logger
  16. ctx context.Context
  17. svcCtx *svc.ServiceContext
  18. }
  19. func NewStartAllByDeployTaskIdLogic(ctx context.Context, svcCtx *svc.ServiceContext) *StartAllByDeployTaskIdLogic {
  20. return &StartAllByDeployTaskIdLogic{
  21. Logger: logx.WithContext(ctx),
  22. ctx: ctx,
  23. svcCtx: svcCtx,
  24. }
  25. }
  26. func (l *StartAllByDeployTaskIdLogic) StartAllByDeployTaskId(req *types.StartAllByDeployTaskIdReq) (resp *types.StartAllByDeployTaskIdResp, err error) {
  27. resp = &types.StartAllByDeployTaskIdResp{}
  28. id, err := strconv.ParseInt(req.Id, 10, 64)
  29. list, err := l.svcCtx.Scheduler.AiStorages.GetInstanceListByDeployTaskId(id)
  30. if err != nil {
  31. return nil, err
  32. }
  33. if len(list) == 0 {
  34. return nil, errors.New("instances are empty")
  35. }
  36. err = l.svcCtx.Scheduler.AiStorages.UpdateDeployTaskById(id)
  37. if err != nil {
  38. return nil, err
  39. }
  40. err = l.startAll(list)
  41. if err != nil {
  42. return nil, err
  43. }
  44. return resp, nil
  45. }
  46. func (l *StartAllByDeployTaskIdLogic) startAll(list []*models.AiInferDeployInstance) error {
  47. var wg sync.WaitGroup
  48. var errCh = make(chan interface{}, len(list))
  49. var errs []interface{}
  50. buf := make(chan bool, 2)
  51. for _, instance := range list {
  52. wg.Add(1)
  53. ins := instance
  54. buf <- true
  55. go func() {
  56. in, err := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].GetInferDeployInstance(l.ctx, ins.InstanceId)
  57. if err != nil {
  58. e := struct {
  59. errTyp uint8
  60. err error
  61. instanceName string
  62. clusterName string
  63. }{
  64. errTyp: 1,
  65. err: err,
  66. instanceName: ins.InstanceName,
  67. clusterName: ins.ClusterName,
  68. }
  69. errCh <- e
  70. wg.Done()
  71. <-buf
  72. return
  73. }
  74. if status.CheckStopStatus(in) {
  75. success := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].StartInferDeployInstance(l.ctx, ins.InstanceId)
  76. if !success {
  77. e := struct {
  78. errTyp uint8
  79. err error
  80. instanceName string
  81. clusterName string
  82. }{
  83. errTyp: 2,
  84. err: err,
  85. instanceName: ins.InstanceName,
  86. clusterName: ins.ClusterName,
  87. }
  88. errCh <- e
  89. wg.Done()
  90. <-buf
  91. return
  92. }
  93. }
  94. ins.Status = "Updating"
  95. err = l.svcCtx.Scheduler.AiStorages.UpdateInferDeployInstance(ins, true)
  96. if err != nil {
  97. wg.Done()
  98. <-buf
  99. return
  100. }
  101. wg.Done()
  102. <-buf
  103. }()
  104. }
  105. wg.Wait()
  106. close(errCh)
  107. for e := range errCh {
  108. errs = append(errs, e)
  109. }
  110. if len(errs) != 0 {
  111. var msg string
  112. for _, err := range errs {
  113. e := (err).(struct {
  114. errTyp uint8
  115. err error
  116. instanceName string
  117. clusterName string
  118. })
  119. switch e.errTyp {
  120. case 1:
  121. msg += fmt.Sprintf("GetInstance Failed # clusterName: %v , instanceName: %v , error: %v \n", e.clusterName, e.instanceName, e.err.Error())
  122. case 2:
  123. msg += fmt.Sprintf("StartInstance Failed # clusterName: %v , instanceName: %v , error: %v \n", e.clusterName, e.instanceName, e.err.Error())
  124. }
  125. }
  126. return errors.New(msg)
  127. }
  128. return nil
  129. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.