You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

stopallbydeploytaskidlogic.go 3.5 kB

4 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. package inference
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  9. "strconv"
  10. "sync"
  11. "github.com/zeromicro/go-zero/core/logx"
  12. )
  13. type StopAllByDeployTaskIdLogic struct {
  14. logx.Logger
  15. ctx context.Context
  16. svcCtx *svc.ServiceContext
  17. }
  18. func NewStopAllByDeployTaskIdLogic(ctx context.Context, svcCtx *svc.ServiceContext) *StopAllByDeployTaskIdLogic {
  19. return &StopAllByDeployTaskIdLogic{
  20. Logger: logx.WithContext(ctx),
  21. ctx: ctx,
  22. svcCtx: svcCtx,
  23. }
  24. }
  25. func (l *StopAllByDeployTaskIdLogic) StopAllByDeployTaskId(req *types.StopAllByDeployTaskIdReq) (resp *types.StopAllByDeployTaskIdResp, err error) {
  26. resp = &types.StopAllByDeployTaskIdResp{}
  27. id, err := strconv.ParseInt(req.Id, 10, 64)
  28. list, err := l.svcCtx.Scheduler.AiStorages.GetInstanceListByDeployTaskId(id)
  29. if err != nil {
  30. return nil, err
  31. }
  32. if len(list) == 0 {
  33. return nil, errors.New("instances are empty")
  34. }
  35. err = l.svcCtx.Scheduler.AiStorages.UpdateDeployTaskById(id)
  36. if err != nil {
  37. return nil, err
  38. }
  39. err = l.stopAll(list)
  40. if err != nil {
  41. return nil, err
  42. }
  43. return resp, nil
  44. }
  45. func (l *StopAllByDeployTaskIdLogic) stopAll(list []*models.AiInferDeployInstance) error {
  46. var wg sync.WaitGroup
  47. var errCh = make(chan interface{}, len(list))
  48. var errs []interface{}
  49. buf := make(chan bool, 2)
  50. for _, instance := range list {
  51. wg.Add(1)
  52. ins := instance
  53. buf <- true
  54. go func() {
  55. in, err := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].GetInferDeployInstance(l.ctx, ins.InstanceId)
  56. if err != nil {
  57. e := struct {
  58. errTyp uint8
  59. err error
  60. instanceName string
  61. clusterName string
  62. }{
  63. errTyp: 1,
  64. err: err,
  65. instanceName: ins.InstanceName,
  66. clusterName: ins.ClusterName,
  67. }
  68. errCh <- e
  69. wg.Done()
  70. <-buf
  71. return
  72. }
  73. if l.svcCtx.Scheduler.AiService.Si.CheckRunningStatus(in) {
  74. success := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[strconv.FormatInt(ins.AdapterId, 10)][strconv.FormatInt(ins.ClusterId, 10)].StopInferDeployInstance(l.ctx, ins.InstanceId)
  75. if !success {
  76. e := struct {
  77. errTyp uint8
  78. err error
  79. instanceName string
  80. clusterName string
  81. }{
  82. errTyp: 2,
  83. err: err,
  84. instanceName: ins.InstanceName,
  85. clusterName: ins.ClusterName,
  86. }
  87. errCh <- e
  88. wg.Done()
  89. <-buf
  90. return
  91. }
  92. }
  93. ins.Status = "Updating"
  94. err = l.svcCtx.Scheduler.AiStorages.UpdateInferDeployInstance(ins, true)
  95. if err != nil {
  96. wg.Done()
  97. <-buf
  98. return
  99. }
  100. wg.Done()
  101. <-buf
  102. }()
  103. }
  104. wg.Wait()
  105. close(errCh)
  106. for e := range errCh {
  107. errs = append(errs, e)
  108. }
  109. if len(errs) != 0 {
  110. var msg string
  111. for _, err := range errs {
  112. e := (err).(struct {
  113. errTyp uint8
  114. err error
  115. instanceName string
  116. clusterName string
  117. })
  118. switch e.errTyp {
  119. case 1:
  120. msg += fmt.Sprintf("GetInstance Failed # clusterName: %v , instanceName: %v , error: %v \n", e.clusterName, e.instanceName, e.err.Error())
  121. case 2:
  122. msg += fmt.Sprintf("StopInstance Failed # clusterName: %v , instanceName: %v , error: %v \n", e.clusterName, e.instanceName, e.err.Error())
  123. }
  124. }
  125. return errors.New(msg)
  126. }
  127. return nil
  128. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.