You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

createinferencetasklogic.go 8.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. package inference
  2. import (
  3. "context"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/utils/task"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
  11. "strconv"
  12. "strings"
  13. "sync"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc"
  15. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  16. "github.com/zeromicro/go-zero/core/logx"
  17. )
  18. type CreateInferenceTaskLogic struct {
  19. logx.Logger
  20. ctx context.Context
  21. svcCtx *svc.ServiceContext
  22. }
  23. func NewCreateInferenceTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *CreateInferenceTaskLogic {
  24. return &CreateInferenceTaskLogic{
  25. Logger: logx.WithContext(ctx),
  26. ctx: ctx,
  27. svcCtx: svcCtx,
  28. }
  29. }
  30. const AdapterId = "1777144940459986944"
  31. func (l *CreateInferenceTaskLogic) CreateInferenceTask(req *types.CreateInferenceTaskReq) (resp *types.CreateInferenceTaskResp, err error) {
  32. err = task.ValidateJobResources(req.JobResources, "inference")
  33. if err != nil {
  34. return nil, err
  35. }
  36. clusters, err := generateClustersForTaskCreation(req.DataDistributes, req.Name)
  37. if err != nil {
  38. return nil, err
  39. }
  40. modelName, err := generateModelName(clusters)
  41. if err != nil {
  42. return nil, err
  43. }
  44. taskName, err := l.svcCtx.Scheduler.AiService.HandleDuplicateTaskName(req.Name)
  45. if err != nil {
  46. return nil, err
  47. }
  48. assignedClusters := task.CopyParams(clusters, req.JobResources.Clusters, "inference")
  49. opt := &option.InferOption{
  50. TaskName: taskName,
  51. TaskDesc: req.Description,
  52. ModelType: "",
  53. ModelName: modelName,
  54. Cmd: "",
  55. }
  56. taskId, err := l.svcCtx.Scheduler.AiStorages.SaveInferDeployTask(taskName, modelName, "", req.Description)
  57. if err != nil {
  58. return nil, err
  59. }
  60. adapterClusterMap := make(map[string][]*strategy.AssignedCluster)
  61. adapterClusterMap[AdapterId] = assignedClusters
  62. err = l.createInferenceTask(taskId, adapterClusterMap, opt)
  63. if err != nil {
  64. return nil, err
  65. }
  66. return
  67. }
  68. func (l *CreateInferenceTaskLogic) createInferenceTask(taskId int64, adapterClusterMap map[string][]*strategy.AssignedCluster, option *option.InferOption) error {
  69. var clusterlen int
  70. for _, c := range adapterClusterMap {
  71. clusterlen += len(c)
  72. }
  73. var errCh = make(chan interface{}, clusterlen)
  74. var errs []interface{}
  75. buf := make(chan bool, 2)
  76. var wg sync.WaitGroup
  77. for aid, v := range adapterClusterMap {
  78. for _, c := range v {
  79. wg.Add(1)
  80. cluster := c
  81. buf <- true
  82. go func() {
  83. opt, _ := cloneOption(option)
  84. updateInferOption(cluster, opt)
  85. err := l.createDeployInstance(taskId, aid, cluster.ClusterId, opt)
  86. if err != nil {
  87. e := struct {
  88. err error
  89. clusterId string
  90. }{
  91. err: err,
  92. clusterId: cluster.ClusterId,
  93. }
  94. errCh <- e
  95. wg.Done()
  96. <-buf
  97. return
  98. }
  99. wg.Done()
  100. <-buf
  101. }()
  102. }
  103. }
  104. wg.Wait()
  105. close(errCh)
  106. for e := range errCh {
  107. errs = append(errs, e)
  108. }
  109. if len(errs) != 0 {
  110. var msg string
  111. for _, err := range errs {
  112. e := (err).(struct {
  113. err error
  114. clusterId string
  115. })
  116. clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(e.clusterId)
  117. if err != nil {
  118. clusterName = e.clusterId
  119. }
  120. msg += fmt.Sprintf("CreateInstance Failed # clusterName: %v, error: %v \n", clusterName, e.err.Error())
  121. }
  122. return errors.New(msg)
  123. }
  124. return nil
  125. }
  126. func updateInferOption(cluster *strategy.AssignedCluster, opt *option.InferOption) {
  127. opt.Cmd = cluster.Cmd
  128. opt.Envs = cluster.Envs
  129. opt.Params = cluster.Params
  130. opt.ImageId = cluster.ImageId
  131. opt.AlgorithmId = cluster.CodeId
  132. opt.ModelID = cluster.ModelId
  133. opt.ResourcesRequired = cluster.ResourcesRequired
  134. opt.Output = cluster.Output
  135. }
  136. func generateClustersForTaskCreation(distributes types.DataDistribute, taskName string) ([]*strategy.AssignedCluster, error) {
  137. var assignedClusters []*strategy.AssignedCluster
  138. clusterMap := make(map[string]*strategy.AssignedCluster)
  139. for _, distribute := range distributes.Model {
  140. if len(distribute.Clusters) == 0 {
  141. return nil, fmt.Errorf("Model distribute: must specify at least one cluster")
  142. }
  143. for _, c := range distribute.Clusters {
  144. if c.ClusterID == "" {
  145. return nil, fmt.Errorf("Model distribute: clusterId can not be empty")
  146. }
  147. cluster := &strategy.AssignedCluster{}
  148. cluster.ClusterId = c.ClusterID
  149. jsonData := entity.JsonData{}
  150. err := json.Unmarshal([]byte(c.JsonData), &jsonData)
  151. if err != nil {
  152. return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Model")
  153. }
  154. if jsonData.Id == "" {
  155. continue
  156. }
  157. cluster.ModelId = jsonData.Id
  158. cluster.ModelName = jsonData.Name
  159. clusterMap[c.ClusterID] = cluster
  160. }
  161. }
  162. for _, distribute := range distributes.Code {
  163. if len(distribute.Clusters) == 0 {
  164. return nil, fmt.Errorf("Code distribute: must specify at least one cluster")
  165. }
  166. for _, c := range distribute.Clusters {
  167. if c.ClusterID == "" {
  168. return nil, fmt.Errorf("Code distribute: clusterId can not be empty")
  169. }
  170. jsonData := entity.JsonData{}
  171. err := json.Unmarshal([]byte(c.JsonData), &jsonData)
  172. if err != nil {
  173. return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Code")
  174. }
  175. if jsonData.Id == "" {
  176. continue
  177. }
  178. cluster, ok := clusterMap[c.ClusterID]
  179. if ok {
  180. cluster.CodeId = jsonData.Id
  181. }
  182. }
  183. }
  184. for _, distribute := range distributes.Image {
  185. if len(distribute.Clusters) == 0 {
  186. return nil, fmt.Errorf("Image distribute: must specify at least one cluster")
  187. }
  188. for _, c := range distribute.Clusters {
  189. if c.ClusterID == "" {
  190. return nil, fmt.Errorf("Image distribute: clusterId can not be empty")
  191. }
  192. jsonData := entity.JsonData{}
  193. err := json.Unmarshal([]byte(c.JsonData), &jsonData)
  194. if err != nil {
  195. return nil, fmt.Errorf("jsonData convert failed, task %d, cluster %s, datatype: %s", taskName, c.ClusterID, "Image")
  196. }
  197. cluster, ok := clusterMap[c.ClusterID]
  198. if ok {
  199. cluster.ImageId = jsonData.Id
  200. }
  201. }
  202. }
  203. for _, c := range clusterMap {
  204. if c.ModelId == "" {
  205. return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "ModelId")
  206. }
  207. if c.CodeId == "" {
  208. return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "CodeId")
  209. }
  210. if c.ImageId == "" {
  211. return nil, fmt.Errorf("create inference task failed, cluster %s, empty data : %s", c.ClusterId, "ImageId")
  212. }
  213. assignedClusters = append(assignedClusters, c)
  214. }
  215. if len(assignedClusters) == 0 {
  216. return nil, fmt.Errorf("no model provided")
  217. }
  218. return assignedClusters, nil
  219. }
  220. func generateModelName(clusters []*strategy.AssignedCluster) (string, error) {
  221. if len(clusters) == 1 {
  222. return clusters[0].ModelName, nil
  223. }
  224. var modelName string
  225. for _, c := range clusters {
  226. modelName += c.ModelName + ","
  227. }
  228. modelName = strings.TrimSuffix(modelName, ",")
  229. return modelName, nil
  230. }
  231. func (l *CreateInferenceTaskLogic) createDeployInstance(taskId int64, adapterId string, clusterId string, opt *option.InferOption) error {
  232. cmap, found := l.svcCtx.Scheduler.AiService.InferenceAdapterMap[adapterId]
  233. if !found {
  234. return errors.New("adapterId not exist: " + adapterId)
  235. }
  236. iCluster, found := cmap[clusterId]
  237. if !found {
  238. return errors.New("clusterId not exist: " + clusterId)
  239. }
  240. insId, err := iCluster.CreateInferDeployInstance(l.ctx, opt)
  241. if err != nil {
  242. return err
  243. }
  244. aid, err := strconv.ParseInt(adapterId, 10, 64)
  245. if err != nil {
  246. return err
  247. }
  248. cid, err := strconv.ParseInt(clusterId, 10, 64)
  249. if err != nil {
  250. return err
  251. }
  252. adapterName, err := l.svcCtx.Scheduler.AiStorages.GetAdapterNameById(adapterId)
  253. if err != nil {
  254. return err
  255. }
  256. clusterName, err := l.svcCtx.Scheduler.AiStorages.GetClusterNameById(clusterId)
  257. if err != nil {
  258. return err
  259. }
  260. ins, err := iCluster.GetInferDeployInstance(l.ctx, insId)
  261. if err != nil {
  262. return err
  263. }
  264. _, err = l.svcCtx.Scheduler.AiStorages.SaveInferDeployInstance(taskId, ins.InstanceId, ins.InstanceName, aid, adapterName, cid, clusterName, ins.ModelName, ins.ModelType, ins.InferCard, ins.ClusterType)
  265. if err != nil {
  266. return err
  267. }
  268. return nil
  269. }
  270. func cloneOption(opt *option.InferOption) (*option.InferOption, error) {
  271. origJSON, err := json.Marshal(opt)
  272. if err != nil {
  273. return nil, err
  274. }
  275. clone := option.InferOption{}
  276. if err = json.Unmarshal(origJSON, &clone); err != nil {
  277. return nil, err
  278. }
  279. return &clone, nil
  280. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.