You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

imageinferencelogic.go 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. package inference
  2. import (
  3. "context"
  4. "errors"
  5. "github.com/go-resty/resty/v2"
  6. "github.com/zeromicro/go-zero/core/logx"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  14. "math/rand"
  15. "mime/multipart"
  16. "net/http"
  17. "sort"
  18. "strconv"
  19. "sync"
  20. "time"
  21. )
  22. type ImageInferenceLogic struct {
  23. logx.Logger
  24. ctx context.Context
  25. svcCtx *svc.ServiceContext
  26. }
  27. func NewImageInferenceLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ImageInferenceLogic {
  28. return &ImageInferenceLogic{
  29. Logger: logx.WithContext(ctx),
  30. ctx: ctx,
  31. svcCtx: svcCtx,
  32. }
  33. }
  34. func (l *ImageInferenceLogic) ImageInference(req *types.ImageInferenceReq) (resp *types.ImageInferenceResp, err error) {
  35. return nil, nil
  36. }
  37. func (l *ImageInferenceLogic) ImageInfer(r *http.Request, req *types.ImageInferenceReq) (resp *types.ImageInferenceResp, err error) {
  38. resp = &types.ImageInferenceResp{}
  39. opt := &option.InferOption{
  40. TaskName: req.TaskName,
  41. TaskDesc: req.TaskDesc,
  42. AdapterId: req.AdapterId,
  43. AiClusterIds: req.AiClusterIds,
  44. ModelName: req.ModelName,
  45. ModelType: req.ModelType,
  46. Strategy: req.Strategy,
  47. StaticWeightMap: req.StaticWeightMap,
  48. }
  49. var ts []struct {
  50. imageResult *types.ImageResult
  51. file multipart.File
  52. }
  53. uploadedFiles := r.MultipartForm.File
  54. if len(uploadedFiles) == 0 {
  55. return nil, errors.New("Images does not exist")
  56. }
  57. if len(uploadedFiles["images"]) == 0 {
  58. return nil, errors.New("Images does not exist")
  59. }
  60. for _, header := range uploadedFiles["images"] {
  61. file, err := header.Open()
  62. if err != nil {
  63. return nil, err
  64. }
  65. defer file.Close()
  66. var ir types.ImageResult
  67. ir.ImageName = header.Filename
  68. t := struct {
  69. imageResult *types.ImageResult
  70. file multipart.File
  71. }{
  72. imageResult: &ir,
  73. file: file,
  74. }
  75. ts = append(ts, t)
  76. }
  77. _, ok := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[opt.AdapterId]
  78. if !ok {
  79. return nil, errors.New("AdapterId does not exist")
  80. }
  81. var strat strategy.Strategy
  82. switch opt.Strategy {
  83. case strategy.STATIC_WEIGHT:
  84. strat = strategy.NewStaticWeightStrategy(opt.StaticWeightMap, int32(len(ts)))
  85. if err != nil {
  86. return nil, err
  87. }
  88. default:
  89. return nil, errors.New("no strategy has been chosen")
  90. }
  91. clusters, err := strat.Schedule()
  92. if err != nil {
  93. return nil, err
  94. }
  95. results, err := infer(opt, clusters, ts, l.svcCtx, l.ctx)
  96. if err != nil {
  97. return nil, err
  98. }
  99. resp.InferResults = results
  100. return resp, nil
  101. }
  102. var acs []*strategy.AssignedCluster
  103. var aiTaskList []*models.TaskAi
  104. func infer(opt *option.InferOption, clusters []*strategy.AssignedCluster, ts []struct {
  105. imageResult *types.ImageResult
  106. file multipart.File
  107. }, svcCtx *svc.ServiceContext, ctx context.Context) ([]*types.ImageResult, error) {
  108. if clusters == nil || len(clusters) == 0 {
  109. return nil, errors.New("clusters is nil")
  110. }
  111. for i := len(clusters) - 1; i >= 0; i-- {
  112. if clusters[i].Replicas == 0 {
  113. clusters = append(clusters[:i], clusters[i+1:]...)
  114. }
  115. }
  116. var wg sync.WaitGroup
  117. var cluster_ch = make(chan struct {
  118. urls []*collector.ImageInferUrl
  119. clusterId string
  120. clusterName string
  121. imageNum int32
  122. }, len(clusters))
  123. var cs []struct {
  124. urls []*collector.ImageInferUrl
  125. clusterId string
  126. clusterName string
  127. imageNum int32
  128. }
  129. collectorMap := svcCtx.Scheduler.AiService.AiCollectorAdapterMap[opt.AdapterId]
  130. //save task
  131. var synergystatus int64
  132. if len(clusters) > 1 {
  133. synergystatus = 1
  134. }
  135. strategyCode, err := svcCtx.Scheduler.AiStorages.GetStrategyCode(opt.Strategy)
  136. if err != nil {
  137. return nil, err
  138. }
  139. adapterName, err := svcCtx.Scheduler.AiStorages.GetAdapterNameById(opt.AdapterId)
  140. if err != nil {
  141. return nil, err
  142. }
  143. id, err := svcCtx.Scheduler.AiStorages.SaveTask(opt.TaskName, strategyCode, synergystatus, "11")
  144. if err != nil {
  145. return nil, err
  146. }
  147. svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "create", "任务创建中")
  148. //save taskai
  149. for _, c := range clusters {
  150. clusterName, _ := svcCtx.Scheduler.AiStorages.GetClusterNameById(c.ClusterId)
  151. opt.Replica = c.Replicas
  152. err := svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, adapterName, c.ClusterId, clusterName, "", constants.Saved, "")
  153. if err != nil {
  154. return nil, err
  155. }
  156. }
  157. for _, cluster := range clusters {
  158. wg.Add(1)
  159. c := cluster
  160. go func() {
  161. imageUrls, err := collectorMap[c.ClusterId].GetImageInferUrl(ctx, opt)
  162. if err != nil {
  163. wg.Done()
  164. return
  165. }
  166. clusterName, _ := svcCtx.Scheduler.AiStorages.GetClusterNameById(c.ClusterId)
  167. s := struct {
  168. urls []*collector.ImageInferUrl
  169. clusterId string
  170. clusterName string
  171. imageNum int32
  172. }{
  173. urls: imageUrls,
  174. clusterId: c.ClusterId,
  175. clusterName: clusterName,
  176. imageNum: c.Replicas,
  177. }
  178. cluster_ch <- s
  179. wg.Done()
  180. return
  181. }()
  182. }
  183. wg.Wait()
  184. close(cluster_ch)
  185. for s := range cluster_ch {
  186. cs = append(cs, s)
  187. }
  188. tx := svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", id).Scan(&aiTaskList)
  189. if tx.Error != nil {
  190. return nil, tx.Error
  191. }
  192. //change cluster status
  193. if len(clusters) != len(cs) {
  194. for _, cluster := range clusters {
  195. if contains(cs, cluster.ClusterId) {
  196. continue
  197. } else {
  198. var ac *strategy.AssignedCluster
  199. ac = cluster
  200. acs = append(acs, ac)
  201. }
  202. }
  203. // update failed cluster status
  204. for _, ac := range acs {
  205. for _, t := range aiTaskList {
  206. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  207. t.Status = constants.Failed
  208. err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
  209. if err != nil {
  210. logx.Errorf(tx.Error.Error())
  211. }
  212. }
  213. }
  214. }
  215. }
  216. var result_ch = make(chan *types.ImageResult, len(ts))
  217. var results []*types.ImageResult
  218. var imageNumIdx int32 = 0
  219. var imageNumIdxEnd int32 = 0
  220. for _, c := range cs {
  221. new_images := make([]struct {
  222. imageResult *types.ImageResult
  223. file multipart.File
  224. }, len(ts))
  225. copy(new_images, ts)
  226. imageNumIdxEnd = imageNumIdxEnd + c.imageNum
  227. new_images = new_images[imageNumIdx:imageNumIdxEnd]
  228. imageNumIdx = imageNumIdx + c.imageNum
  229. wg.Add(len(new_images))
  230. go sendInferReq(new_images, c, &wg, *svcCtx, result_ch)
  231. }
  232. wg.Wait()
  233. close(result_ch)
  234. for s := range result_ch {
  235. results = append(results, s)
  236. }
  237. sort.Slice(results, func(p, q int) bool {
  238. return results[p].ClusterName < results[q].ClusterName
  239. })
  240. // update succeeded cluster status
  241. for _, c := range cs {
  242. for _, t := range aiTaskList {
  243. if c.clusterId == strconv.Itoa(int(t.ClusterId)) {
  244. t.Status = constants.Completed
  245. err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
  246. if err != nil {
  247. logx.Errorf(tx.Error.Error())
  248. }
  249. }
  250. }
  251. }
  252. return results, nil
  253. }
  254. func sendInferReq(images []struct {
  255. imageResult *types.ImageResult
  256. file multipart.File
  257. }, cluster struct {
  258. urls []*collector.ImageInferUrl
  259. clusterId string
  260. clusterName string
  261. imageNum int32
  262. }, wg *sync.WaitGroup, svcCtx svc.ServiceContext, ch chan<- *types.ImageResult) {
  263. for _, image := range images {
  264. go func(t struct {
  265. imageResult *types.ImageResult
  266. file multipart.File
  267. }, c struct {
  268. urls []*collector.ImageInferUrl
  269. clusterId string
  270. clusterName string
  271. imageNum int32
  272. }) {
  273. if len(c.urls) == 1 {
  274. r, err := getInferResult(c.urls[0].Url, t.file, t.imageResult.ImageName)
  275. if err != nil {
  276. t.imageResult.ImageResult = err.Error()
  277. t.imageResult.ClusterName = c.clusterName
  278. t.imageResult.Card = c.urls[0].Card
  279. ch <- t.imageResult
  280. wg.Done()
  281. return
  282. }
  283. t.imageResult.ImageResult = r
  284. t.imageResult.ClusterName = c.clusterName
  285. t.imageResult.Card = c.urls[0].Card
  286. ch <- t.imageResult
  287. wg.Done()
  288. return
  289. } else {
  290. idx := rand.Intn(len(c.urls))
  291. r, err := getInferResult(c.urls[idx].Url, t.file, t.imageResult.ImageName)
  292. if err != nil {
  293. t.imageResult.ImageResult = err.Error()
  294. t.imageResult.ClusterName = c.clusterName
  295. t.imageResult.Card = c.urls[idx].Card
  296. ch <- t.imageResult
  297. wg.Done()
  298. return
  299. }
  300. t.imageResult.ImageResult = r
  301. t.imageResult.ClusterName = c.clusterName
  302. t.imageResult.Card = c.urls[idx].Card
  303. for _, ac := range acs {
  304. for _, task := range aiTaskList {
  305. if ac.ClusterId == strconv.Itoa(int(task.ClusterId)) && ac.ClusterId == t.imageResult.ClusterId {
  306. taskAiSub := &models.TaskAiSub{
  307. Id: task.Id,
  308. ImageName: t.imageResult.ImageName,
  309. Result: t.imageResult.ImageResult,
  310. Card: t.imageResult.Card,
  311. ClusterId: task.ClusterId,
  312. ClusterName: t.imageResult.ClusterName,
  313. }
  314. tx := svcCtx.DbEngin.Save(&taskAiSub)
  315. if tx.Error != nil {
  316. logx.Errorf(err.Error())
  317. }
  318. }
  319. continue
  320. }
  321. continue
  322. }
  323. ch <- t.imageResult
  324. wg.Done()
  325. return
  326. }
  327. }(image, cluster)
  328. }
  329. }
  330. func getInferResult(url string, file multipart.File, fileName string) (string, error) {
  331. var res Res
  332. req := GetRestyRequest(10)
  333. _, err := req.
  334. SetFileReader("file", fileName, file).
  335. SetResult(&res).
  336. Post(url)
  337. if err != nil {
  338. return "", err
  339. }
  340. return res.Result, nil
  341. }
  342. func GetRestyRequest(timeoutSeconds int64) *resty.Request {
  343. client := resty.New().SetTimeout(time.Duration(timeoutSeconds) * time.Second)
  344. request := client.R()
  345. return request
  346. }
  347. type Res struct {
  348. Result string `json:"result"`
  349. }
  350. func contains(cs []struct {
  351. urls []*collector.ImageInferUrl
  352. clusterId string
  353. clusterName string
  354. imageNum int32
  355. }, e string) bool {
  356. for _, c := range cs {
  357. if c.clusterId == e {
  358. return true
  359. }
  360. }
  361. return false
  362. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.