You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

imageinferencelogic.go 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. package inference
  2. import (
  3. "context"
  4. "errors"
  5. "github.com/go-resty/resty/v2"
  6. "github.com/zeromicro/go-zero/core/logx"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/svc"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  15. "k8s.io/apimachinery/pkg/util/json"
  16. "log"
  17. "math/rand"
  18. "mime/multipart"
  19. "net/http"
  20. "strconv"
  21. "sync"
  22. "time"
  23. )
  24. type ImageInferenceLogic struct {
  25. logx.Logger
  26. ctx context.Context
  27. svcCtx *svc.ServiceContext
  28. }
  29. func NewImageInferenceLogic(ctx context.Context, svcCtx *svc.ServiceContext) *ImageInferenceLogic {
  30. return &ImageInferenceLogic{
  31. Logger: logx.WithContext(ctx),
  32. ctx: ctx,
  33. svcCtx: svcCtx,
  34. }
  35. }
  36. func (l *ImageInferenceLogic) ImageInference(req *types.ImageInferenceReq) (resp *types.ImageInferenceResp, err error) {
  37. return nil, nil
  38. }
  39. func (l *ImageInferenceLogic) ImageInfer(r *http.Request, req *types.ImageInferenceReq) (resp *types.ImageInferenceResp, err error) {
  40. resp = &types.ImageInferenceResp{}
  41. opt := &option.InferOption{
  42. TaskName: req.TaskName,
  43. TaskDesc: req.TaskDesc,
  44. AdapterId: req.AdapterId,
  45. AiClusterIds: req.AiClusterIds,
  46. ModelName: req.ModelName,
  47. ModelType: req.ModelType,
  48. Strategy: req.Strategy,
  49. StaticWeightMap: req.StaticWeightMap,
  50. }
  51. var ts []struct {
  52. imageResult *types.ImageResult
  53. file multipart.File
  54. }
  55. uploadedFiles := r.MultipartForm.File
  56. if len(uploadedFiles) == 0 {
  57. return nil, errors.New("Images does not exist")
  58. }
  59. if len(uploadedFiles["images"]) == 0 {
  60. return nil, errors.New("Images does not exist")
  61. }
  62. for _, header := range uploadedFiles["images"] {
  63. file, err := header.Open()
  64. if err != nil {
  65. return nil, err
  66. }
  67. defer file.Close()
  68. var ir types.ImageResult
  69. ir.ImageName = header.Filename
  70. t := struct {
  71. imageResult *types.ImageResult
  72. file multipart.File
  73. }{
  74. imageResult: &ir,
  75. file: file,
  76. }
  77. ts = append(ts, t)
  78. }
  79. _, ok := l.svcCtx.Scheduler.AiService.AiCollectorAdapterMap[opt.AdapterId]
  80. if !ok {
  81. return nil, errors.New("AdapterId does not exist")
  82. }
  83. var strat strategy.Strategy
  84. switch opt.Strategy {
  85. case strategy.STATIC_WEIGHT:
  86. strat = strategy.NewStaticWeightStrategy(opt.StaticWeightMap, int32(len(ts)))
  87. if err != nil {
  88. return nil, err
  89. }
  90. default:
  91. return nil, errors.New("no strategy has been chosen")
  92. }
  93. clusters, err := strat.Schedule()
  94. if err != nil {
  95. return nil, err
  96. }
  97. results, err := infer(opt, clusters, ts, l.svcCtx, l.ctx)
  98. if err != nil {
  99. return nil, err
  100. }
  101. resp.InferResults = results
  102. return resp, nil
  103. }
  104. func infer(opt *option.InferOption, clusters []*strategy.AssignedCluster, ts []struct {
  105. imageResult *types.ImageResult
  106. file multipart.File
  107. }, svcCtx *svc.ServiceContext, ctx context.Context) ([]*types.ImageResult, error) {
  108. if clusters == nil || len(clusters) == 0 {
  109. return nil, errors.New("clusters is nil")
  110. }
  111. for i := len(clusters) - 1; i >= 0; i-- {
  112. if clusters[i].Replicas == 0 {
  113. clusters = append(clusters[:i], clusters[i+1:]...)
  114. }
  115. }
  116. var wg sync.WaitGroup
  117. var cluster_ch = make(chan struct {
  118. urls []*collector.ImageInferUrl
  119. clusterId string
  120. clusterName string
  121. imageNum int32
  122. }, len(clusters))
  123. var cs []struct {
  124. urls []*collector.ImageInferUrl
  125. clusterId string
  126. clusterName string
  127. imageNum int32
  128. }
  129. collectorMap := svcCtx.Scheduler.AiService.AiCollectorAdapterMap[opt.AdapterId]
  130. //save task
  131. var synergystatus int64
  132. if len(clusters) > 1 {
  133. synergystatus = 1
  134. }
  135. strategyCode, err := svcCtx.Scheduler.AiStorages.GetStrategyCode(opt.Strategy)
  136. if err != nil {
  137. return nil, err
  138. }
  139. adapterName, err := svcCtx.Scheduler.AiStorages.GetAdapterNameById(opt.AdapterId)
  140. if err != nil {
  141. return nil, err
  142. }
  143. id, err := svcCtx.Scheduler.AiStorages.SaveTask(opt.TaskName, strategyCode, synergystatus, "11")
  144. if err != nil {
  145. return nil, err
  146. }
  147. svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "create", "任务创建中")
  148. //save taskai
  149. for _, c := range clusters {
  150. clusterName, _ := svcCtx.Scheduler.AiStorages.GetClusterNameById(c.ClusterId)
  151. opt.Replica = c.Replicas
  152. err := svcCtx.Scheduler.AiStorages.SaveAiTask(id, opt, adapterName, c.ClusterId, clusterName, "", constants.Saved, "")
  153. if err != nil {
  154. return nil, err
  155. }
  156. }
  157. for _, cluster := range clusters {
  158. wg.Add(1)
  159. c := cluster
  160. go func() {
  161. imageUrls, err := collectorMap[c.ClusterId].GetImageInferUrl(ctx, opt)
  162. if err != nil {
  163. wg.Done()
  164. return
  165. }
  166. clusterName, _ := svcCtx.Scheduler.AiStorages.GetClusterNameById(c.ClusterId)
  167. s := struct {
  168. urls []*collector.ImageInferUrl
  169. clusterId string
  170. clusterName string
  171. imageNum int32
  172. }{
  173. urls: imageUrls,
  174. clusterId: c.ClusterId,
  175. clusterName: clusterName,
  176. imageNum: c.Replicas,
  177. }
  178. cluster_ch <- s
  179. wg.Done()
  180. return
  181. }()
  182. }
  183. wg.Wait()
  184. close(cluster_ch)
  185. for s := range cluster_ch {
  186. cs = append(cs, s)
  187. }
  188. var aiTaskList []*models.TaskAi
  189. tx := svcCtx.DbEngin.Raw("select * from task_ai where `task_id` = ? ", id).Scan(&aiTaskList)
  190. if tx.Error != nil {
  191. return nil, tx.Error
  192. }
  193. //no cluster available
  194. if len(cs) == 0 {
  195. for _, t := range aiTaskList {
  196. t.Status = constants.Failed
  197. t.EndTime = time.Now().Format(time.RFC3339)
  198. err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
  199. if err != nil {
  200. logx.Errorf(tx.Error.Error())
  201. }
  202. }
  203. svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "failed", "任务失败")
  204. return nil, errors.New("image infer task failed")
  205. }
  206. //change cluster status
  207. if len(clusters) != len(cs) {
  208. var acs []*strategy.AssignedCluster
  209. var rcs []*strategy.AssignedCluster
  210. for _, cluster := range clusters {
  211. if contains(cs, cluster.ClusterId) {
  212. var ac *strategy.AssignedCluster
  213. ac = cluster
  214. rcs = append(rcs, ac)
  215. } else {
  216. var ac *strategy.AssignedCluster
  217. ac = cluster
  218. acs = append(acs, ac)
  219. }
  220. }
  221. // update failed cluster status
  222. for _, ac := range acs {
  223. for _, t := range aiTaskList {
  224. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  225. t.Status = constants.Failed
  226. t.EndTime = time.Now().Format(time.RFC3339)
  227. err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
  228. if err != nil {
  229. logx.Errorf(tx.Error.Error())
  230. }
  231. }
  232. }
  233. }
  234. // update running cluster status
  235. for _, ac := range rcs {
  236. for _, t := range aiTaskList {
  237. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  238. t.Status = constants.Running
  239. err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
  240. if err != nil {
  241. logx.Errorf(tx.Error.Error())
  242. }
  243. }
  244. }
  245. }
  246. svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "failed", "任务失败")
  247. } else {
  248. for _, t := range aiTaskList {
  249. t.Status = constants.Running
  250. err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
  251. if err != nil {
  252. logx.Errorf(tx.Error.Error())
  253. }
  254. }
  255. svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "running", "任务运行中")
  256. }
  257. var result_ch = make(chan *types.ImageResult, len(ts))
  258. var results []*types.ImageResult
  259. limit := make(chan bool, 7)
  260. var imageNumIdx int32 = 0
  261. var imageNumIdxEnd int32 = 0
  262. for _, c := range cs {
  263. new_images := make([]struct {
  264. imageResult *types.ImageResult
  265. file multipart.File
  266. }, len(ts))
  267. copy(new_images, ts)
  268. imageNumIdxEnd = imageNumIdxEnd + c.imageNum
  269. new_images = new_images[imageNumIdx:imageNumIdxEnd]
  270. imageNumIdx = imageNumIdx + c.imageNum
  271. wg.Add(len(new_images))
  272. go sendInferReq(new_images, c, &wg, result_ch, limit)
  273. }
  274. wg.Wait()
  275. close(result_ch)
  276. for s := range result_ch {
  277. results = append(results, s)
  278. }
  279. //sort.Slice(results, func(p, q int) bool {
  280. // return results[p].ClusterName < results[q].ClusterName
  281. //})
  282. //save ai sub tasks
  283. for _, r := range results {
  284. for _, task := range aiTaskList {
  285. if r.ClusterId == strconv.Itoa(int(task.ClusterId)) {
  286. taskAiSub := models.TaskAiSub{
  287. TaskId: id,
  288. TaskName: task.Name,
  289. TaskAiId: task.TaskId,
  290. TaskAiName: task.Name,
  291. ImageName: r.ImageName,
  292. Result: r.ImageResult,
  293. Card: r.Card,
  294. ClusterId: task.ClusterId,
  295. ClusterName: r.ClusterName,
  296. }
  297. tx := svcCtx.DbEngin.Table("task_ai_sub").Create(&taskAiSub)
  298. if tx.Error != nil {
  299. logx.Errorf(err.Error())
  300. }
  301. }
  302. }
  303. }
  304. // update succeeded cluster status
  305. var successStatusCount int
  306. for _, c := range cs {
  307. for _, t := range aiTaskList {
  308. if c.clusterId == strconv.Itoa(int(t.ClusterId)) {
  309. t.Status = constants.Completed
  310. t.EndTime = time.Now().Format(time.RFC3339)
  311. err := svcCtx.Scheduler.AiStorages.UpdateAiTask(t)
  312. if err != nil {
  313. logx.Errorf(tx.Error.Error())
  314. }
  315. successStatusCount++
  316. } else {
  317. continue
  318. }
  319. }
  320. }
  321. if len(cs) == successStatusCount {
  322. svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "completed", "任务完成")
  323. } else {
  324. svcCtx.Scheduler.AiStorages.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "failed", "任务失败")
  325. }
  326. return results, nil
  327. }
  328. func sendInferReq(images []struct {
  329. imageResult *types.ImageResult
  330. file multipart.File
  331. }, cluster struct {
  332. urls []*collector.ImageInferUrl
  333. clusterId string
  334. clusterName string
  335. imageNum int32
  336. }, wg *sync.WaitGroup, ch chan<- *types.ImageResult, limit chan bool) {
  337. for _, image := range images {
  338. limit <- true
  339. go func(t struct {
  340. imageResult *types.ImageResult
  341. file multipart.File
  342. }, c struct {
  343. urls []*collector.ImageInferUrl
  344. clusterId string
  345. clusterName string
  346. imageNum int32
  347. }) {
  348. if len(c.urls) == 1 {
  349. r, err := getInferResult(c.urls[0].Url, t.file, t.imageResult.ImageName, c.clusterName)
  350. if err != nil {
  351. t.imageResult.ImageResult = err.Error()
  352. t.imageResult.ClusterId = c.clusterId
  353. t.imageResult.ClusterName = c.clusterName
  354. t.imageResult.Card = c.urls[0].Card
  355. ch <- t.imageResult
  356. wg.Done()
  357. <-limit
  358. return
  359. }
  360. t.imageResult.ImageResult = r
  361. t.imageResult.ClusterId = c.clusterId
  362. t.imageResult.ClusterName = c.clusterName
  363. t.imageResult.Card = c.urls[0].Card
  364. ch <- t.imageResult
  365. wg.Done()
  366. <-limit
  367. return
  368. } else {
  369. idx := rand.Intn(len(c.urls))
  370. r, err := getInferResult(c.urls[idx].Url, t.file, t.imageResult.ImageName, c.clusterName)
  371. if err != nil {
  372. t.imageResult.ImageResult = err.Error()
  373. t.imageResult.ClusterId = c.clusterId
  374. t.imageResult.ClusterName = c.clusterName
  375. t.imageResult.Card = c.urls[idx].Card
  376. ch <- t.imageResult
  377. wg.Done()
  378. <-limit
  379. return
  380. }
  381. t.imageResult.ImageResult = r
  382. t.imageResult.ClusterId = c.clusterId
  383. t.imageResult.ClusterName = c.clusterName
  384. t.imageResult.Card = c.urls[idx].Card
  385. ch <- t.imageResult
  386. wg.Done()
  387. <-limit
  388. return
  389. }
  390. }(image, cluster)
  391. <-limit
  392. }
  393. }
  394. func getInferResult(url string, file multipart.File, fileName string, clusterName string) (string, error) {
  395. if clusterName == "鹏城云脑II-modelarts" {
  396. r, err := getInferResultModelarts(url, file, fileName)
  397. if err != nil {
  398. return "", err
  399. }
  400. return r, nil
  401. }
  402. var res Res
  403. req := GetRestyRequest(20)
  404. _, err := req.
  405. SetFileReader("file", fileName, file).
  406. SetResult(&res).
  407. Post(url)
  408. if err != nil {
  409. return "", err
  410. }
  411. return res.Result, nil
  412. }
  413. func getInferResultModelarts(url string, file multipart.File, fileName string) (string, error) {
  414. var res Res
  415. /* req := GetRestyRequest(20)
  416. _, err := req.
  417. SetFileReader("file", fileName, file).
  418. SetHeaders(map[string]string{
  419. "ak": "UNEHPHO4Z7YSNPKRXFE4",
  420. "sk": "JWXCE9qcYbc7RjpSRIWt4WgG3ZKF6Q4lPzkJReX9",
  421. }).
  422. SetResult(&res).
  423. Post(url)
  424. if err != nil {
  425. return "", err
  426. }*/
  427. body, err := utils.SendRequest("POST", url, file, fileName)
  428. if err != nil {
  429. return "", err
  430. }
  431. errjson := json.Unmarshal([]byte(body), &res)
  432. if errjson != nil {
  433. log.Fatalf("Error parsing JSON: %s", errjson)
  434. }
  435. return res.Result, nil
  436. }
  437. func GetRestyRequest(timeoutSeconds int64) *resty.Request {
  438. client := resty.New().SetTimeout(time.Duration(timeoutSeconds) * time.Second)
  439. request := client.R()
  440. return request
  441. }
  442. type Res struct {
  443. Result string `json:"result"`
  444. }
  445. func contains(cs []struct {
  446. urls []*collector.ImageInferUrl
  447. clusterId string
  448. clusterName string
  449. imageNum int32
  450. }, e string) bool {
  451. for _, c := range cs {
  452. if c.clusterId == e {
  453. return true
  454. }
  455. }
  456. return false
  457. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.