You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

imageInference.go 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. package imageInference
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "github.com/go-resty/resty/v2"
  6. "github.com/zeromicro/go-zero/core/logx"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/storeLink"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  15. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  16. "log"
  17. "math/rand"
  18. "mime/multipart"
  19. "net/http"
  20. "sort"
  21. "strconv"
  22. "sync"
  23. "time"
  24. )
  25. type IImageInference interface {
  26. AppendRoute(urls []*inference.InferUrl) error
  27. GetAiType() string
  28. }
  29. type ImageFile struct {
  30. ImageResult *types.ImageResult
  31. File multipart.File
  32. }
  33. type FilteredCluster struct {
  34. urls []*inference.InferUrl
  35. clusterId string
  36. clusterName string
  37. clusterType string
  38. imageNum int32
  39. }
  40. type ImageInference struct {
  41. inference IImageInference
  42. files []*ImageFile
  43. clusters []*strategy.AssignedCluster
  44. instances []*models.AiInferDeployInstance
  45. opt *option.InferOption
  46. storage *database.AiStorage
  47. inferAdapter map[string]map[string]inference.ICluster
  48. errMap map[string]string
  49. adapterName string
  50. }
  51. func New(
  52. inference IImageInference,
  53. files []*ImageFile,
  54. clusters []*strategy.AssignedCluster,
  55. instances []*models.AiInferDeployInstance,
  56. opt *option.InferOption,
  57. storage *database.AiStorage,
  58. inferAdapter map[string]map[string]inference.ICluster,
  59. adapterName string) (*ImageInference, error) {
  60. return &ImageInference{
  61. inference: inference,
  62. files: files,
  63. clusters: clusters,
  64. instances: instances,
  65. opt: opt,
  66. storage: storage,
  67. inferAdapter: inferAdapter,
  68. adapterName: adapterName,
  69. errMap: make(map[string]string),
  70. }, nil
  71. }
  72. func (i *ImageInference) CreateTask() (int64, error) {
  73. id, err := i.saveTask()
  74. if err != nil {
  75. return 0, err
  76. }
  77. err = i.saveAiTask(id)
  78. if err != nil {
  79. return 0, err
  80. }
  81. return id, nil
  82. }
  83. func (i *ImageInference) InferTask(id int64) error {
  84. clusters, err := i.filterClusters()
  85. if err != nil {
  86. return err
  87. }
  88. aiTaskList, err := i.storage.GetAiTaskListById(id)
  89. if err != nil || len(aiTaskList) == 0 {
  90. return err
  91. }
  92. err = i.updateStatus(aiTaskList, clusters)
  93. if err != nil {
  94. return err
  95. }
  96. results, err := i.inferImages(clusters)
  97. if err != nil {
  98. return err
  99. }
  100. err = i.saveAiSubTasks(id, aiTaskList, clusters, results)
  101. if err != nil {
  102. return err
  103. }
  104. return nil
  105. }
  106. func (i *ImageInference) saveTask() (int64, error) {
  107. var synergystatus int64
  108. if len(i.clusters) > 1 {
  109. synergystatus = 1
  110. }
  111. strategyCode, err := i.storage.GetStrategyCode(i.opt.Strategy)
  112. if err != nil {
  113. return 0, err
  114. }
  115. id, err := i.storage.SaveTask(i.opt.TaskName, strategyCode, synergystatus, i.inference.GetAiType())
  116. if err != nil {
  117. return 0, err
  118. }
  119. i.storage.AddNoticeInfo("", "", "", "", i.opt.TaskName, "create", "任务创建中")
  120. return id, nil
  121. }
  122. func (i *ImageInference) saveAiTask(id int64) error {
  123. for _, c := range i.clusters {
  124. clusterName, _ := i.storage.GetClusterNameById(c.ClusterId)
  125. i.opt.Replica = c.Replicas
  126. err := i.storage.SaveAiTask(id, i.opt, i.adapterName, c.ClusterId, clusterName, "", constants.Saved, "")
  127. if err != nil {
  128. return err
  129. }
  130. }
  131. return nil
  132. }
  133. func (i *ImageInference) filterClustersTemp() ([]*FilteredCluster, error) {
  134. var wg sync.WaitGroup
  135. var ch = make(chan *FilteredCluster, len(i.clusters))
  136. var cs []*FilteredCluster
  137. var mutex sync.Mutex
  138. inferMap := i.inferAdapter[i.opt.AdapterId]
  139. for _, cluster := range i.clusters {
  140. wg.Add(1)
  141. c := cluster
  142. go func() {
  143. r := http.Request{}
  144. clusterInferUrl, err := inferMap[c.ClusterId].GetClusterInferUrl(r.Context(), i.opt)
  145. if err != nil {
  146. mutex.Lock()
  147. i.errMap[c.ClusterId] = err.Error()
  148. mutex.Unlock()
  149. wg.Done()
  150. return
  151. }
  152. i.inference.AppendRoute(clusterInferUrl.InferUrls)
  153. var f FilteredCluster
  154. f.urls = clusterInferUrl.InferUrls
  155. f.clusterId = c.ClusterId
  156. f.clusterName = clusterInferUrl.ClusterName
  157. f.clusterType = clusterInferUrl.ClusterType
  158. f.imageNum = c.Replicas
  159. ch <- &f
  160. wg.Done()
  161. return
  162. }()
  163. }
  164. wg.Wait()
  165. close(ch)
  166. for s := range ch {
  167. cs = append(cs, s)
  168. }
  169. return cs, nil
  170. }
  171. func (i *ImageInference) filterClusters() ([]*FilteredCluster, error) {
  172. var cs []*FilteredCluster
  173. for _, cluster := range i.clusters {
  174. var inferurls []*inference.InferUrl
  175. var clustertype string
  176. var clusterName string
  177. for _, instance := range i.instances {
  178. clusterId := strconv.FormatInt(instance.ClusterId, 10)
  179. adapterId := strconv.FormatInt(instance.AdapterId, 10)
  180. if cluster.ClusterId == clusterId {
  181. r := http.Request{}
  182. deployInstance, err := i.inferAdapter[adapterId][clusterId].GetInferDeployInstance(r.Context(), instance.InstanceId)
  183. if err != nil {
  184. continue
  185. }
  186. var url inference.InferUrl
  187. url.Url = deployInstance.InferUrl
  188. url.Card = deployInstance.InferCard
  189. inferurls = append(inferurls, &url)
  190. clustertype = deployInstance.ClusterType
  191. clusterName = deployInstance.ClusterName
  192. }
  193. }
  194. if len(inferurls) == 0 {
  195. continue
  196. }
  197. i.inference.AppendRoute(inferurls)
  198. var f FilteredCluster
  199. f.urls = inferurls
  200. f.clusterId = cluster.ClusterId
  201. f.clusterName = clusterName
  202. f.clusterType = clustertype
  203. f.imageNum = cluster.Replicas
  204. cs = append(cs, &f)
  205. }
  206. return cs, nil
  207. }
  208. func (i *ImageInference) inferImages(cs []*FilteredCluster) ([]*types.ImageResult, error) {
  209. var wg sync.WaitGroup
  210. var ch = make(chan *types.ImageResult, len(i.files))
  211. var results []*types.ImageResult
  212. limit := make(chan bool, 7)
  213. var imageNumIdx int32 = 0
  214. var imageNumIdxEnd int32 = 0
  215. for _, c := range cs {
  216. new_images := make([]*ImageFile, len(i.files))
  217. copy(new_images, i.files)
  218. imageNumIdxEnd = imageNumIdxEnd + c.imageNum
  219. new_images = new_images[imageNumIdx:imageNumIdxEnd]
  220. imageNumIdx = imageNumIdx + c.imageNum
  221. wg.Add(len(new_images))
  222. go i.sendInferReq(new_images, c, &wg, ch, limit)
  223. }
  224. wg.Wait()
  225. close(ch)
  226. for s := range ch {
  227. results = append(results, s)
  228. }
  229. sort.Slice(results, func(p, q int) bool {
  230. return results[p].ClusterName < results[q].ClusterName
  231. })
  232. return results, nil
  233. }
  234. func (i *ImageInference) updateStatus(aiTaskList []*models.TaskAi, cs []*FilteredCluster) error {
  235. //no cluster available
  236. if len(cs) == 0 {
  237. for _, t := range aiTaskList {
  238. t.Status = constants.Failed
  239. t.EndTime = time.Now().Format(time.RFC3339)
  240. if _, ok := i.errMap[strconv.Itoa(int(t.ClusterId))]; ok {
  241. t.Msg = i.errMap[strconv.Itoa(int(t.ClusterId))]
  242. }
  243. err := i.storage.UpdateAiTask(t)
  244. if err != nil {
  245. logx.Errorf(err.Error())
  246. }
  247. }
  248. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "failed", "任务失败")
  249. return errors.New("available clusters' empty, image infer task failed")
  250. }
  251. //change cluster status
  252. if len(i.clusters) != len(cs) {
  253. var acs []*strategy.AssignedCluster
  254. var rcs []*strategy.AssignedCluster
  255. for _, cluster := range i.clusters {
  256. if contains(cs, cluster.ClusterId) {
  257. var ac *strategy.AssignedCluster
  258. ac = cluster
  259. rcs = append(rcs, ac)
  260. } else {
  261. var ac *strategy.AssignedCluster
  262. ac = cluster
  263. acs = append(acs, ac)
  264. }
  265. }
  266. // update failed cluster status
  267. for _, ac := range acs {
  268. for _, t := range aiTaskList {
  269. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  270. t.Status = constants.Failed
  271. t.EndTime = time.Now().Format(time.RFC3339)
  272. if _, ok := i.errMap[strconv.Itoa(int(t.ClusterId))]; ok {
  273. t.Msg = i.errMap[strconv.Itoa(int(t.ClusterId))]
  274. }
  275. err := i.storage.UpdateAiTask(t)
  276. if err != nil {
  277. logx.Errorf(err.Error())
  278. }
  279. }
  280. }
  281. }
  282. // update running cluster status
  283. for _, ac := range rcs {
  284. for _, t := range aiTaskList {
  285. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  286. t.Status = constants.Running
  287. err := i.storage.UpdateAiTask(t)
  288. if err != nil {
  289. logx.Errorf(err.Error())
  290. }
  291. }
  292. }
  293. }
  294. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "failed", "任务失败")
  295. } else {
  296. for _, t := range aiTaskList {
  297. t.Status = constants.Running
  298. err := i.storage.UpdateAiTask(t)
  299. if err != nil {
  300. logx.Errorf(err.Error())
  301. }
  302. }
  303. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "running", "任务运行中")
  304. }
  305. return nil
  306. }
  307. func (i *ImageInference) sendInferReq(images []*ImageFile, cluster *FilteredCluster, wg *sync.WaitGroup, ch chan<- *types.ImageResult, limit chan bool) {
  308. for _, image := range images {
  309. limit <- true
  310. go func(t *ImageFile, c *FilteredCluster) {
  311. if len(c.urls) == 1 {
  312. r, err := getInferResult(c.urls[0].Url, t.File, t.ImageResult.ImageName, c.clusterId, c.clusterType, i.inferAdapter, i.opt.AdapterId)
  313. if err != nil {
  314. t.ImageResult.ImageResult = err.Error()
  315. t.ImageResult.ClusterId = c.clusterId
  316. t.ImageResult.ClusterName = c.clusterName
  317. t.ImageResult.Card = c.urls[0].Card
  318. ch <- t.ImageResult
  319. wg.Done()
  320. <-limit
  321. return
  322. }
  323. t.ImageResult.ImageResult = r
  324. t.ImageResult.ClusterId = c.clusterId
  325. t.ImageResult.ClusterName = c.clusterName
  326. t.ImageResult.Card = c.urls[0].Card
  327. ch <- t.ImageResult
  328. wg.Done()
  329. <-limit
  330. return
  331. } else {
  332. idx := rand.Intn(len(c.urls))
  333. r, err := getInferResult(c.urls[idx].Url, t.File, t.ImageResult.ImageName, c.clusterId, c.clusterType, i.inferAdapter, i.opt.AdapterId)
  334. if err != nil {
  335. t.ImageResult.ImageResult = err.Error()
  336. t.ImageResult.ClusterId = c.clusterId
  337. t.ImageResult.ClusterName = c.clusterName
  338. t.ImageResult.Card = c.urls[idx].Card
  339. ch <- t.ImageResult
  340. wg.Done()
  341. <-limit
  342. return
  343. }
  344. t.ImageResult.ImageResult = r
  345. t.ImageResult.ClusterId = c.clusterId
  346. t.ImageResult.ClusterName = c.clusterName
  347. t.ImageResult.Card = c.urls[idx].Card
  348. ch <- t.ImageResult
  349. wg.Done()
  350. <-limit
  351. return
  352. }
  353. }(image, cluster)
  354. <-limit
  355. }
  356. }
  357. func (i *ImageInference) saveAiSubTasks(id int64, aiTaskList []*models.TaskAi, cs []*FilteredCluster, results []*types.ImageResult) error {
  358. //save ai sub tasks
  359. for _, r := range results {
  360. for _, task := range aiTaskList {
  361. if r.ClusterId == strconv.Itoa(int(task.ClusterId)) {
  362. taskAiSub := models.TaskAiSub{
  363. TaskId: id,
  364. TaskName: task.Name,
  365. TaskAiId: task.TaskId,
  366. TaskAiName: task.Name,
  367. ImageName: r.ImageName,
  368. Result: r.ImageResult,
  369. Card: r.Card,
  370. ClusterId: task.ClusterId,
  371. ClusterName: r.ClusterName,
  372. }
  373. err := i.storage.SaveAiTaskImageSubTask(&taskAiSub)
  374. if err != nil {
  375. panic(err)
  376. }
  377. }
  378. }
  379. }
  380. // update succeeded cluster status
  381. var successStatusCount int
  382. for _, c := range cs {
  383. for _, t := range aiTaskList {
  384. if c.clusterId == strconv.Itoa(int(t.ClusterId)) {
  385. t.Status = constants.Completed
  386. t.EndTime = time.Now().Format(time.RFC3339)
  387. err := i.storage.UpdateAiTask(t)
  388. if err != nil {
  389. logx.Errorf(err.Error())
  390. }
  391. successStatusCount++
  392. } else {
  393. continue
  394. }
  395. }
  396. }
  397. if len(cs) == successStatusCount {
  398. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "completed", "任务完成")
  399. } else {
  400. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "failed", "任务失败")
  401. }
  402. return nil
  403. }
  404. func getInferResult(url string, file multipart.File, fileName string, clusterId string, clusterType string, inferAdapter map[string]map[string]inference.ICluster, adapterId string) (string, error) {
  405. adapter, found := inferAdapter[adapterId]
  406. if !found {
  407. return "", errors.New("adapterId not found")
  408. }
  409. iCluster, found := adapter[clusterId]
  410. if !found {
  411. return "", errors.New("clusterId not found")
  412. }
  413. switch clusterType {
  414. case storeLink.TYPE_OCTOPUS:
  415. r := http.Request{}
  416. result, err := iCluster.GetInferResult(r.Context(), url, file, fileName)
  417. if err != nil {
  418. return "", err
  419. }
  420. return result, nil
  421. case storeLink.TYPE_MODELARTS:
  422. r, err := getInferResultModelarts(url, file, fileName)
  423. if err != nil {
  424. return "", err
  425. }
  426. return r, nil
  427. default:
  428. var res Res
  429. req := GetRestyRequest(20)
  430. _, err := req.
  431. SetFileReader("file", fileName, file).
  432. SetResult(&res).
  433. Post(url)
  434. if err != nil {
  435. return "", err
  436. }
  437. return res.Result, nil
  438. }
  439. }
  440. func getInferResultModelarts(url string, file multipart.File, fileName string) (string, error) {
  441. var res Res
  442. /* req := GetRestyRequest(20)
  443. _, err := req.
  444. SetFileReader("file", fileName, file).
  445. SetHeaders(map[string]string{
  446. "ak": "UNEHPHO4Z7YSNPKRXFE4",
  447. "sk": "JWXCE9qcYbc7RjpSRIWt4WgG3ZKF6Q4lPzkJReX9",
  448. }).
  449. SetResult(&res).
  450. Post(url)
  451. if err != nil {
  452. return "", err
  453. }*/
  454. body, err := utils.SendRequest("POST", url, file, fileName)
  455. if err != nil {
  456. return "", err
  457. }
  458. errjson := json.Unmarshal([]byte(body), &res)
  459. if errjson != nil {
  460. log.Fatalf("Error parsing JSON: %s", errjson)
  461. }
  462. return res.Result, nil
  463. }
  464. func GetRestyRequest(timeoutSeconds int64) *resty.Request {
  465. client := resty.New().SetTimeout(time.Duration(timeoutSeconds) * time.Second)
  466. request := client.R()
  467. return request
  468. }
  469. type Res struct {
  470. Result string `json:"result"`
  471. }
  472. func contains(cs []*FilteredCluster, e string) bool {
  473. for _, c := range cs {
  474. if c.clusterId == e {
  475. return true
  476. }
  477. }
  478. return false
  479. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.