You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

imageInference.go 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469
  1. package imageInference
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "github.com/go-resty/resty/v2"
  6. "github.com/zeromicro/go-zero/core/logx"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/database"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/strategy"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  15. "log"
  16. "math/rand"
  17. "mime/multipart"
  18. "net/http"
  19. "sort"
  20. "strconv"
  21. "sync"
  22. "time"
  23. )
  24. type IImageInference interface {
  25. AppendRoute(urls []*inference.InferUrl) error
  26. GetAiType() string
  27. }
  28. type ImageFile struct {
  29. ImageResult *types.ImageResult
  30. File multipart.File
  31. }
  32. type FilteredCluster struct {
  33. urls []*inference.InferUrl
  34. clusterId string
  35. clusterName string
  36. imageNum int32
  37. }
  38. type ImageInference struct {
  39. inference IImageInference
  40. files []*ImageFile
  41. clusters []*strategy.AssignedCluster
  42. opt *option.InferOption
  43. storage *database.AiStorage
  44. inferAdapter map[string]map[string]inference.ICluster
  45. errMap map[string]string
  46. adapterName string
  47. }
  48. func New(
  49. inference IImageInference,
  50. files []*ImageFile,
  51. clusters []*strategy.AssignedCluster,
  52. opt *option.InferOption,
  53. storage *database.AiStorage,
  54. inferAdapter map[string]map[string]inference.ICluster,
  55. adapterName string) (*ImageInference, error) {
  56. return &ImageInference{
  57. inference: inference,
  58. files: files,
  59. clusters: clusters,
  60. opt: opt,
  61. storage: storage,
  62. inferAdapter: inferAdapter,
  63. adapterName: adapterName,
  64. errMap: make(map[string]string),
  65. }, nil
  66. }
  67. func (i *ImageInference) CreateTask() (int64, error) {
  68. id, err := i.saveTask()
  69. if err != nil {
  70. return 0, err
  71. }
  72. err = i.saveAiTask(id)
  73. if err != nil {
  74. return 0, err
  75. }
  76. return id, nil
  77. }
  78. func (i *ImageInference) InferTask(id int64) error {
  79. clusters, err := i.filterClusters()
  80. if err != nil {
  81. return err
  82. }
  83. aiTaskList, err := i.storage.GetAiTaskListById(id)
  84. if err != nil || len(aiTaskList) == 0 {
  85. return err
  86. }
  87. err = i.updateStatus(aiTaskList, clusters)
  88. if err != nil {
  89. return err
  90. }
  91. results, err := i.inferImages(clusters)
  92. if err != nil {
  93. return err
  94. }
  95. err = i.saveAiSubTasks(id, aiTaskList, clusters, results)
  96. if err != nil {
  97. return err
  98. }
  99. return nil
  100. }
  101. func (i *ImageInference) saveTask() (int64, error) {
  102. var synergystatus int64
  103. if len(i.clusters) > 1 {
  104. synergystatus = 1
  105. }
  106. strategyCode, err := i.storage.GetStrategyCode(i.opt.Strategy)
  107. if err != nil {
  108. return 0, err
  109. }
  110. id, err := i.storage.SaveTask(i.opt.TaskName, strategyCode, synergystatus, i.inference.GetAiType())
  111. if err != nil {
  112. return 0, err
  113. }
  114. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "create", "任务创建中")
  115. return id, nil
  116. }
  117. func (i *ImageInference) saveAiTask(id int64) error {
  118. for _, c := range i.clusters {
  119. clusterName, _ := i.storage.GetClusterNameById(c.ClusterId)
  120. i.opt.Replica = c.Replicas
  121. err := i.storage.SaveAiTask(id, i.opt, i.adapterName, c.ClusterId, clusterName, "", constants.Saved, "")
  122. if err != nil {
  123. return err
  124. }
  125. }
  126. return nil
  127. }
  128. func (i *ImageInference) filterClusters() ([]*FilteredCluster, error) {
  129. var wg sync.WaitGroup
  130. var ch = make(chan *FilteredCluster, len(i.clusters))
  131. var cs []*FilteredCluster
  132. var mutex sync.Mutex
  133. inferMap := i.inferAdapter[i.opt.AdapterId]
  134. for _, cluster := range i.clusters {
  135. wg.Add(1)
  136. c := cluster
  137. go func() {
  138. r := http.Request{}
  139. imageUrls, err := inferMap[c.ClusterId].GetInferUrl(r.Context(), i.opt)
  140. if err != nil {
  141. mutex.Lock()
  142. i.errMap[c.ClusterId] = err.Error()
  143. mutex.Unlock()
  144. wg.Done()
  145. return
  146. }
  147. i.inference.AppendRoute(imageUrls)
  148. clusterName, _ := i.storage.GetClusterNameById(c.ClusterId)
  149. var f FilteredCluster
  150. f.urls = imageUrls
  151. f.clusterId = c.ClusterId
  152. f.clusterName = clusterName
  153. f.imageNum = c.Replicas
  154. ch <- &f
  155. wg.Done()
  156. return
  157. }()
  158. }
  159. wg.Wait()
  160. close(ch)
  161. for s := range ch {
  162. cs = append(cs, s)
  163. }
  164. return cs, nil
  165. }
  166. func (i *ImageInference) inferImages(cs []*FilteredCluster) ([]*types.ImageResult, error) {
  167. var wg sync.WaitGroup
  168. var ch = make(chan *types.ImageResult, len(i.files))
  169. var results []*types.ImageResult
  170. limit := make(chan bool, 7)
  171. var imageNumIdx int32 = 0
  172. var imageNumIdxEnd int32 = 0
  173. for _, c := range cs {
  174. new_images := make([]*ImageFile, len(i.files))
  175. copy(new_images, i.files)
  176. imageNumIdxEnd = imageNumIdxEnd + c.imageNum
  177. new_images = new_images[imageNumIdx:imageNumIdxEnd]
  178. imageNumIdx = imageNumIdx + c.imageNum
  179. wg.Add(len(new_images))
  180. go sendInferReq(new_images, c, &wg, ch, limit)
  181. }
  182. wg.Wait()
  183. close(ch)
  184. for s := range ch {
  185. results = append(results, s)
  186. }
  187. sort.Slice(results, func(p, q int) bool {
  188. return results[p].ClusterName < results[q].ClusterName
  189. })
  190. return results, nil
  191. }
  192. func (i *ImageInference) updateStatus(aiTaskList []*models.TaskAi, cs []*FilteredCluster) error {
  193. //no cluster available
  194. if len(cs) == 0 {
  195. for _, t := range aiTaskList {
  196. t.Status = constants.Failed
  197. t.EndTime = time.Now().Format(time.RFC3339)
  198. if _, ok := i.errMap[strconv.Itoa(int(t.ClusterId))]; ok {
  199. t.Msg = i.errMap[strconv.Itoa(int(t.ClusterId))]
  200. }
  201. err := i.storage.UpdateAiTask(t)
  202. if err != nil {
  203. logx.Errorf(err.Error())
  204. }
  205. }
  206. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "failed", "任务失败")
  207. return errors.New("available clusters' empty, image infer task failed")
  208. }
  209. //change cluster status
  210. if len(i.clusters) != len(cs) {
  211. var acs []*strategy.AssignedCluster
  212. var rcs []*strategy.AssignedCluster
  213. for _, cluster := range i.clusters {
  214. if contains(cs, cluster.ClusterId) {
  215. var ac *strategy.AssignedCluster
  216. ac = cluster
  217. rcs = append(rcs, ac)
  218. } else {
  219. var ac *strategy.AssignedCluster
  220. ac = cluster
  221. acs = append(acs, ac)
  222. }
  223. }
  224. // update failed cluster status
  225. for _, ac := range acs {
  226. for _, t := range aiTaskList {
  227. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  228. t.Status = constants.Failed
  229. t.EndTime = time.Now().Format(time.RFC3339)
  230. if _, ok := i.errMap[strconv.Itoa(int(t.ClusterId))]; ok {
  231. t.Msg = i.errMap[strconv.Itoa(int(t.ClusterId))]
  232. }
  233. err := i.storage.UpdateAiTask(t)
  234. if err != nil {
  235. logx.Errorf(err.Error())
  236. }
  237. }
  238. }
  239. }
  240. // update running cluster status
  241. for _, ac := range rcs {
  242. for _, t := range aiTaskList {
  243. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  244. t.Status = constants.Running
  245. err := i.storage.UpdateAiTask(t)
  246. if err != nil {
  247. logx.Errorf(err.Error())
  248. }
  249. }
  250. }
  251. }
  252. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "failed", "任务失败")
  253. } else {
  254. for _, t := range aiTaskList {
  255. t.Status = constants.Running
  256. err := i.storage.UpdateAiTask(t)
  257. if err != nil {
  258. logx.Errorf(err.Error())
  259. }
  260. }
  261. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "running", "任务运行中")
  262. }
  263. return nil
  264. }
  265. func sendInferReq(images []*ImageFile, cluster *FilteredCluster, wg *sync.WaitGroup, ch chan<- *types.ImageResult, limit chan bool) {
  266. for _, image := range images {
  267. limit <- true
  268. go func(t *ImageFile, c *FilteredCluster) {
  269. if len(c.urls) == 1 {
  270. r, err := getInferResult(c.urls[0].Url, t.File, t.ImageResult.ImageName, c.clusterName)
  271. if err != nil {
  272. t.ImageResult.ImageResult = err.Error()
  273. t.ImageResult.ClusterId = c.clusterId
  274. t.ImageResult.ClusterName = c.clusterName
  275. t.ImageResult.Card = c.urls[0].Card
  276. ch <- t.ImageResult
  277. wg.Done()
  278. <-limit
  279. return
  280. }
  281. t.ImageResult.ImageResult = r
  282. t.ImageResult.ClusterId = c.clusterId
  283. t.ImageResult.ClusterName = c.clusterName
  284. t.ImageResult.Card = c.urls[0].Card
  285. ch <- t.ImageResult
  286. wg.Done()
  287. <-limit
  288. return
  289. } else {
  290. idx := rand.Intn(len(c.urls))
  291. r, err := getInferResult(c.urls[idx].Url, t.File, t.ImageResult.ImageName, c.clusterName)
  292. if err != nil {
  293. t.ImageResult.ImageResult = err.Error()
  294. t.ImageResult.ClusterId = c.clusterId
  295. t.ImageResult.ClusterName = c.clusterName
  296. t.ImageResult.Card = c.urls[idx].Card
  297. ch <- t.ImageResult
  298. wg.Done()
  299. <-limit
  300. return
  301. }
  302. t.ImageResult.ImageResult = r
  303. t.ImageResult.ClusterId = c.clusterId
  304. t.ImageResult.ClusterName = c.clusterName
  305. t.ImageResult.Card = c.urls[idx].Card
  306. ch <- t.ImageResult
  307. wg.Done()
  308. <-limit
  309. return
  310. }
  311. }(image, cluster)
  312. <-limit
  313. }
  314. }
  315. func (i *ImageInference) saveAiSubTasks(id int64, aiTaskList []*models.TaskAi, cs []*FilteredCluster, results []*types.ImageResult) error {
  316. //save ai sub tasks
  317. for _, r := range results {
  318. for _, task := range aiTaskList {
  319. if r.ClusterId == strconv.Itoa(int(task.ClusterId)) {
  320. taskAiSub := models.TaskAiSub{
  321. TaskId: id,
  322. TaskName: task.Name,
  323. TaskAiId: task.TaskId,
  324. TaskAiName: task.Name,
  325. ImageName: r.ImageName,
  326. Result: r.ImageResult,
  327. Card: r.Card,
  328. ClusterId: task.ClusterId,
  329. ClusterName: r.ClusterName,
  330. }
  331. err := i.storage.SaveAiTaskImageSubTask(&taskAiSub)
  332. if err != nil {
  333. panic(err)
  334. }
  335. }
  336. }
  337. }
  338. // update succeeded cluster status
  339. var successStatusCount int
  340. for _, c := range cs {
  341. for _, t := range aiTaskList {
  342. if c.clusterId == strconv.Itoa(int(t.ClusterId)) {
  343. t.Status = constants.Completed
  344. t.EndTime = time.Now().Format(time.RFC3339)
  345. err := i.storage.UpdateAiTask(t)
  346. if err != nil {
  347. logx.Errorf(err.Error())
  348. }
  349. successStatusCount++
  350. } else {
  351. continue
  352. }
  353. }
  354. }
  355. if len(cs) == successStatusCount {
  356. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "completed", "任务完成")
  357. } else {
  358. i.storage.AddNoticeInfo(i.opt.AdapterId, i.adapterName, "", "", i.opt.TaskName, "failed", "任务失败")
  359. }
  360. return nil
  361. }
  362. func getInferResult(url string, file multipart.File, fileName string, clusterName string) (string, error) {
  363. if clusterName == "鹏城云脑II-modelarts" {
  364. r, err := getInferResultModelarts(url, file, fileName)
  365. if err != nil {
  366. return "", err
  367. }
  368. return r, nil
  369. }
  370. var res Res
  371. req := GetRestyRequest(20)
  372. _, err := req.
  373. SetFileReader("file", fileName, file).
  374. SetResult(&res).
  375. Post(url)
  376. if err != nil {
  377. return "", err
  378. }
  379. return res.Result, nil
  380. }
  381. func getInferResultModelarts(url string, file multipart.File, fileName string) (string, error) {
  382. var res Res
  383. /* req := GetRestyRequest(20)
  384. _, err := req.
  385. SetFileReader("file", fileName, file).
  386. SetHeaders(map[string]string{
  387. "ak": "UNEHPHO4Z7YSNPKRXFE4",
  388. "sk": "JWXCE9qcYbc7RjpSRIWt4WgG3ZKF6Q4lPzkJReX9",
  389. }).
  390. SetResult(&res).
  391. Post(url)
  392. if err != nil {
  393. return "", err
  394. }*/
  395. body, err := utils.SendRequest("POST", url, file, fileName)
  396. if err != nil {
  397. return "", err
  398. }
  399. errjson := json.Unmarshal([]byte(body), &res)
  400. if errjson != nil {
  401. log.Fatalf("Error parsing JSON: %s", errjson)
  402. }
  403. return res.Result, nil
  404. }
  405. func GetRestyRequest(timeoutSeconds int64) *resty.Request {
  406. client := resty.New().SetTimeout(time.Duration(timeoutSeconds) * time.Second)
  407. request := client.R()
  408. return request
  409. }
  410. type Res struct {
  411. Result string `json:"result"`
  412. }
  413. func contains(cs []*FilteredCluster, e string) bool {
  414. for _, c := range cs {
  415. if c.clusterId == e {
  416. return true
  417. }
  418. }
  419. return false
  420. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.