You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

imageInfer.go 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396
  1. package inference
  2. import (
  3. "context"
  4. "encoding/json"
  5. "errors"
  6. "github.com/go-resty/resty/v2"
  7. "github.com/zeromicro/go-zero/core/logx"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/database"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/strategy"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/storeLink"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  15. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  16. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  17. "log"
  18. "math/rand"
  19. "mime/multipart"
  20. "sort"
  21. "strconv"
  22. "sync"
  23. "time"
  24. )
  25. type ImageFile struct {
  26. ImageResult *types.ImageResult
  27. File multipart.File
  28. }
  29. func Infer(opt *option.InferOption, id int64, adapterName string, clusters []*strategy.AssignedCluster, ts []*ImageFile, aiCollectorAdapterMap map[string]map[string]collector.AiCollector, storage *database.AiStorage, ctx context.Context) ([]*types.ImageResult, error) {
  30. //for i := len(clusters) - 1; i >= 0; i-- {
  31. // if clusters[i].Replicas == 0 {
  32. // clusters = append(clusters[:i], clusters[i+1:]...)
  33. // }
  34. //}
  35. var wg sync.WaitGroup
  36. var cluster_ch = make(chan struct {
  37. urls []*collector.InferUrl
  38. clusterId string
  39. clusterName string
  40. imageNum int32
  41. }, len(clusters))
  42. var cs []struct {
  43. urls []*collector.InferUrl
  44. clusterId string
  45. clusterName string
  46. imageNum int32
  47. }
  48. collectorMap := aiCollectorAdapterMap[opt.AdapterId]
  49. ////save taskai
  50. //for _, c := range clusters {
  51. // clusterName, _ := storage.GetClusterNameById(c.ClusterId)
  52. // opt.Replica = c.Replicas
  53. // err := storage.SaveAiTask(id, opt, adapterName, c.ClusterId, clusterName, "", constants.Saved, "")
  54. // if err != nil {
  55. // return nil, err
  56. // }
  57. //}
  58. var mutex sync.Mutex
  59. errMap := make(map[string]string)
  60. for _, cluster := range clusters {
  61. wg.Add(1)
  62. c := cluster
  63. go func() {
  64. imageUrls, err := collectorMap[c.ClusterId].GetInferUrl(ctx, opt)
  65. if err != nil {
  66. mutex.Lock()
  67. errMap[c.ClusterId] = err.Error()
  68. mutex.Unlock()
  69. wg.Done()
  70. return
  71. }
  72. for i, _ := range imageUrls {
  73. imageUrls[i].Url = imageUrls[i].Url + storeLink.FORWARD_SLASH + "image"
  74. }
  75. clusterName, _ := storage.GetClusterNameById(c.ClusterId)
  76. s := struct {
  77. urls []*collector.InferUrl
  78. clusterId string
  79. clusterName string
  80. imageNum int32
  81. }{
  82. urls: imageUrls,
  83. clusterId: c.ClusterId,
  84. clusterName: clusterName,
  85. imageNum: c.Replicas,
  86. }
  87. cluster_ch <- s
  88. wg.Done()
  89. return
  90. }()
  91. }
  92. wg.Wait()
  93. close(cluster_ch)
  94. for s := range cluster_ch {
  95. cs = append(cs, s)
  96. }
  97. aiTaskList, err := storage.GetAiTaskListById(id)
  98. if err != nil {
  99. return nil, err
  100. }
  101. //no cluster available
  102. if len(cs) == 0 {
  103. for _, t := range aiTaskList {
  104. t.Status = constants.Failed
  105. t.EndTime = time.Now().Format(time.RFC3339)
  106. if _, ok := errMap[strconv.Itoa(int(t.ClusterId))]; ok {
  107. t.Msg = errMap[strconv.Itoa(int(t.ClusterId))]
  108. }
  109. err := storage.UpdateAiTask(t)
  110. if err != nil {
  111. logx.Errorf(err.Error())
  112. }
  113. }
  114. storage.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "failed", "任务失败")
  115. return nil, errors.New("image infer task failed")
  116. }
  117. //change cluster status
  118. if len(clusters) != len(cs) {
  119. var acs []*strategy.AssignedCluster
  120. var rcs []*strategy.AssignedCluster
  121. for _, cluster := range clusters {
  122. if contains(cs, cluster.ClusterId) {
  123. var ac *strategy.AssignedCluster
  124. ac = cluster
  125. rcs = append(rcs, ac)
  126. } else {
  127. var ac *strategy.AssignedCluster
  128. ac = cluster
  129. acs = append(acs, ac)
  130. }
  131. }
  132. // update failed cluster status
  133. for _, ac := range acs {
  134. for _, t := range aiTaskList {
  135. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  136. t.Status = constants.Failed
  137. t.EndTime = time.Now().Format(time.RFC3339)
  138. if _, ok := errMap[strconv.Itoa(int(t.ClusterId))]; ok {
  139. t.Msg = errMap[strconv.Itoa(int(t.ClusterId))]
  140. }
  141. err := storage.UpdateAiTask(t)
  142. if err != nil {
  143. logx.Errorf(err.Error())
  144. }
  145. }
  146. }
  147. }
  148. // update running cluster status
  149. for _, ac := range rcs {
  150. for _, t := range aiTaskList {
  151. if ac.ClusterId == strconv.Itoa(int(t.ClusterId)) {
  152. t.Status = constants.Running
  153. err := storage.UpdateAiTask(t)
  154. if err != nil {
  155. logx.Errorf(err.Error())
  156. }
  157. }
  158. }
  159. }
  160. storage.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "failed", "任务失败")
  161. } else {
  162. for _, t := range aiTaskList {
  163. t.Status = constants.Running
  164. err := storage.UpdateAiTask(t)
  165. if err != nil {
  166. logx.Errorf(err.Error())
  167. }
  168. }
  169. storage.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "running", "任务运行中")
  170. }
  171. var result_ch = make(chan *types.ImageResult, len(ts))
  172. var results []*types.ImageResult
  173. limit := make(chan bool, 7)
  174. var imageNumIdx int32 = 0
  175. var imageNumIdxEnd int32 = 0
  176. for _, c := range cs {
  177. new_images := make([]*ImageFile, len(ts))
  178. copy(new_images, ts)
  179. imageNumIdxEnd = imageNumIdxEnd + c.imageNum
  180. new_images = new_images[imageNumIdx:imageNumIdxEnd]
  181. imageNumIdx = imageNumIdx + c.imageNum
  182. wg.Add(len(new_images))
  183. go sendInferReq(new_images, c, &wg, result_ch, limit)
  184. }
  185. wg.Wait()
  186. close(result_ch)
  187. for s := range result_ch {
  188. results = append(results, s)
  189. }
  190. sort.Slice(results, func(p, q int) bool {
  191. return results[p].ClusterName < results[q].ClusterName
  192. })
  193. //save ai sub tasks
  194. for _, r := range results {
  195. for _, task := range aiTaskList {
  196. if r.ClusterId == strconv.Itoa(int(task.ClusterId)) {
  197. taskAiSub := models.TaskAiSub{
  198. TaskId: id,
  199. TaskName: task.Name,
  200. TaskAiId: task.TaskId,
  201. TaskAiName: task.Name,
  202. ImageName: r.ImageName,
  203. Result: r.ImageResult,
  204. Card: r.Card,
  205. ClusterId: task.ClusterId,
  206. ClusterName: r.ClusterName,
  207. }
  208. err := storage.SaveAiTaskImageSubTask(&taskAiSub)
  209. if err != nil {
  210. panic(err)
  211. }
  212. }
  213. }
  214. }
  215. // update succeeded cluster status
  216. var successStatusCount int
  217. for _, c := range cs {
  218. for _, t := range aiTaskList {
  219. if c.clusterId == strconv.Itoa(int(t.ClusterId)) {
  220. t.Status = constants.Completed
  221. t.EndTime = time.Now().Format(time.RFC3339)
  222. err := storage.UpdateAiTask(t)
  223. if err != nil {
  224. logx.Errorf(err.Error())
  225. }
  226. successStatusCount++
  227. } else {
  228. continue
  229. }
  230. }
  231. }
  232. if len(cs) == successStatusCount {
  233. storage.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "completed", "任务完成")
  234. } else {
  235. storage.AddNoticeInfo(opt.AdapterId, adapterName, "", "", opt.TaskName, "failed", "任务失败")
  236. }
  237. return results, nil
  238. }
  239. func sendInferReq(images []*ImageFile, cluster struct {
  240. urls []*collector.InferUrl
  241. clusterId string
  242. clusterName string
  243. imageNum int32
  244. }, wg *sync.WaitGroup, ch chan<- *types.ImageResult, limit chan bool) {
  245. for _, image := range images {
  246. limit <- true
  247. go func(t *ImageFile, c struct {
  248. urls []*collector.InferUrl
  249. clusterId string
  250. clusterName string
  251. imageNum int32
  252. }) {
  253. if len(c.urls) == 1 {
  254. r, err := getInferResult(c.urls[0].Url, t.File, t.ImageResult.ImageName, c.clusterName)
  255. if err != nil {
  256. t.ImageResult.ImageResult = err.Error()
  257. t.ImageResult.ClusterId = c.clusterId
  258. t.ImageResult.ClusterName = c.clusterName
  259. t.ImageResult.Card = c.urls[0].Card
  260. ch <- t.ImageResult
  261. wg.Done()
  262. <-limit
  263. return
  264. }
  265. t.ImageResult.ImageResult = r
  266. t.ImageResult.ClusterId = c.clusterId
  267. t.ImageResult.ClusterName = c.clusterName
  268. t.ImageResult.Card = c.urls[0].Card
  269. ch <- t.ImageResult
  270. wg.Done()
  271. <-limit
  272. return
  273. } else {
  274. idx := rand.Intn(len(c.urls))
  275. r, err := getInferResult(c.urls[idx].Url, t.File, t.ImageResult.ImageName, c.clusterName)
  276. if err != nil {
  277. t.ImageResult.ImageResult = err.Error()
  278. t.ImageResult.ClusterId = c.clusterId
  279. t.ImageResult.ClusterName = c.clusterName
  280. t.ImageResult.Card = c.urls[idx].Card
  281. ch <- t.ImageResult
  282. wg.Done()
  283. <-limit
  284. return
  285. }
  286. t.ImageResult.ImageResult = r
  287. t.ImageResult.ClusterId = c.clusterId
  288. t.ImageResult.ClusterName = c.clusterName
  289. t.ImageResult.Card = c.urls[idx].Card
  290. ch <- t.ImageResult
  291. wg.Done()
  292. <-limit
  293. return
  294. }
  295. }(image, cluster)
  296. <-limit
  297. }
  298. }
  299. func getInferResult(url string, file multipart.File, fileName string, clusterName string) (string, error) {
  300. if clusterName == "鹏城云脑II-modelarts" {
  301. r, err := getInferResultModelarts(url, file, fileName)
  302. if err != nil {
  303. return "", err
  304. }
  305. return r, nil
  306. }
  307. var res Res
  308. req := GetRestyRequest(20)
  309. _, err := req.
  310. SetFileReader("file", fileName, file).
  311. SetResult(&res).
  312. Post(url)
  313. if err != nil {
  314. return "", err
  315. }
  316. return res.Result, nil
  317. }
  318. func getInferResultModelarts(url string, file multipart.File, fileName string) (string, error) {
  319. var res Res
  320. /* req := GetRestyRequest(20)
  321. _, err := req.
  322. SetFileReader("file", fileName, file).
  323. SetHeaders(map[string]string{
  324. "ak": "UNEHPHO4Z7YSNPKRXFE4",
  325. "sk": "JWXCE9qcYbc7RjpSRIWt4WgG3ZKF6Q4lPzkJReX9",
  326. }).
  327. SetResult(&res).
  328. Post(url)
  329. if err != nil {
  330. return "", err
  331. }*/
  332. body, err := utils.SendRequest("POST", url, file, fileName)
  333. if err != nil {
  334. return "", err
  335. }
  336. errjson := json.Unmarshal([]byte(body), &res)
  337. if errjson != nil {
  338. log.Fatalf("Error parsing JSON: %s", errjson)
  339. }
  340. return res.Result, nil
  341. }
  342. func GetRestyRequest(timeoutSeconds int64) *resty.Request {
  343. client := resty.New().SetTimeout(time.Duration(timeoutSeconds) * time.Second)
  344. request := client.R()
  345. return request
  346. }
  347. type Res struct {
  348. Result string `json:"result"`
  349. }
  350. func contains(cs []struct {
  351. urls []*collector.InferUrl
  352. clusterId string
  353. clusterName string
  354. imageNum int32
  355. }, e string) bool {
  356. for _, c := range cs {
  357. if c.clusterId == e {
  358. return true
  359. }
  360. }
  361. return false
  362. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.