You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

shuguangai.go 36 kB

10 months ago
10 months ago
10 months ago
10 months ago
11 months ago
10 months ago
10 months ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "errors"
  16. "gitlink.org.cn/JointCloud/pcm-ac/hpcAC"
  17. hpcacclient "gitlink.org.cn/JointCloud/pcm-ac/hpcacclient"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  19. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  20. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  21. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  22. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  23. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  24. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  25. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  26. "mime/multipart"
  27. "strconv"
  28. "strings"
  29. "sync"
  30. "time"
  31. )
  32. const (
  33. RAM_SIZE_1G = 1024 // 1G
  34. WORKER_NUMBER = 1
  35. DCU = "DCU"
  36. DCU_TOPS = 24.5
  37. PYTORCH = "Pytorch"
  38. TASK_PYTORCH_PREFIX = "PytorchTask"
  39. TENSORFLOW = "Tensorflow"
  40. RESOURCE_GROUP = "kshdtest"
  41. WorkPath = "/work/home/acgnnmfbwo/pcmv1/"
  42. TimeoutLimit = "10:00:00"
  43. PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
  44. DATASETS_DIR = KUNSHAN_DIR + "/dataset"
  45. ALGORITHM_DIR = KUNSHAN_DIR + "/algorithm"
  46. KUNSHAN_DIR = "/public/home/acgnnmfbwo/pcmv1"
  47. TRAIN_FILE = "train.py"
  48. CPUCOREPRICEPERHOUR = 0.09
  49. DCUPRICEPERHOUR = 2.0
  50. KB = 1024
  51. TIMEOUT = 20
  52. DEPLOY_INSTANCE_LIMIT = 100
  53. ProtocolType = "HTTP"
  54. ContainerPort = 8881
  55. JUPYTER = "jupyter"
  56. Z100L = "Z100L"
  57. )
  58. var (
  59. RESOURCESGAIMAP = map[string]ResourceSpecSGAI{
  60. "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": {
  61. CPU: 1,
  62. GPU: 1,
  63. RAM: 2 * RAM_SIZE_1G,
  64. },
  65. "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": {
  66. CPU: 1,
  67. GPU: 2,
  68. RAM: 2 * RAM_SIZE_1G,
  69. },
  70. "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": {
  71. CPU: 2,
  72. GPU: 3,
  73. RAM: 4 * RAM_SIZE_1G,
  74. },
  75. "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": {
  76. CPU: 4,
  77. GPU: 4,
  78. RAM: 8 * RAM_SIZE_1G,
  79. },
  80. "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": {
  81. CPU: 5,
  82. GPU: 5,
  83. RAM: 10 * RAM_SIZE_1G,
  84. },
  85. }
  86. RESOURCESPECSAI = map[string]string{
  87. "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi": "CPU:1, DCU:1, RAM:2G",
  88. "6d41v1XV53MQPmQOJ5kNatIck9yl8nWZ": "CPU:1, DCU:2, RAM:2G",
  89. "OBtVaaXAv9n9FbLR7pWAoa3yR13jXwNc": "CPU:2, DCU:3, RAM:4G",
  90. "sBWfpkntUzsWYly11kdwEHZOYYIsFmve": "CPU:4, DCU:4, RAM:8G",
  91. "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2": "CPU:5, DCU:5, RAM:10G",
  92. }
  93. ModelNameCmdMap = map[string]string{
  94. "blip-image-captioning-base": "sudo pip install transformers python-multipart fastapi uvicorn[standard]; sudo python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/blip_image_captioning_base/infer.py",
  95. "imagenet_resnet50": "sudo pip install fastapi uvicorn[standard] python-multipart; sudo python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/imagenet_resnet50/infer.py",
  96. }
  97. )
  98. type ResourceSpecSGAI struct {
  99. CPU int64
  100. GPU int64
  101. RAM int64
  102. }
  103. type ShuguangAi struct {
  104. aCRpc hpcacclient.HpcAC
  105. platform string
  106. participantId int64
  107. }
  108. func NewShuguangAi(aCRpc hpcAC.HpcACClient, name string, id int64) *ShuguangAi {
  109. return &ShuguangAi{aCRpc: aCRpc, platform: name, participantId: id}
  110. }
  111. func (s *ShuguangAi) UploadImage(ctx context.Context, path string) (interface{}, error) {
  112. return nil, nil
  113. }
  114. func (s *ShuguangAi) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  115. return nil, nil
  116. }
  117. func (s *ShuguangAi) QueryImageList(ctx context.Context) (interface{}, error) {
  118. // shuguangAi获取镜像列表
  119. req := &hpcAC.GetImageListAiReq{
  120. AcceleratorType: DCU,
  121. TaskType: PYTORCH,
  122. }
  123. resp, err := s.aCRpc.GetImageListAi(ctx, req)
  124. if err != nil {
  125. return nil, err
  126. }
  127. return resp, nil
  128. }
  129. func (s *ShuguangAi) SubmitPytorchTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string) (interface{}, error) {
  130. //判断是否resourceId匹配自定义资源Id
  131. _, isMapContainsKey := RESOURCESPECSAI[resourceId]
  132. if !isMapContainsKey {
  133. return nil, errors.New("shuguangAi资源Id不存在")
  134. }
  135. //根据imageId获取imagePath, version
  136. imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId}
  137. imageResp, err := s.aCRpc.GetImageAiById(ctx, imageReq)
  138. if err != nil {
  139. return nil, err
  140. }
  141. //python参数
  142. var pythonArg string
  143. for _, param := range params {
  144. s := strings.Split(param, COMMA)
  145. pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE
  146. }
  147. //环境变量
  148. var env string
  149. for _, e := range envs {
  150. s := strings.Split(e, COMMA)
  151. env += s[0] + "=" + s[1] + SPACE
  152. }
  153. //set paths
  154. var workPath string
  155. var codePath string
  156. paths := strings.Split(algorithmId, DASH)
  157. if len(paths) == 3 {
  158. workPath = ALGORITHM_DIR + FORWARD_SLASH + paths[0] + FORWARD_SLASH + paths[1] + DASH + paths[2]
  159. codePath = workPath + FORWARD_SLASH + TRAIN_FILE
  160. } else {
  161. // storage schedule submit mode
  162. codePath = algorithmId
  163. paths = strings.Split(algorithmId, FORWARD_SLASH)
  164. last := paths[len(paths)-1]
  165. workPath = strings.TrimSuffix(algorithmId, FORWARD_SLASH+last)
  166. }
  167. req := &hpcAC.SubmitPytorchTaskReq{
  168. Params: &hpcAC.SubmitPytorchTaskParams{
  169. TaskName: TASK_PYTORCH_PREFIX + UNDERSCORE + utils.RandomString(10),
  170. WorkPath: workPath,
  171. IsDistributed: false,
  172. IsHvd: false,
  173. Env: env,
  174. AcceleratorType: DCU,
  175. Version: imageResp.Image.Version,
  176. ImagePath: imageResp.Image.Path,
  177. WorkerNumber: WORKER_NUMBER,
  178. ResourceGroup: RESOURCE_GROUP,
  179. TimeoutLimit: TimeoutLimit,
  180. PythonCodePath: codePath,
  181. PythonArg: pythonArg,
  182. },
  183. }
  184. updateSGAIRequestByResourceId(resourceId, req)
  185. resp, err := s.aCRpc.SubmitPytorchTask(ctx, req)
  186. if err != nil {
  187. return nil, err
  188. }
  189. return resp, nil
  190. }
  191. func updateSGAIRequestByResourceId(resourceId string, req *hpcAC.SubmitPytorchTaskReq) {
  192. spec := RESOURCESGAIMAP[resourceId]
  193. req.Params.WorkerCpuNumber = spec.CPU
  194. req.Params.WorkerGpuNumber = spec.GPU
  195. req.Params.WorkerRamSize = spec.RAM
  196. }
  197. func (s *ShuguangAi) SubmitTensorflowTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string) (interface{}, error) {
  198. //req := &hpcAC.SubmitTensorflowTaskReq{
  199. // Params: &hpcAC.SubmitTensorflowTaskParams{
  200. //
  201. // }
  202. //}
  203. return nil, nil
  204. }
  205. func (s *ShuguangAi) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  206. // set algorithmId temporarily for storelink submit
  207. if algorithmId == "" {
  208. algorithmId = "pytorch-mnist-fcn"
  209. }
  210. // shuguangAi提交任务
  211. switch aiType {
  212. case PYTORCH_TASK:
  213. task, err := s.SubmitPytorchTask(ctx, imageId, cmd, envs, params, resourceId, datasetsId, algorithmId)
  214. if err != nil {
  215. return nil, err
  216. }
  217. return task, nil
  218. case TENSORFLOW_TASK:
  219. task, err := s.SubmitTensorflowTask(ctx, imageId, cmd, envs, params, resourceId, datasetsId, algorithmId)
  220. if err != nil {
  221. return nil, err
  222. }
  223. return task, nil
  224. default:
  225. task, err := s.SubmitPytorchTask(ctx, imageId, cmd, envs, params, resourceId, datasetsId, algorithmId)
  226. if err != nil {
  227. return nil, err
  228. }
  229. return task, nil
  230. }
  231. }
  232. func (s *ShuguangAi) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  233. // shuguangAi获取任务
  234. req := &hpcAC.GetPytorchTaskReq{
  235. Id: taskId,
  236. }
  237. resp, err := s.aCRpc.GetPytorchTask(ctx, req)
  238. if err != nil {
  239. return nil, err
  240. }
  241. return resp, nil
  242. }
  243. func (s *ShuguangAi) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  244. // shuguangAi删除任务
  245. req := &hpcAC.DeleteTaskAiReq{
  246. Ids: taskId,
  247. }
  248. resp, err := s.aCRpc.DeleteTaskAi(ctx, req)
  249. if err != nil {
  250. return nil, err
  251. }
  252. return resp, nil
  253. }
  254. func (s *ShuguangAi) QuerySpecs(ctx context.Context) (interface{}, error) {
  255. resp := &types.GetResourceSpecsResp{}
  256. for k, v := range RESOURCESPECSAI {
  257. var respec types.ResourceSpecSl
  258. respec.SpecId = k
  259. respec.SpecName = v
  260. respec.ParticipantId = s.participantId
  261. respec.ParticipantName = s.platform
  262. resp.ResourceSpecs = append(resp.ResourceSpecs, &respec)
  263. }
  264. resp.Success = true
  265. return resp, nil
  266. }
  267. func (s *ShuguangAi) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  268. var wg sync.WaitGroup
  269. wg.Add(5)
  270. var cBalance = make(chan float64)
  271. var cMemTotal = make(chan float64)
  272. var cTotalCpu = make(chan int64)
  273. resourceStats := &collector.ResourceStats{
  274. ClusterId: strconv.FormatInt(s.participantId, 10),
  275. Name: s.platform,
  276. }
  277. dcu := &collector.Card{
  278. Platform: SHUGUANGAI,
  279. Type: CARD,
  280. Name: DCU,
  281. TOpsAtFp16: DCU_TOPS,
  282. }
  283. //history jobs
  284. go func() {
  285. hReq := &hpcAC.ListHistoryJobReq{}
  286. hReq.Start = 0
  287. hReq.Limit = 1
  288. hReq.IsQueryByQueueTime = "false"
  289. hReq.TimeType = "CUSTOM"
  290. hReq.StartTime = "2024-01-01 01:01:01"
  291. endTime := time.Now().Format("2006-01-02 15:04:05")
  292. hReq.EndTime = endTime
  293. hResp, err := s.aCRpc.ListHistoryJob(ctx, hReq)
  294. if err != nil || hResp.Code != "0" {
  295. wg.Done()
  296. return
  297. }
  298. if hResp.Data == nil {
  299. wg.Done()
  300. return
  301. }
  302. resourceStats.TaskCompleted = int64(hResp.Data.Total)
  303. wg.Done()
  304. }()
  305. //balance
  306. go func() {
  307. userReq := &hpcAC.GetUserInfoReq{}
  308. userinfo, err := s.aCRpc.GetUserInfo(ctx, userReq)
  309. if err != nil {
  310. return
  311. }
  312. if userinfo.Data == nil {
  313. wg.Done()
  314. return
  315. }
  316. balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64)
  317. resourceStats.Balance = balance
  318. cBalance <- balance
  319. }()
  320. //resource limit
  321. go func() {
  322. limitReq := &hpcAC.QueueReq{}
  323. limitResp, err := s.aCRpc.QueryUserQuotasLimit(ctx, limitReq)
  324. if err != nil || limitResp.Code != "0" {
  325. wg.Done()
  326. return
  327. }
  328. if limitResp.Data == nil {
  329. wg.Done()
  330. return
  331. }
  332. totalCpu := limitResp.Data.AccountMaxCpu
  333. totalDcu := limitResp.Data.AccountMaxDcu
  334. dcu.CardNum = int32(totalDcu)
  335. resourceStats.CpuCoreTotal = totalCpu
  336. cTotalCpu <- totalCpu
  337. wg.Done()
  338. }()
  339. //disk
  340. go func() {
  341. diskReq := &hpcAC.ParaStorQuotaReq{}
  342. diskResp, err := s.aCRpc.ParaStorQuota(ctx, diskReq)
  343. if err != nil {
  344. wg.Done()
  345. return
  346. }
  347. if diskResp.Data == nil {
  348. wg.Done()
  349. return
  350. }
  351. totalDisk := common.RoundFloat(diskResp.Data[0].Threshold*KB*KB*KB, 3)
  352. availDisk := common.RoundFloat((diskResp.Data[0].Threshold-diskResp.Data[0].Usage)*KB*KB*KB, 3)
  353. resourceStats.DiskTotal = totalDisk
  354. resourceStats.DiskAvail = availDisk
  355. wg.Done()
  356. }()
  357. //memory
  358. go func() {
  359. nodeResp, err := s.aCRpc.GetNodeResources(ctx, nil)
  360. if err != nil {
  361. wg.Done()
  362. return
  363. }
  364. if nodeResp.Data == nil {
  365. wg.Done()
  366. return
  367. }
  368. memSize := common.RoundFloat(float64(nodeResp.Data.MemorySize)*KB*KB, 3) // MB to BYTES
  369. resourceStats.MemTotal = memSize
  370. cMemTotal <- memSize
  371. wg.Done()
  372. }()
  373. //resources being occupied
  374. go func() {
  375. var memSize float64
  376. var totalCpu int64
  377. select {
  378. case v := <-cMemTotal:
  379. memSize = v
  380. case <-time.After(TIMEOUT * time.Second):
  381. wg.Done()
  382. return
  383. }
  384. select {
  385. case v := <-cTotalCpu:
  386. totalCpu = v
  387. case <-time.After(TIMEOUT * time.Second):
  388. wg.Done()
  389. return
  390. }
  391. memberJobResp, err := s.aCRpc.GetMemberJobs(ctx, nil)
  392. if err != nil {
  393. wg.Done()
  394. return
  395. }
  396. if memberJobResp.Data == nil {
  397. wg.Done()
  398. return
  399. }
  400. var cpuCoreAvail int64
  401. var memAvail float64
  402. if len(memberJobResp.Data) != 0 {
  403. cpuCoreAvail = totalCpu
  404. memAvail = memSize
  405. } else {
  406. var cpuCoreUsed int64
  407. var memUsed float64
  408. for _, datum := range memberJobResp.Data {
  409. cpuCoreUsed += datum.CpuCore
  410. }
  411. memUsed = float64(cpuCoreUsed * 2 * KB * KB * KB) // 2 GB per cpu core
  412. if cpuCoreUsed > totalCpu {
  413. cpuCoreAvail = 0
  414. } else {
  415. cpuCoreAvail = totalCpu - cpuCoreUsed
  416. }
  417. if memUsed > memSize {
  418. memAvail = 0
  419. } else {
  420. memAvail = memSize - memUsed
  421. }
  422. }
  423. resourceStats.CpuCoreAvail = cpuCoreAvail
  424. resourceStats.MemAvail = memAvail
  425. wg.Done()
  426. }()
  427. //usable hours
  428. var balance float64
  429. select {
  430. case v := <-cBalance:
  431. balance = v
  432. case <-time.After(TIMEOUT * time.Second):
  433. return nil, errors.New("get balance rpc call failed")
  434. }
  435. var cards []*collector.Card
  436. cardHours := common.RoundFloat(balance/DCUPRICEPERHOUR, 3)
  437. cpuHours := common.RoundFloat(balance/CPUCOREPRICEPERHOUR, 3)
  438. dcu.CardHours = cardHours
  439. resourceStats.CpuCoreHours = cpuHours
  440. resourceStats.Balance = balance
  441. wg.Wait()
  442. cards = append(cards, dcu)
  443. resourceStats.CardsAvail = cards
  444. return resourceStats, nil
  445. }
  446. func (s *ShuguangAi) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  447. req := &hpcAC.GetFileListReq{Limit: 100, Path: DATASETS_DIR, Start: 0}
  448. list, err := s.aCRpc.GetFileList(ctx, req)
  449. if err != nil {
  450. return nil, err
  451. }
  452. if list.Code != "0" {
  453. return nil, errors.New(list.Msg)
  454. }
  455. specs := []*collector.DatasetsSpecs{}
  456. for _, file := range list.Data.FileList {
  457. spec := &collector.DatasetsSpecs{Name: file.Name, Size: strconv.FormatInt(file.Size, 10)}
  458. specs = append(specs, spec)
  459. }
  460. return specs, nil
  461. }
  462. func (s *ShuguangAi) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  463. var algorithms []*collector.Algorithm
  464. for _, t := range GetTaskTypes() {
  465. taskType := t
  466. req := &hpcAC.GetFileListReq{Limit: 100, Path: ALGORITHM_DIR + FORWARD_SLASH + taskType, Start: 0, Order: "asc", OrderBy: "name", KeyWord: ""}
  467. list, err := s.aCRpc.GetFileList(ctx, req)
  468. if err != nil {
  469. return nil, err
  470. }
  471. if list.Code != "0" {
  472. return nil, errors.New(list.Msg)
  473. }
  474. for _, file := range list.Data.FileList {
  475. algorithm := &collector.Algorithm{Name: file.Name, Platform: SHUGUANGAI, TaskType: taskType}
  476. algorithms = append(algorithms, algorithm)
  477. }
  478. }
  479. return algorithms, nil
  480. }
  481. func (s *ShuguangAi) GetComputeCards(ctx context.Context) ([]string, error) {
  482. var cards []string
  483. cards = append(cards, DCU)
  484. return cards, nil
  485. }
  486. func (s *ShuguangAi) GetUserBalance(ctx context.Context) (float64, error) {
  487. userReq := &hpcAC.GetUserInfoReq{}
  488. userinfo, err := s.aCRpc.GetUserInfo(ctx, userReq)
  489. if err != nil {
  490. return 0, err
  491. }
  492. balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64)
  493. return balance, nil
  494. }
  495. func (s *ShuguangAi) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  496. algoName := dataset + DASH + algorithm
  497. req := &hpcAC.GetFileReq{
  498. Path: ALGORITHM_DIR + FORWARD_SLASH + taskType + FORWARD_SLASH + algoName + FORWARD_SLASH + TRAIN_FILE,
  499. }
  500. resp, err := s.aCRpc.GetFile(ctx, req)
  501. if err != nil {
  502. return "", err
  503. }
  504. return resp.Content, nil
  505. }
  506. func (s *ShuguangAi) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  507. algoName := dataset + DASH + algorithm
  508. req := &hpcAC.UploadFileReq{
  509. Path: ALGORITHM_DIR + FORWARD_SLASH + taskType + FORWARD_SLASH + algoName + FORWARD_SLASH,
  510. Cover: "cover",
  511. File: code,
  512. }
  513. _, err := s.aCRpc.UploadFile(ctx, req)
  514. if err != nil {
  515. return err
  516. }
  517. return nil
  518. }
  519. func (s *ShuguangAi) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  520. req := &hpcAC.GetInstanceLogReq{
  521. TaskId: taskId,
  522. InstanceNum: instanceNum,
  523. LineCount: 1000,
  524. StartLineNum: -1,
  525. }
  526. resp, err := s.aCRpc.GetInstanceLog(ctx, req)
  527. if err != nil {
  528. return "", err
  529. }
  530. if resp.Code != "0" {
  531. resp.Data.Content = "waiting for logs..."
  532. }
  533. return resp.Data.Content, nil
  534. }
  535. func (s *ShuguangAi) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  536. resp, err := s.QueryTask(ctx, taskId)
  537. if err != nil {
  538. return nil, err
  539. }
  540. jobresp := (resp).(*hpcAC.GetPytorchTaskResp)
  541. if jobresp.Code != "0" {
  542. return nil, errors.New(jobresp.Msg)
  543. }
  544. var task collector.Task
  545. task.Id = jobresp.Data.Id
  546. if jobresp.Data.StartTime != "" {
  547. task.Start = jobresp.Data.StartTime
  548. }
  549. if jobresp.Data.EndTime != "" {
  550. task.End = jobresp.Data.EndTime
  551. }
  552. task.Status = jobresp.Data.Status
  553. return &task, nil
  554. }
  555. func (s *ShuguangAi) Stop(ctx context.Context, id string) error {
  556. req := &hpcAC.StopTaskAiReq{
  557. Id: id,
  558. }
  559. resp, err := s.aCRpc.StopTaskAi(ctx, req)
  560. if err != nil {
  561. return err
  562. }
  563. if resp.Code != "0" {
  564. return errors.New(resp.Msg)
  565. }
  566. return nil
  567. }
  568. func (s *ShuguangAi) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  569. switch mode {
  570. case executor.SUBMIT_MODE_JOINT_CLOUD:
  571. err := s.GenerateSubmitParams(ctx, option)
  572. if err != nil {
  573. return nil, err
  574. }
  575. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  576. var dcuNum int64
  577. for _, res := range option.ResourcesRequired {
  578. typeName, ok := res["type"]
  579. if !ok {
  580. continue
  581. }
  582. switch typeName {
  583. case DCU:
  584. num, ok := res["number"]
  585. if !ok {
  586. continue
  587. }
  588. n := common.ConvertTypeToString(num)
  589. val, err := strconv.ParseInt(n, 10, 64)
  590. if err != nil {
  591. return nil, err
  592. }
  593. dcuNum = val
  594. }
  595. }
  596. for k, v := range RESOURCESGAIMAP {
  597. if dcuNum == v.GPU {
  598. option.ResourceId = k
  599. break
  600. }
  601. if dcuNum == 0 && v.GPU == 1 {
  602. option.ResourceId = k
  603. break
  604. }
  605. if dcuNum >= 5 && v.GPU == 5 {
  606. option.ResourceId = k
  607. break
  608. }
  609. }
  610. option.ComputeCard = DCU
  611. default:
  612. return nil, errors.New("failed to choose submit mode")
  613. }
  614. task, err := s.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  615. if err != nil {
  616. return nil, err
  617. }
  618. return task, nil
  619. }
  620. func (s *ShuguangAi) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  621. err := s.generateResourceId(option)
  622. if err != nil {
  623. return err
  624. }
  625. err = s.generateImageId(ctx, option)
  626. if err != nil {
  627. return err
  628. }
  629. err = s.generateAlgorithmId(ctx, option)
  630. if err != nil {
  631. return err
  632. }
  633. err = s.generateCmd(option)
  634. if err != nil {
  635. return err
  636. }
  637. err = s.generateEnv(option)
  638. if err != nil {
  639. return err
  640. }
  641. err = s.generateParams(option)
  642. if err != nil {
  643. return err
  644. }
  645. return nil
  646. }
  647. func (s *ShuguangAi) generateResourceId(option *option.AiOption) error {
  648. if option.ResourceType == "" {
  649. return errors.New("ResourceType not set")
  650. }
  651. if option.ResourceType == CPU {
  652. option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
  653. option.ComputeCard = CPU
  654. return nil
  655. }
  656. if option.ResourceType == CARD {
  657. if option.ComputeCard == "" {
  658. option.ComputeCard = DCU
  659. }
  660. if strings.ToUpper(option.ComputeCard) != DCU {
  661. return errors.New("computeCard not found")
  662. }
  663. option.ComputeCard = DCU
  664. if 0 <= option.Tops && option.Tops <= DCU_TOPS {
  665. option.ResourceId = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
  666. return nil
  667. }
  668. cardNum := 5
  669. for k, v := range RESOURCESGAIMAP {
  670. for i := 1; i <= cardNum; i++ {
  671. if float64(i)*DCU_TOPS <= option.Tops && option.Tops <= float64(v.GPU)*DCU_TOPS {
  672. option.ResourceId = k
  673. return nil
  674. }
  675. }
  676. }
  677. if option.Tops > float64(cardNum)*DCU_TOPS {
  678. option.ResourceId = "jeYBVPwyIALjVYNzHvysh2o5CsBpBLp2"
  679. return nil
  680. }
  681. }
  682. return errors.New("failed to get ResourceId")
  683. }
  684. func (s *ShuguangAi) generateImageId(ctx context.Context, option *option.AiOption) error {
  685. if option.TaskType == "" {
  686. return errors.New("TaskType not set")
  687. }
  688. taskType := strings.Title(option.TaskType)
  689. req := &hpcAC.GetImageListAiReq{
  690. AcceleratorType: DCU,
  691. TaskType: taskType,
  692. }
  693. resp, err := s.aCRpc.GetImageListAi(ctx, req)
  694. if err != nil {
  695. return errors.New("generateImageId / GetImageListAi: " + err.Error())
  696. }
  697. if resp.Code != "0" {
  698. return errors.New("failed to get imageId")
  699. }
  700. for _, datum := range resp.Data {
  701. ns := strings.Split(datum.Version, COLON)
  702. if ns[0] == "jupyterlab-pytorch" {
  703. option.ImageId = datum.ImageId
  704. return nil
  705. }
  706. }
  707. return errors.New("failed to get ImageId")
  708. }
  709. func (s *ShuguangAi) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
  710. if option.DatasetsName == "" {
  711. return errors.New("DatasetsName not set")
  712. }
  713. req := &hpcAC.GetFileListReq{Limit: 100, Path: ALGORITHM_DIR + FORWARD_SLASH + option.TaskType, Start: 0}
  714. list, err := s.aCRpc.GetFileList(ctx, req)
  715. if err != nil {
  716. return errors.New("generateAlgorithmId / GetFileListReq: " + err.Error())
  717. }
  718. if list.Code != "0" {
  719. return errors.New(list.Msg)
  720. }
  721. var algorithmId string
  722. for _, file := range list.Data.FileList {
  723. ns := strings.Split(file.Name, DASH)
  724. if ns[0] == option.DatasetsName {
  725. algoName := ns[1]
  726. if option.AlgorithmName == "" {
  727. switch option.DatasetsName {
  728. case "cifar10":
  729. algorithmId = option.TaskType + DASH + option.DatasetsName + DASH + "cnn"
  730. option.AlgorithmId = algorithmId
  731. option.AlgorithmName = algoName
  732. return nil
  733. case "mnist":
  734. algorithmId = option.TaskType + DASH + option.DatasetsName + DASH + "fcn"
  735. option.AlgorithmId = algorithmId
  736. option.AlgorithmName = algoName
  737. return nil
  738. }
  739. } else {
  740. if algoName == option.AlgorithmName {
  741. algorithmId = option.TaskType + DASH + option.DatasetsName + DASH + algoName
  742. option.AlgorithmId = algorithmId
  743. return nil
  744. }
  745. }
  746. }
  747. }
  748. if algorithmId == "" {
  749. return errors.New("Algorithm does not exist")
  750. }
  751. return errors.New("failed to get AlgorithmId")
  752. }
  753. func (s *ShuguangAi) generateCmd(option *option.AiOption) error {
  754. return nil
  755. }
  756. func (s *ShuguangAi) generateEnv(option *option.AiOption) error {
  757. return nil
  758. }
  759. func (s *ShuguangAi) generateParams(option *option.AiOption) error {
  760. if option.ResourceType == "" {
  761. return errors.New("ResourceType not set")
  762. }
  763. if len(option.Params) == 0 {
  764. epoch := "epoch" + COMMA + "1"
  765. option.Params = append(option.Params, epoch)
  766. }
  767. switch option.ResourceType {
  768. case CPU:
  769. card := "card" + COMMA + CPU
  770. option.Params = append(option.Params, card)
  771. return nil
  772. case CARD:
  773. card := "card" + COMMA + "cuda:0"
  774. option.Params = append(option.Params, card)
  775. return nil
  776. }
  777. return errors.New("failed to set params")
  778. }
  779. func (s *ShuguangAi) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  780. var imageUrls []*inference.InferUrl
  781. urlReq := &hpcAC.GetInferUrlReq{
  782. ModelName: option.ModelName,
  783. Type: option.ModelType,
  784. Card: "dcu",
  785. }
  786. urlResp, err := s.aCRpc.GetInferUrl(ctx, urlReq)
  787. if err != nil {
  788. return nil, err
  789. }
  790. imageUrl := &inference.InferUrl{
  791. Url: urlResp.Url,
  792. Card: "dcu",
  793. }
  794. imageUrls = append(imageUrls, imageUrl)
  795. clusterWithUrl := &inference.ClusterInferUrl{
  796. ClusterName: s.platform,
  797. ClusterType: TYPE_SHUGUANGAI,
  798. InferUrls: imageUrls,
  799. }
  800. return clusterWithUrl, nil
  801. }
  802. func (s *ShuguangAi) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  803. var insList []*inference.DeployInstance
  804. params := &hpcAC.GetInstanceServiceListReqParam{
  805. InstanceServiceName: DEPLOY_INSTANCE_PREFIEX,
  806. Status: "",
  807. TaskType: "",
  808. Start: 0,
  809. Limit: DEPLOY_INSTANCE_LIMIT,
  810. Sort: "desc",
  811. }
  812. req := &hpcacclient.GetInstanceServiceListReq{
  813. Param: params,
  814. }
  815. list, err := s.aCRpc.GetInstanceServiceList(ctx, req)
  816. if err != nil {
  817. return nil, err
  818. }
  819. if list.Code != "0" {
  820. return nil, errors.New(list.Msg)
  821. }
  822. for _, datum := range list.Data {
  823. ins := &inference.DeployInstance{}
  824. ins.InstanceName = datum.InstanceServiceName
  825. ins.InstanceId = datum.Id
  826. ins.ClusterName = s.platform
  827. ins.Status = datum.Status
  828. ins.InferCard = DCU
  829. ins.CreatedTime = datum.CreateTime
  830. ins.ClusterType = TYPE_SHUGUANGAI
  831. insList = append(insList, ins)
  832. }
  833. return insList, nil
  834. }
  835. func (s *ShuguangAi) StartInferDeployInstance(ctx context.Context, id string) bool {
  836. req := &hpcAC.StartInstanceServiceReq{
  837. InstanceServiceId: id,
  838. }
  839. resp, err := s.aCRpc.StartInstanceService(ctx, req)
  840. if err != nil || resp.Code != "0" {
  841. return false
  842. }
  843. if resp.Data == id && resp.Code == "0" {
  844. return true
  845. }
  846. return false
  847. }
  848. func (s *ShuguangAi) StopInferDeployInstance(ctx context.Context, id string) bool {
  849. ids := []string{id}
  850. req := &hpcAC.StopInstanceServiceReq{
  851. Ids: ids,
  852. }
  853. resp, err := s.aCRpc.StopInstanceService(ctx, req)
  854. if err != nil || resp.Code != "0" {
  855. return false
  856. }
  857. if resp.Code == "0" {
  858. return true
  859. }
  860. return false
  861. }
  862. func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  863. ins := &inference.DeployInstance{}
  864. req := &hpcAC.GetInstanceServiceDetailReq{
  865. Id: id,
  866. }
  867. resp, err := s.aCRpc.GetInstanceServiceDetail(ctx, req)
  868. if err != nil || resp.Code != "0" || resp.Data == nil {
  869. return nil, err
  870. }
  871. if resp.Data == nil {
  872. return nil, errors.New("GetInferDeployInstance empty")
  873. }
  874. var url string
  875. if resp.Data.Status == constants.Running {
  876. url = resp.Data.ContainerPortInfoList[0].AccessUrl
  877. }
  878. var modelType string
  879. var modelName string
  880. var card string
  881. if resp.Data.Description != "" {
  882. str := strings.Split(resp.Data.Description, FORWARD_SLASH)
  883. if len(str) == 3 {
  884. modelType = str[0]
  885. modelName = str[1]
  886. card = str[2]
  887. }
  888. }
  889. ins.InstanceName = resp.Data.InstanceServiceName
  890. ins.InstanceId = resp.Data.Id
  891. ins.ClusterName = s.platform
  892. ins.Status = resp.Data.Status
  893. ins.InferCard = DCU
  894. ins.CreatedTime = resp.Data.CreateTime
  895. ins.ClusterType = TYPE_SHUGUANGAI
  896. ins.ModelType = modelType
  897. ins.ModelName = modelName
  898. ins.InferUrl = url
  899. ins.InferCard = card
  900. return ins, nil
  901. }
  902. func (s *ShuguangAi) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  903. return "", nil
  904. }
  905. func (s *ShuguangAi) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  906. containerPortInfoList := []*hpcAC.ContainerPortInfoList{
  907. {
  908. ProtocolType: ProtocolType,
  909. ContainerPort: ContainerPort,
  910. },
  911. }
  912. desc := option.ModelType + FORWARD_SLASH + option.ModelName + FORWARD_SLASH + strings.ToLower(DCU)
  913. instanceServiceName := "infer_instance" + UNDERSCORE + utils.TimeString()
  914. resourceGroup := "kshdtest"
  915. script, ok := ModelNameCmdMap[option.ModelName]
  916. if !ok {
  917. return "", errors.New("failed to set cmd, ModelName not exist")
  918. }
  919. param := &hpcAC.CreateParams{
  920. AcceleratorType: strings.ToLower(DCU),
  921. ContainerPortInfoList: containerPortInfoList,
  922. CpuNumber: 8,
  923. Description: desc,
  924. //env
  925. GpuNumber: 1,
  926. ImagePath: "11.11.100.6:5000/dcu/admin/base/jupyterlab-pytorch:1.13.1-py3.7-dtk23.04-centos7.6",
  927. InstanceServiceName: instanceServiceName,
  928. MountInfoList: make([]*hpcAC.MountInfoList, 0),
  929. //originalVersion
  930. RamSize: 10 * RAM_SIZE_1G,
  931. //rdma
  932. ResourceGroup: resourceGroup,
  933. StartScriptActionScope: "all",
  934. StartScriptContent: script,
  935. //startServiceCommand
  936. //taskClassification: "interactive"
  937. TaskNumber: 1,
  938. TaskType: JUPYTER,
  939. TimeoutLimit: "01:00:00",
  940. UseStartScript: true,
  941. //useStartServiceCommand: false
  942. Version: "jupyterlab-pytorch:1.13.1-py3.7-dtk23.04-centos7.6",
  943. }
  944. req := &hpcacclient.CreateInstanceServiceReq{
  945. Data: param,
  946. }
  947. resp, err := s.aCRpc.CreateInstanceService(ctx, req)
  948. if err != nil {
  949. return "", err
  950. }
  951. if resp.Code != "0" {
  952. return "", errors.New(resp.Msg)
  953. }
  954. return resp.Data, nil
  955. }
  956. func (s *ShuguangAi) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
  957. modelPath := "model" + FORWARD_SLASH + name
  958. req := &hpcAC.IsExistFileReq{
  959. Path: KUNSHAN_DIR + FORWARD_SLASH + modelPath,
  960. }
  961. resp, err := s.aCRpc.IsExistFile(ctx, req)
  962. if err != nil {
  963. return false
  964. }
  965. if resp.Code != "0" || resp.Data == nil {
  966. return false
  967. }
  968. return resp.Data.Exist
  969. }
  970. func (s *ShuguangAi) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  971. return nil, nil
  972. //var timeout = 5
  973. //var wg sync.WaitGroup
  974. //var uwg sync.WaitGroup
  975. //wg.Add(3)
  976. //uwg.Add(3)
  977. //var ch = make(chan *collector.Usage, 2)
  978. //var qCh = make(chan *collector.Usage, 2)
  979. //var sch = make(chan *collector.Usage, 1)
  980. //var cresCh = make(chan *collector.ClusterResource)
  981. //
  982. //resUsage := &collector.ResourceSpec{
  983. // ClusterId: strconv.FormatInt(s.participantId, 10),
  984. //}
  985. //
  986. //var resources []interface{}
  987. //
  988. //// 查询用户可访问队列
  989. //go func() {
  990. // defer wg.Done()
  991. // defer close(ch)
  992. // done := make(chan bool)
  993. // go func() {
  994. // defer uwg.Done()
  995. // queueResp, err := s.aCRpc.SelectQueueByUser(ctx, nil)
  996. // if err != nil {
  997. // done <- true
  998. // return
  999. // }
  1000. //
  1001. // if len(queueResp.Data) == 0 {
  1002. // done <- true
  1003. // return
  1004. // }
  1005. //
  1006. // var data *hpcAC.QueueData
  1007. // for _, datum := range queueResp.Data {
  1008. // if datum.QueueName == RESOURCE_GROUP {
  1009. // data = datum
  1010. // break
  1011. // }
  1012. // }
  1013. //
  1014. // //rate
  1015. // queChargeRate, _ := strconv.ParseFloat(data.QueChargeRate, 64)
  1016. // rate := &collector.Usage{
  1017. // Type: strings.ToUpper(RATE),
  1018. // Total: &collector.UnitValue{Unit: PERHOUR, Value: queChargeRate},
  1019. // }
  1020. //
  1021. // cresCh <- &collector.ClusterResource{Resource: rate}
  1022. //
  1023. // var freeNodes int64
  1024. // var cpuPerNode int64
  1025. // var dcuPerNode int64
  1026. // freeNodes, _ = strconv.ParseInt(data.QueFreeNodes, 10, 10)
  1027. // cpuPerNode, _ = strconv.ParseInt(data.QueMaxPPN, 10, 10)
  1028. // dcuPerNode, _ = strconv.ParseInt(data.QueMaxDcuPN, 10, 10)
  1029. //
  1030. // cpu := &collector.Usage{
  1031. // Type: strings.ToUpper(CPU),
  1032. // Total: &collector.UnitValue{Unit: CPUCORE, Value: freeNodes * cpuPerNode},
  1033. // }
  1034. //
  1035. // ch <- cpu
  1036. //
  1037. // dcu := &collector.Usage{
  1038. // Type: DCU,
  1039. // Name: Z100L,
  1040. // Total: &collector.UnitValue{Unit: NUMBER, Value: freeNodes * dcuPerNode},
  1041. // }
  1042. //
  1043. // ch <- dcu
  1044. //
  1045. // done <- true
  1046. // }()
  1047. //
  1048. // select {
  1049. // case <-done:
  1050. // return
  1051. // case <-time.After(time.Duration(timeout) * time.Second):
  1052. // return
  1053. // }
  1054. //
  1055. //}()
  1056. //
  1057. //// 查询实时作业列表
  1058. //go func() {
  1059. // defer wg.Done()
  1060. // defer close(qCh)
  1061. // done := make(chan bool)
  1062. // go func() {
  1063. // defer uwg.Done()
  1064. // jobList, err := s.aCRpc.ListJob(ctx, nil)
  1065. // if err != nil {
  1066. // done <- true
  1067. // return
  1068. // }
  1069. //
  1070. // // running task num
  1071. // run := &collector.Usage{}
  1072. // run.Type = strings.ToUpper(RUNNINGTASK)
  1073. //
  1074. // if len(jobList.Jobs) == 0 {
  1075. // var v int64
  1076. // run.Total = &collector.UnitValue{
  1077. // Unit: NUMBER,
  1078. // Value: v,
  1079. // }
  1080. //
  1081. // cresCh <- &collector.ClusterResource{Resource: run}
  1082. //
  1083. // done <- true
  1084. // return
  1085. // } else {
  1086. // var v int64
  1087. // v = int64(len(jobList.Jobs))
  1088. // run.Total = &collector.UnitValue{
  1089. // Unit: NUMBER,
  1090. // Value: v,
  1091. // }
  1092. //
  1093. // cresCh <- &collector.ClusterResource{Resource: run}
  1094. // }
  1095. //
  1096. // var cpureqed atomic.Int64
  1097. // var dcureqed atomic.Int64
  1098. // //var jwg sync.WaitGroup
  1099. // //for _, j := range jobList.Jobs {
  1100. // // jwg.Add(1)
  1101. // // job := j
  1102. // // go func() {
  1103. // // defer jwg.Done()
  1104. // // h := http.Request{}
  1105. // // jreq := &hpcAC.JobDetailReq{
  1106. // // JobId: job.JobId,
  1107. // // }
  1108. // // detail, err := s.aCRpc.GetJobDetail(h.Context(), jreq)
  1109. // // if err != nil || detail.Data == nil {
  1110. // // return
  1111. // // }
  1112. // //
  1113. // // cpureqed.Add(int64(detail.Data.ProcNumReq))
  1114. // // dcureqed.Add(int64(detail.Data.DcuNumReq))
  1115. // // }()
  1116. // //}
  1117. // //jwg.Wait()
  1118. //
  1119. // for v := range ch {
  1120. // switch v.Type {
  1121. // case strings.ToUpper(CPU):
  1122. // t, _ := v.Total.Value.(int64)
  1123. // avail := t - cpureqed.Load()
  1124. // cpu := &collector.Usage{
  1125. // Type: strings.ToUpper(CPU),
  1126. // Name: v.Name,
  1127. // Total: v.Total,
  1128. // Available: &collector.UnitValue{Unit: CPUCORE, Value: avail},
  1129. // }
  1130. //
  1131. // qCh <- cpu
  1132. //
  1133. // case DCU:
  1134. // t, _ := v.Total.Value.(int64)
  1135. // avail := t - dcureqed.Load()
  1136. // dcu := &collector.Usage{
  1137. // Type: DCU,
  1138. // Name: v.Name,
  1139. // Total: v.Total,
  1140. // Available: &collector.UnitValue{Unit: CPUCORE, Value: avail},
  1141. // }
  1142. //
  1143. // qCh <- dcu
  1144. // }
  1145. // }
  1146. // done <- true
  1147. // }()
  1148. //
  1149. // select {
  1150. // case <-done:
  1151. // return
  1152. // case <-time.After(time.Duration(timeout) * time.Second):
  1153. // return
  1154. // }
  1155. //}()
  1156. //
  1157. //// 查询用户共享存储配额及使用量
  1158. //go func() {
  1159. // defer wg.Done()
  1160. // defer close(sch)
  1161. // done := make(chan bool)
  1162. // storage := &collector.Usage{}
  1163. // go func() {
  1164. //
  1165. // diskReq := &hpcAC.ParaStorQuotaReq{}
  1166. // diskResp, err := s.aCRpc.ParaStorQuota(ctx, diskReq)
  1167. // if err != nil || diskResp.Data == nil {
  1168. // done <- true
  1169. // return
  1170. // }
  1171. //
  1172. // totalStorage := common.RoundFloat(diskResp.Data[0].Threshold, 0)
  1173. // availStorage := common.RoundFloat((diskResp.Data[0].Threshold - diskResp.Data[0].Usage), 0)
  1174. //
  1175. // storage.Type = STORAGE
  1176. // storage.Name = DISK
  1177. // storage.Total = &collector.UnitValue{
  1178. // Unit: GIGABYTE,
  1179. // Value: totalStorage,
  1180. // }
  1181. // storage.Available = &collector.UnitValue{
  1182. // Unit: GIGABYTE,
  1183. // Value: availStorage,
  1184. // }
  1185. //
  1186. // done <- true
  1187. // }()
  1188. //
  1189. // select {
  1190. // case <-done:
  1191. // sch <- storage
  1192. // case <-time.After(time.Duration(timeout) * time.Second):
  1193. // return
  1194. // }
  1195. //}()
  1196. //
  1197. //// 查询用户信息
  1198. //go func() {
  1199. // defer uwg.Done()
  1200. // done := make(chan bool)
  1201. // cres := &collector.ClusterResource{}
  1202. // go func() {
  1203. // userReq := &hpcAC.GetUserInfoReq{}
  1204. // userinfo, err := s.aCRpc.GetUserInfo(ctx, userReq)
  1205. // if err != nil || userinfo.Data == nil {
  1206. // done <- true
  1207. // return
  1208. // }
  1209. // balance, _ := strconv.ParseFloat(userinfo.Data.AccountBalance, 64)
  1210. // bal := &collector.Usage{}
  1211. // bal.Type = strings.ToUpper(BALANCE)
  1212. // bal.Total = &collector.UnitValue{
  1213. // Unit: RMB,
  1214. // Value: balance,
  1215. // }
  1216. // cres.Resource = bal
  1217. //
  1218. // done <- true
  1219. // }()
  1220. //
  1221. // select {
  1222. // case <-done:
  1223. // cresCh <- cres
  1224. // case <-time.After(time.Duration(timeout) * time.Second):
  1225. // return
  1226. // }
  1227. //}()
  1228. //
  1229. //go func() {
  1230. // uwg.Wait()
  1231. // close(cresCh)
  1232. //}()
  1233. //
  1234. //for v := range cresCh {
  1235. // resources = append(resources, v)
  1236. //}
  1237. //
  1238. //wg.Wait()
  1239. //
  1240. //cres := &collector.ClusterResource{}
  1241. //bres := make([]*collector.Usage, 0)
  1242. //if len(qCh) == 0 {
  1243. // for v := range ch {
  1244. // v.Available = v.Total
  1245. // switch v.Type {
  1246. // case DCU:
  1247. // cres.Resource = v
  1248. // case strings.ToUpper(CPU):
  1249. // bres = append(bres, v)
  1250. // }
  1251. // }
  1252. //} else {
  1253. // for v := range qCh {
  1254. // switch v.Type {
  1255. // case DCU:
  1256. // cres.Resource = v
  1257. // case strings.ToUpper(CPU):
  1258. // bres = append(bres, v)
  1259. // }
  1260. // }
  1261. //}
  1262. //
  1263. //// temporarily set memory usage
  1264. ////var dcuNum int
  1265. ////
  1266. ////mem := &collector.Usage{
  1267. //// Type: strings.ToUpper(MEMORY),
  1268. //// Name: strings.ToUpper(RAM),
  1269. //// Total: &collector.UnitValue{Unit: GIGABYTE, Value: 2 * RAM_SIZE_1G},
  1270. //// Available: &collector.UnitValue{Unit: GIGABYTE, Value: 2 * RAM_SIZE_1G},
  1271. ////}
  1272. ////vmem := &collector.Usage{
  1273. //// Type: strings.ToUpper(MEMORY),
  1274. //// Name: strings.ToUpper(VRAM),
  1275. //// Total: &collector.UnitValue{Unit: GIGABYTE, Value: 2 * RAM_SIZE_1G},
  1276. //// Available: &collector.UnitValue{Unit: GIGABYTE, Value: 2 * RAM_SIZE_1G},
  1277. ////}
  1278. ////bres = append(bres, mem)
  1279. ////bres = append(bres, vmem)
  1280. //
  1281. //for v := range sch {
  1282. // bres = append(bres, v)
  1283. //}
  1284. //
  1285. //cres.BaseResources = bres
  1286. //resources = append(resources, cres)
  1287. //resUsage.Resources = resources
  1288. //
  1289. //return resUsage, nil
  1290. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.