You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopusHttp.go 23 kB

4 months ago
4 months ago
4 months ago
4 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856
  1. package octopusHttp
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. "github.com/zeromicro/go-zero/core/logx"
  9. common2 "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  15. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  16. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  17. omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
  18. "gitlink.org.cn/JointCloud/pcm-openi/common"
  19. "mime/multipart"
  20. "net/http"
  21. "strconv"
  22. "strings"
  23. "time"
  24. )
  25. const (
  26. RESOURCE_POOL = "grampus-pool"
  27. Param_Token = "token"
  28. Param_Addr = "addr"
  29. Forward_Slash = "/"
  30. COMMA = ","
  31. UNDERSCORE = "_"
  32. TASK_NAME_PREFIX = "trainJob"
  33. Python = "python "
  34. SemiColon = ";"
  35. BALANCE = "balance"
  36. RATE = "rate"
  37. PERHOUR = "per-hour"
  38. NUMBER = "number"
  39. KILOBYTE = "kb"
  40. GIGABYTE = "gb"
  41. CPUCORE = "core"
  42. STORAGE = "STORAGE"
  43. DISK = "disk"
  44. MEMORY = "memory"
  45. RAM = "ram"
  46. VRAM = "vram"
  47. RMB = "rmb"
  48. POINT = "point"
  49. RUNNINGTASK = "RUNNING_TASK"
  50. RUNNING = "RUNNING"
  51. CPU = "cpu"
  52. Gi = "Gi"
  53. AlgorithmRecordOnlyVersion = "V1"
  54. )
  55. const (
  56. NotImplementError = "not implemented"
  57. )
  58. const (
  59. MyAlgorithmListUrl = "api/v1/algorithm/myAlgorithmList"
  60. CreateAlgorithm = "api/v1/algorithm/create"
  61. ResourcespecsUrl = "api/v1/resource/specs"
  62. CreateTrainJobUrl = "api/v1/job/create"
  63. TrainJobDetail = "api/v1/job/detail"
  64. TrainJobLog = "api/v1/job/log"
  65. )
  66. // compute source
  67. var (
  68. ComputeSourceToCardType = map[string]string{
  69. "nvidia-a100": "GPU",
  70. "nvidia-a100-80g": "GPU",
  71. "mr-v100": "ILUVATAR-GPGPU",
  72. "bi-v100": "ILUVATAR-GPGPU",
  73. "MR-V50": "ILUVATAR-GPGPU",
  74. "BI-V100": "ILUVATAR-GPGPU",
  75. "BI-V150": "ILUVATAR-GPGPU",
  76. "MR-V100": "ILUVATAR-GPGPU",
  77. "cambricon.com/mlu": "MLU",
  78. "hygon.com/dcu": "DCU",
  79. "huawei.com/Ascend910": "NPU",
  80. "enflame.com/gcu": "GCU",
  81. "ILUVATAR-GPGPU": "ILUVATAR-GPGPU",
  82. "MXN260": "METAX-GPGPU",
  83. }
  84. )
  85. type OctopusHttp struct {
  86. server string
  87. host string
  88. platform string
  89. participantId int64
  90. token *Token
  91. }
  92. func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) *OctopusHttp {
  93. token, err := NewToken(server, host, user, pwd)
  94. if err != nil {
  95. logx.Infof("Init OctopusHttp, id: %d, host: %s, token error: %s \n", id, host, err)
  96. }
  97. return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token}
  98. }
  99. // executor
  100. func (o *OctopusHttp) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  101. switch mode {
  102. case executor.SUBMIT_MODE_JOINT_CLOUD:
  103. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  104. // cmd
  105. if option.AlgorithmId == "" {
  106. return nil, errors.New("algorithmId is empty")
  107. }
  108. if option.Cmd != "" {
  109. option.Cmd = option.Cmd + SemiColon + Python + option.AlgorithmId
  110. } else {
  111. option.Cmd = Python + option.AlgorithmId
  112. }
  113. // algorithm
  114. param := &omodel.CreateMyAlgorithmParam{
  115. AlgorithmName: option.AlgorithmId,
  116. ModelName: option.AlgorithmId,
  117. }
  118. algorithm, err := o.createAlgorithm(ctx, param)
  119. if err != nil {
  120. return nil, err
  121. }
  122. if algorithm.Code != http.StatusOK {
  123. if algorithm.Data != nil {
  124. marshal, err := json.Marshal(algorithm.Data)
  125. if err != nil {
  126. return nil, err
  127. }
  128. errormdl := &omodel.Error{}
  129. err = json.Unmarshal(marshal, errormdl)
  130. if err != nil {
  131. return nil, err
  132. }
  133. return nil, errors.New(errormdl.Message)
  134. }
  135. } else {
  136. if algorithm.Data != nil {
  137. result := &entity.OctCreateAlgorithm{}
  138. marshal, err := json.Marshal(algorithm.Data)
  139. if err != nil {
  140. return nil, err
  141. }
  142. err = json.Unmarshal(marshal, result)
  143. if err != nil {
  144. return nil, err
  145. }
  146. if result.AlgorithmId == "" {
  147. return nil, errors.New("createAlgorithm failed")
  148. }
  149. option.AlgorithmId = result.AlgorithmId
  150. } else {
  151. return nil, errors.New("createAlgorithm failed")
  152. }
  153. }
  154. // resource
  155. option.ResourceId = "964fdee2db544928bfea74dac12a924f"
  156. // submit
  157. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  158. if err != nil {
  159. return nil, err
  160. }
  161. return task, nil
  162. }
  163. return nil, nil
  164. }
  165. func (o *OctopusHttp) Stop(ctx context.Context, id string) error {
  166. return nil
  167. }
  168. func (o *OctopusHttp) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  169. // octopus提交任务
  170. reqUrl := o.server + CreateTrainJobUrl
  171. token, err := o.token.Get()
  172. if err != nil {
  173. return nil, err
  174. }
  175. // python参数
  176. var prms []struct {
  177. Key string `json:"key"`
  178. Value string `json:"value"`
  179. }
  180. for _, param := range params {
  181. var p struct {
  182. Key string `json:"key"`
  183. Value string `json:"value"`
  184. }
  185. s := strings.Split(param, COMMA)
  186. p.Key = s[0]
  187. p.Value = s[1]
  188. prms = append(prms, p)
  189. }
  190. //环境变量
  191. envMap := make(map[string]string)
  192. for _, env := range envs {
  193. s := strings.Split(env, COMMA)
  194. envMap[s[0]] = s[1]
  195. }
  196. param := &omodel.CreateTrainJobParam{
  197. //DataSetId: datasetsId,
  198. //DataSetVersion: VERSION,
  199. AlgorithmId: algorithmId,
  200. AlgorithmVersion: AlgorithmRecordOnlyVersion,
  201. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  202. ImageId: imageId,
  203. IsDistributed: false,
  204. ResourcePool: RESOURCE_POOL,
  205. Config: []*omodel.CreateTrainJobConf{
  206. {
  207. Command: cmd,
  208. ResourceSpecId: resourceId,
  209. MinFailedTaskCount: 1,
  210. MinSucceededTaskCount: 1,
  211. TaskNumber: 1,
  212. Parameters: prms,
  213. Envs: envMap,
  214. },
  215. },
  216. }
  217. resp := &entity.OctResp{}
  218. req := common.GetRestyRequest(common.TIMEOUT)
  219. _, err = req.
  220. SetHeader("Authorization", "Bearer "+token).
  221. SetQueryString("token=" + token).
  222. SetQueryString("addr=" + o.host).
  223. SetBody(param).
  224. SetResult(resp).
  225. Post(reqUrl)
  226. if err != nil {
  227. return nil, err
  228. }
  229. return resp, nil
  230. }
  231. func (o *OctopusHttp) createAlgorithm(ctx context.Context, param *omodel.CreateMyAlgorithmParam) (*entity.OctResp, error) {
  232. createAlgorithmUrl := o.server + CreateAlgorithm
  233. token, err := o.token.Get()
  234. if err != nil {
  235. return nil, err
  236. }
  237. resp := &entity.OctResp{}
  238. req := common.GetRestyRequest(common.TIMEOUT)
  239. _, err = req.
  240. SetHeader("Authorization", "Bearer "+token).
  241. SetBody(param).
  242. SetResult(resp).
  243. Post(createAlgorithmUrl)
  244. if err != nil {
  245. return nil, err
  246. }
  247. return resp, nil
  248. }
  249. // collector
  250. func (o *OctopusHttp) resourceSpecs(ctx context.Context) (*entity.OctResp, error) {
  251. resourcespecsUrl := o.server + ResourcespecsUrl
  252. token, err := o.token.Get()
  253. if err != nil {
  254. return nil, err
  255. }
  256. param := omodel.ResourceSpecParam{
  257. ResourcePool: RESOURCE_POOL,
  258. }
  259. b, _ := json.Marshal(param)
  260. byt := bytes.NewBuffer(b)
  261. resp := &entity.OctResp{}
  262. req := common.GetRestyRequest(common.TIMEOUT)
  263. r, _ := http.NewRequest("GET", resourcespecsUrl, byt)
  264. req.RawRequest = r
  265. req.URL = resourcespecsUrl
  266. _, err = req.
  267. SetHeader("Content-Type", "application/json").
  268. SetQueryParam(Param_Token, token).
  269. SetQueryParam(Param_Addr, o.host).
  270. SetBody(byt).
  271. SetResult(resp).
  272. Send()
  273. if err != nil {
  274. return nil, err
  275. }
  276. return resp, nil
  277. }
  278. func (o *OctopusHttp) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  279. resp, err := o.resourceSpecs(ctx)
  280. if err != nil {
  281. return nil, err
  282. }
  283. if resp.Code != http.StatusOK {
  284. if resp.Data != nil {
  285. marshal, err := json.Marshal(resp.Data)
  286. if err != nil {
  287. return nil, err
  288. }
  289. errormdl := &omodel.Error{}
  290. err = json.Unmarshal(marshal, errormdl)
  291. if err != nil {
  292. return nil, err
  293. }
  294. return nil, errors.New(errormdl.Message)
  295. }
  296. } else {
  297. if resp.Data != nil {
  298. spec := &entity.OctResourceSpecs{}
  299. marshal, err := json.Marshal(resp.Data)
  300. if err != nil {
  301. return nil, err
  302. }
  303. err = json.Unmarshal(marshal, spec)
  304. if err != nil {
  305. return nil, err
  306. }
  307. }
  308. }
  309. return nil, nil
  310. }
  311. func (o *OctopusHttp) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  312. return nil, nil
  313. }
  314. func (o *OctopusHttp) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  315. return nil, errors.New(NotImplementError)
  316. }
  317. func (o *OctopusHttp) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  318. taskDetailsUrl := o.server + TrainJobLog
  319. token, err := o.token.Get()
  320. if err != nil {
  321. return "", err
  322. }
  323. param := omodel.TrainJobLog{
  324. JobId: taskId,
  325. }
  326. b, _ := json.Marshal(param)
  327. byt := bytes.NewBuffer(b)
  328. resp := &entity.OctResp{}
  329. req := common.GetRestyRequest(common.TIMEOUT)
  330. r, _ := http.NewRequest("GET", taskDetailsUrl, byt)
  331. req.RawRequest = r
  332. req.URL = taskDetailsUrl
  333. _, err = req.
  334. SetHeader("Content-Type", "application/json").
  335. SetQueryParam(Param_Token, token).
  336. SetQueryParam(Param_Addr, o.host).
  337. SetBody(byt).
  338. SetResult(resp).
  339. Send()
  340. if err != nil {
  341. return "", errors.New("failed to invoke taskDetails")
  342. }
  343. if resp.Code != http.StatusOK {
  344. return "", errors.New("failed to invoke taskDetails")
  345. }
  346. var log string
  347. marshal, err := json.Marshal(resp.Data)
  348. if err != nil {
  349. return "", err
  350. }
  351. log = string(marshal)
  352. if strings.Contains(log, "404 Not Found") || log == "" {
  353. log = "waiting for logs..."
  354. }
  355. return log, nil
  356. }
  357. func (o *OctopusHttp) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  358. if taskId == "" {
  359. return nil, errors.New("empty taskId")
  360. }
  361. resp, err := o.getTrainingTask(ctx, taskId)
  362. if err != nil {
  363. return nil, err
  364. }
  365. if resp.Code != http.StatusOK {
  366. if resp.Data != nil {
  367. marshal, err := json.Marshal(resp.Data)
  368. if err != nil {
  369. return nil, err
  370. }
  371. errormdl := &omodel.Error{}
  372. err = json.Unmarshal(marshal, errormdl)
  373. if err != nil {
  374. return nil, err
  375. }
  376. return nil, errors.New(errormdl.Message)
  377. }
  378. } else {
  379. if resp.Data != nil {
  380. job := &entity.OctTrainJob{}
  381. marshal, err := json.Marshal(resp.Data)
  382. if err != nil {
  383. return nil, err
  384. }
  385. err = json.Unmarshal(marshal, job)
  386. if err != nil {
  387. return nil, err
  388. }
  389. var task collector.Task
  390. task.Id = job.TrainJob.Id
  391. if job.TrainJob.StartedAt != 0 {
  392. task.Start = time.Unix(int64(job.TrainJob.StartedAt), 0).Format(constants.Layout)
  393. }
  394. if job.TrainJob.CompletedAt != 0 {
  395. task.End = time.Unix(int64(job.TrainJob.CompletedAt), 0).Format(constants.Layout)
  396. }
  397. switch job.TrainJob.Status {
  398. case "succeeded":
  399. task.Status = constants.Completed
  400. case "failed":
  401. task.Status = constants.Failed
  402. case "running":
  403. task.Status = constants.Running
  404. case "stopped":
  405. task.Status = constants.Stopped
  406. case "pending":
  407. task.Status = constants.Pending
  408. default:
  409. task.Status = "undefined"
  410. }
  411. return &task, nil
  412. }
  413. }
  414. return nil, errors.New("failed to get trainjob")
  415. }
  416. func (o *OctopusHttp) getTrainingTask(ctx context.Context, taskId string) (*entity.OctResp, error) {
  417. taskDetailsUrl := o.server + TrainJobDetail
  418. token, err := o.token.Get()
  419. if err != nil {
  420. return nil, err
  421. }
  422. param := omodel.TrainJobDetailParam{
  423. JobId: taskId,
  424. }
  425. b, _ := json.Marshal(param)
  426. byt := bytes.NewBuffer(b)
  427. resp := &entity.OctResp{}
  428. req := common.GetRestyRequest(common.TIMEOUT)
  429. r, _ := http.NewRequest("GET", taskDetailsUrl, byt)
  430. req.RawRequest = r
  431. req.URL = taskDetailsUrl
  432. _, err = req.
  433. SetHeader("Content-Type", "application/json").
  434. SetQueryParam(Param_Token, token).
  435. SetQueryParam(Param_Addr, o.host).
  436. SetBody(byt).
  437. SetResult(resp).
  438. Send()
  439. if err != nil {
  440. return nil, errors.New("failed to invoke taskDetails")
  441. }
  442. return resp, nil
  443. }
  444. func (o *OctopusHttp) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  445. return "", errors.New(NotImplementError)
  446. }
  447. func (o *OctopusHttp) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  448. return nil
  449. }
  450. func (o *OctopusHttp) GetComputeCards(ctx context.Context) ([]string, error) {
  451. return nil, errors.New(NotImplementError)
  452. }
  453. func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) {
  454. return 0, errors.New(NotImplementError)
  455. }
  456. func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  457. resp, err := o.resourceSpecs(ctx)
  458. if err != nil {
  459. return nil, err
  460. }
  461. res := &collector.ResourceSpec{
  462. ClusterId: strconv.FormatInt(o.participantId, 10),
  463. Tag: resrcType,
  464. }
  465. if resp.Code != http.StatusOK {
  466. if resp.Data != nil {
  467. marshal, err := json.Marshal(resp.Data)
  468. if err != nil {
  469. return nil, err
  470. }
  471. errormdl := &omodel.Error{}
  472. err = json.Unmarshal(marshal, errormdl)
  473. if err != nil {
  474. return nil, err
  475. }
  476. return nil, errors.New(errormdl.Message)
  477. }
  478. } else {
  479. if resp.Data != nil {
  480. specs := &entity.OctResourceSpecs{}
  481. marshal, err := json.Marshal(resp.Data)
  482. if err != nil {
  483. return nil, err
  484. }
  485. err = json.Unmarshal(marshal, specs)
  486. if err != nil {
  487. return nil, err
  488. }
  489. clusterResources, err := genSpecs(specs, resrcType)
  490. if err != nil {
  491. return nil, err
  492. }
  493. res.Resources = clusterResources
  494. }
  495. }
  496. return res, nil
  497. }
  498. func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, error) {
  499. res := make([]interface{}, 0)
  500. if resrcType == "Inference" {
  501. return res, nil
  502. } else if resrcType == "Train" {
  503. if specs.MapResourceSpecIdList.Train.ResourceSpecs == nil {
  504. return res, nil
  505. } else {
  506. for _, s := range specs.MapResourceSpecIdList.Train.ResourceSpecs {
  507. spec := &omodel.Spec{}
  508. marshal, err := json.Marshal(s)
  509. if err != nil {
  510. return nil, err
  511. }
  512. err = json.Unmarshal(marshal, spec)
  513. if err != nil {
  514. return nil, err
  515. }
  516. resType, err := chooseResourceType(spec)
  517. if err != nil {
  518. return nil, err
  519. }
  520. if resType == nil {
  521. continue
  522. }
  523. res = append(res, resType)
  524. }
  525. }
  526. }
  527. return res, nil
  528. }
  529. func chooseResourceType(spec *omodel.Spec) (*collector.ClusterResource, error) {
  530. if spec.ResourceQuantity.NvidiaA100 != "" {
  531. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  532. if err != nil {
  533. return nil, err
  534. }
  535. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  536. if err != nil {
  537. return nil, err
  538. }
  539. return cres, nil
  540. } else if spec.ResourceQuantity.NvidiaA10080G != "" {
  541. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA10080G")
  542. if err != nil {
  543. return nil, err
  544. }
  545. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA10080G, spec)
  546. if err != nil {
  547. return nil, err
  548. }
  549. return cres, nil
  550. } else if spec.ResourceQuantity.MrV100 != "" {
  551. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MrV100")
  552. if err != nil {
  553. return nil, err
  554. }
  555. cres, err := genClusterResources(tag, spec.ResourceQuantity.MrV100, spec)
  556. if err != nil {
  557. return nil, err
  558. }
  559. return cres, nil
  560. } else if spec.ResourceQuantity.BiV100 != "" {
  561. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BiV100")
  562. if err != nil {
  563. return nil, err
  564. }
  565. cres, err := genClusterResources(tag, spec.ResourceQuantity.BiV100, spec)
  566. if err != nil {
  567. return nil, err
  568. }
  569. return cres, nil
  570. } else if spec.ResourceQuantity.MRV50 != "" {
  571. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV50")
  572. if err != nil {
  573. return nil, err
  574. }
  575. cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV50, spec)
  576. if err != nil {
  577. return nil, err
  578. }
  579. return cres, nil
  580. } else if spec.ResourceQuantity.BIV100 != "" {
  581. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  582. if err != nil {
  583. return nil, err
  584. }
  585. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  586. if err != nil {
  587. return nil, err
  588. }
  589. return cres, nil
  590. } else if spec.ResourceQuantity.BIV150 != "" {
  591. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BIV150")
  592. if err != nil {
  593. return nil, err
  594. }
  595. cres, err := genClusterResources(tag, spec.ResourceQuantity.BIV150, spec)
  596. if err != nil {
  597. return nil, err
  598. }
  599. return cres, nil
  600. } else if spec.ResourceQuantity.MRV100 != "" {
  601. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV100")
  602. if err != nil {
  603. return nil, err
  604. }
  605. cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV100, spec)
  606. if err != nil {
  607. return nil, err
  608. }
  609. return cres, nil
  610. } else if spec.ResourceQuantity.CambriconComMlu != "" {
  611. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "CambriconComMlu")
  612. if err != nil {
  613. return nil, err
  614. }
  615. cres, err := genClusterResources(tag, spec.ResourceQuantity.CambriconComMlu, spec)
  616. if err != nil {
  617. return nil, err
  618. }
  619. return cres, nil
  620. } else if spec.ResourceQuantity.HygonComDcu != "" {
  621. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HygonComDcu")
  622. if err != nil {
  623. return nil, err
  624. }
  625. cres, err := genClusterResources(tag, spec.ResourceQuantity.HygonComDcu, spec)
  626. if err != nil {
  627. return nil, err
  628. }
  629. return cres, nil
  630. } else if spec.ResourceQuantity.HuaweiComAscend910 != "" {
  631. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HuaweiComAscend910")
  632. if err != nil {
  633. return nil, err
  634. }
  635. cres, err := genClusterResources(tag, spec.ResourceQuantity.HuaweiComAscend910, spec)
  636. if err != nil {
  637. return nil, err
  638. }
  639. return cres, nil
  640. } else if spec.ResourceQuantity.EnflameComGcu != "" {
  641. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "EnflameComGcu")
  642. if err != nil {
  643. return nil, err
  644. }
  645. cres, err := genClusterResources(tag, spec.ResourceQuantity.EnflameComGcu, spec)
  646. if err != nil {
  647. return nil, err
  648. }
  649. return cres, nil
  650. } else if spec.ResourceQuantity.MXN260 != "" {
  651. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MXN260")
  652. if err != nil {
  653. return nil, err
  654. }
  655. cres, err := genClusterResources(tag, spec.ResourceQuantity.MXN260, spec)
  656. if err != nil {
  657. return nil, err
  658. }
  659. return cres, nil
  660. } else if spec.ResourceQuantity.NvidiaV100 != "" {
  661. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaV100")
  662. if err != nil {
  663. return nil, err
  664. }
  665. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaV100, spec)
  666. if err != nil {
  667. return nil, err
  668. }
  669. return cres, nil
  670. } else if spec.ResourceQuantity.MetaxTechComGpu != "" {
  671. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MetaxTechComGpu")
  672. if err != nil {
  673. return nil, err
  674. }
  675. cres, err := genClusterResources(tag, spec.ResourceQuantity.MetaxTechComGpu, spec)
  676. if err != nil {
  677. return nil, err
  678. }
  679. return cres, nil
  680. }
  681. return nil, nil
  682. }
  683. func genClusterResources(cType string, cNum string, s *omodel.Spec) (*collector.ClusterResource, error) {
  684. cres := &collector.ClusterResource{}
  685. bres := make([]*collector.Usage, 0)
  686. var cardNum int64
  687. var cpuCore int64
  688. var memGi int64
  689. cardNum, err := strconv.ParseInt(cNum, 10, 64)
  690. if err != nil {
  691. cardNum = 0
  692. }
  693. cpuCore, err = strconv.ParseInt(s.ResourceQuantity.Cpu, 10, 64)
  694. if err != nil {
  695. cpuCore = 0
  696. }
  697. if s.ResourceQuantity.Memory != "" {
  698. gi := strings.Split(s.ResourceQuantity.Memory, Gi)
  699. if len(gi) != 2 {
  700. return nil, fmt.Errorf("s.ResourceQuantity.Memory convert error: %s", s.ResourceQuantity.Memory)
  701. }
  702. mGi, err := strconv.ParseInt(gi[0], 10, 64)
  703. if err != nil {
  704. memGi = 0
  705. } else {
  706. memGi = mGi
  707. }
  708. } else {
  709. memGi = 0
  710. }
  711. card := &collector.Usage{
  712. Type: ComputeSourceToCardType[cType],
  713. Name: strings.ToUpper(cType),
  714. Total: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  715. Available: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  716. }
  717. cpu := &collector.Usage{
  718. Type: strings.ToUpper(CPU),
  719. Name: strings.ToUpper(CPU),
  720. Total: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  721. Available: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  722. }
  723. mem := &collector.Usage{
  724. Type: strings.ToUpper(MEMORY),
  725. Name: strings.ToUpper(RAM),
  726. Total: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  727. Available: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  728. }
  729. bres = append(bres, cpu)
  730. bres = append(bres, mem)
  731. cres.Resource = card
  732. cres.BaseResources = bres
  733. return cres, nil
  734. }
  735. // inference
  736. func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  737. return nil, errors.New(NotImplementError)
  738. }
  739. func (o *OctopusHttp) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  740. return nil, errors.New(NotImplementError)
  741. }
  742. func (o *OctopusHttp) StartInferDeployInstance(ctx context.Context, id string) bool {
  743. return false
  744. }
  745. func (o *OctopusHttp) StopInferDeployInstance(ctx context.Context, id string) bool {
  746. return false
  747. }
  748. func (o *OctopusHttp) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  749. return nil, errors.New(NotImplementError)
  750. }
  751. func (o *OctopusHttp) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  752. return "", errors.New(NotImplementError)
  753. }
  754. func (o *OctopusHttp) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool {
  755. return false
  756. }
  757. func (o *OctopusHttp) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  758. return "", errors.New(NotImplementError)
  759. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.