You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopusHttp.go 23 kB

4 months ago
4 months ago
4 months ago
4 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860
  1. package octopusHttp
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. "github.com/zeromicro/go-zero/core/logx"
  9. common2 "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  15. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  16. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  17. omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
  18. "gitlink.org.cn/JointCloud/pcm-openi/common"
  19. "mime/multipart"
  20. "net/http"
  21. "strconv"
  22. "strings"
  23. "time"
  24. )
  25. const (
  26. RESOURCE_POOL = "grampus-pool"
  27. Param_Token = "token"
  28. Param_Addr = "addr"
  29. Forward_Slash = "/"
  30. COMMA = ","
  31. UNDERSCORE = "_"
  32. TASK_NAME_PREFIX = "trainJob"
  33. Python = "python "
  34. SemiColon = ";"
  35. BALANCE = "balance"
  36. RATE = "rate"
  37. PERHOUR = "per-hour"
  38. NUMBER = "number"
  39. KILOBYTE = "kb"
  40. GIGABYTE = "gb"
  41. CPUCORE = "core"
  42. STORAGE = "STORAGE"
  43. DISK = "disk"
  44. MEMORY = "memory"
  45. RAM = "ram"
  46. VRAM = "vram"
  47. RMB = "rmb"
  48. POINT = "point"
  49. RUNNINGTASK = "RUNNING_TASK"
  50. RUNNING = "RUNNING"
  51. CPU = "cpu"
  52. Gi = "Gi"
  53. AlgorithmRecordOnlyVersion = "V1"
  54. )
  55. const (
  56. NotImplementError = "not implemented"
  57. )
  58. const (
  59. MyAlgorithmListUrl = "api/v1/algorithm/myAlgorithmList"
  60. CreateAlgorithm = "api/v1/algorithm/create"
  61. ResourcespecsUrl = "api/v1/resource/specs"
  62. CreateTrainJobUrl = "api/v1/job/create"
  63. TrainJobDetail = "api/v1/job/detail"
  64. TrainJobLog = "api/v1/job/log"
  65. )
  66. // compute source
  67. var (
  68. ComputeSourceToCardType = map[string]string{
  69. "nvidia-a100": "GPU",
  70. "nvidia-a100-80g": "GPU",
  71. "mr-v100": "ILUVATAR-GPGPU",
  72. "bi-v100": "ILUVATAR-GPGPU",
  73. "MR-V50": "ILUVATAR-GPGPU",
  74. "BI-V100": "ILUVATAR-GPGPU",
  75. "BI-V150": "ILUVATAR-GPGPU",
  76. "MR-V100": "ILUVATAR-GPGPU",
  77. "cambricon.com/mlu": "MLU",
  78. "hygon.com/dcu": "DCU",
  79. "huawei.com/Ascend910": "NPU",
  80. "enflame.com/gcu": "GCU",
  81. "ILUVATAR-GPGPU": "ILUVATAR-GPGPU",
  82. "MXN260": "METAX-GPGPU",
  83. }
  84. )
  85. type OctopusHttp struct {
  86. server string
  87. host string
  88. platform string
  89. participantId int64
  90. token *Token
  91. }
  92. func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) *OctopusHttp {
  93. token, err := NewToken(server, host, user, pwd)
  94. if err != nil {
  95. logx.Infof("Init OctopusHttp, id: %d, host: %s, token error: %s \n", id, host, err)
  96. }
  97. return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token}
  98. }
  99. // executor
  100. func (o *OctopusHttp) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  101. switch mode {
  102. case executor.SUBMIT_MODE_JOINT_CLOUD:
  103. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  104. // cmd
  105. if option.AlgorithmId == "" {
  106. return nil, errors.New("algorithmId is empty")
  107. }
  108. if option.Cmd != "" {
  109. option.Cmd = option.Cmd + SemiColon + Python + option.AlgorithmId
  110. } else {
  111. option.Cmd = Python + option.AlgorithmId
  112. }
  113. // algorithm
  114. //param := &omodel.CreateMyAlgorithmParam{
  115. // AlgorithmName: option.AlgorithmId,
  116. // ModelName: option.AlgorithmId,
  117. //}
  118. //algorithm, err := o.createAlgorithm(ctx, param)
  119. //if err != nil {
  120. // return nil, err
  121. //}
  122. //if algorithm.Code != http.StatusOK {
  123. // if algorithm.Data != nil {
  124. // marshal, err := json.Marshal(algorithm.Data)
  125. // if err != nil {
  126. // return nil, err
  127. // }
  128. //
  129. // errormdl := &omodel.Error{}
  130. // err = json.Unmarshal(marshal, errormdl)
  131. // if err != nil {
  132. // return nil, err
  133. // }
  134. // return nil, errors.New(errormdl.Message)
  135. // } else {
  136. // return nil, errors.New(algorithm.Msg)
  137. // }
  138. //} else {
  139. // if algorithm.Data != nil {
  140. // result := &entity.OctCreateAlgorithm{}
  141. // marshal, err := json.Marshal(algorithm.Data)
  142. // if err != nil {
  143. // return nil, err
  144. // }
  145. // err = json.Unmarshal(marshal, result)
  146. // if err != nil {
  147. // return nil, err
  148. // }
  149. // if result.AlgorithmId == "" {
  150. // return nil, errors.New("createAlgorithm failed")
  151. // }
  152. // option.AlgorithmId = result.AlgorithmId
  153. // } else {
  154. // return nil, errors.New("createAlgorithm failed")
  155. // }
  156. //}
  157. // resource
  158. option.ResourceId = "964fdee2db544928bfea74dac12a924f"
  159. // submit
  160. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  161. if err != nil {
  162. return nil, err
  163. }
  164. return task, nil
  165. }
  166. return nil, nil
  167. }
  168. func (o *OctopusHttp) Stop(ctx context.Context, id string) error {
  169. return nil
  170. }
  171. func (o *OctopusHttp) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  172. // octopus提交任务
  173. reqUrl := o.server + CreateTrainJobUrl
  174. token, err := o.token.Get()
  175. if err != nil {
  176. return nil, err
  177. }
  178. // python参数
  179. var prms []struct {
  180. Key string `json:"key"`
  181. Value string `json:"value"`
  182. }
  183. for _, param := range params {
  184. var p struct {
  185. Key string `json:"key"`
  186. Value string `json:"value"`
  187. }
  188. s := strings.Split(param, COMMA)
  189. p.Key = s[0]
  190. p.Value = s[1]
  191. prms = append(prms, p)
  192. }
  193. //环境变量
  194. envMap := make(map[string]string)
  195. for _, env := range envs {
  196. s := strings.Split(env, COMMA)
  197. envMap[s[0]] = s[1]
  198. }
  199. param := &omodel.CreateTrainJobParam{
  200. //DataSetId: datasetsId,
  201. //DataSetVersion: VERSION,
  202. //AlgorithmId: algorithmId,
  203. //AlgorithmVersion: AlgorithmRecordOnlyVersion,
  204. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  205. ImageId: imageId,
  206. IsDistributed: false,
  207. ResourcePool: RESOURCE_POOL,
  208. Config: []*omodel.CreateTrainJobConf{
  209. {
  210. Command: cmd,
  211. ResourceSpecId: resourceId,
  212. MinFailedTaskCount: 1,
  213. MinSucceededTaskCount: 1,
  214. TaskNumber: 1,
  215. //Parameters: prms,
  216. Envs: envMap,
  217. },
  218. },
  219. }
  220. resp := &entity.OctResp{}
  221. req := common.GetRestyRequest(common.TIMEOUT)
  222. _, err = req.
  223. SetHeader("Authorization", "Bearer "+token).
  224. SetQueryString("token=" + token).
  225. SetQueryString("addr=" + o.host).
  226. SetBody(param).
  227. SetResult(resp).
  228. Post(reqUrl)
  229. if err != nil {
  230. return nil, err
  231. }
  232. return resp, nil
  233. }
  234. func (o *OctopusHttp) createAlgorithm(ctx context.Context, param *omodel.CreateMyAlgorithmParam) (*entity.OctResp, error) {
  235. createAlgorithmUrl := o.server + CreateAlgorithm
  236. token, err := o.token.Get()
  237. if err != nil {
  238. return nil, err
  239. }
  240. resp := &entity.OctResp{}
  241. req := common.GetRestyRequest(common.TIMEOUT)
  242. _, err = req.
  243. SetHeader("Authorization", "Bearer "+token).
  244. SetQueryString("token=" + token).
  245. SetQueryString("addr=" + o.host).
  246. SetBody(param).
  247. SetResult(resp).
  248. Post(createAlgorithmUrl)
  249. if err != nil {
  250. return nil, err
  251. }
  252. return resp, nil
  253. }
  254. // collector
  255. func (o *OctopusHttp) resourceSpecs(ctx context.Context) (*entity.OctResp, error) {
  256. resourcespecsUrl := o.server + ResourcespecsUrl
  257. token, err := o.token.Get()
  258. if err != nil {
  259. return nil, err
  260. }
  261. param := omodel.ResourceSpecParam{
  262. ResourcePool: RESOURCE_POOL,
  263. }
  264. b, _ := json.Marshal(param)
  265. byt := bytes.NewBuffer(b)
  266. resp := &entity.OctResp{}
  267. req := common.GetRestyRequest(common.TIMEOUT)
  268. r, _ := http.NewRequest("GET", resourcespecsUrl, byt)
  269. req.RawRequest = r
  270. req.URL = resourcespecsUrl
  271. _, err = req.
  272. SetHeader("Content-Type", "application/json").
  273. SetQueryParam(Param_Token, token).
  274. SetQueryParam(Param_Addr, o.host).
  275. SetBody(byt).
  276. SetResult(resp).
  277. Send()
  278. if err != nil {
  279. return nil, err
  280. }
  281. return resp, nil
  282. }
  283. func (o *OctopusHttp) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  284. resp, err := o.resourceSpecs(ctx)
  285. if err != nil {
  286. return nil, err
  287. }
  288. if resp.Code != http.StatusOK {
  289. if resp.Data != nil {
  290. marshal, err := json.Marshal(resp.Data)
  291. if err != nil {
  292. return nil, err
  293. }
  294. errormdl := &omodel.Error{}
  295. err = json.Unmarshal(marshal, errormdl)
  296. if err != nil {
  297. return nil, err
  298. }
  299. return nil, errors.New(errormdl.Message)
  300. }
  301. } else {
  302. if resp.Data != nil {
  303. spec := &entity.OctResourceSpecs{}
  304. marshal, err := json.Marshal(resp.Data)
  305. if err != nil {
  306. return nil, err
  307. }
  308. err = json.Unmarshal(marshal, spec)
  309. if err != nil {
  310. return nil, err
  311. }
  312. }
  313. }
  314. return nil, nil
  315. }
  316. func (o *OctopusHttp) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  317. return nil, nil
  318. }
  319. func (o *OctopusHttp) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  320. return nil, errors.New(NotImplementError)
  321. }
  322. func (o *OctopusHttp) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  323. taskDetailsUrl := o.server + TrainJobLog
  324. token, err := o.token.Get()
  325. if err != nil {
  326. return "", err
  327. }
  328. param := omodel.TrainJobLog{
  329. JobId: taskId,
  330. }
  331. b, _ := json.Marshal(param)
  332. byt := bytes.NewBuffer(b)
  333. resp := &entity.OctResp{}
  334. req := common.GetRestyRequest(common.TIMEOUT)
  335. r, _ := http.NewRequest("GET", taskDetailsUrl, byt)
  336. req.RawRequest = r
  337. req.URL = taskDetailsUrl
  338. _, err = req.
  339. SetHeader("Content-Type", "application/json").
  340. SetQueryParam(Param_Token, token).
  341. SetQueryParam(Param_Addr, o.host).
  342. SetBody(byt).
  343. SetResult(resp).
  344. Send()
  345. if err != nil {
  346. return "", errors.New("failed to invoke taskDetails")
  347. }
  348. if resp.Code != http.StatusOK {
  349. return "", errors.New("failed to invoke taskDetails")
  350. }
  351. var log string
  352. marshal, err := json.Marshal(resp.Data)
  353. if err != nil {
  354. return "", err
  355. }
  356. log = string(marshal)
  357. if strings.Contains(log, "404 Not Found") || log == "" {
  358. log = "waiting for logs..."
  359. }
  360. return log, nil
  361. }
  362. func (o *OctopusHttp) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  363. if taskId == "" {
  364. return nil, errors.New("empty taskId")
  365. }
  366. resp, err := o.getTrainingTask(ctx, taskId)
  367. if err != nil {
  368. return nil, err
  369. }
  370. if resp.Code != http.StatusOK {
  371. if resp.Data != nil {
  372. marshal, err := json.Marshal(resp.Data)
  373. if err != nil {
  374. return nil, err
  375. }
  376. errormdl := &omodel.Error{}
  377. err = json.Unmarshal(marshal, errormdl)
  378. if err != nil {
  379. return nil, err
  380. }
  381. return nil, errors.New(errormdl.Message)
  382. }
  383. } else {
  384. if resp.Data != nil {
  385. job := &entity.OctTrainJob{}
  386. marshal, err := json.Marshal(resp.Data)
  387. if err != nil {
  388. return nil, err
  389. }
  390. err = json.Unmarshal(marshal, job)
  391. if err != nil {
  392. return nil, err
  393. }
  394. var task collector.Task
  395. task.Id = job.TrainJob.Id
  396. if job.TrainJob.StartedAt != 0 {
  397. task.Start = time.Unix(int64(job.TrainJob.StartedAt), 0).Format(constants.Layout)
  398. }
  399. if job.TrainJob.CompletedAt != 0 {
  400. task.End = time.Unix(int64(job.TrainJob.CompletedAt), 0).Format(constants.Layout)
  401. }
  402. switch job.TrainJob.Status {
  403. case "succeeded":
  404. task.Status = constants.Completed
  405. case "failed":
  406. task.Status = constants.Failed
  407. case "running":
  408. task.Status = constants.Running
  409. case "stopped":
  410. task.Status = constants.Stopped
  411. case "pending":
  412. task.Status = constants.Pending
  413. default:
  414. task.Status = "undefined"
  415. }
  416. return &task, nil
  417. }
  418. }
  419. return nil, errors.New("failed to get trainjob")
  420. }
  421. func (o *OctopusHttp) getTrainingTask(ctx context.Context, taskId string) (*entity.OctResp, error) {
  422. taskDetailsUrl := o.server + TrainJobDetail
  423. token, err := o.token.Get()
  424. if err != nil {
  425. return nil, err
  426. }
  427. param := omodel.TrainJobDetailParam{
  428. JobId: taskId,
  429. }
  430. b, _ := json.Marshal(param)
  431. byt := bytes.NewBuffer(b)
  432. resp := &entity.OctResp{}
  433. req := common.GetRestyRequest(common.TIMEOUT)
  434. r, _ := http.NewRequest("GET", taskDetailsUrl, byt)
  435. req.RawRequest = r
  436. req.URL = taskDetailsUrl
  437. _, err = req.
  438. SetHeader("Content-Type", "application/json").
  439. SetQueryParam(Param_Token, token).
  440. SetQueryParam(Param_Addr, o.host).
  441. SetBody(byt).
  442. SetResult(resp).
  443. Send()
  444. if err != nil {
  445. return nil, errors.New("failed to invoke taskDetails")
  446. }
  447. return resp, nil
  448. }
  449. func (o *OctopusHttp) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  450. return "", errors.New(NotImplementError)
  451. }
  452. func (o *OctopusHttp) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  453. return nil
  454. }
  455. func (o *OctopusHttp) GetComputeCards(ctx context.Context) ([]string, error) {
  456. return nil, errors.New(NotImplementError)
  457. }
  458. func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) {
  459. return 0, errors.New(NotImplementError)
  460. }
  461. func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  462. resp, err := o.resourceSpecs(ctx)
  463. if err != nil {
  464. return nil, err
  465. }
  466. res := &collector.ResourceSpec{
  467. ClusterId: strconv.FormatInt(o.participantId, 10),
  468. Tag: resrcType,
  469. }
  470. if resp.Code != http.StatusOK {
  471. if resp.Data != nil {
  472. marshal, err := json.Marshal(resp.Data)
  473. if err != nil {
  474. return nil, err
  475. }
  476. errormdl := &omodel.Error{}
  477. err = json.Unmarshal(marshal, errormdl)
  478. if err != nil {
  479. return nil, err
  480. }
  481. return nil, errors.New(errormdl.Message)
  482. }
  483. } else {
  484. if resp.Data != nil {
  485. specs := &entity.OctResourceSpecs{}
  486. marshal, err := json.Marshal(resp.Data)
  487. if err != nil {
  488. return nil, err
  489. }
  490. err = json.Unmarshal(marshal, specs)
  491. if err != nil {
  492. return nil, err
  493. }
  494. clusterResources, err := genSpecs(specs, resrcType)
  495. if err != nil {
  496. return nil, err
  497. }
  498. res.Resources = clusterResources
  499. }
  500. }
  501. return res, nil
  502. }
  503. func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, error) {
  504. res := make([]interface{}, 0)
  505. if resrcType == "Inference" {
  506. return res, nil
  507. } else if resrcType == "Train" {
  508. if specs.MapResourceSpecIdList.Train.ResourceSpecs == nil {
  509. return res, nil
  510. } else {
  511. for _, s := range specs.MapResourceSpecIdList.Train.ResourceSpecs {
  512. spec := &omodel.Spec{}
  513. marshal, err := json.Marshal(s)
  514. if err != nil {
  515. return nil, err
  516. }
  517. err = json.Unmarshal(marshal, spec)
  518. if err != nil {
  519. return nil, err
  520. }
  521. resType, err := chooseResourceType(spec)
  522. if err != nil {
  523. return nil, err
  524. }
  525. if resType == nil {
  526. continue
  527. }
  528. res = append(res, resType)
  529. }
  530. }
  531. }
  532. return res, nil
  533. }
  534. func chooseResourceType(spec *omodel.Spec) (*collector.ClusterResource, error) {
  535. if spec.ResourceQuantity.NvidiaA100 != "" {
  536. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  537. if err != nil {
  538. return nil, err
  539. }
  540. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  541. if err != nil {
  542. return nil, err
  543. }
  544. return cres, nil
  545. } else if spec.ResourceQuantity.NvidiaA10080G != "" {
  546. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA10080G")
  547. if err != nil {
  548. return nil, err
  549. }
  550. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA10080G, spec)
  551. if err != nil {
  552. return nil, err
  553. }
  554. return cres, nil
  555. } else if spec.ResourceQuantity.MrV100 != "" {
  556. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MrV100")
  557. if err != nil {
  558. return nil, err
  559. }
  560. cres, err := genClusterResources(tag, spec.ResourceQuantity.MrV100, spec)
  561. if err != nil {
  562. return nil, err
  563. }
  564. return cres, nil
  565. } else if spec.ResourceQuantity.BiV100 != "" {
  566. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BiV100")
  567. if err != nil {
  568. return nil, err
  569. }
  570. cres, err := genClusterResources(tag, spec.ResourceQuantity.BiV100, spec)
  571. if err != nil {
  572. return nil, err
  573. }
  574. return cres, nil
  575. } else if spec.ResourceQuantity.MRV50 != "" {
  576. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV50")
  577. if err != nil {
  578. return nil, err
  579. }
  580. cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV50, spec)
  581. if err != nil {
  582. return nil, err
  583. }
  584. return cres, nil
  585. } else if spec.ResourceQuantity.BIV100 != "" {
  586. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  587. if err != nil {
  588. return nil, err
  589. }
  590. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  591. if err != nil {
  592. return nil, err
  593. }
  594. return cres, nil
  595. } else if spec.ResourceQuantity.BIV150 != "" {
  596. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BIV150")
  597. if err != nil {
  598. return nil, err
  599. }
  600. cres, err := genClusterResources(tag, spec.ResourceQuantity.BIV150, spec)
  601. if err != nil {
  602. return nil, err
  603. }
  604. return cres, nil
  605. } else if spec.ResourceQuantity.MRV100 != "" {
  606. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV100")
  607. if err != nil {
  608. return nil, err
  609. }
  610. cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV100, spec)
  611. if err != nil {
  612. return nil, err
  613. }
  614. return cres, nil
  615. } else if spec.ResourceQuantity.CambriconComMlu != "" {
  616. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "CambriconComMlu")
  617. if err != nil {
  618. return nil, err
  619. }
  620. cres, err := genClusterResources(tag, spec.ResourceQuantity.CambriconComMlu, spec)
  621. if err != nil {
  622. return nil, err
  623. }
  624. return cres, nil
  625. } else if spec.ResourceQuantity.HygonComDcu != "" {
  626. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HygonComDcu")
  627. if err != nil {
  628. return nil, err
  629. }
  630. cres, err := genClusterResources(tag, spec.ResourceQuantity.HygonComDcu, spec)
  631. if err != nil {
  632. return nil, err
  633. }
  634. return cres, nil
  635. } else if spec.ResourceQuantity.HuaweiComAscend910 != "" {
  636. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HuaweiComAscend910")
  637. if err != nil {
  638. return nil, err
  639. }
  640. cres, err := genClusterResources(tag, spec.ResourceQuantity.HuaweiComAscend910, spec)
  641. if err != nil {
  642. return nil, err
  643. }
  644. return cres, nil
  645. } else if spec.ResourceQuantity.EnflameComGcu != "" {
  646. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "EnflameComGcu")
  647. if err != nil {
  648. return nil, err
  649. }
  650. cres, err := genClusterResources(tag, spec.ResourceQuantity.EnflameComGcu, spec)
  651. if err != nil {
  652. return nil, err
  653. }
  654. return cres, nil
  655. } else if spec.ResourceQuantity.MXN260 != "" {
  656. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MXN260")
  657. if err != nil {
  658. return nil, err
  659. }
  660. cres, err := genClusterResources(tag, spec.ResourceQuantity.MXN260, spec)
  661. if err != nil {
  662. return nil, err
  663. }
  664. return cres, nil
  665. } else if spec.ResourceQuantity.NvidiaV100 != "" {
  666. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaV100")
  667. if err != nil {
  668. return nil, err
  669. }
  670. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaV100, spec)
  671. if err != nil {
  672. return nil, err
  673. }
  674. return cres, nil
  675. } else if spec.ResourceQuantity.MetaxTechComGpu != "" {
  676. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MetaxTechComGpu")
  677. if err != nil {
  678. return nil, err
  679. }
  680. cres, err := genClusterResources(tag, spec.ResourceQuantity.MetaxTechComGpu, spec)
  681. if err != nil {
  682. return nil, err
  683. }
  684. return cres, nil
  685. }
  686. return nil, nil
  687. }
  688. func genClusterResources(cType string, cNum string, s *omodel.Spec) (*collector.ClusterResource, error) {
  689. cres := &collector.ClusterResource{}
  690. bres := make([]*collector.Usage, 0)
  691. var cardNum int64
  692. var cpuCore int64
  693. var memGi int64
  694. cardNum, err := strconv.ParseInt(cNum, 10, 64)
  695. if err != nil {
  696. cardNum = 0
  697. }
  698. cpuCore, err = strconv.ParseInt(s.ResourceQuantity.Cpu, 10, 64)
  699. if err != nil {
  700. cpuCore = 0
  701. }
  702. if s.ResourceQuantity.Memory != "" {
  703. gi := strings.Split(s.ResourceQuantity.Memory, Gi)
  704. if len(gi) != 2 {
  705. return nil, fmt.Errorf("s.ResourceQuantity.Memory convert error: %s", s.ResourceQuantity.Memory)
  706. }
  707. mGi, err := strconv.ParseInt(gi[0], 10, 64)
  708. if err != nil {
  709. memGi = 0
  710. } else {
  711. memGi = mGi
  712. }
  713. } else {
  714. memGi = 0
  715. }
  716. card := &collector.Usage{
  717. Type: ComputeSourceToCardType[cType],
  718. Name: strings.ToUpper(cType),
  719. Total: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  720. Available: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  721. }
  722. cpu := &collector.Usage{
  723. Type: strings.ToUpper(CPU),
  724. Name: strings.ToUpper(CPU),
  725. Total: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  726. Available: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  727. }
  728. mem := &collector.Usage{
  729. Type: strings.ToUpper(MEMORY),
  730. Name: strings.ToUpper(RAM),
  731. Total: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  732. Available: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  733. }
  734. bres = append(bres, cpu)
  735. bres = append(bres, mem)
  736. cres.Resource = card
  737. cres.BaseResources = bres
  738. return cres, nil
  739. }
  740. // inference
  741. func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  742. return nil, errors.New(NotImplementError)
  743. }
  744. func (o *OctopusHttp) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  745. return nil, errors.New(NotImplementError)
  746. }
  747. func (o *OctopusHttp) StartInferDeployInstance(ctx context.Context, id string) bool {
  748. return false
  749. }
  750. func (o *OctopusHttp) StopInferDeployInstance(ctx context.Context, id string) bool {
  751. return false
  752. }
  753. func (o *OctopusHttp) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  754. return nil, errors.New(NotImplementError)
  755. }
  756. func (o *OctopusHttp) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  757. return "", errors.New(NotImplementError)
  758. }
  759. func (o *OctopusHttp) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool {
  760. return false
  761. }
  762. func (o *OctopusHttp) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  763. return "", errors.New(NotImplementError)
  764. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.