You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopusHttp.go 21 kB

4 months ago
4 months ago
4 months ago
4 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786
  1. package octopusHttp
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. "github.com/zeromicro/go-zero/core/logx"
  9. common2 "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  15. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  16. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  17. omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
  18. "gitlink.org.cn/JointCloud/pcm-openi/common"
  19. "mime/multipart"
  20. "net/http"
  21. "strconv"
  22. "strings"
  23. "time"
  24. )
  25. const (
  26. RESOURCE_POOL = "grampus-pool"
  27. Param_Token = "token"
  28. Param_Addr = "addr"
  29. Forward_Slash = "/"
  30. COMMA = ","
  31. UNDERSCORE = "_"
  32. TASK_NAME_PREFIX = "trainJob"
  33. Python = "python "
  34. SemiColon = ";"
  35. BALANCE = "balance"
  36. RATE = "rate"
  37. PERHOUR = "per-hour"
  38. NUMBER = "number"
  39. KILOBYTE = "kb"
  40. GIGABYTE = "gb"
  41. CPUCORE = "core"
  42. STORAGE = "STORAGE"
  43. DISK = "disk"
  44. MEMORY = "memory"
  45. RAM = "ram"
  46. VRAM = "vram"
  47. RMB = "rmb"
  48. POINT = "point"
  49. RUNNINGTASK = "RUNNING_TASK"
  50. RUNNING = "RUNNING"
  51. CPU = "cpu"
  52. Gi = "Gi"
  53. )
  54. const (
  55. NotImplementError = "not implemented"
  56. )
  57. const (
  58. MyAlgorithmListUrl = "api/v1/algorithm/myAlgorithmList"
  59. ResourcespecsUrl = "api/v1/resource/specs"
  60. CreateTrainJobUrl = "api/v1/job/create"
  61. TrainJobDetail = "api/v1/job/detail"
  62. TrainJobLog = "api/v1/job/log"
  63. )
  64. // compute source
  65. var (
  66. ComputeSourceToCardType = map[string]string{
  67. "nvidia-a100": "GPU",
  68. "nvidia-a100-80g": "GPU",
  69. "mr-v100": "ILUVATAR-GPGPU",
  70. "bi-v100": "ILUVATAR-GPGPU",
  71. "MR-V50": "ILUVATAR-GPGPU",
  72. "BI-V100": "ILUVATAR-GPGPU",
  73. "BI-V150": "ILUVATAR-GPGPU",
  74. "MR-V100": "ILUVATAR-GPGPU",
  75. "cambricon.com/mlu": "MLU",
  76. "hygon.com/dcu": "DCU",
  77. "huawei.com/Ascend910": "NPU",
  78. "enflame.com/gcu": "GCU",
  79. "ILUVATAR-GPGPU": "ILUVATAR-GPGPU",
  80. "MXN260": "METAX-GPGPU",
  81. }
  82. )
  83. type OctopusHttp struct {
  84. server string
  85. host string
  86. platform string
  87. participantId int64
  88. token *Token
  89. }
  90. func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) *OctopusHttp {
  91. token, err := NewToken(server, host, user, pwd)
  92. if err != nil {
  93. logx.Infof("Init OctopusHttp, id: %d, host: %s, token error: %s \n", id, host, err)
  94. }
  95. return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token}
  96. }
  97. // executor
  98. func (o *OctopusHttp) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  99. switch mode {
  100. case executor.SUBMIT_MODE_JOINT_CLOUD:
  101. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  102. // cmd
  103. if option.AlgorithmId == "" {
  104. return nil, errors.New("algorithmId is empty")
  105. }
  106. if option.Cmd != "" {
  107. option.Cmd = option.Cmd + SemiColon + Python + option.AlgorithmId
  108. } else {
  109. option.Cmd = Python + option.AlgorithmId
  110. }
  111. option.ResourceId = "964fdee2db544928bfea74dac12a924f"
  112. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  113. if err != nil {
  114. return nil, err
  115. }
  116. return task, nil
  117. }
  118. return nil, nil
  119. }
  120. func (o *OctopusHttp) Stop(ctx context.Context, id string) error {
  121. return nil
  122. }
  123. func (o *OctopusHttp) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  124. // octopus提交任务
  125. reqUrl := o.server + CreateTrainJobUrl
  126. token, err := o.token.Get()
  127. if err != nil {
  128. return nil, err
  129. }
  130. // python参数
  131. var prms []struct {
  132. Key string `json:"key"`
  133. Value string `json:"value"`
  134. }
  135. for _, param := range params {
  136. var p struct {
  137. Key string `json:"key"`
  138. Value string `json:"value"`
  139. }
  140. s := strings.Split(param, COMMA)
  141. p.Key = s[0]
  142. p.Value = s[1]
  143. prms = append(prms, p)
  144. }
  145. //环境变量
  146. envMap := make(map[string]string)
  147. for _, env := range envs {
  148. s := strings.Split(env, COMMA)
  149. envMap[s[0]] = s[1]
  150. }
  151. param := &omodel.CreateTrainJobParam{
  152. //DataSetId: datasetsId,
  153. //DataSetVersion: VERSION,
  154. //AlgorithmId: algorithmId,
  155. //AlgorithmVersion: VERSION,
  156. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  157. ImageId: imageId,
  158. IsDistributed: false,
  159. ResourcePool: RESOURCE_POOL,
  160. Config: []*omodel.CreateTrainJobConf{
  161. {
  162. Command: cmd,
  163. ResourceSpecId: resourceId,
  164. MinFailedTaskCount: 1,
  165. MinSucceededTaskCount: 1,
  166. TaskNumber: 1,
  167. //Parameters: prms,
  168. Envs: envMap,
  169. },
  170. },
  171. }
  172. resp := &entity.OctResp{}
  173. req := common.GetRestyRequest(common.TIMEOUT)
  174. _, err = req.
  175. SetHeader("Authorization", "Bearer "+token).
  176. SetQueryString("token=" + token).
  177. SetQueryString("addr=" + o.host).
  178. SetBody(param).
  179. SetResult(resp).
  180. Post(reqUrl)
  181. if err != nil {
  182. return nil, err
  183. }
  184. return resp, nil
  185. }
  186. // collector
  187. func (o *OctopusHttp) resourceSpecs(ctx context.Context) (*entity.OctResp, error) {
  188. resourcespecsUrl := o.server + ResourcespecsUrl
  189. token, err := o.token.Get()
  190. if err != nil {
  191. return nil, err
  192. }
  193. param := omodel.ResourceSpecParam{
  194. ResourcePool: RESOURCE_POOL,
  195. }
  196. b, _ := json.Marshal(param)
  197. byt := bytes.NewBuffer(b)
  198. resp := &entity.OctResp{}
  199. req := common.GetRestyRequest(common.TIMEOUT)
  200. r, _ := http.NewRequest("GET", resourcespecsUrl, byt)
  201. req.RawRequest = r
  202. req.URL = resourcespecsUrl
  203. _, err = req.
  204. SetHeader("Content-Type", "application/json").
  205. SetQueryParam(Param_Token, token).
  206. SetQueryParam(Param_Addr, o.host).
  207. SetBody(byt).
  208. SetResult(resp).
  209. Send()
  210. if err != nil {
  211. return nil, err
  212. }
  213. return resp, nil
  214. }
  215. func (o *OctopusHttp) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  216. resp, err := o.resourceSpecs(ctx)
  217. if err != nil {
  218. return nil, err
  219. }
  220. if resp.Code != http.StatusOK {
  221. if resp.Data != nil {
  222. marshal, err := json.Marshal(resp.Data)
  223. if err != nil {
  224. return nil, err
  225. }
  226. errormdl := &omodel.Error{}
  227. err = json.Unmarshal(marshal, errormdl)
  228. if err != nil {
  229. return nil, err
  230. }
  231. return nil, errors.New(errormdl.Message)
  232. }
  233. } else {
  234. if resp.Data != nil {
  235. spec := &entity.OctResourceSpecs{}
  236. marshal, err := json.Marshal(resp.Data)
  237. if err != nil {
  238. return nil, err
  239. }
  240. err = json.Unmarshal(marshal, spec)
  241. if err != nil {
  242. return nil, err
  243. }
  244. }
  245. }
  246. return nil, nil
  247. }
  248. func (o *OctopusHttp) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  249. return nil, nil
  250. }
  251. func (o *OctopusHttp) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  252. return nil, errors.New(NotImplementError)
  253. }
  254. func (o *OctopusHttp) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  255. taskDetailsUrl := o.server + TrainJobLog
  256. token, err := o.token.Get()
  257. if err != nil {
  258. return "", err
  259. }
  260. param := omodel.TrainJobLog{
  261. JobId: taskId,
  262. }
  263. b, _ := json.Marshal(param)
  264. byt := bytes.NewBuffer(b)
  265. resp := &entity.OctResp{}
  266. req := common.GetRestyRequest(common.TIMEOUT)
  267. r, _ := http.NewRequest("GET", taskDetailsUrl, byt)
  268. req.RawRequest = r
  269. req.URL = taskDetailsUrl
  270. _, err = req.
  271. SetHeader("Content-Type", "application/json").
  272. SetQueryParam(Param_Token, token).
  273. SetQueryParam(Param_Addr, o.host).
  274. SetBody(byt).
  275. SetResult(resp).
  276. Send()
  277. if err != nil {
  278. return "", errors.New("failed to invoke taskDetails")
  279. }
  280. if resp.Code != http.StatusOK {
  281. return "", errors.New("failed to invoke taskDetails")
  282. }
  283. var log string
  284. marshal, err := json.Marshal(resp.Data)
  285. if err != nil {
  286. return "", err
  287. }
  288. log = string(marshal)
  289. if strings.Contains(log, "404 Not Found") || log == "" {
  290. log = "waiting for logs..."
  291. }
  292. return log, nil
  293. }
  294. func (o *OctopusHttp) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  295. if taskId == "" {
  296. return nil, errors.New("empty taskId")
  297. }
  298. resp, err := o.getTrainingTask(ctx, taskId)
  299. if err != nil {
  300. return nil, err
  301. }
  302. if resp.Code != http.StatusOK {
  303. if resp.Data != nil {
  304. marshal, err := json.Marshal(resp.Data)
  305. if err != nil {
  306. return nil, err
  307. }
  308. errormdl := &omodel.Error{}
  309. err = json.Unmarshal(marshal, errormdl)
  310. if err != nil {
  311. return nil, err
  312. }
  313. return nil, errors.New(errormdl.Message)
  314. }
  315. } else {
  316. if resp.Data != nil {
  317. job := &entity.OctTrainJob{}
  318. marshal, err := json.Marshal(resp.Data)
  319. if err != nil {
  320. return nil, err
  321. }
  322. err = json.Unmarshal(marshal, job)
  323. if err != nil {
  324. return nil, err
  325. }
  326. var task collector.Task
  327. task.Id = job.TrainJob.Id
  328. if job.TrainJob.StartedAt != 0 {
  329. task.Start = time.Unix(int64(job.TrainJob.StartedAt), 0).Format(constants.Layout)
  330. }
  331. if job.TrainJob.CompletedAt != 0 {
  332. task.End = time.Unix(int64(job.TrainJob.CompletedAt), 0).Format(constants.Layout)
  333. }
  334. switch job.TrainJob.Status {
  335. case "succeeded":
  336. task.Status = constants.Completed
  337. case "failed":
  338. task.Status = constants.Failed
  339. case "running":
  340. task.Status = constants.Running
  341. case "stopped":
  342. task.Status = constants.Stopped
  343. case "pending":
  344. task.Status = constants.Pending
  345. default:
  346. task.Status = "undefined"
  347. }
  348. return &task, nil
  349. }
  350. }
  351. return nil, errors.New("failed to get trainjob")
  352. }
  353. func (o *OctopusHttp) getTrainingTask(ctx context.Context, taskId string) (*entity.OctResp, error) {
  354. taskDetailsUrl := o.server + TrainJobDetail
  355. token, err := o.token.Get()
  356. if err != nil {
  357. return nil, err
  358. }
  359. param := omodel.TrainJobDetailParam{
  360. JobId: taskId,
  361. }
  362. b, _ := json.Marshal(param)
  363. byt := bytes.NewBuffer(b)
  364. resp := &entity.OctResp{}
  365. req := common.GetRestyRequest(common.TIMEOUT)
  366. r, _ := http.NewRequest("GET", taskDetailsUrl, byt)
  367. req.RawRequest = r
  368. req.URL = taskDetailsUrl
  369. _, err = req.
  370. SetHeader("Content-Type", "application/json").
  371. SetQueryParam(Param_Token, token).
  372. SetQueryParam(Param_Addr, o.host).
  373. SetBody(byt).
  374. SetResult(resp).
  375. Send()
  376. if err != nil {
  377. return nil, errors.New("failed to invoke taskDetails")
  378. }
  379. return resp, nil
  380. }
  381. func (o *OctopusHttp) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  382. return "", errors.New(NotImplementError)
  383. }
  384. func (o *OctopusHttp) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  385. return nil
  386. }
  387. func (o OctopusHttp) GetComputeCards(ctx context.Context) ([]string, error) {
  388. return nil, errors.New(NotImplementError)
  389. }
  390. func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) {
  391. return 0, errors.New(NotImplementError)
  392. }
  393. func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  394. resp, err := o.resourceSpecs(ctx)
  395. if err != nil {
  396. return nil, err
  397. }
  398. res := &collector.ResourceSpec{
  399. ClusterId: strconv.FormatInt(o.participantId, 10),
  400. Tag: resrcType,
  401. }
  402. if resp.Code != http.StatusOK {
  403. if resp.Data != nil {
  404. marshal, err := json.Marshal(resp.Data)
  405. if err != nil {
  406. return nil, err
  407. }
  408. errormdl := &omodel.Error{}
  409. err = json.Unmarshal(marshal, errormdl)
  410. if err != nil {
  411. return nil, err
  412. }
  413. return nil, errors.New(errormdl.Message)
  414. }
  415. } else {
  416. if resp.Data != nil {
  417. specs := &entity.OctResourceSpecs{}
  418. marshal, err := json.Marshal(resp.Data)
  419. if err != nil {
  420. return nil, err
  421. }
  422. err = json.Unmarshal(marshal, specs)
  423. if err != nil {
  424. return nil, err
  425. }
  426. clusterResources, err := genSpecs(specs, resrcType)
  427. if err != nil {
  428. return nil, err
  429. }
  430. res.Resources = clusterResources
  431. }
  432. }
  433. return res, nil
  434. }
  435. func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, error) {
  436. res := make([]interface{}, 0)
  437. if resrcType == "Inference" {
  438. return res, nil
  439. } else if resrcType == "Train" {
  440. if specs.MapResourceSpecIdList.Train.ResourceSpecs == nil {
  441. return res, nil
  442. } else {
  443. for _, s := range specs.MapResourceSpecIdList.Train.ResourceSpecs {
  444. spec := &omodel.Spec{}
  445. marshal, err := json.Marshal(s)
  446. if err != nil {
  447. return nil, err
  448. }
  449. err = json.Unmarshal(marshal, spec)
  450. if err != nil {
  451. return nil, err
  452. }
  453. resType, err := chooseResourceType(spec)
  454. if err != nil {
  455. return nil, err
  456. }
  457. if resType == nil {
  458. continue
  459. }
  460. res = append(res, resType)
  461. }
  462. }
  463. }
  464. return res, nil
  465. }
  466. func chooseResourceType(spec *omodel.Spec) (*collector.ClusterResource, error) {
  467. if spec.ResourceQuantity.NvidiaA100 != "" {
  468. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  469. if err != nil {
  470. return nil, err
  471. }
  472. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  473. if err != nil {
  474. return nil, err
  475. }
  476. return cres, nil
  477. } else if spec.ResourceQuantity.NvidiaA10080G != "" {
  478. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA10080G")
  479. if err != nil {
  480. return nil, err
  481. }
  482. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA10080G, spec)
  483. if err != nil {
  484. return nil, err
  485. }
  486. return cres, nil
  487. } else if spec.ResourceQuantity.MrV100 != "" {
  488. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MrV100")
  489. if err != nil {
  490. return nil, err
  491. }
  492. cres, err := genClusterResources(tag, spec.ResourceQuantity.MrV100, spec)
  493. if err != nil {
  494. return nil, err
  495. }
  496. return cres, nil
  497. } else if spec.ResourceQuantity.BiV100 != "" {
  498. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BiV100")
  499. if err != nil {
  500. return nil, err
  501. }
  502. cres, err := genClusterResources(tag, spec.ResourceQuantity.BiV100, spec)
  503. if err != nil {
  504. return nil, err
  505. }
  506. return cres, nil
  507. } else if spec.ResourceQuantity.MRV50 != "" {
  508. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV50")
  509. if err != nil {
  510. return nil, err
  511. }
  512. cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV50, spec)
  513. if err != nil {
  514. return nil, err
  515. }
  516. return cres, nil
  517. } else if spec.ResourceQuantity.BIV100 != "" {
  518. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  519. if err != nil {
  520. return nil, err
  521. }
  522. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  523. if err != nil {
  524. return nil, err
  525. }
  526. return cres, nil
  527. } else if spec.ResourceQuantity.BIV150 != "" {
  528. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BIV150")
  529. if err != nil {
  530. return nil, err
  531. }
  532. cres, err := genClusterResources(tag, spec.ResourceQuantity.BIV150, spec)
  533. if err != nil {
  534. return nil, err
  535. }
  536. return cres, nil
  537. } else if spec.ResourceQuantity.MRV100 != "" {
  538. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV100")
  539. if err != nil {
  540. return nil, err
  541. }
  542. cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV100, spec)
  543. if err != nil {
  544. return nil, err
  545. }
  546. return cres, nil
  547. } else if spec.ResourceQuantity.CambriconComMlu != "" {
  548. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "CambriconComMlu")
  549. if err != nil {
  550. return nil, err
  551. }
  552. cres, err := genClusterResources(tag, spec.ResourceQuantity.CambriconComMlu, spec)
  553. if err != nil {
  554. return nil, err
  555. }
  556. return cres, nil
  557. } else if spec.ResourceQuantity.HygonComDcu != "" {
  558. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HygonComDcu")
  559. if err != nil {
  560. return nil, err
  561. }
  562. cres, err := genClusterResources(tag, spec.ResourceQuantity.HygonComDcu, spec)
  563. if err != nil {
  564. return nil, err
  565. }
  566. return cres, nil
  567. } else if spec.ResourceQuantity.HuaweiComAscend910 != "" {
  568. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HuaweiComAscend910")
  569. if err != nil {
  570. return nil, err
  571. }
  572. cres, err := genClusterResources(tag, spec.ResourceQuantity.HuaweiComAscend910, spec)
  573. if err != nil {
  574. return nil, err
  575. }
  576. return cres, nil
  577. } else if spec.ResourceQuantity.EnflameComGcu != "" {
  578. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "EnflameComGcu")
  579. if err != nil {
  580. return nil, err
  581. }
  582. cres, err := genClusterResources(tag, spec.ResourceQuantity.EnflameComGcu, spec)
  583. if err != nil {
  584. return nil, err
  585. }
  586. return cres, nil
  587. } else if spec.ResourceQuantity.MXN260 != "" {
  588. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MXN260")
  589. if err != nil {
  590. return nil, err
  591. }
  592. cres, err := genClusterResources(tag, spec.ResourceQuantity.MXN260, spec)
  593. if err != nil {
  594. return nil, err
  595. }
  596. return cres, nil
  597. } else if spec.ResourceQuantity.NvidiaV100 != "" {
  598. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaV100")
  599. if err != nil {
  600. return nil, err
  601. }
  602. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaV100, spec)
  603. if err != nil {
  604. return nil, err
  605. }
  606. return cres, nil
  607. } else if spec.ResourceQuantity.MetaxTechComGpu != "" {
  608. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MetaxTechComGpu")
  609. if err != nil {
  610. return nil, err
  611. }
  612. cres, err := genClusterResources(tag, spec.ResourceQuantity.MetaxTechComGpu, spec)
  613. if err != nil {
  614. return nil, err
  615. }
  616. return cres, nil
  617. }
  618. return nil, nil
  619. }
  620. func genClusterResources(cType string, cNum string, s *omodel.Spec) (*collector.ClusterResource, error) {
  621. cres := &collector.ClusterResource{}
  622. bres := make([]*collector.Usage, 0)
  623. var cardNum int64
  624. var cpuCore int64
  625. var memGi int64
  626. cardNum, err := strconv.ParseInt(cNum, 10, 64)
  627. if err != nil {
  628. cardNum = 0
  629. }
  630. cpuCore, err = strconv.ParseInt(s.ResourceQuantity.Cpu, 10, 64)
  631. if err != nil {
  632. cpuCore = 0
  633. }
  634. if s.ResourceQuantity.Memory != "" {
  635. gi := strings.Split(s.ResourceQuantity.Memory, Gi)
  636. if len(gi) != 2 {
  637. return nil, fmt.Errorf("s.ResourceQuantity.Memory convert error: %s", s.ResourceQuantity.Memory)
  638. }
  639. mGi, err := strconv.ParseInt(gi[0], 10, 64)
  640. if err != nil {
  641. memGi = 0
  642. } else {
  643. memGi = mGi
  644. }
  645. } else {
  646. memGi = 0
  647. }
  648. card := &collector.Usage{
  649. Type: ComputeSourceToCardType[cType],
  650. Name: strings.ToUpper(cType),
  651. Total: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  652. Available: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  653. }
  654. cpu := &collector.Usage{
  655. Type: strings.ToUpper(CPU),
  656. Name: strings.ToUpper(CPU),
  657. Total: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  658. Available: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  659. }
  660. mem := &collector.Usage{
  661. Type: strings.ToUpper(MEMORY),
  662. Name: strings.ToUpper(RAM),
  663. Total: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  664. Available: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  665. }
  666. bres = append(bres, cpu)
  667. bres = append(bres, mem)
  668. cres.Resource = card
  669. cres.BaseResources = bres
  670. return cres, nil
  671. }
  672. // inference
  673. func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  674. return nil, errors.New(NotImplementError)
  675. }
  676. func (o *OctopusHttp) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  677. return nil, errors.New(NotImplementError)
  678. }
  679. func (o *OctopusHttp) StartInferDeployInstance(ctx context.Context, id string) bool {
  680. return false
  681. }
  682. func (o *OctopusHttp) StopInferDeployInstance(ctx context.Context, id string) bool {
  683. return false
  684. }
  685. func (o *OctopusHttp) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  686. return nil, errors.New(NotImplementError)
  687. }
  688. func (o *OctopusHttp) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  689. return "", errors.New(NotImplementError)
  690. }
  691. func (o *OctopusHttp) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool {
  692. return false
  693. }
  694. func (o *OctopusHttp) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  695. return "", errors.New(NotImplementError)
  696. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.