You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopusHttp.go 30 kB

4 months ago
4 months ago
4 months ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197
  1. package octopusHttp
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. common2 "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  15. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  16. omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
  17. "gitlink.org.cn/JointCloud/pcm-openi/common"
  18. "mime/multipart"
  19. "net/http"
  20. "strconv"
  21. "strings"
  22. "time"
  23. )
  24. const (
  25. Param_Token = "token"
  26. Param_Addr = "addr"
  27. Forward_Slash = "/"
  28. COMMA = ","
  29. UNDERSCORE = "_"
  30. TASK_NAME_PREFIX = "trainJob"
  31. Python = "python "
  32. SemiColon = ";"
  33. BALANCE = "balance"
  34. RATE = "rate"
  35. PERHOUR = "per-hour"
  36. NUMBER = "number"
  37. KILOBYTE = "kb"
  38. GIGABYTE = "gb"
  39. CPUCORE = "core"
  40. STORAGE = "STORAGE"
  41. DISK = "disk"
  42. MEMORY = "memory"
  43. RAM = "ram"
  44. VRAM = "vram"
  45. RMB = "rmb"
  46. POINT = "point"
  47. RUNNINGTASK = "RUNNING_TASK"
  48. RUNNING = "RUNNING"
  49. CPU = "cpu"
  50. Gi = "Gi"
  51. AlgorithmRecordOnlyVersion = "V1"
  52. )
  53. const (
  54. NotImplementError = "not implemented"
  55. )
  56. const (
  57. MyAlgorithmListUrl = "api/v1/algorithm/myAlgorithmList"
  58. CreateAlgorithm = "api/v1/algorithm/create"
  59. ResourcespecsUrl = "api/v1/resource/specs"
  60. CreateTrainJobUrl = "api/v1/job/create"
  61. TrainJobDetail = "api/v1/job/detail"
  62. TrainJobLog = "api/v1/job/log"
  63. )
  64. // compute source
  65. var (
  66. ComputeSourceToCardType = map[string]string{
  67. "nvidia-a100": "GPU",
  68. "nvidia-a100-80g": "GPU",
  69. "mr-v100": "ILUVATAR-GPGPU",
  70. "bi-v100": "ILUVATAR-GPGPU",
  71. "MR-V50": "ILUVATAR-GPGPU",
  72. "BI-V100": "ILUVATAR-GPGPU",
  73. "BI-V150": "ILUVATAR-GPGPU",
  74. "MR-V100": "ILUVATAR-GPGPU",
  75. "cambricon.com/mlu": "MLU",
  76. "hygon.com/dcu": "DCU",
  77. "huawei.com/Ascend910": "NPU",
  78. "enflame.com/gcu": "GCU",
  79. "ILUVATAR-GPGPU": "ILUVATAR-GPGPU",
  80. "MXN260": "METAX-GPGPU",
  81. }
  82. )
  83. type OctopusHttp struct {
  84. server string
  85. host string
  86. platform string
  87. participantId int64
  88. token *Token
  89. resourcePool string
  90. }
  91. func NewOctopusHttp(id int64, resourcePool, name, server, host, user, pwd string) *OctopusHttp {
  92. token := &Token{
  93. user: user,
  94. pwd: pwd,
  95. server: server,
  96. host: host,
  97. }
  98. return &OctopusHttp{resourcePool: resourcePool, platform: name, participantId: id, server: server, host: host, token: token}
  99. }
  100. // executor
  101. func (o *OctopusHttp) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  102. switch mode {
  103. case executor.SUBMIT_MODE_JOINT_CLOUD:
  104. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  105. // cmd
  106. if option.AlgorithmId == "" {
  107. return nil, errors.New("algorithmId is empty")
  108. }
  109. if option.Cmd != "" {
  110. option.Cmd = option.Cmd + SemiColon + Python + option.AlgorithmId
  111. } else {
  112. option.Cmd = Python + option.AlgorithmId
  113. }
  114. // algorithm
  115. //param := &omodel.CreateMyAlgorithmParam{
  116. // AlgorithmName: option.AlgorithmId,
  117. // ModelName: option.AlgorithmId,
  118. //}
  119. //algorithm, err := o.createAlgorithm(ctx, param)
  120. //if err != nil {
  121. // return nil, err
  122. //}
  123. //if algorithm.Code != http.StatusOK {
  124. // if algorithm.Data != nil {
  125. // marshal, err := json.Marshal(algorithm.Data)
  126. // if err != nil {
  127. // return nil, err
  128. // }
  129. //
  130. // errormdl := &omodel.Error{}
  131. // err = json.Unmarshal(marshal, errormdl)
  132. // if err != nil {
  133. // return nil, err
  134. // }
  135. // return nil, errors.New(errormdl.Message)
  136. // } else {
  137. // return nil, errors.New(algorithm.Msg)
  138. // }
  139. //} else {
  140. // if algorithm.Data != nil {
  141. // result := &entity.OctCreateAlgorithm{}
  142. // marshal, err := json.Marshal(algorithm.Data)
  143. // if err != nil {
  144. // return nil, err
  145. // }
  146. // err = json.Unmarshal(marshal, result)
  147. // if err != nil {
  148. // return nil, err
  149. // }
  150. // if result.AlgorithmId == "" {
  151. // return nil, errors.New("createAlgorithm failed")
  152. // }
  153. // option.AlgorithmId = result.AlgorithmId
  154. // } else {
  155. // return nil, errors.New("createAlgorithm failed")
  156. // }
  157. //}
  158. // resource
  159. resp, err := o.resourceSpecs(ctx)
  160. if err != nil {
  161. return nil, err
  162. }
  163. id, err := matchResource(resp, option.ResourcesRequired)
  164. if err != nil {
  165. return nil, err
  166. }
  167. if id == nil {
  168. return nil, errors.New("resource id is nil")
  169. }
  170. option.ResourceId = *id
  171. // submit
  172. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  173. if err != nil {
  174. return nil, err
  175. }
  176. return task, nil
  177. }
  178. return nil, nil
  179. }
  180. func (o *OctopusHttp) Stop(ctx context.Context, id string) error {
  181. return nil
  182. }
  183. func (o *OctopusHttp) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  184. // octopus提交任务
  185. reqUrl := o.server + CreateTrainJobUrl
  186. token, err := o.token.Get()
  187. if err != nil {
  188. return nil, err
  189. }
  190. // python参数
  191. var prms []struct {
  192. Key string `json:"key"`
  193. Value string `json:"value"`
  194. }
  195. for _, param := range params {
  196. var p struct {
  197. Key string `json:"key"`
  198. Value string `json:"value"`
  199. }
  200. s := strings.Split(param, COMMA)
  201. p.Key = s[0]
  202. p.Value = s[1]
  203. prms = append(prms, p)
  204. }
  205. //环境变量
  206. envMap := make(map[string]string)
  207. for _, env := range envs {
  208. s := strings.Split(env, COMMA)
  209. envMap[s[0]] = s[1]
  210. }
  211. param := &omodel.CreateTrainJobParam{
  212. //DataSetId: datasetsId,
  213. //DataSetVersion: VERSION,
  214. //AlgorithmId: algorithmId,
  215. //AlgorithmVersion: AlgorithmRecordOnlyVersion,
  216. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  217. ImageId: imageId,
  218. IsDistributed: false,
  219. ResourcePool: o.resourcePool,
  220. Config: []*omodel.CreateTrainJobConf{
  221. {
  222. Command: cmd,
  223. ResourceSpecId: resourceId,
  224. MinFailedTaskCount: 1,
  225. MinSucceededTaskCount: 1,
  226. TaskNumber: 1,
  227. //Parameters: prms,
  228. Envs: envMap,
  229. },
  230. },
  231. }
  232. resp := &entity.OctResp{}
  233. req := common.GetRestyRequest(common.TIMEOUT)
  234. _, err = req.
  235. SetHeader("Authorization", "Bearer "+token).
  236. SetQueryString("token=" + token).
  237. SetQueryString("addr=" + o.host).
  238. SetBody(param).
  239. SetResult(resp).
  240. Post(reqUrl)
  241. if err != nil {
  242. return nil, err
  243. }
  244. return resp, nil
  245. }
  246. func (o *OctopusHttp) createAlgorithm(ctx context.Context, param *omodel.CreateMyAlgorithmParam) (*entity.OctResp, error) {
  247. createAlgorithmUrl := o.server + CreateAlgorithm
  248. token, err := o.token.Get()
  249. if err != nil {
  250. return nil, err
  251. }
  252. resp := &entity.OctResp{}
  253. req := common.GetRestyRequest(common.TIMEOUT)
  254. _, err = req.
  255. SetHeader("Authorization", "Bearer "+token).
  256. SetQueryString("token=" + token).
  257. SetQueryString("addr=" + o.host).
  258. SetBody(param).
  259. SetResult(resp).
  260. Post(createAlgorithmUrl)
  261. if err != nil {
  262. return nil, err
  263. }
  264. return resp, nil
  265. }
  266. // collector
  267. func (o *OctopusHttp) resourceSpecs(ctx context.Context) (*entity.OctResp, error) {
  268. resourcespecsUrl := o.server + ResourcespecsUrl
  269. token, err := o.token.Get()
  270. if err != nil {
  271. return nil, err
  272. }
  273. param := omodel.ResourceSpecParam{
  274. ResourcePool: o.resourcePool,
  275. }
  276. b, _ := json.Marshal(param)
  277. byt := bytes.NewBuffer(b)
  278. resp := &entity.OctResp{}
  279. req := common.GetRestyRequest(common.TIMEOUT)
  280. r, _ := http.NewRequest("GET", resourcespecsUrl, byt)
  281. req.RawRequest = r
  282. req.URL = resourcespecsUrl
  283. _, err = req.
  284. SetHeader("Content-Type", "application/json").
  285. SetQueryParam(Param_Token, token).
  286. SetQueryParam(Param_Addr, o.host).
  287. SetBody(byt).
  288. SetResult(resp).
  289. Send()
  290. if err != nil {
  291. return nil, err
  292. }
  293. return resp, nil
  294. }
  295. func (o *OctopusHttp) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  296. resp, err := o.resourceSpecs(ctx)
  297. if err != nil {
  298. return nil, err
  299. }
  300. if resp.Code != http.StatusOK {
  301. if resp.Data != nil {
  302. marshal, err := json.Marshal(resp.Data)
  303. if err != nil {
  304. return nil, err
  305. }
  306. errormdl := &omodel.Error{}
  307. err = json.Unmarshal(marshal, errormdl)
  308. if err != nil {
  309. return nil, err
  310. }
  311. return nil, errors.New(errormdl.Message)
  312. }
  313. } else {
  314. if resp.Data != nil {
  315. spec := &entity.OctResourceSpecs{}
  316. marshal, err := json.Marshal(resp.Data)
  317. if err != nil {
  318. return nil, err
  319. }
  320. err = json.Unmarshal(marshal, spec)
  321. if err != nil {
  322. return nil, err
  323. }
  324. }
  325. }
  326. return nil, nil
  327. }
  328. func (o *OctopusHttp) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  329. return nil, nil
  330. }
  331. func (o *OctopusHttp) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  332. return nil, errors.New(NotImplementError)
  333. }
  334. func (o *OctopusHttp) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  335. taskDetailsUrl := o.server + TrainJobLog
  336. token, err := o.token.Get()
  337. if err != nil {
  338. return "", err
  339. }
  340. param := omodel.TrainJobLog{
  341. JobId: taskId,
  342. }
  343. b, _ := json.Marshal(param)
  344. byt := bytes.NewBuffer(b)
  345. resp := &entity.OctResp{}
  346. req := common.GetRestyRequest(common.TIMEOUT)
  347. r, _ := http.NewRequest("GET", taskDetailsUrl, byt)
  348. req.RawRequest = r
  349. req.URL = taskDetailsUrl
  350. _, err = req.
  351. SetHeader("Content-Type", "application/json").
  352. SetQueryParam(Param_Token, token).
  353. SetQueryParam(Param_Addr, o.host).
  354. SetBody(byt).
  355. SetResult(resp).
  356. Send()
  357. if err != nil {
  358. return "", errors.New("failed to invoke taskDetails")
  359. }
  360. if resp.Code != http.StatusOK {
  361. return "", errors.New("failed to invoke taskDetails")
  362. }
  363. var log string
  364. marshal, err := json.Marshal(resp.Data)
  365. if err != nil {
  366. return "", err
  367. }
  368. log = string(marshal)
  369. if strings.Contains(log, "404 Not Found") || log == "" {
  370. log = "waiting for logs..."
  371. }
  372. return log, nil
  373. }
  374. func (o *OctopusHttp) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  375. if taskId == "" {
  376. return nil, errors.New("empty taskId")
  377. }
  378. resp, err := o.getTrainingTask(ctx, taskId)
  379. if err != nil {
  380. return nil, err
  381. }
  382. if resp.Code != http.StatusOK {
  383. if resp.Data != nil {
  384. marshal, err := json.Marshal(resp.Data)
  385. if err != nil {
  386. return nil, err
  387. }
  388. errormdl := &omodel.Error{}
  389. err = json.Unmarshal(marshal, errormdl)
  390. if err != nil {
  391. return nil, err
  392. }
  393. return nil, errors.New(errormdl.Message)
  394. }
  395. } else {
  396. if resp.Data != nil {
  397. job := &entity.OctTrainJob{}
  398. marshal, err := json.Marshal(resp.Data)
  399. if err != nil {
  400. return nil, err
  401. }
  402. err = json.Unmarshal(marshal, job)
  403. if err != nil {
  404. return nil, err
  405. }
  406. var task collector.Task
  407. task.Id = job.TrainJob.Id
  408. if job.TrainJob.StartedAt != 0 {
  409. task.Start = time.Unix(int64(job.TrainJob.StartedAt), 0).Format(constants.Layout)
  410. }
  411. if job.TrainJob.CompletedAt != 0 {
  412. task.End = time.Unix(int64(job.TrainJob.CompletedAt), 0).Format(constants.Layout)
  413. }
  414. switch job.TrainJob.Status {
  415. case "succeeded":
  416. task.Status = constants.Completed
  417. case "failed":
  418. task.Status = constants.Failed
  419. case "running":
  420. task.Status = constants.Running
  421. case "stopped":
  422. task.Status = constants.Stopped
  423. case "pending":
  424. task.Status = constants.Pending
  425. default:
  426. task.Status = "undefined"
  427. }
  428. return &task, nil
  429. }
  430. }
  431. return nil, errors.New("failed to get trainjob")
  432. }
  433. func (o *OctopusHttp) getTrainingTask(ctx context.Context, taskId string) (*entity.OctResp, error) {
  434. taskDetailsUrl := o.server + TrainJobDetail
  435. token, err := o.token.Get()
  436. if err != nil {
  437. return nil, err
  438. }
  439. param := omodel.TrainJobDetailParam{
  440. JobId: taskId,
  441. }
  442. b, _ := json.Marshal(param)
  443. byt := bytes.NewBuffer(b)
  444. resp := &entity.OctResp{}
  445. req := common.GetRestyRequest(common.TIMEOUT)
  446. r, _ := http.NewRequest("GET", taskDetailsUrl, byt)
  447. req.RawRequest = r
  448. req.URL = taskDetailsUrl
  449. _, err = req.
  450. SetHeader("Content-Type", "application/json").
  451. SetQueryParam(Param_Token, token).
  452. SetQueryParam(Param_Addr, o.host).
  453. SetBody(byt).
  454. SetResult(resp).
  455. Send()
  456. if err != nil {
  457. return nil, errors.New("failed to invoke taskDetails")
  458. }
  459. return resp, nil
  460. }
  461. func (o *OctopusHttp) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  462. return "", errors.New(NotImplementError)
  463. }
  464. func (o *OctopusHttp) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  465. return nil
  466. }
  467. func (o *OctopusHttp) GetComputeCards(ctx context.Context) ([]string, error) {
  468. return nil, errors.New(NotImplementError)
  469. }
  470. func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) {
  471. return 0, errors.New(NotImplementError)
  472. }
  473. func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  474. resp, err := o.resourceSpecs(ctx)
  475. if err != nil {
  476. return nil, err
  477. }
  478. res := &collector.ResourceSpec{
  479. ClusterId: strconv.FormatInt(o.participantId, 10),
  480. Tag: resrcType,
  481. }
  482. if resp.Code != http.StatusOK {
  483. if resp.Data != nil {
  484. marshal, err := json.Marshal(resp.Data)
  485. if err != nil {
  486. return nil, err
  487. }
  488. errormdl := &omodel.Error{}
  489. err = json.Unmarshal(marshal, errormdl)
  490. if err != nil {
  491. return nil, err
  492. }
  493. return nil, errors.New(errormdl.Message)
  494. }
  495. } else {
  496. if resp.Data != nil {
  497. specs := &entity.OctResourceSpecs{}
  498. marshal, err := json.Marshal(resp.Data)
  499. if err != nil {
  500. return nil, err
  501. }
  502. err = json.Unmarshal(marshal, specs)
  503. if err != nil {
  504. return nil, err
  505. }
  506. clusterResources, err := genSpecs(specs, resrcType, nil)
  507. if err != nil {
  508. return nil, err
  509. }
  510. res.Resources = clusterResources
  511. }
  512. }
  513. return res, nil
  514. }
  515. func findSpecId(cType string, cNum string, s *omodel.Spec, resourcesRequired []map[string]interface{}) (*string, error) {
  516. var id string
  517. for _, res := range resourcesRequired {
  518. //typeName, ok := res["type"]
  519. //if !ok {
  520. // continue
  521. //}
  522. name, ok := res["name"]
  523. if !ok {
  524. continue
  525. }
  526. if str, ok := name.(string); ok {
  527. name = strings.ToLower(str)
  528. } else {
  529. continue
  530. }
  531. num, ok := res["number"]
  532. if !ok {
  533. continue
  534. }
  535. if str, ok := num.(string); ok {
  536. num = strings.ToLower(str)
  537. } else {
  538. continue
  539. }
  540. if cType == name && cNum == num {
  541. id = s.Id
  542. return &id, nil
  543. }
  544. }
  545. return nil, nil
  546. }
  547. func matchResource(resp *entity.OctResp, resourcesRequired []map[string]interface{}) (*string, error) {
  548. if resp.Code != http.StatusOK {
  549. if resp.Data != nil {
  550. marshal, err := json.Marshal(resp.Data)
  551. if err != nil {
  552. return nil, err
  553. }
  554. errormdl := &omodel.Error{}
  555. err = json.Unmarshal(marshal, errormdl)
  556. if err != nil {
  557. return nil, err
  558. }
  559. return nil, errors.New(errormdl.Message)
  560. }
  561. } else {
  562. if resp.Data != nil {
  563. spec := &entity.OctResourceSpecs{}
  564. marshal, err := json.Marshal(resp.Data)
  565. if err != nil {
  566. return nil, err
  567. }
  568. err = json.Unmarshal(marshal, spec)
  569. if err != nil {
  570. return nil, err
  571. }
  572. res, err := genSpecs(spec, "Train", resourcesRequired)
  573. if err != nil {
  574. return nil, err
  575. }
  576. if len(res) != 1 {
  577. return nil, errors.New("resource not found")
  578. }
  579. if str, ok := res[0].(*string); ok {
  580. return str, nil
  581. }
  582. }
  583. }
  584. return nil, errors.New("matchResource failed")
  585. }
  586. func genSpecs(specs *entity.OctResourceSpecs, resrcType string, resourcesRequired []map[string]interface{}) ([]interface{}, error) {
  587. res := make([]interface{}, 0)
  588. if resrcType == "Inference" {
  589. return res, nil
  590. } else if resrcType == "Train" {
  591. if specs.MapResourceSpecIdList.Train.ResourceSpecs == nil {
  592. return res, nil
  593. } else {
  594. for _, s := range specs.MapResourceSpecIdList.Train.ResourceSpecs {
  595. spec := &omodel.Spec{}
  596. marshal, err := json.Marshal(s)
  597. if err != nil {
  598. return nil, err
  599. }
  600. err = json.Unmarshal(marshal, spec)
  601. if err != nil {
  602. return nil, err
  603. }
  604. resType, err := chooseResourceType(spec, resourcesRequired)
  605. if err != nil {
  606. return nil, err
  607. }
  608. if resType == nil {
  609. continue
  610. }
  611. res = append(res, resType)
  612. }
  613. }
  614. }
  615. return res, nil
  616. }
  617. func chooseResourceType(spec *omodel.Spec, resourcesRequired []map[string]interface{}) (interface{}, error) {
  618. if spec.ResourceQuantity.NvidiaA100 != "" {
  619. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  620. if err != nil {
  621. return nil, err
  622. }
  623. var cres interface{}
  624. if resourcesRequired != nil {
  625. id, err := findSpecId(tag, spec.ResourceQuantity.NvidiaA100, spec, resourcesRequired)
  626. if err != nil {
  627. return nil, err
  628. }
  629. if id != nil {
  630. cres = id
  631. }
  632. } else {
  633. res, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  634. if err != nil {
  635. return nil, err
  636. }
  637. cres = res
  638. }
  639. return cres, nil
  640. } else if spec.ResourceQuantity.NvidiaA10080G != "" {
  641. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA10080G")
  642. if err != nil {
  643. return nil, err
  644. }
  645. var cres interface{}
  646. if resourcesRequired != nil {
  647. id, err := findSpecId(tag, spec.ResourceQuantity.NvidiaA10080G, spec, resourcesRequired)
  648. if err != nil {
  649. return nil, err
  650. }
  651. if id != nil {
  652. cres = id
  653. }
  654. } else {
  655. res, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA10080G, spec)
  656. if err != nil {
  657. return nil, err
  658. }
  659. cres = res
  660. }
  661. //cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA10080G, spec)
  662. //if err != nil {
  663. // return nil, err
  664. //}
  665. return cres, nil
  666. } else if spec.ResourceQuantity.MrV100 != "" {
  667. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MrV100")
  668. if err != nil {
  669. return nil, err
  670. }
  671. var cres interface{}
  672. if resourcesRequired != nil {
  673. id, err := findSpecId(tag, spec.ResourceQuantity.MrV100, spec, resourcesRequired)
  674. if err != nil {
  675. return nil, err
  676. }
  677. if id != nil {
  678. cres = id
  679. }
  680. } else {
  681. res, err := genClusterResources(tag, spec.ResourceQuantity.MrV100, spec)
  682. if err != nil {
  683. return nil, err
  684. }
  685. cres = res
  686. }
  687. //cres, err := genClusterResources(tag, spec.ResourceQuantity.MrV100, spec)
  688. //if err != nil {
  689. // return nil, err
  690. //}
  691. return cres, nil
  692. } else if spec.ResourceQuantity.BiV100 != "" {
  693. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BiV100")
  694. if err != nil {
  695. return nil, err
  696. }
  697. var cres interface{}
  698. if resourcesRequired != nil {
  699. id, err := findSpecId(tag, spec.ResourceQuantity.BiV100, spec, resourcesRequired)
  700. if err != nil {
  701. return nil, err
  702. }
  703. if id != nil {
  704. cres = id
  705. }
  706. } else {
  707. res, err := genClusterResources(tag, spec.ResourceQuantity.BiV100, spec)
  708. if err != nil {
  709. return nil, err
  710. }
  711. cres = res
  712. }
  713. //cres, err := genClusterResources(tag, spec.ResourceQuantity.BiV100, spec)
  714. //if err != nil {
  715. // return nil, err
  716. //}
  717. return cres, nil
  718. } else if spec.ResourceQuantity.MRV50 != "" {
  719. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV50")
  720. if err != nil {
  721. return nil, err
  722. }
  723. var cres interface{}
  724. if resourcesRequired != nil {
  725. id, err := findSpecId(tag, spec.ResourceQuantity.MRV50, spec, resourcesRequired)
  726. if err != nil {
  727. return nil, err
  728. }
  729. if id != nil {
  730. cres = id
  731. }
  732. } else {
  733. res, err := genClusterResources(tag, spec.ResourceQuantity.MRV50, spec)
  734. if err != nil {
  735. return nil, err
  736. }
  737. cres = res
  738. }
  739. //cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV50, spec)
  740. //if err != nil {
  741. // return nil, err
  742. //}
  743. return cres, nil
  744. } else if spec.ResourceQuantity.BIV100 != "" {
  745. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  746. if err != nil {
  747. return nil, err
  748. }
  749. var cres interface{}
  750. if resourcesRequired != nil {
  751. id, err := findSpecId(tag, spec.ResourceQuantity.NvidiaA100, spec, resourcesRequired)
  752. if err != nil {
  753. return nil, err
  754. }
  755. if id != nil {
  756. cres = id
  757. }
  758. } else {
  759. res, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  760. if err != nil {
  761. return nil, err
  762. }
  763. cres = res
  764. }
  765. //cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  766. //if err != nil {
  767. // return nil, err
  768. //}
  769. return cres, nil
  770. } else if spec.ResourceQuantity.BIV150 != "" {
  771. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BIV150")
  772. if err != nil {
  773. return nil, err
  774. }
  775. var cres interface{}
  776. if resourcesRequired != nil {
  777. id, err := findSpecId(tag, spec.ResourceQuantity.BIV150, spec, resourcesRequired)
  778. if err != nil {
  779. return nil, err
  780. }
  781. if id != nil {
  782. cres = id
  783. }
  784. } else {
  785. res, err := genClusterResources(tag, spec.ResourceQuantity.BIV150, spec)
  786. if err != nil {
  787. return nil, err
  788. }
  789. cres = res
  790. }
  791. //cres, err := genClusterResources(tag, spec.ResourceQuantity.BIV150, spec)
  792. //if err != nil {
  793. // return nil, err
  794. //}
  795. return cres, nil
  796. } else if spec.ResourceQuantity.MRV100 != "" {
  797. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV100")
  798. if err != nil {
  799. return nil, err
  800. }
  801. var cres interface{}
  802. if resourcesRequired != nil {
  803. id, err := findSpecId(tag, spec.ResourceQuantity.MRV100, spec, resourcesRequired)
  804. if err != nil {
  805. return nil, err
  806. }
  807. if id != nil {
  808. cres = id
  809. }
  810. } else {
  811. res, err := genClusterResources(tag, spec.ResourceQuantity.MRV100, spec)
  812. if err != nil {
  813. return nil, err
  814. }
  815. cres = res
  816. }
  817. //cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV100, spec)
  818. //if err != nil {
  819. // return nil, err
  820. //}
  821. return cres, nil
  822. } else if spec.ResourceQuantity.CambriconComMlu != "" {
  823. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "CambriconComMlu")
  824. if err != nil {
  825. return nil, err
  826. }
  827. var cres interface{}
  828. if resourcesRequired != nil {
  829. id, err := findSpecId(tag, spec.ResourceQuantity.CambriconComMlu, spec, resourcesRequired)
  830. if err != nil {
  831. return nil, err
  832. }
  833. if id != nil {
  834. cres = id
  835. }
  836. } else {
  837. res, err := genClusterResources(tag, spec.ResourceQuantity.CambriconComMlu, spec)
  838. if err != nil {
  839. return nil, err
  840. }
  841. cres = res
  842. }
  843. //cres, err := genClusterResources(tag, spec.ResourceQuantity.CambriconComMlu, spec)
  844. //if err != nil {
  845. // return nil, err
  846. //}
  847. return cres, nil
  848. } else if spec.ResourceQuantity.HygonComDcu != "" {
  849. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HygonComDcu")
  850. if err != nil {
  851. return nil, err
  852. }
  853. var cres interface{}
  854. if resourcesRequired != nil {
  855. id, err := findSpecId(tag, spec.ResourceQuantity.HygonComDcu, spec, resourcesRequired)
  856. if err != nil {
  857. return nil, err
  858. }
  859. if id != nil {
  860. cres = id
  861. }
  862. } else {
  863. res, err := genClusterResources(tag, spec.ResourceQuantity.HygonComDcu, spec)
  864. if err != nil {
  865. return nil, err
  866. }
  867. cres = res
  868. }
  869. //cres, err := genClusterResources(tag, spec.ResourceQuantity.HygonComDcu, spec)
  870. //if err != nil {
  871. // return nil, err
  872. //}
  873. return cres, nil
  874. } else if spec.ResourceQuantity.HuaweiComAscend910 != "" {
  875. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HuaweiComAscend910")
  876. if err != nil {
  877. return nil, err
  878. }
  879. var cres interface{}
  880. if resourcesRequired != nil {
  881. id, err := findSpecId(tag, spec.ResourceQuantity.HuaweiComAscend910, spec, resourcesRequired)
  882. if err != nil {
  883. return nil, err
  884. }
  885. if id != nil {
  886. cres = id
  887. }
  888. } else {
  889. res, err := genClusterResources(tag, spec.ResourceQuantity.HuaweiComAscend910, spec)
  890. if err != nil {
  891. return nil, err
  892. }
  893. cres = res
  894. }
  895. //cres, err := genClusterResources(tag, spec.ResourceQuantity.HuaweiComAscend910, spec)
  896. //if err != nil {
  897. // return nil, err
  898. //}
  899. return cres, nil
  900. } else if spec.ResourceQuantity.EnflameComGcu != "" {
  901. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "EnflameComGcu")
  902. if err != nil {
  903. return nil, err
  904. }
  905. var cres interface{}
  906. if resourcesRequired != nil {
  907. id, err := findSpecId(tag, spec.ResourceQuantity.EnflameComGcu, spec, resourcesRequired)
  908. if err != nil {
  909. return nil, err
  910. }
  911. if id != nil {
  912. cres = id
  913. }
  914. } else {
  915. res, err := genClusterResources(tag, spec.ResourceQuantity.EnflameComGcu, spec)
  916. if err != nil {
  917. return nil, err
  918. }
  919. cres = res
  920. }
  921. //cres, err := genClusterResources(tag, spec.ResourceQuantity.EnflameComGcu, spec)
  922. //if err != nil {
  923. // return nil, err
  924. //}
  925. return cres, nil
  926. } else if spec.ResourceQuantity.MXN260 != "" {
  927. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MXN260")
  928. if err != nil {
  929. return nil, err
  930. }
  931. var cres interface{}
  932. if resourcesRequired != nil {
  933. id, err := findSpecId(tag, spec.ResourceQuantity.MXN260, spec, resourcesRequired)
  934. if err != nil {
  935. return nil, err
  936. }
  937. if id != nil {
  938. cres = id
  939. }
  940. } else {
  941. res, err := genClusterResources(tag, spec.ResourceQuantity.MXN260, spec)
  942. if err != nil {
  943. return nil, err
  944. }
  945. cres = res
  946. }
  947. //cres, err := genClusterResources(tag, spec.ResourceQuantity.MXN260, spec)
  948. //if err != nil {
  949. // return nil, err
  950. //}
  951. return cres, nil
  952. } else if spec.ResourceQuantity.NvidiaV100 != "" {
  953. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaV100")
  954. if err != nil {
  955. return nil, err
  956. }
  957. var cres interface{}
  958. if resourcesRequired != nil {
  959. id, err := findSpecId(tag, spec.ResourceQuantity.NvidiaV100, spec, resourcesRequired)
  960. if err != nil {
  961. return nil, err
  962. }
  963. if id != nil {
  964. cres = id
  965. }
  966. } else {
  967. res, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaV100, spec)
  968. if err != nil {
  969. return nil, err
  970. }
  971. cres = res
  972. }
  973. //cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaV100, spec)
  974. //if err != nil {
  975. // return nil, err
  976. //}
  977. return cres, nil
  978. } else if spec.ResourceQuantity.MetaxTechComGpu != "" {
  979. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MetaxTechComGpu")
  980. if err != nil {
  981. return nil, err
  982. }
  983. var cres interface{}
  984. if resourcesRequired != nil {
  985. id, err := findSpecId(tag, spec.ResourceQuantity.MetaxTechComGpu, spec, resourcesRequired)
  986. if err != nil {
  987. return nil, err
  988. }
  989. if id != nil {
  990. cres = id
  991. }
  992. } else {
  993. res, err := genClusterResources(tag, spec.ResourceQuantity.MetaxTechComGpu, spec)
  994. if err != nil {
  995. return nil, err
  996. }
  997. cres = res
  998. }
  999. //cres, err := genClusterResources(tag, spec.ResourceQuantity.MetaxTechComGpu, spec)
  1000. //if err != nil {
  1001. // return nil, err
  1002. //}
  1003. return cres, nil
  1004. }
  1005. return nil, nil
  1006. }
  1007. func genClusterResources(cType string, cNum string, s *omodel.Spec) (*collector.ClusterResource, error) {
  1008. cres := &collector.ClusterResource{}
  1009. bres := make([]*collector.Usage, 0)
  1010. var cardNum int64
  1011. var cpuCore int64
  1012. var memGi int64
  1013. cardNum, err := strconv.ParseInt(cNum, 10, 64)
  1014. if err != nil {
  1015. cardNum = 0
  1016. }
  1017. cpuCore, err = strconv.ParseInt(s.ResourceQuantity.Cpu, 10, 64)
  1018. if err != nil {
  1019. cpuCore = 0
  1020. }
  1021. if s.ResourceQuantity.Memory != "" {
  1022. gi := strings.Split(s.ResourceQuantity.Memory, Gi)
  1023. if len(gi) != 2 {
  1024. return nil, fmt.Errorf("s.ResourceQuantity.Memory convert error: %s", s.ResourceQuantity.Memory)
  1025. }
  1026. mGi, err := strconv.ParseInt(gi[0], 10, 64)
  1027. if err != nil {
  1028. memGi = 0
  1029. } else {
  1030. memGi = mGi
  1031. }
  1032. } else {
  1033. memGi = 0
  1034. }
  1035. card := &collector.Usage{
  1036. Type: ComputeSourceToCardType[cType],
  1037. Name: strings.ToUpper(cType),
  1038. Total: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  1039. Available: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  1040. }
  1041. cpu := &collector.Usage{
  1042. Type: strings.ToUpper(CPU),
  1043. Name: strings.ToUpper(CPU),
  1044. Total: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  1045. Available: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  1046. }
  1047. mem := &collector.Usage{
  1048. Type: strings.ToUpper(MEMORY),
  1049. Name: strings.ToUpper(RAM),
  1050. Total: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  1051. Available: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  1052. }
  1053. bres = append(bres, cpu)
  1054. bres = append(bres, mem)
  1055. cres.Resource = card
  1056. cres.BaseResources = bres
  1057. return cres, nil
  1058. }
  1059. // inference
  1060. func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  1061. return nil, errors.New(NotImplementError)
  1062. }
  1063. func (o *OctopusHttp) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  1064. return nil, errors.New(NotImplementError)
  1065. }
  1066. func (o *OctopusHttp) StartInferDeployInstance(ctx context.Context, id string) bool {
  1067. return false
  1068. }
  1069. func (o *OctopusHttp) StopInferDeployInstance(ctx context.Context, id string) bool {
  1070. return false
  1071. }
  1072. func (o *OctopusHttp) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  1073. return nil, errors.New(NotImplementError)
  1074. }
  1075. func (o *OctopusHttp) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  1076. return "", errors.New(NotImplementError)
  1077. }
  1078. func (o *OctopusHttp) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool {
  1079. return false
  1080. }
  1081. func (o *OctopusHttp) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  1082. return "", errors.New(NotImplementError)
  1083. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.