You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopusHttp.go 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. package octopusHttp
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. common2 "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  15. omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
  16. "gitlink.org.cn/JointCloud/pcm-openi/common"
  17. "mime/multipart"
  18. "net/http"
  19. "strconv"
  20. "strings"
  21. )
  22. const (
  23. RESOURCE_POOL = "common-pool"
  24. Param_Token = "token"
  25. Param_Addr = "addr"
  26. Forward_Slash = "/"
  27. COMMA = ","
  28. UNDERSCORE = "_"
  29. TASK_NAME_PREFIX = "trainJob"
  30. Python = "python "
  31. SemiColon = ";"
  32. BALANCE = "balance"
  33. RATE = "rate"
  34. PERHOUR = "per-hour"
  35. NUMBER = "number"
  36. KILOBYTE = "kb"
  37. GIGABYTE = "gb"
  38. CPUCORE = "core"
  39. STORAGE = "STORAGE"
  40. DISK = "disk"
  41. MEMORY = "memory"
  42. RAM = "ram"
  43. VRAM = "vram"
  44. RMB = "rmb"
  45. POINT = "point"
  46. RUNNINGTASK = "RUNNING_TASK"
  47. RUNNING = "RUNNING"
  48. CPU = "cpu"
  49. Gi = "Gi"
  50. )
  51. const (
  52. NotImplementError = "not implemented"
  53. )
  54. const (
  55. MyAlgorithmListUrl = "api/v1/algorithm/myAlgorithmList"
  56. ResourcespecsUrl = "api/v1/resource/specs"
  57. CreateTrainJobUrl = "api/v1/job/create"
  58. TrainJobDetail = "api/v1/job/detail"
  59. )
  60. // compute source
  61. var (
  62. ComputeSourceToCardType = map[string]string{
  63. "nvidia-a100": "GPU",
  64. "nvidia-a100-80g": "GPU",
  65. "mr-v100": "ILUVATAR-GPGPU",
  66. "bi-v100": "ILUVATAR-GPGPU",
  67. "MR-V50": "ILUVATAR-GPGPU",
  68. "BI-V100": "ILUVATAR-GPGPU",
  69. "BI-V150": "ILUVATAR-GPGPU",
  70. "MR-V100": "ILUVATAR-GPGPU",
  71. "cambricon.com/mlu": "MLU",
  72. "hygon.com/dcu": "DCU",
  73. "huawei.com/Ascend910": "NPU",
  74. "enflame.com/gcu": "GCU",
  75. "ILUVATAR-GPGPU": "ILUVATAR-GPGPU",
  76. "MXN260": "METAX-GPGPU",
  77. }
  78. )
  79. type OctopusHttp struct {
  80. server string
  81. host string
  82. platform string
  83. participantId int64
  84. token *Token
  85. }
  86. func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) *OctopusHttp {
  87. token, _ := NewToken(server, host, user, pwd)
  88. return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token}
  89. }
  90. // executor
  91. func (o *OctopusHttp) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  92. switch mode {
  93. case executor.SUBMIT_MODE_JOINT_CLOUD:
  94. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  95. // cmd
  96. if option.AlgorithmId == "" {
  97. return nil, errors.New("algorithmId is empty")
  98. }
  99. if option.Cmd != "" {
  100. option.Cmd = option.Cmd + SemiColon + Python + option.AlgorithmId
  101. } else {
  102. option.Cmd = Python + option.AlgorithmId
  103. }
  104. option.ResourceId = "9e2feeae30e04492a4298755179f2ae0"
  105. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  106. if err != nil {
  107. return nil, err
  108. }
  109. return task, nil
  110. }
  111. return nil, nil
  112. }
  113. func (o *OctopusHttp) Stop(ctx context.Context, id string) error {
  114. //TODO implement me
  115. panic("implement me")
  116. }
  117. func (o *OctopusHttp) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  118. // octopus提交任务
  119. reqUrl := o.server + CreateTrainJobUrl
  120. token, err := o.token.Get()
  121. if err != nil {
  122. return nil, err
  123. }
  124. // python参数
  125. var prms []struct {
  126. Key string `json:"key"`
  127. Value string `json:"value"`
  128. }
  129. for _, param := range params {
  130. var p struct {
  131. Key string `json:"key"`
  132. Value string `json:"value"`
  133. }
  134. s := strings.Split(param, COMMA)
  135. p.Key = s[0]
  136. p.Value = s[1]
  137. prms = append(prms, p)
  138. }
  139. //环境变量
  140. envMap := make(map[string]string)
  141. for _, env := range envs {
  142. s := strings.Split(env, COMMA)
  143. envMap[s[0]] = s[1]
  144. }
  145. param := &omodel.CreateTrainJobParam{
  146. //DataSetId: datasetsId,
  147. //DataSetVersion: VERSION,
  148. //AlgorithmId: algorithmId,
  149. //AlgorithmVersion: VERSION,
  150. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  151. ImageId: imageId,
  152. IsDistributed: false,
  153. ResourcePool: RESOURCE_POOL,
  154. Config: []*omodel.CreateTrainJobConf{
  155. {
  156. Command: cmd,
  157. ResourceSpecId: resourceId,
  158. MinFailedTaskCount: 1,
  159. MinSucceededTaskCount: 1,
  160. TaskNumber: 1,
  161. Parameters: prms,
  162. Envs: envMap,
  163. },
  164. },
  165. }
  166. resp := &entity.OctCreateJobResp{}
  167. req := common.GetRestyRequest(common.TIMEOUT)
  168. _, err = req.
  169. SetHeader("Authorization", "Bearer "+token).
  170. SetQueryString("token=" + token).
  171. SetQueryString("addr=" + o.host).
  172. SetBody(param).
  173. SetResult(resp).
  174. Post(reqUrl)
  175. if err != nil {
  176. return nil, err
  177. }
  178. return resp, nil
  179. }
  180. // collector
  181. func (o *OctopusHttp) resourceSpecs(ctx context.Context) (*entity.OctResourceSpecsResp, error) {
  182. resourcespecsUrl := o.server + ResourcespecsUrl
  183. token, err := o.token.Get()
  184. if err != nil {
  185. return nil, err
  186. }
  187. param := omodel.ResourceSpecParam{
  188. ResourcePool: RESOURCE_POOL,
  189. }
  190. b, _ := json.Marshal(param)
  191. byt := bytes.NewBuffer(b)
  192. resp := &entity.OctResourceSpecsResp{}
  193. req := common.GetRestyRequest(common.TIMEOUT)
  194. r, _ := http.NewRequest("GET", resourcespecsUrl, byt)
  195. req.RawRequest = r
  196. req.URL = resourcespecsUrl
  197. _, err = req.
  198. SetHeader("Content-Type", "application/json").
  199. SetQueryParam(Param_Token, token).
  200. SetQueryParam(Param_Addr, o.host).
  201. SetBody(byt).
  202. SetResult(resp).
  203. Send()
  204. if err != nil {
  205. return nil, err
  206. }
  207. return resp, nil
  208. }
  209. func (o *OctopusHttp) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  210. resp, err := o.resourceSpecs(ctx)
  211. if err != nil {
  212. return nil, err
  213. }
  214. if resp.Code != http.StatusOK {
  215. if resp.Data != nil {
  216. marshal, err := json.Marshal(resp.Data)
  217. if err != nil {
  218. return nil, err
  219. }
  220. errormdl := &omodel.Error{}
  221. err = json.Unmarshal(marshal, errormdl)
  222. if err != nil {
  223. return nil, err
  224. }
  225. return nil, errors.New(errormdl.Message)
  226. }
  227. } else {
  228. if resp.Data != nil {
  229. spec := &entity.OctResourceSpecs{}
  230. marshal, err := json.Marshal(resp.Data)
  231. if err != nil {
  232. return nil, err
  233. }
  234. err = json.Unmarshal(marshal, spec)
  235. if err != nil {
  236. return nil, err
  237. }
  238. }
  239. }
  240. return nil, nil
  241. }
  242. func (o *OctopusHttp) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  243. return nil, nil
  244. }
  245. func (o *OctopusHttp) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  246. //TODO implement me
  247. panic("implement me")
  248. }
  249. func (o *OctopusHttp) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  250. //TODO implement me
  251. panic("implement me")
  252. }
  253. func (o *OctopusHttp) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  254. //TODO implement me
  255. panic("implement me")
  256. }
  257. func (o *OctopusHttp) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  258. //TODO implement me
  259. panic("implement me")
  260. }
  261. func (o *OctopusHttp) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  262. //TODO implement me
  263. panic("implement me")
  264. }
  265. func (o OctopusHttp) GetComputeCards(ctx context.Context) ([]string, error) {
  266. //TODO implement me
  267. panic("implement me")
  268. }
  269. func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) {
  270. //TODO implement me
  271. panic("implement me")
  272. }
  273. func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  274. resp, err := o.resourceSpecs(ctx)
  275. if err != nil {
  276. return nil, err
  277. }
  278. res := &collector.ResourceSpec{
  279. ClusterId: strconv.FormatInt(o.participantId, 10),
  280. Tag: resrcType,
  281. }
  282. if resp.Code != http.StatusOK {
  283. if resp.Data != nil {
  284. marshal, err := json.Marshal(resp.Data)
  285. if err != nil {
  286. return nil, err
  287. }
  288. errormdl := &omodel.Error{}
  289. err = json.Unmarshal(marshal, errormdl)
  290. if err != nil {
  291. return nil, err
  292. }
  293. return nil, errors.New(errormdl.Message)
  294. }
  295. } else {
  296. if resp.Data != nil {
  297. specs := &entity.OctResourceSpecs{}
  298. marshal, err := json.Marshal(resp.Data)
  299. if err != nil {
  300. return nil, err
  301. }
  302. err = json.Unmarshal(marshal, specs)
  303. if err != nil {
  304. return nil, err
  305. }
  306. clusterResources, err := genSpecs(specs, resrcType)
  307. if err != nil {
  308. return nil, err
  309. }
  310. res.Resources = clusterResources
  311. }
  312. }
  313. return res, nil
  314. }
  315. func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, error) {
  316. res := make([]interface{}, 0)
  317. if resrcType == "Inference" {
  318. return res, nil
  319. } else if resrcType == "Train" {
  320. if specs.MapResourceSpecIdList.Train.ResourceSpecs == nil {
  321. return res, nil
  322. } else {
  323. for _, s := range specs.MapResourceSpecIdList.Train.ResourceSpecs {
  324. spec := &omodel.Spec{}
  325. marshal, err := json.Marshal(s)
  326. if err != nil {
  327. return nil, err
  328. }
  329. err = json.Unmarshal(marshal, spec)
  330. if err != nil {
  331. return nil, err
  332. }
  333. resType, err := chooseResourceType(spec)
  334. if err != nil {
  335. return nil, err
  336. }
  337. if resType == nil {
  338. continue
  339. }
  340. res = append(res, resType)
  341. }
  342. }
  343. }
  344. return res, nil
  345. }
  346. func chooseResourceType(spec *omodel.Spec) (*collector.ClusterResource, error) {
  347. if spec.ResourceQuantity.NvidiaA100 != "" {
  348. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  349. if err != nil {
  350. return nil, err
  351. }
  352. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  353. if err != nil {
  354. return nil, err
  355. }
  356. return cres, nil
  357. } else if spec.ResourceQuantity.NvidiaA10080G != "" {
  358. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA10080G")
  359. if err != nil {
  360. return nil, err
  361. }
  362. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA10080G, spec)
  363. if err != nil {
  364. return nil, err
  365. }
  366. return cres, nil
  367. } else if spec.ResourceQuantity.MrV100 != "" {
  368. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MrV100")
  369. if err != nil {
  370. return nil, err
  371. }
  372. cres, err := genClusterResources(tag, spec.ResourceQuantity.MrV100, spec)
  373. if err != nil {
  374. return nil, err
  375. }
  376. return cres, nil
  377. } else if spec.ResourceQuantity.BiV100 != "" {
  378. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BiV100")
  379. if err != nil {
  380. return nil, err
  381. }
  382. cres, err := genClusterResources(tag, spec.ResourceQuantity.BiV100, spec)
  383. if err != nil {
  384. return nil, err
  385. }
  386. return cres, nil
  387. } else if spec.ResourceQuantity.MRV50 != "" {
  388. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV50")
  389. if err != nil {
  390. return nil, err
  391. }
  392. cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV50, spec)
  393. if err != nil {
  394. return nil, err
  395. }
  396. return cres, nil
  397. } else if spec.ResourceQuantity.BIV100 != "" {
  398. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "NvidiaA100")
  399. if err != nil {
  400. return nil, err
  401. }
  402. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  403. if err != nil {
  404. return nil, err
  405. }
  406. return cres, nil
  407. } else if spec.ResourceQuantity.BIV150 != "" {
  408. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "BIV150")
  409. if err != nil {
  410. return nil, err
  411. }
  412. cres, err := genClusterResources(tag, spec.ResourceQuantity.BIV150, spec)
  413. if err != nil {
  414. return nil, err
  415. }
  416. return cres, nil
  417. } else if spec.ResourceQuantity.MRV100 != "" {
  418. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MRV100")
  419. if err != nil {
  420. return nil, err
  421. }
  422. cres, err := genClusterResources(tag, spec.ResourceQuantity.MRV100, spec)
  423. if err != nil {
  424. return nil, err
  425. }
  426. return cres, nil
  427. } else if spec.ResourceQuantity.CambriconComMlu != "" {
  428. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "CambriconComMlu")
  429. if err != nil {
  430. return nil, err
  431. }
  432. cres, err := genClusterResources(tag, spec.ResourceQuantity.CambriconComMlu, spec)
  433. if err != nil {
  434. return nil, err
  435. }
  436. return cres, nil
  437. } else if spec.ResourceQuantity.HygonComDcu != "" {
  438. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HygonComDcu")
  439. if err != nil {
  440. return nil, err
  441. }
  442. cres, err := genClusterResources(tag, spec.ResourceQuantity.HygonComDcu, spec)
  443. if err != nil {
  444. return nil, err
  445. }
  446. return cres, nil
  447. } else if spec.ResourceQuantity.HuaweiComAscend910 != "" {
  448. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "HuaweiComAscend910")
  449. if err != nil {
  450. return nil, err
  451. }
  452. cres, err := genClusterResources(tag, spec.ResourceQuantity.HuaweiComAscend910, spec)
  453. if err != nil {
  454. return nil, err
  455. }
  456. return cres, nil
  457. } else if spec.ResourceQuantity.EnflameComGcu != "" {
  458. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "EnflameComGcu")
  459. if err != nil {
  460. return nil, err
  461. }
  462. cres, err := genClusterResources(tag, spec.ResourceQuantity.EnflameComGcu, spec)
  463. if err != nil {
  464. return nil, err
  465. }
  466. return cres, nil
  467. } else if spec.ResourceQuantity.MXN260 != "" {
  468. tag, err := common2.GetJSONTag(spec.ResourceQuantity, "MXN260")
  469. if err != nil {
  470. return nil, err
  471. }
  472. cres, err := genClusterResources(tag, spec.ResourceQuantity.MXN260, spec)
  473. if err != nil {
  474. return nil, err
  475. }
  476. return cres, nil
  477. }
  478. return nil, nil
  479. }
  480. func genClusterResources(cType string, cNum string, s *omodel.Spec) (*collector.ClusterResource, error) {
  481. cres := &collector.ClusterResource{}
  482. bres := make([]*collector.Usage, 0)
  483. var cardNum int64
  484. var cpuCore int64
  485. cardNum, err := strconv.ParseInt(cNum, 10, 64)
  486. if err != nil {
  487. cardNum = 0
  488. }
  489. cpuCore, err = strconv.ParseInt(s.ResourceQuantity.Cpu, 10, 64)
  490. if err != nil {
  491. cpuCore = 0
  492. }
  493. gi := strings.Split(s.ResourceQuantity.Memory, Gi)
  494. if len(gi) != 2 {
  495. return nil, fmt.Errorf("s.ResourceQuantity.Memory convert error: %s", s.ResourceQuantity.Memory)
  496. }
  497. memGi, err := strconv.ParseInt(gi[0], 10, 64)
  498. if err != nil {
  499. memGi = 0
  500. }
  501. card := &collector.Usage{
  502. Type: ComputeSourceToCardType[cType],
  503. Name: strings.ToUpper(cType),
  504. Total: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  505. Available: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  506. }
  507. cpu := &collector.Usage{
  508. Type: strings.ToUpper(CPU),
  509. Name: strings.ToUpper(CPU),
  510. Total: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  511. Available: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  512. }
  513. mem := &collector.Usage{
  514. Type: strings.ToUpper(MEMORY),
  515. Name: strings.ToUpper(RAM),
  516. Total: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  517. Available: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  518. }
  519. bres = append(bres, cpu)
  520. bres = append(bres, mem)
  521. cres.Resource = card
  522. cres.BaseResources = bres
  523. return cres, nil
  524. }
  525. // inference
  526. func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  527. return nil, errors.New(NotImplementError)
  528. }
  529. func (o *OctopusHttp) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  530. return nil, errors.New(NotImplementError)
  531. }
  532. func (o *OctopusHttp) StartInferDeployInstance(ctx context.Context, id string) bool {
  533. return false
  534. }
  535. func (o *OctopusHttp) StopInferDeployInstance(ctx context.Context, id string) bool {
  536. return false
  537. }
  538. func (o *OctopusHttp) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  539. return nil, errors.New(NotImplementError)
  540. }
  541. func (o *OctopusHttp) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  542. return "", errors.New(NotImplementError)
  543. }
  544. func (o *OctopusHttp) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool {
  545. return false
  546. }
  547. func (o *OctopusHttp) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  548. return "", errors.New(NotImplementError)
  549. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.