You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopusHttp.go 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611
  1. package octopusHttp
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/json"
  6. "errors"
  7. "fmt"
  8. common2 "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/entity"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  13. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  14. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  15. omodel "gitlink.org.cn/JointCloud/pcm-octopus/http/model"
  16. "gitlink.org.cn/JointCloud/pcm-openi/common"
  17. "mime/multipart"
  18. "net/http"
  19. "strconv"
  20. "strings"
  21. )
  22. const (
  23. RESOURCE_POOL = "common-pool"
  24. Param_Token = "token"
  25. Param_Addr = "addr"
  26. Forward_Slash = "/"
  27. COMMA = ","
  28. UNDERSCORE = "_"
  29. TASK_NAME_PREFIX = "trainJob"
  30. Python = "python "
  31. SemiColon = ";"
  32. BALANCE = "balance"
  33. RATE = "rate"
  34. PERHOUR = "per-hour"
  35. NUMBER = "number"
  36. KILOBYTE = "kb"
  37. GIGABYTE = "gb"
  38. CPUCORE = "core"
  39. STORAGE = "STORAGE"
  40. DISK = "disk"
  41. MEMORY = "memory"
  42. RAM = "ram"
  43. VRAM = "vram"
  44. RMB = "rmb"
  45. POINT = "point"
  46. RUNNINGTASK = "RUNNING_TASK"
  47. RUNNING = "RUNNING"
  48. CPU = "cpu"
  49. Gi = "Gi"
  50. )
  51. const (
  52. NotImplementError = "not implemented"
  53. )
  54. const (
  55. MyAlgorithmListUrl = "api/v1/algorithm/myAlgorithmList"
  56. ResourcespecsUrl = "api/v1/resource/specs"
  57. CreateTrainJobUrl = "api/v1/job/create"
  58. TrainJobDetail = "api/v1/job/detail"
  59. )
  60. // compute source
  61. var (
  62. ComputeSourceToCardType = map[string]string{
  63. "nvidia-a100": "GPU",
  64. "nvidia-a100-80g": "GPU",
  65. "mr-v100": "ILUVATAR-GPGPU",
  66. "bi-v100": "ILUVATAR-GPGPU",
  67. "MR-V50": "ILUVATAR-GPGPU",
  68. "BI-V100": "ILUVATAR-GPGPU",
  69. "BI-V150": "ILUVATAR-GPGPU",
  70. "MR-V100": "ILUVATAR-GPGPU",
  71. "cambricon.com/mlu": "MLU",
  72. "hygon.com/dcu": "DCU",
  73. "huawei.com/Ascend910": "NPU",
  74. "enflame.com/gcu": "GCU",
  75. "ILUVATAR-GPGPU": "ILUVATAR-GPGPU",
  76. "MXN260": "METAX-GPGPU",
  77. }
  78. )
  79. type OctopusHttp struct {
  80. server string
  81. host string
  82. platform string
  83. participantId int64
  84. token *Token
  85. }
  86. func NewOctopusHttp(id int64, name, server, host string, user string, pwd string) *OctopusHttp {
  87. token, _ := NewToken(server, host, user, pwd)
  88. return &OctopusHttp{platform: name, participantId: id, server: server, host: host, token: token}
  89. }
  90. // executor
  91. func (o *OctopusHttp) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  92. switch mode {
  93. case executor.SUBMIT_MODE_JOINT_CLOUD:
  94. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  95. // cmd
  96. if option.AlgorithmId == "" {
  97. return nil, errors.New("algorithmId is empty")
  98. }
  99. if option.Cmd != "" {
  100. option.Cmd = option.Cmd + SemiColon + Python + option.AlgorithmId
  101. } else {
  102. option.Cmd = Python + option.AlgorithmId
  103. }
  104. option.ResourceId = "9e2feeae30e04492a4298755179f2ae0"
  105. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  106. if err != nil {
  107. return nil, err
  108. }
  109. return task, nil
  110. }
  111. return nil, nil
  112. }
  113. func (o *OctopusHttp) Stop(ctx context.Context, id string) error {
  114. //TODO implement me
  115. panic("implement me")
  116. }
  117. func (o *OctopusHttp) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  118. // octopus提交任务
  119. reqUrl := o.server + CreateTrainJobUrl
  120. token, err := o.token.Get()
  121. if err != nil {
  122. return nil, err
  123. }
  124. // python参数
  125. var prms []struct {
  126. Key string `json:"key"`
  127. Value string `json:"value"`
  128. }
  129. for _, param := range params {
  130. var p struct {
  131. Key string `json:"key"`
  132. Value string `json:"value"`
  133. }
  134. s := strings.Split(param, COMMA)
  135. p.Key = s[0]
  136. p.Value = s[1]
  137. prms = append(prms, p)
  138. }
  139. //环境变量
  140. envMap := make(map[string]string)
  141. for _, env := range envs {
  142. s := strings.Split(env, COMMA)
  143. envMap[s[0]] = s[1]
  144. }
  145. param := &omodel.CreateTrainJobParam{
  146. //DataSetId: datasetsId,
  147. //DataSetVersion: VERSION,
  148. //AlgorithmId: algorithmId,
  149. //AlgorithmVersion: VERSION,
  150. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  151. ImageId: imageId,
  152. IsDistributed: false,
  153. ResourcePool: RESOURCE_POOL,
  154. Config: []*omodel.CreateTrainJobConf{
  155. {
  156. Command: cmd,
  157. ResourceSpecId: resourceId,
  158. MinFailedTaskCount: 1,
  159. MinSucceededTaskCount: 1,
  160. TaskNumber: 1,
  161. Parameters: prms,
  162. Envs: envMap,
  163. },
  164. },
  165. }
  166. resp := &entity.OctCreateJobResp{}
  167. req := common.GetRestyRequest(common.TIMEOUT)
  168. _, err = req.
  169. SetHeader("Authorization", "Bearer "+token).
  170. SetBody(param).
  171. SetResult(resp).
  172. Post(reqUrl)
  173. if err != nil {
  174. return nil, err
  175. }
  176. return resp, nil
  177. }
  178. // collector
  179. func (o *OctopusHttp) resourceSpecs(ctx context.Context) (*entity.OctResourceSpecsResp, error) {
  180. resourcespecsUrl := o.server + ResourcespecsUrl
  181. token, err := o.token.Get()
  182. if err != nil {
  183. return nil, err
  184. }
  185. param := omodel.ResourceSpecParam{
  186. ResourcePool: RESOURCE_POOL,
  187. }
  188. b, _ := json.Marshal(param)
  189. byt := bytes.NewBuffer(b)
  190. resp := &entity.OctResourceSpecsResp{}
  191. req := common.GetRestyRequest(common.TIMEOUT)
  192. r, _ := http.NewRequest("GET", resourcespecsUrl, byt)
  193. req.RawRequest = r
  194. req.URL = resourcespecsUrl
  195. _, err = req.
  196. SetHeader("Content-Type", "application/json").
  197. SetQueryParam(Param_Token, token).
  198. SetQueryParam(Param_Addr, o.host).
  199. SetBody(byt).
  200. SetResult(resp).
  201. Send()
  202. if err != nil {
  203. return nil, err
  204. }
  205. return resp, nil
  206. }
  207. func (o *OctopusHttp) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  208. resp, err := o.resourceSpecs(ctx)
  209. if err != nil {
  210. return nil, err
  211. }
  212. if resp.Code != http.StatusOK {
  213. if resp.Data != nil {
  214. marshal, err := json.Marshal(resp.Data)
  215. if err != nil {
  216. return nil, err
  217. }
  218. errormdl := &omodel.Error{}
  219. err = json.Unmarshal(marshal, errormdl)
  220. if err != nil {
  221. return nil, err
  222. }
  223. return nil, errors.New(errormdl.Message)
  224. }
  225. } else {
  226. if resp.Data != nil {
  227. spec := &entity.OctResourceSpecs{}
  228. marshal, err := json.Marshal(resp.Data)
  229. if err != nil {
  230. return nil, err
  231. }
  232. err = json.Unmarshal(marshal, spec)
  233. if err != nil {
  234. return nil, err
  235. }
  236. }
  237. }
  238. return nil, nil
  239. }
  240. func (o *OctopusHttp) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  241. return nil, nil
  242. }
  243. func (o *OctopusHttp) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  244. //TODO implement me
  245. panic("implement me")
  246. }
  247. func (o *OctopusHttp) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  248. //TODO implement me
  249. panic("implement me")
  250. }
  251. func (o *OctopusHttp) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  252. //TODO implement me
  253. panic("implement me")
  254. }
  255. func (o *OctopusHttp) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  256. //TODO implement me
  257. panic("implement me")
  258. }
  259. func (o *OctopusHttp) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  260. //TODO implement me
  261. panic("implement me")
  262. }
  263. func (o OctopusHttp) GetComputeCards(ctx context.Context) ([]string, error) {
  264. //TODO implement me
  265. panic("implement me")
  266. }
  267. func (o *OctopusHttp) GetUserBalance(ctx context.Context) (float64, error) {
  268. //TODO implement me
  269. panic("implement me")
  270. }
  271. func (o *OctopusHttp) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  272. resp, err := o.resourceSpecs(ctx)
  273. if err != nil {
  274. return nil, err
  275. }
  276. res := &collector.ResourceSpec{
  277. ClusterId: strconv.FormatInt(o.participantId, 10),
  278. Tag: resrcType,
  279. }
  280. if resp.Code != http.StatusOK {
  281. if resp.Data != nil {
  282. marshal, err := json.Marshal(resp.Data)
  283. if err != nil {
  284. return nil, err
  285. }
  286. errormdl := &omodel.Error{}
  287. err = json.Unmarshal(marshal, errormdl)
  288. if err != nil {
  289. return nil, err
  290. }
  291. return nil, errors.New(errormdl.Message)
  292. }
  293. } else {
  294. if resp.Data != nil {
  295. specs := &entity.OctResourceSpecs{}
  296. marshal, err := json.Marshal(resp.Data)
  297. if err != nil {
  298. return nil, err
  299. }
  300. err = json.Unmarshal(marshal, specs)
  301. if err != nil {
  302. return nil, err
  303. }
  304. clusterResources, err := genSpecs(specs, resrcType)
  305. if err != nil {
  306. return nil, err
  307. }
  308. res.Resources = clusterResources
  309. }
  310. }
  311. return res, nil
  312. }
  313. func genSpecs(specs *entity.OctResourceSpecs, resrcType string) ([]interface{}, error) {
  314. res := make([]interface{}, 0)
  315. if resrcType == "Inference" {
  316. return res, nil
  317. } else if resrcType == "Train" {
  318. if specs.MapResourceSpecIdList.Train.ResourceSpecs == nil {
  319. return res, nil
  320. } else {
  321. for _, s := range specs.MapResourceSpecIdList.Train.ResourceSpecs {
  322. spec := &omodel.Spec{}
  323. marshal, err := json.Marshal(s)
  324. if err != nil {
  325. return nil, err
  326. }
  327. err = json.Unmarshal(marshal, specs)
  328. if err != nil {
  329. return nil, err
  330. }
  331. resType, err := chooseResourceType(spec)
  332. if err != nil {
  333. return nil, err
  334. }
  335. if resType == nil {
  336. continue
  337. }
  338. res = append(res, resType)
  339. }
  340. }
  341. }
  342. return res, nil
  343. }
  344. func chooseResourceType(spec *omodel.Spec) (*collector.ClusterResource, error) {
  345. if spec.ResourceQuantity.NvidiaA100 != "" {
  346. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  347. if err != nil {
  348. return nil, err
  349. }
  350. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  351. if err != nil {
  352. return nil, err
  353. }
  354. return cres, nil
  355. } else if spec.ResourceQuantity.NvidiaA10080G != "" {
  356. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  357. if err != nil {
  358. return nil, err
  359. }
  360. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  361. if err != nil {
  362. return nil, err
  363. }
  364. return cres, nil
  365. } else if spec.ResourceQuantity.MrV100 != "" {
  366. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  367. if err != nil {
  368. return nil, err
  369. }
  370. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  371. if err != nil {
  372. return nil, err
  373. }
  374. return cres, nil
  375. } else if spec.ResourceQuantity.BiV100 != "" {
  376. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  377. if err != nil {
  378. return nil, err
  379. }
  380. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  381. if err != nil {
  382. return nil, err
  383. }
  384. return cres, nil
  385. } else if spec.ResourceQuantity.MRV50 != "" {
  386. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  387. if err != nil {
  388. return nil, err
  389. }
  390. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  391. if err != nil {
  392. return nil, err
  393. }
  394. return cres, nil
  395. } else if spec.ResourceQuantity.BIV100 != "" {
  396. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  397. if err != nil {
  398. return nil, err
  399. }
  400. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  401. if err != nil {
  402. return nil, err
  403. }
  404. return cres, nil
  405. } else if spec.ResourceQuantity.BIV150 != "" {
  406. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  407. if err != nil {
  408. return nil, err
  409. }
  410. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  411. if err != nil {
  412. return nil, err
  413. }
  414. return cres, nil
  415. } else if spec.ResourceQuantity.MRV100 != "" {
  416. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  417. if err != nil {
  418. return nil, err
  419. }
  420. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  421. if err != nil {
  422. return nil, err
  423. }
  424. return cres, nil
  425. } else if spec.ResourceQuantity.CambriconComMlu != "" {
  426. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  427. if err != nil {
  428. return nil, err
  429. }
  430. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  431. if err != nil {
  432. return nil, err
  433. }
  434. return cres, nil
  435. } else if spec.ResourceQuantity.HygonComDcu != "" {
  436. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  437. if err != nil {
  438. return nil, err
  439. }
  440. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  441. if err != nil {
  442. return nil, err
  443. }
  444. return cres, nil
  445. } else if spec.ResourceQuantity.HuaweiComAscend910 != "" {
  446. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  447. if err != nil {
  448. return nil, err
  449. }
  450. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  451. if err != nil {
  452. return nil, err
  453. }
  454. return cres, nil
  455. } else if spec.ResourceQuantity.EnflameComGcu != "" {
  456. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  457. if err != nil {
  458. return nil, err
  459. }
  460. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  461. if err != nil {
  462. return nil, err
  463. }
  464. return cres, nil
  465. } else if spec.ResourceQuantity.MXN260 != "" {
  466. tag, err := common2.GetJSONTag(spec, "NvidiaA100")
  467. if err != nil {
  468. return nil, err
  469. }
  470. cres, err := genClusterResources(tag, spec.ResourceQuantity.NvidiaA100, spec)
  471. if err != nil {
  472. return nil, err
  473. }
  474. return cres, nil
  475. }
  476. return nil, nil
  477. }
  478. func genClusterResources(cType string, cNum string, s *omodel.Spec) (*collector.ClusterResource, error) {
  479. cres := &collector.ClusterResource{}
  480. bres := make([]*collector.Usage, 0)
  481. cardNum, err := strconv.ParseInt(cNum, 10, 64)
  482. if err != nil {
  483. return nil, err
  484. }
  485. cpuCore, err := strconv.ParseInt(s.ResourceQuantity.Cpu, 10, 64)
  486. if err != nil {
  487. return nil, err
  488. }
  489. gi := strings.Split(s.ResourceQuantity.Memory, Gi)
  490. if len(gi) != 1 {
  491. return nil, fmt.Errorf("s.ResourceQuantity.Memory convert error: %s", s.ResourceQuantity.Memory)
  492. }
  493. memGi, err := strconv.ParseInt(gi[0], 10, 64)
  494. if err != nil {
  495. return nil, err
  496. }
  497. card := &collector.Usage{
  498. Type: ComputeSourceToCardType[cType],
  499. Name: strings.ToUpper(cType),
  500. Total: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  501. Available: &collector.UnitValue{Unit: NUMBER, Value: cardNum},
  502. }
  503. cpu := &collector.Usage{
  504. Type: strings.ToUpper(CPU),
  505. Name: strings.ToUpper(CPU),
  506. Total: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  507. Available: &collector.UnitValue{Unit: CPUCORE, Value: cpuCore},
  508. }
  509. mem := &collector.Usage{
  510. Type: strings.ToUpper(MEMORY),
  511. Name: strings.ToUpper(RAM),
  512. Total: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  513. Available: &collector.UnitValue{Unit: GIGABYTE, Value: memGi},
  514. }
  515. bres = append(bres, cpu)
  516. bres = append(bres, mem)
  517. cres.Resource = card
  518. cres.BaseResources = bres
  519. return cres, nil
  520. }
  521. // inference
  522. func (o *OctopusHttp) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  523. return nil, errors.New(NotImplementError)
  524. }
  525. func (o *OctopusHttp) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  526. return nil, errors.New(NotImplementError)
  527. }
  528. func (o *OctopusHttp) StartInferDeployInstance(ctx context.Context, id string) bool {
  529. return false
  530. }
  531. func (o *OctopusHttp) StopInferDeployInstance(ctx context.Context, id string) bool {
  532. return false
  533. }
  534. func (o *OctopusHttp) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  535. return nil, errors.New(NotImplementError)
  536. }
  537. func (o *OctopusHttp) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  538. return "", errors.New(NotImplementError)
  539. }
  540. func (o *OctopusHttp) CheckModelExistence(ctx context.Context, modelName string, modelType string) bool {
  541. return false
  542. }
  543. func (o *OctopusHttp) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  544. return "", errors.New(NotImplementError)
  545. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.