You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopus.go 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "errors"
  16. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
  17. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  19. "gitlink.org.cn/JointCloud/pcm-octopus/octopus"
  20. "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
  21. "math"
  22. "strconv"
  23. "strings"
  24. )
  25. type OctopusLink struct {
  26. octopusRpc octopusclient.Octopus
  27. pageIndex int32
  28. pageSize int32
  29. platform string
  30. participantId int64
  31. }
  32. const (
  33. IMG_NAME_PREFIX = "oct_"
  34. IMG_VERSION_PREFIX = "version_"
  35. TASK_NAME_PREFIX = "trainJob"
  36. RESOURCE_POOL = "common-pool"
  37. HANWUJI = "hanwuji"
  38. SUIYUAN = "suiyuan"
  39. SAILINGSI = "sailingsi"
  40. MLU = "MLU"
  41. CAMBRICONMLU290 = 256
  42. GCU = "GCU"
  43. ENFLAME = "enflame"
  44. EnflameT20 = 128
  45. BASE_TOPS = 128
  46. CAMBRICON = "cambricon"
  47. TRAIN_CMD = "cd /code; python train.py"
  48. VERSION = "V1"
  49. )
  50. var (
  51. cardAliasMap = map[string]string{
  52. MLU: CAMBRICON,
  53. GCU: ENFLAME,
  54. }
  55. cardTopsMap = map[string]float64{
  56. MLU: CAMBRICONMLU290,
  57. GCU: EnflameT20,
  58. }
  59. )
  60. func NewOctopusLink(octopusRpc octopusclient.Octopus, name string, id int64) *OctopusLink {
  61. return &OctopusLink{octopusRpc: octopusRpc, platform: name, participantId: id, pageIndex: 1, pageSize: 100}
  62. }
  63. func (o *OctopusLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
  64. // octopus创建镜像
  65. createReq := &octopus.CreateImageReq{
  66. Platform: o.platform,
  67. CreateImage: &octopus.CreateImage{
  68. SourceType: 1,
  69. ImageName: IMG_NAME_PREFIX + utils.RandomString(7),
  70. ImageVersion: IMG_VERSION_PREFIX + utils.RandomString(7),
  71. },
  72. }
  73. createResp, err := o.octopusRpc.CreateImage(ctx, createReq)
  74. if err != nil {
  75. return nil, err
  76. }
  77. // octopus上传镜像
  78. uploadReq := &octopus.UploadImageReq{
  79. Platform: o.platform,
  80. ImageId: createResp.Payload.ImageId,
  81. Params: &octopus.UploadImageParam{
  82. Domain: "",
  83. FileName: "",
  84. },
  85. }
  86. uploadResp, err := o.octopusRpc.UploadImage(ctx, uploadReq)
  87. if err != nil {
  88. return nil, err
  89. }
  90. // Todo 实际上传
  91. return uploadResp, nil
  92. }
  93. func (o *OctopusLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  94. // octopus删除镜像
  95. req := &octopus.DeleteImageReq{
  96. Platform: o.platform,
  97. ImageId: imageId,
  98. }
  99. resp, err := o.octopusRpc.DeleteImage(ctx, req)
  100. if err != nil {
  101. return nil, err
  102. }
  103. return resp, nil
  104. }
  105. func (o *OctopusLink) QueryImageList(ctx context.Context) (interface{}, error) {
  106. // octopus获取镜像列表
  107. req := &octopus.GetUserImageListReq{
  108. Platform: o.platform,
  109. PageIndex: o.pageIndex,
  110. PageSize: o.pageSize,
  111. }
  112. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  113. if err != nil {
  114. return nil, err
  115. }
  116. return resp, nil
  117. }
  118. func (o *OctopusLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  119. // octopus提交任务
  120. // python参数
  121. var prms []*octopus.Parameters
  122. for _, param := range params {
  123. var p octopus.Parameters
  124. s := strings.Split(param, COMMA)
  125. p.Key = s[0]
  126. p.Value = s[1]
  127. prms = append(prms, &p)
  128. }
  129. //环境变量
  130. envMap := make(map[string]string)
  131. for _, env := range envs {
  132. s := strings.Split(env, COMMA)
  133. envMap[s[0]] = s[1]
  134. }
  135. req := &octopus.CreateTrainJobReq{
  136. Platform: o.platform,
  137. Params: &octopus.CreateTrainJobParam{
  138. ImageId: imageId,
  139. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  140. ResourcePool: RESOURCE_POOL,
  141. Config: []*octopus.Config{
  142. {
  143. Command: cmd,
  144. ResourceSpecId: resourceId,
  145. MinFailedTaskCount: 1,
  146. MinSucceededTaskCount: 1,
  147. TaskNumber: 1,
  148. Parameters: prms,
  149. Envs: envMap,
  150. },
  151. },
  152. DataSetId: datasetsId,
  153. DataSetVersion: VERSION,
  154. AlgorithmId: algorithmId,
  155. AlgorithmVersion: VERSION,
  156. },
  157. }
  158. resp, err := o.octopusRpc.CreateTrainJob(ctx, req)
  159. if err != nil {
  160. return nil, err
  161. }
  162. return resp, nil
  163. }
  164. func (o *OctopusLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  165. // octopus获取任务
  166. req := &octopus.GetTrainJobReq{
  167. Platform: o.platform,
  168. Id: taskId,
  169. }
  170. resp, err := o.octopusRpc.GetTrainJob(ctx, req)
  171. if err != nil {
  172. return nil, err
  173. }
  174. return resp, nil
  175. }
  176. func (o *OctopusLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  177. // octopus删除任务
  178. req := &octopus.DeleteTrainJobReq{
  179. Platform: o.platform,
  180. JobIds: []string{taskId},
  181. }
  182. resp, err := o.octopusRpc.DeleteTrainJob(ctx, req)
  183. if err != nil {
  184. return nil, err
  185. }
  186. return resp, nil
  187. }
  188. func (o *OctopusLink) QuerySpecs(ctx context.Context) (interface{}, error) {
  189. // octopus查询资源规格
  190. req := &octopus.GetResourceSpecsReq{
  191. Platform: o.platform,
  192. ResourcePool: RESOURCE_POOL,
  193. }
  194. resp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  195. if err != nil {
  196. return nil, err
  197. }
  198. return resp, nil
  199. }
  200. func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  201. req := &octopus.GetResourceSpecsReq{
  202. Platform: o.platform,
  203. ResourcePool: RESOURCE_POOL,
  204. }
  205. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  206. if err != nil {
  207. return nil, err
  208. }
  209. if !specResp.Success {
  210. return nil, errors.New(specResp.Error.Message)
  211. }
  212. balanceReq := &octopus.GetUserBalanceReq{
  213. Platform: o.platform,
  214. }
  215. balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
  216. if err != nil {
  217. return nil, err
  218. }
  219. if !balanceResp.Success {
  220. return nil, errors.New(balanceResp.Error.Message)
  221. }
  222. var cards []*collector.Card
  223. balance := float64(balanceResp.Payload.BillingUser.Amount)
  224. var cpuHours float64
  225. for _, spec := range specResp.TrainResourceSpecs {
  226. if spec.Price == 0 {
  227. ns := strings.Split(spec.Name, COMMA)
  228. if len(ns) == 2 {
  229. nss := strings.Split(ns[0], COLON)
  230. if nss[0] == CPU {
  231. cpuHours = -1
  232. }
  233. }
  234. }
  235. if spec.Price == 1 {
  236. ns := strings.Split(spec.Name, COMMA)
  237. cardSpecs := strings.Split(ns[0], STAR)
  238. cardTops, isMapContainsKey := cardTopsMap[cardSpecs[1]]
  239. if !isMapContainsKey {
  240. continue
  241. }
  242. card := &collector.Card{
  243. Platform: OCTOPUS,
  244. Type: CARD,
  245. Name: cardSpecs[1],
  246. TOpsAtFp16: cardTops,
  247. CardHours: balance / spec.Price,
  248. }
  249. cards = append(cards, card)
  250. }
  251. }
  252. resourceStats := &collector.ResourceStats{
  253. ClusterId: strconv.FormatInt(o.participantId, 10),
  254. Name: o.platform,
  255. Balance: balance,
  256. CardsAvail: cards,
  257. CpuCoreHours: cpuHours,
  258. }
  259. return resourceStats, nil
  260. }
  261. func (o *OctopusLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  262. req := &octopus.GetMyDatasetListReq{
  263. Platform: o.platform,
  264. PageIndex: o.pageIndex,
  265. PageSize: o.pageSize,
  266. }
  267. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  268. if err != nil {
  269. return nil, err
  270. }
  271. if !resp.Success {
  272. return nil, errors.New(resp.Error.Message)
  273. }
  274. specs := []*collector.DatasetsSpecs{}
  275. for _, dataset := range resp.Payload.Datasets {
  276. spec := &collector.DatasetsSpecs{Name: dataset.Name}
  277. specs = append(specs, spec)
  278. }
  279. return specs, nil
  280. }
  281. func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  282. var algorithms []*collector.Algorithm
  283. req := &octopus.GetMyAlgorithmListReq{
  284. Platform: o.platform,
  285. PageIndex: o.pageIndex,
  286. PageSize: o.pageSize,
  287. }
  288. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  289. if err != nil {
  290. return nil, err
  291. }
  292. if !resp.Success {
  293. return nil, errors.New("failed to get algorithms")
  294. }
  295. for _, a := range resp.Payload.Algorithms {
  296. algorithm := &collector.Algorithm{Name: a.AlgorithmName, Platform: OCTOPUS, TaskType: strings.ToLower(a.FrameworkName)}
  297. algorithms = append(algorithms, algorithm)
  298. }
  299. return algorithms, nil
  300. }
  301. func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  302. instance, err := strconv.ParseInt(instanceNum, 10, 32)
  303. if err != nil {
  304. return "", err
  305. }
  306. req := &octopus.GetTrainJobLogReq{
  307. Platform: o.platform,
  308. TaskId: taskId,
  309. TaskNum: "task0",
  310. Num: int32(instance),
  311. }
  312. resp, err := o.octopusRpc.GetTrainJobLog(ctx, req)
  313. if err != nil {
  314. return "", err
  315. }
  316. return resp.Content, nil
  317. }
  318. func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
  319. err := o.GenerateSubmitParams(ctx, option)
  320. if err != nil {
  321. return nil, err
  322. }
  323. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  324. if err != nil {
  325. return nil, err
  326. }
  327. return task, nil
  328. }
  329. func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  330. err := o.generateResourceId(ctx, option)
  331. if err != nil {
  332. return err
  333. }
  334. err = o.generateDatasetsId(ctx, option)
  335. if err != nil {
  336. return err
  337. }
  338. err = o.generateImageId(ctx, option)
  339. if err != nil {
  340. return err
  341. }
  342. err = o.generateAlgorithmId(ctx, option)
  343. if err != nil {
  344. return err
  345. }
  346. err = o.generateCmd(option)
  347. if err != nil {
  348. return err
  349. }
  350. err = o.generateEnv(option)
  351. if err != nil {
  352. return err
  353. }
  354. err = o.generateParams(option)
  355. if err != nil {
  356. return err
  357. }
  358. return nil
  359. }
  360. func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption) error {
  361. if option.ResourceType == "" {
  362. return errors.New("ResourceType not set")
  363. }
  364. req := &octopus.GetResourceSpecsReq{
  365. Platform: o.platform,
  366. ResourcePool: RESOURCE_POOL,
  367. }
  368. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  369. if err != nil {
  370. return err
  371. }
  372. if !specResp.Success {
  373. return errors.New(specResp.Error.Message)
  374. }
  375. if option.ResourceType == CPU {
  376. for _, spec := range specResp.TrainResourceSpecs {
  377. if spec.Price == 0 {
  378. option.ResourceId = spec.Id
  379. return nil
  380. }
  381. }
  382. }
  383. if option.ResourceType == CARD {
  384. err = setResourceIdByCard(option, specResp, GCU)
  385. if err != nil {
  386. return err
  387. }
  388. return nil
  389. }
  390. return errors.New("failed to get ResourceId")
  391. }
  392. func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error {
  393. if option.DatasetsName == "" {
  394. return errors.New("DatasetsName not set")
  395. }
  396. req := &octopus.GetMyDatasetListReq{
  397. Platform: o.platform,
  398. PageIndex: o.pageIndex,
  399. PageSize: o.pageSize,
  400. }
  401. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  402. if err != nil {
  403. return err
  404. }
  405. if !resp.Success {
  406. return errors.New("failed to get DatasetsId")
  407. }
  408. for _, dataset := range resp.Payload.Datasets {
  409. if dataset.Name == option.DatasetsName {
  410. option.DatasetsId = dataset.Id
  411. return nil
  412. }
  413. }
  414. return errors.New("failed to get DatasetsId")
  415. }
  416. func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption) error {
  417. if option.TaskType == "" {
  418. return errors.New("TaskType not set")
  419. }
  420. req := &octopus.GetUserImageListReq{
  421. Platform: o.platform,
  422. PageIndex: o.pageIndex,
  423. PageSize: o.pageSize,
  424. }
  425. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  426. if err != nil {
  427. return err
  428. }
  429. if !resp.Success {
  430. return errors.New("failed to get imageId")
  431. }
  432. if option.ResourceType == CPU {
  433. for _, img := range resp.Payload.Images {
  434. if img.Image.ImageName == "test-image" {
  435. option.ImageId = img.Image.Id
  436. return nil
  437. }
  438. }
  439. }
  440. preImgReq := &octopus.GetPresetImageListReq{
  441. Platform: o.platform,
  442. PageIndex: o.pageIndex,
  443. PageSize: o.pageSize,
  444. }
  445. preImgResp, err := o.octopusRpc.GetPresetImageList(ctx, preImgReq)
  446. if err != nil {
  447. return err
  448. }
  449. if !preImgResp.Success {
  450. return errors.New("failed to get PresetImages")
  451. }
  452. if option.ResourceType == CARD {
  453. for _, image := range preImgResp.Payload.Images {
  454. if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
  455. option.ImageId = image.Id
  456. return nil
  457. }
  458. }
  459. }
  460. return errors.New("failed to get ImageId")
  461. }
  462. func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
  463. // temporarily set algorithm to cnn
  464. if option.AlgorithmName == "" {
  465. switch option.DatasetsName {
  466. case "cifar10":
  467. option.AlgorithmName = "cnn"
  468. case "mnist":
  469. option.AlgorithmName = "fcn"
  470. }
  471. }
  472. req := &octopus.GetMyAlgorithmListReq{
  473. Platform: o.platform,
  474. PageIndex: o.pageIndex,
  475. PageSize: o.pageSize,
  476. }
  477. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  478. if err != nil {
  479. return err
  480. }
  481. if !resp.Success {
  482. return errors.New("failed to get algorithmId")
  483. }
  484. for _, algorithm := range resp.Payload.Algorithms {
  485. if algorithm.FrameworkName == strings.Title(option.TaskType) {
  486. ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
  487. if ns[0] != option.DatasetsName {
  488. continue
  489. }
  490. if ns[1] != option.AlgorithmName {
  491. continue
  492. }
  493. switch option.ResourceType {
  494. case CPU:
  495. if ns[2] != CPU {
  496. continue
  497. }
  498. case CARD:
  499. if ns[2] != strings.ToLower(option.ComputeCard) {
  500. continue
  501. }
  502. }
  503. option.AlgorithmId = algorithm.AlgorithmId
  504. return nil
  505. }
  506. }
  507. if option.AlgorithmId == "" {
  508. return errors.New("Algorithm does not exist")
  509. }
  510. return errors.New("failed to get AlgorithmId")
  511. }
  512. func (o *OctopusLink) generateCmd(option *option.AiOption) error {
  513. if option.Cmd == "" {
  514. switch option.ComputeCard {
  515. case GCU:
  516. option.Cmd = "cd /code; python3 train.py"
  517. default:
  518. option.Cmd = TRAIN_CMD
  519. }
  520. }
  521. return nil
  522. }
  523. func (o *OctopusLink) generateEnv(option *option.AiOption) error {
  524. return nil
  525. }
  526. func (o *OctopusLink) generateParams(option *option.AiOption) error {
  527. if len(option.Params) == 0 {
  528. epoch := "epoch" + COMMA + "1"
  529. option.Params = append(option.Params, epoch)
  530. }
  531. return nil
  532. }
  533. func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error {
  534. if option.Tops == 0 {
  535. for _, spec := range specs.TrainResourceSpecs {
  536. if spec.Price == 1 {
  537. ns := strings.Split(spec.Name, COMMA)
  538. cardSpecs := strings.Split(ns[0], STAR)
  539. if cardSpecs[1] == computeCard {
  540. option.ResourceId = spec.Id
  541. option.ComputeCard = computeCard
  542. return nil
  543. }
  544. } else {
  545. continue
  546. }
  547. }
  548. } else {
  549. cardNum := math.Ceil(option.Tops / float64(BASE_TOPS))
  550. for _, spec := range specs.TrainResourceSpecs {
  551. if option.Tops < BASE_TOPS {
  552. if spec.Price == 1 {
  553. ns := strings.Split(spec.Name, COMMA)
  554. cardSpecs := strings.Split(ns[0], STAR)
  555. if cardSpecs[1] == computeCard {
  556. option.ResourceId = spec.Id
  557. option.ComputeCard = computeCard
  558. return nil
  559. }
  560. } else {
  561. continue
  562. }
  563. } else {
  564. ns := strings.Split(spec.Name, COMMA)
  565. if len(ns) != 4 {
  566. continue
  567. }
  568. cardSpecs := strings.Split(ns[0], STAR)
  569. if cardSpecs[1] != computeCard {
  570. continue
  571. }
  572. s, err := strconv.ParseFloat(cardSpecs[0], 64)
  573. if err != nil {
  574. return err
  575. }
  576. switch computeCard {
  577. case GCU:
  578. if cardNum == s { // 1, 4, 8
  579. option.ResourceId = spec.Id
  580. option.ComputeCard = computeCard
  581. return nil
  582. }
  583. if 1 < cardNum && cardNum <= 4 && s == 4 {
  584. option.ResourceId = spec.Id
  585. option.ComputeCard = computeCard
  586. return nil
  587. }
  588. if 4 < cardNum && s == 8 {
  589. option.ResourceId = spec.Id
  590. option.ComputeCard = computeCard
  591. return nil
  592. }
  593. case MLU: // 1, 2, 4
  594. if cardNum/2 == s {
  595. option.ResourceId = spec.Id
  596. option.ComputeCard = computeCard
  597. return nil
  598. }
  599. if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
  600. option.ResourceId = spec.Id
  601. option.ComputeCard = computeCard
  602. return nil
  603. }
  604. if 2 < cardNum/2 && s == 4 {
  605. option.ResourceId = spec.Id
  606. option.ComputeCard = computeCard
  607. return nil
  608. }
  609. }
  610. }
  611. }
  612. }
  613. return errors.New("set ResourceId error")
  614. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.