You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopus.go 16 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "errors"
  16. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
  17. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  19. "gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopus"
  20. "gitlink.org.cn/jcce-pcm/pcm-participant-octopus/octopusclient"
  21. "math"
  22. "strconv"
  23. "strings"
  24. )
  25. type OctopusLink struct {
  26. octopusRpc octopusclient.Octopus
  27. pageIndex int32
  28. pageSize int32
  29. platform string
  30. participantId int64
  31. }
  32. const (
  33. IMG_NAME_PREFIX = "oct_"
  34. IMG_VERSION_PREFIX = "version_"
  35. TASK_NAME_PREFIX = "trainJob"
  36. RESOURCE_POOL = "common-pool"
  37. HANWUJI = "hanwuji"
  38. SUIYUAN = "suiyuan"
  39. SAILINGSI = "sailingsi"
  40. MLU = "MLU"
  41. CAMBRICONMLU290 = 256
  42. GCU = "GCU"
  43. ENFLAME = "enflame"
  44. EnflameT20 = 128
  45. BASE_TOPS = 128
  46. CAMBRICON = "cambricon"
  47. TRAIN_CMD = "cd /code; python train.py"
  48. VERSION = "V1"
  49. )
  50. var (
  51. cardAliasMap = map[string]string{
  52. MLU: CAMBRICON,
  53. GCU: ENFLAME,
  54. }
  55. cardTopsMap = map[string]float64{
  56. MLU: CAMBRICONMLU290,
  57. GCU: EnflameT20,
  58. }
  59. )
  60. func NewOctopusLink(octopusRpc octopusclient.Octopus, name string, id int64) *OctopusLink {
  61. return &OctopusLink{octopusRpc: octopusRpc, platform: name, participantId: id, pageIndex: 1, pageSize: 100}
  62. }
  63. func (o *OctopusLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
  64. // octopus创建镜像
  65. createReq := &octopus.CreateImageReq{
  66. Platform: o.platform,
  67. CreateImage: &octopus.CreateImage{
  68. SourceType: 1,
  69. ImageName: IMG_NAME_PREFIX + utils.RandomString(7),
  70. ImageVersion: IMG_VERSION_PREFIX + utils.RandomString(7),
  71. },
  72. }
  73. createResp, err := o.octopusRpc.CreateImage(ctx, createReq)
  74. if err != nil {
  75. return nil, err
  76. }
  77. // octopus上传镜像
  78. uploadReq := &octopus.UploadImageReq{
  79. Platform: o.platform,
  80. ImageId: createResp.Payload.ImageId,
  81. Params: &octopus.UploadImageParam{
  82. Domain: "",
  83. FileName: "",
  84. },
  85. }
  86. uploadResp, err := o.octopusRpc.UploadImage(ctx, uploadReq)
  87. if err != nil {
  88. return nil, err
  89. }
  90. // Todo 实际上传
  91. return uploadResp, nil
  92. }
  93. func (o *OctopusLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  94. // octopus删除镜像
  95. req := &octopus.DeleteImageReq{
  96. Platform: o.platform,
  97. ImageId: imageId,
  98. }
  99. resp, err := o.octopusRpc.DeleteImage(ctx, req)
  100. if err != nil {
  101. return nil, err
  102. }
  103. return resp, nil
  104. }
  105. func (o *OctopusLink) QueryImageList(ctx context.Context) (interface{}, error) {
  106. // octopus获取镜像列表
  107. req := &octopus.GetUserImageListReq{
  108. Platform: o.platform,
  109. PageIndex: o.pageIndex,
  110. PageSize: o.pageSize,
  111. }
  112. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  113. if err != nil {
  114. return nil, err
  115. }
  116. return resp, nil
  117. }
  118. func (o *OctopusLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  119. // octopus提交任务
  120. // python参数
  121. var prms []*octopus.Parameters
  122. for _, param := range params {
  123. var p octopus.Parameters
  124. s := strings.Split(param, COMMA)
  125. p.Key = s[0]
  126. p.Value = s[1]
  127. prms = append(prms, &p)
  128. }
  129. //环境变量
  130. envMap := make(map[string]string)
  131. for _, env := range envs {
  132. s := strings.Split(env, COMMA)
  133. envMap[s[0]] = s[1]
  134. }
  135. req := &octopus.CreateTrainJobReq{
  136. Platform: o.platform,
  137. Params: &octopus.CreateTrainJobParam{
  138. ImageId: imageId,
  139. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  140. ResourcePool: RESOURCE_POOL,
  141. Config: []*octopus.Config{
  142. {
  143. Command: cmd,
  144. ResourceSpecId: resourceId,
  145. MinFailedTaskCount: 1,
  146. MinSucceededTaskCount: 1,
  147. TaskNumber: 1,
  148. Parameters: prms,
  149. Envs: envMap,
  150. },
  151. },
  152. DataSetId: datasetsId,
  153. DataSetVersion: VERSION,
  154. AlgorithmId: algorithmId,
  155. AlgorithmVersion: VERSION,
  156. },
  157. }
  158. resp, err := o.octopusRpc.CreateTrainJob(ctx, req)
  159. if err != nil {
  160. return nil, err
  161. }
  162. return resp, nil
  163. }
  164. func (o *OctopusLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  165. // octopus获取任务
  166. req := &octopus.GetTrainJobReq{
  167. Platform: o.platform,
  168. Id: taskId,
  169. }
  170. resp, err := o.octopusRpc.GetTrainJob(ctx, req)
  171. if err != nil {
  172. return nil, err
  173. }
  174. return resp, nil
  175. }
  176. func (o *OctopusLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  177. // octopus删除任务
  178. req := &octopus.DeleteTrainJobReq{
  179. Platform: o.platform,
  180. JobIds: []string{taskId},
  181. }
  182. resp, err := o.octopusRpc.DeleteTrainJob(ctx, req)
  183. if err != nil {
  184. return nil, err
  185. }
  186. return resp, nil
  187. }
  188. func (o *OctopusLink) QuerySpecs(ctx context.Context) (interface{}, error) {
  189. // octopus查询资源规格
  190. req := &octopus.GetResourceSpecsReq{
  191. Platform: o.platform,
  192. ResourcePool: RESOURCE_POOL,
  193. }
  194. resp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  195. if err != nil {
  196. return nil, err
  197. }
  198. return resp, nil
  199. }
  200. func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  201. req := &octopus.GetResourceSpecsReq{
  202. Platform: o.platform,
  203. ResourcePool: RESOURCE_POOL,
  204. }
  205. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  206. if err != nil {
  207. return nil, err
  208. }
  209. if !specResp.Success {
  210. return nil, errors.New(specResp.Error.Message)
  211. }
  212. balanceReq := &octopus.GetUserBalanceReq{
  213. Platform: o.platform,
  214. }
  215. balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
  216. if err != nil {
  217. return nil, err
  218. }
  219. if !balanceResp.Success {
  220. return nil, errors.New(balanceResp.Error.Message)
  221. }
  222. var cards []*collector.Card
  223. balance := float64(balanceResp.Payload.BillingUser.Amount)
  224. var cpuHours float64
  225. for _, spec := range specResp.TrainResourceSpecs {
  226. if spec.Price == 0 {
  227. ns := strings.Split(spec.Name, COMMA)
  228. if len(ns) == 2 {
  229. nss := strings.Split(ns[0], COLON)
  230. if nss[0] == CPU {
  231. cpuHours = -1
  232. }
  233. }
  234. }
  235. if spec.Price == 1 {
  236. ns := strings.Split(spec.Name, COMMA)
  237. cardSpecs := strings.Split(ns[0], STAR)
  238. cardTops, isMapContainsKey := cardTopsMap[cardSpecs[1]]
  239. if !isMapContainsKey {
  240. continue
  241. }
  242. card := &collector.Card{
  243. Platform: OCTOPUS,
  244. Type: CARD,
  245. Name: cardSpecs[1],
  246. TOpsAtFp16: cardTops,
  247. CardHours: balance / spec.Price,
  248. }
  249. cards = append(cards, card)
  250. }
  251. }
  252. resourceStats := &collector.ResourceStats{
  253. ParticipantId: o.participantId,
  254. Name: o.platform,
  255. Balance: balance,
  256. CardsAvail: cards,
  257. CpuCoreHours: cpuHours,
  258. }
  259. return resourceStats, nil
  260. }
  261. func (o *OctopusLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  262. req := &octopus.GetMyDatasetListReq{
  263. Platform: o.platform,
  264. PageIndex: o.pageIndex,
  265. PageSize: o.pageSize,
  266. }
  267. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  268. if err != nil {
  269. return nil, err
  270. }
  271. if !resp.Success {
  272. return nil, errors.New(resp.Error.Message)
  273. }
  274. specs := []*collector.DatasetsSpecs{}
  275. for _, dataset := range resp.Payload.Datasets {
  276. spec := &collector.DatasetsSpecs{Name: dataset.Name}
  277. specs = append(specs, spec)
  278. }
  279. return specs, nil
  280. }
  281. func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  282. var algorithms []*collector.Algorithm
  283. req := &octopus.GetMyAlgorithmListReq{
  284. Platform: o.platform,
  285. PageIndex: o.pageIndex,
  286. PageSize: o.pageSize,
  287. }
  288. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  289. if err != nil {
  290. return nil, err
  291. }
  292. if !resp.Success {
  293. return nil, errors.New("failed to get algorithms")
  294. }
  295. for _, a := range resp.Payload.Algorithms {
  296. algorithm := &collector.Algorithm{Name: a.AlgorithmName, Platform: OCTOPUS, TaskType: strings.ToLower(a.FrameworkName)}
  297. algorithms = append(algorithms, algorithm)
  298. }
  299. return algorithms, nil
  300. }
  301. func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
  302. err := o.GenerateSubmitParams(ctx, option)
  303. if err != nil {
  304. return nil, err
  305. }
  306. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  307. if err != nil {
  308. return nil, err
  309. }
  310. return task, nil
  311. }
  312. func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  313. err := o.generateResourceId(ctx, option)
  314. if err != nil {
  315. return err
  316. }
  317. err = o.generateDatasetsId(ctx, option)
  318. if err != nil {
  319. return err
  320. }
  321. err = o.generateImageId(ctx, option)
  322. if err != nil {
  323. return err
  324. }
  325. err = o.generateAlgorithmId(ctx, option)
  326. if err != nil {
  327. return err
  328. }
  329. err = o.generateCmd(option)
  330. if err != nil {
  331. return err
  332. }
  333. err = o.generateEnv(option)
  334. if err != nil {
  335. return err
  336. }
  337. err = o.generateParams(option)
  338. if err != nil {
  339. return err
  340. }
  341. return nil
  342. }
  343. func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption) error {
  344. if option.ResourceType == "" {
  345. return errors.New("ResourceType not set")
  346. }
  347. req := &octopus.GetResourceSpecsReq{
  348. Platform: o.platform,
  349. ResourcePool: RESOURCE_POOL,
  350. }
  351. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  352. if err != nil {
  353. return err
  354. }
  355. if !specResp.Success {
  356. return errors.New(specResp.Error.Message)
  357. }
  358. if option.ResourceType == CPU {
  359. for _, spec := range specResp.TrainResourceSpecs {
  360. if spec.Price == 0 {
  361. option.ResourceId = spec.Id
  362. return nil
  363. }
  364. }
  365. }
  366. if option.ResourceType == CARD {
  367. err = setResourceIdByCard(option, specResp, GCU)
  368. if err != nil {
  369. return err
  370. }
  371. return nil
  372. }
  373. return errors.New("failed to get ResourceId")
  374. }
  375. func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error {
  376. if option.DatasetsName == "" {
  377. return errors.New("DatasetsName not set")
  378. }
  379. req := &octopus.GetMyDatasetListReq{
  380. Platform: o.platform,
  381. PageIndex: o.pageIndex,
  382. PageSize: o.pageSize,
  383. }
  384. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  385. if err != nil {
  386. return err
  387. }
  388. if !resp.Success {
  389. return errors.New("failed to get DatasetsId")
  390. }
  391. for _, dataset := range resp.Payload.Datasets {
  392. if dataset.Name == option.DatasetsName {
  393. option.DatasetsId = dataset.Id
  394. return nil
  395. }
  396. }
  397. return errors.New("failed to get DatasetsId")
  398. }
  399. func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption) error {
  400. if option.TaskType == "" {
  401. return errors.New("TaskType not set")
  402. }
  403. req := &octopus.GetUserImageListReq{
  404. Platform: o.platform,
  405. PageIndex: o.pageIndex,
  406. PageSize: o.pageSize,
  407. }
  408. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  409. if err != nil {
  410. return err
  411. }
  412. if !resp.Success {
  413. return errors.New("failed to get imageId")
  414. }
  415. if option.ResourceType == CPU {
  416. for _, img := range resp.Payload.Images {
  417. if img.Image.ImageName == "test-image" {
  418. option.ImageId = img.Image.Id
  419. return nil
  420. }
  421. }
  422. }
  423. preImgReq := &octopus.GetPresetImageListReq{
  424. Platform: o.platform,
  425. PageIndex: o.pageIndex,
  426. PageSize: o.pageSize,
  427. }
  428. preImgResp, err := o.octopusRpc.GetPresetImageList(ctx, preImgReq)
  429. if err != nil {
  430. return err
  431. }
  432. if !preImgResp.Success {
  433. return errors.New("failed to get PresetImages")
  434. }
  435. if option.ResourceType == CARD {
  436. for _, image := range preImgResp.Payload.Images {
  437. if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
  438. option.ImageId = image.Id
  439. return nil
  440. }
  441. }
  442. }
  443. return errors.New("failed to get ImageId")
  444. }
  445. func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
  446. // temporarily set algorithm to cnn
  447. if option.AlgorithmName == "" {
  448. switch option.DatasetsName {
  449. case "cifar10":
  450. option.AlgorithmName = "cnn"
  451. case "mnist":
  452. option.AlgorithmName = "fcn"
  453. }
  454. }
  455. req := &octopus.GetMyAlgorithmListReq{
  456. Platform: o.platform,
  457. PageIndex: o.pageIndex,
  458. PageSize: o.pageSize,
  459. }
  460. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  461. if err != nil {
  462. return err
  463. }
  464. if !resp.Success {
  465. return errors.New("failed to get algorithmId")
  466. }
  467. for _, algorithm := range resp.Payload.Algorithms {
  468. if algorithm.FrameworkName == strings.Title(option.TaskType) {
  469. ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
  470. if ns[0] != option.DatasetsName {
  471. continue
  472. }
  473. if ns[1] != option.AlgorithmName {
  474. continue
  475. }
  476. switch option.ResourceType {
  477. case CPU:
  478. if ns[2] != CPU {
  479. continue
  480. }
  481. case CARD:
  482. if ns[2] != strings.ToLower(option.ComputeCard) {
  483. continue
  484. }
  485. }
  486. option.AlgorithmId = algorithm.AlgorithmId
  487. return nil
  488. }
  489. }
  490. if option.AlgorithmId == "" {
  491. return errors.New("Algorithm does not exist")
  492. }
  493. return errors.New("failed to get AlgorithmId")
  494. }
  495. func (o *OctopusLink) generateCmd(option *option.AiOption) error {
  496. if option.Cmd == "" {
  497. switch option.ComputeCard {
  498. case GCU:
  499. option.Cmd = "cd /code; python3 train.py"
  500. default:
  501. option.Cmd = TRAIN_CMD
  502. }
  503. }
  504. return nil
  505. }
  506. func (o *OctopusLink) generateEnv(option *option.AiOption) error {
  507. return nil
  508. }
  509. func (o *OctopusLink) generateParams(option *option.AiOption) error {
  510. if len(option.Params) == 0 {
  511. epoch := "epoch" + COMMA + "1"
  512. option.Params = append(option.Params, epoch)
  513. }
  514. return nil
  515. }
  516. func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error {
  517. if option.Tops == 0 {
  518. for _, spec := range specs.TrainResourceSpecs {
  519. if spec.Price == 1 {
  520. ns := strings.Split(spec.Name, COMMA)
  521. cardSpecs := strings.Split(ns[0], STAR)
  522. if cardSpecs[1] == computeCard {
  523. option.ResourceId = spec.Id
  524. option.ComputeCard = computeCard
  525. return nil
  526. }
  527. } else {
  528. continue
  529. }
  530. }
  531. } else {
  532. cardNum := math.Ceil(option.Tops / float64(BASE_TOPS))
  533. for _, spec := range specs.TrainResourceSpecs {
  534. if option.Tops < BASE_TOPS {
  535. if spec.Price == 1 {
  536. ns := strings.Split(spec.Name, COMMA)
  537. cardSpecs := strings.Split(ns[0], STAR)
  538. if cardSpecs[1] == computeCard {
  539. option.ResourceId = spec.Id
  540. option.ComputeCard = computeCard
  541. return nil
  542. }
  543. } else {
  544. continue
  545. }
  546. } else {
  547. ns := strings.Split(spec.Name, COMMA)
  548. if len(ns) != 4 {
  549. continue
  550. }
  551. cardSpecs := strings.Split(ns[0], STAR)
  552. if cardSpecs[1] != computeCard {
  553. continue
  554. }
  555. s, err := strconv.ParseFloat(cardSpecs[0], 64)
  556. if err != nil {
  557. return err
  558. }
  559. switch computeCard {
  560. case GCU:
  561. if cardNum == s { // 1, 4, 8
  562. option.ResourceId = spec.Id
  563. option.ComputeCard = computeCard
  564. return nil
  565. }
  566. if 1 < cardNum && cardNum <= 4 && s == 4 {
  567. option.ResourceId = spec.Id
  568. option.ComputeCard = computeCard
  569. return nil
  570. }
  571. if 4 < cardNum && s == 8 {
  572. option.ResourceId = spec.Id
  573. option.ComputeCard = computeCard
  574. return nil
  575. }
  576. case MLU: // 1, 2, 4
  577. if cardNum/2 == s {
  578. option.ResourceId = spec.Id
  579. option.ComputeCard = computeCard
  580. return nil
  581. }
  582. if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
  583. option.ResourceId = spec.Id
  584. option.ComputeCard = computeCard
  585. return nil
  586. }
  587. if 2 < cardNum/2 && s == 4 {
  588. option.ResourceId = spec.Id
  589. option.ComputeCard = computeCard
  590. return nil
  591. }
  592. }
  593. }
  594. }
  595. }
  596. return errors.New("set ResourceId error")
  597. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.