You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopus.go 21 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "errors"
  16. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
  17. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  19. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  20. "gitlink.org.cn/JointCloud/pcm-octopus/octopus"
  21. "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
  22. "math"
  23. "strconv"
  24. "strings"
  25. "time"
  26. )
  27. type OctopusLink struct {
  28. octopusRpc octopusclient.Octopus
  29. pageIndex int32
  30. pageSize int32
  31. platform string
  32. participantId int64
  33. }
  34. const (
  35. IMG_NAME_PREFIX = "oct_"
  36. IMG_VERSION_PREFIX = "version_"
  37. TASK_NAME_PREFIX = "trainJob"
  38. RESOURCE_POOL = "common-pool"
  39. HANWUJI = "hanwuji"
  40. SUIYUAN = "suiyuan"
  41. SAILINGSI = "sailingsi"
  42. MLU = "MLU"
  43. BIV100 = "BI-V100"
  44. CAMBRICONMLU290 = 256
  45. GCU = "GCU"
  46. ENFLAME = "enflame"
  47. EnflameT20 = 128
  48. BASE_TOPS = 128
  49. CAMBRICON = "cambricon"
  50. ILUVATAR = "iluvatar"
  51. TRAIN_CMD = "cd /code; python train.py"
  52. VERSION = "V1"
  53. DOMAIN = "http://192.168.242.41:8001/"
  54. CAMBRICON_CN = "寒武纪290"
  55. ENFLAME_CN = "燧原T20"
  56. ILUVATAR_CN = "天数BI-V100"
  57. )
  58. var (
  59. cardAliasMap = map[string]string{
  60. MLU: CAMBRICON,
  61. GCU: ENFLAME,
  62. BIV100: ILUVATAR,
  63. }
  64. cardCnMap = map[string]string{
  65. MLU: CAMBRICON_CN,
  66. GCU: ENFLAME_CN,
  67. BIV100: ILUVATAR_CN,
  68. }
  69. cardTopsMap = map[string]float64{
  70. MLU: CAMBRICONMLU290,
  71. GCU: EnflameT20,
  72. }
  73. )
  74. func NewOctopusLink(octopusRpc octopusclient.Octopus, name string, id int64) *OctopusLink {
  75. return &OctopusLink{octopusRpc: octopusRpc, platform: name, participantId: id, pageIndex: 1, pageSize: 100}
  76. }
  77. func (o *OctopusLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
  78. // octopus创建镜像
  79. createReq := &octopus.CreateImageReq{
  80. Platform: o.platform,
  81. CreateImage: &octopus.CreateImage{
  82. SourceType: 1,
  83. ImageName: IMG_NAME_PREFIX + utils.RandomString(7),
  84. ImageVersion: IMG_VERSION_PREFIX + utils.RandomString(7),
  85. },
  86. }
  87. createResp, err := o.octopusRpc.CreateImage(ctx, createReq)
  88. if err != nil {
  89. return nil, err
  90. }
  91. // octopus上传镜像
  92. uploadReq := &octopus.UploadImageReq{
  93. Platform: o.platform,
  94. ImageId: createResp.Payload.ImageId,
  95. Params: &octopus.UploadImageParam{
  96. Domain: "",
  97. FileName: "",
  98. },
  99. }
  100. uploadResp, err := o.octopusRpc.UploadImage(ctx, uploadReq)
  101. if err != nil {
  102. return nil, err
  103. }
  104. // Todo 实际上传
  105. return uploadResp, nil
  106. }
  107. func (o *OctopusLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  108. // octopus删除镜像
  109. req := &octopus.DeleteImageReq{
  110. Platform: o.platform,
  111. ImageId: imageId,
  112. }
  113. resp, err := o.octopusRpc.DeleteImage(ctx, req)
  114. if err != nil {
  115. return nil, err
  116. }
  117. return resp, nil
  118. }
  119. func (o *OctopusLink) QueryImageList(ctx context.Context) (interface{}, error) {
  120. // octopus获取镜像列表
  121. req := &octopus.GetUserImageListReq{
  122. Platform: o.platform,
  123. PageIndex: o.pageIndex,
  124. PageSize: o.pageSize,
  125. }
  126. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  127. if err != nil {
  128. return nil, err
  129. }
  130. return resp, nil
  131. }
  132. func (o *OctopusLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  133. // octopus提交任务
  134. // python参数
  135. var prms []*octopus.Parameters
  136. for _, param := range params {
  137. var p octopus.Parameters
  138. s := strings.Split(param, COMMA)
  139. p.Key = s[0]
  140. p.Value = s[1]
  141. prms = append(prms, &p)
  142. }
  143. //环境变量
  144. envMap := make(map[string]string)
  145. for _, env := range envs {
  146. s := strings.Split(env, COMMA)
  147. envMap[s[0]] = s[1]
  148. }
  149. req := &octopus.CreateTrainJobReq{
  150. Platform: o.platform,
  151. Params: &octopus.CreateTrainJobParam{
  152. ImageId: imageId,
  153. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  154. ResourcePool: RESOURCE_POOL,
  155. Config: []*octopus.Config{
  156. {
  157. Command: cmd,
  158. ResourceSpecId: resourceId,
  159. MinFailedTaskCount: 1,
  160. MinSucceededTaskCount: 1,
  161. TaskNumber: 1,
  162. Parameters: prms,
  163. Envs: envMap,
  164. },
  165. },
  166. DataSetId: datasetsId,
  167. DataSetVersion: VERSION,
  168. AlgorithmId: algorithmId,
  169. AlgorithmVersion: VERSION,
  170. },
  171. }
  172. resp, err := o.octopusRpc.CreateTrainJob(ctx, req)
  173. if err != nil {
  174. return nil, err
  175. }
  176. return resp, nil
  177. }
  178. func (o *OctopusLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  179. // octopus获取任务
  180. req := &octopus.GetTrainJobReq{
  181. Platform: o.platform,
  182. Id: taskId,
  183. }
  184. resp, err := o.octopusRpc.GetTrainJob(ctx, req)
  185. if err != nil {
  186. return nil, err
  187. }
  188. return resp, nil
  189. }
  190. func (o *OctopusLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  191. // octopus删除任务
  192. req := &octopus.DeleteTrainJobReq{
  193. Platform: o.platform,
  194. JobIds: []string{taskId},
  195. }
  196. resp, err := o.octopusRpc.DeleteTrainJob(ctx, req)
  197. if err != nil {
  198. return nil, err
  199. }
  200. return resp, nil
  201. }
  202. func (o *OctopusLink) QuerySpecs(ctx context.Context) (interface{}, error) {
  203. // octopus查询资源规格
  204. req := &octopus.GetResourceSpecsReq{
  205. Platform: o.platform,
  206. ResourcePool: RESOURCE_POOL,
  207. }
  208. resp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  209. if err != nil {
  210. return nil, err
  211. }
  212. return resp, nil
  213. }
  214. func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  215. req := &octopus.GetResourceSpecsReq{
  216. Platform: o.platform,
  217. ResourcePool: RESOURCE_POOL,
  218. }
  219. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  220. if err != nil {
  221. return nil, err
  222. }
  223. if !specResp.Success {
  224. return nil, errors.New(specResp.Error.Message)
  225. }
  226. balanceReq := &octopus.GetUserBalanceReq{
  227. Platform: o.platform,
  228. }
  229. balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
  230. if err != nil {
  231. return nil, err
  232. }
  233. if !balanceResp.Success {
  234. return nil, errors.New(balanceResp.Error.Message)
  235. }
  236. var cards []*collector.Card
  237. balance := float64(balanceResp.Payload.BillingUser.Amount)
  238. var cpuHours float64
  239. for _, spec := range specResp.TrainResourceSpecs {
  240. if spec.Price == 0 {
  241. ns := strings.Split(spec.Name, COMMA)
  242. if len(ns) == 2 {
  243. nss := strings.Split(ns[0], COLON)
  244. if nss[0] == CPU {
  245. cpuHours = -1
  246. }
  247. }
  248. }
  249. if spec.Price == 1 {
  250. ns := strings.Split(spec.Name, COMMA)
  251. cardSpecs := strings.Split(ns[0], STAR)
  252. cardTops, isMapContainsKey := cardTopsMap[cardSpecs[1]]
  253. if !isMapContainsKey {
  254. continue
  255. }
  256. card := &collector.Card{
  257. Platform: OCTOPUS,
  258. Type: CARD,
  259. Name: cardSpecs[1],
  260. TOpsAtFp16: cardTops,
  261. CardHours: balance / spec.Price,
  262. }
  263. cards = append(cards, card)
  264. }
  265. }
  266. resourceStats := &collector.ResourceStats{
  267. ClusterId: strconv.FormatInt(o.participantId, 10),
  268. Name: o.platform,
  269. Balance: balance,
  270. CardsAvail: cards,
  271. CpuCoreHours: cpuHours,
  272. }
  273. return resourceStats, nil
  274. }
  275. func (o *OctopusLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  276. req := &octopus.GetMyDatasetListReq{
  277. Platform: o.platform,
  278. PageIndex: o.pageIndex,
  279. PageSize: o.pageSize,
  280. }
  281. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  282. if err != nil {
  283. return nil, err
  284. }
  285. if !resp.Success {
  286. return nil, errors.New(resp.Error.Message)
  287. }
  288. specs := []*collector.DatasetsSpecs{}
  289. for _, dataset := range resp.Payload.Datasets {
  290. spec := &collector.DatasetsSpecs{Name: dataset.Name}
  291. specs = append(specs, spec)
  292. }
  293. return specs, nil
  294. }
  295. func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  296. var algorithms []*collector.Algorithm
  297. req := &octopus.GetMyAlgorithmListReq{
  298. Platform: o.platform,
  299. PageIndex: o.pageIndex,
  300. PageSize: o.pageSize,
  301. }
  302. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  303. if err != nil {
  304. return nil, err
  305. }
  306. if !resp.Success {
  307. return nil, errors.New("failed to get algorithms")
  308. }
  309. for _, a := range resp.Payload.Algorithms {
  310. algorithm := &collector.Algorithm{Name: a.AlgorithmName, Platform: OCTOPUS, TaskType: strings.ToLower(a.FrameworkName)}
  311. algorithms = append(algorithms, algorithm)
  312. }
  313. return algorithms, nil
  314. }
  315. func (o *OctopusLink) GetComputeCards(ctx context.Context) ([]string, error) {
  316. var cards []string
  317. for s, _ := range cardAliasMap {
  318. cards = append(cards, s)
  319. }
  320. return cards, nil
  321. }
  322. func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  323. var name string
  324. if resourceType == CARD {
  325. name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
  326. } else {
  327. name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
  328. }
  329. req := &octopus.GetMyAlgorithmListReq{
  330. Platform: o.platform,
  331. PageIndex: o.pageIndex,
  332. PageSize: o.pageSize,
  333. }
  334. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  335. if err != nil {
  336. return "", err
  337. }
  338. if !resp.Success {
  339. return "", errors.New("failed to get algorithmList")
  340. }
  341. var algorithmId string
  342. var algorithms []*octopus.Algorithms
  343. for _, a := range resp.Payload.Algorithms {
  344. if strings.ToLower(a.FrameworkName) != taskType {
  345. continue
  346. }
  347. if a.AlgorithmDescript == name {
  348. algorithms = append(algorithms, a)
  349. }
  350. }
  351. if len(algorithms) == 0 {
  352. return "", errors.New("algorithmId not found")
  353. }
  354. if len(algorithms) == 1 {
  355. algorithmId = algorithms[0].AlgorithmId
  356. }
  357. aLatest := &octopus.Algorithms{}
  358. for i, _ := range algorithms {
  359. if time.Unix(algorithms[i].CreatedAt, 0).After(time.Unix(aLatest.CreatedAt, 0)) {
  360. aLatest = algorithms[i]
  361. }
  362. }
  363. if aLatest.AlgorithmId == "" {
  364. return "", errors.New("algorithmId not found")
  365. }
  366. algorithmId = aLatest.AlgorithmId
  367. dcReq := &octopus.DownloadCompressReq{
  368. Platform: o.platform,
  369. Version: VERSION,
  370. AlgorithmId: algorithmId,
  371. }
  372. dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq)
  373. if err != nil {
  374. return "", err
  375. }
  376. if !dcResp.Success {
  377. return "", errors.New(dcResp.Error.Message)
  378. }
  379. daReq := &octopus.DownloadAlgorithmReq{
  380. Platform: o.platform,
  381. Version: VERSION,
  382. AlgorithmId: algorithmId,
  383. CompressAt: dcResp.Payload.CompressAt,
  384. Domain: DOMAIN,
  385. }
  386. daResp, err := o.octopusRpc.DownloadAlgorithm(ctx, daReq)
  387. if err != nil {
  388. return "", err
  389. }
  390. if !daResp.Success {
  391. return "", errors.New(dcResp.Error.Message)
  392. }
  393. urlReq := &octopus.AlgorithmUrlReq{
  394. Platform: o.platform,
  395. Url: daResp.Payload.DownloadUrl,
  396. }
  397. urlResp, err := o.octopusRpc.DownloadAlgorithmUrl(ctx, urlReq)
  398. if err != nil {
  399. return "", err
  400. }
  401. return urlResp.Algorithm, nil
  402. }
  403. func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  404. //var name string
  405. //if resourceType == CARD {
  406. // name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
  407. //} else {
  408. // name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
  409. //}
  410. //uploadReq := &octopus.UploadAlgorithmReq{}
  411. return nil
  412. }
  413. func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  414. instance, err := strconv.ParseInt(instanceNum, 10, 32)
  415. if err != nil {
  416. return "", err
  417. }
  418. req := &octopus.GetTrainJobLogReq{
  419. Platform: o.platform,
  420. TaskId: taskId,
  421. TaskNum: "task0",
  422. Num: int32(instance),
  423. }
  424. resp, err := o.octopusRpc.GetTrainJobLog(ctx, req)
  425. if err != nil {
  426. return "", err
  427. }
  428. if strings.Contains(resp.Content, "404 Not Found") {
  429. resp.Content = "waiting for logs..."
  430. }
  431. return resp.Content, nil
  432. }
  433. func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  434. resp, err := o.QueryTask(ctx, taskId)
  435. if err != nil {
  436. return nil, err
  437. }
  438. jobresp, ok := (resp).(*octopus.GetTrainJobResp)
  439. if !jobresp.Success || !ok {
  440. if jobresp.Error != nil {
  441. return nil, errors.New(jobresp.Error.Message)
  442. } else {
  443. return nil, errors.New("get training task failed, empty error returned")
  444. }
  445. }
  446. var task collector.Task
  447. task.Id = jobresp.Payload.TrainJob.Id
  448. task.Start = time.Unix(jobresp.Payload.TrainJob.StartedAt, 0).Format(constants.Layout)
  449. task.End = time.Unix(jobresp.Payload.TrainJob.CompletedAt, 0).Format(constants.Layout)
  450. switch jobresp.Payload.TrainJob.Status {
  451. case "succeeded":
  452. task.Status = constants.Completed
  453. case "failed":
  454. task.Status = constants.Failed
  455. case "running":
  456. task.Status = constants.Running
  457. case "stopped":
  458. task.Status = constants.Stopped
  459. case "pending":
  460. task.Status = constants.Pending
  461. default:
  462. task.Status = "undefined"
  463. }
  464. return &task, nil
  465. }
  466. func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
  467. err := o.GenerateSubmitParams(ctx, option)
  468. if err != nil {
  469. return nil, err
  470. }
  471. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  472. if err != nil {
  473. return nil, err
  474. }
  475. return task, nil
  476. }
  477. func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  478. err := o.generateResourceId(ctx, option)
  479. if err != nil {
  480. return err
  481. }
  482. err = o.generateDatasetsId(ctx, option)
  483. if err != nil {
  484. return err
  485. }
  486. err = o.generateImageId(ctx, option)
  487. if err != nil {
  488. return err
  489. }
  490. err = o.generateAlgorithmId(ctx, option)
  491. if err != nil {
  492. return err
  493. }
  494. err = o.generateCmd(option)
  495. if err != nil {
  496. return err
  497. }
  498. err = o.generateEnv(option)
  499. if err != nil {
  500. return err
  501. }
  502. err = o.generateParams(option)
  503. if err != nil {
  504. return err
  505. }
  506. return nil
  507. }
  508. func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption) error {
  509. if option.ResourceType == "" {
  510. return errors.New("ResourceType not set")
  511. }
  512. req := &octopus.GetResourceSpecsReq{
  513. Platform: o.platform,
  514. ResourcePool: RESOURCE_POOL,
  515. }
  516. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  517. if err != nil {
  518. return err
  519. }
  520. if !specResp.Success {
  521. return errors.New(specResp.Error.Message)
  522. }
  523. if option.ResourceType == CPU {
  524. for _, spec := range specResp.TrainResourceSpecs {
  525. if spec.Price == 0 {
  526. option.ResourceId = spec.Id
  527. return nil
  528. }
  529. }
  530. }
  531. if option.ResourceType == CARD {
  532. if option.ComputeCard == "" {
  533. option.ComputeCard = GCU
  534. }
  535. err = setResourceIdByCard(option, specResp, option.ComputeCard)
  536. if err != nil {
  537. return err
  538. }
  539. return nil
  540. }
  541. return errors.New("failed to get ResourceId")
  542. }
  543. func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error {
  544. if option.DatasetsName == "" {
  545. return errors.New("DatasetsName not set")
  546. }
  547. req := &octopus.GetMyDatasetListReq{
  548. Platform: o.platform,
  549. PageIndex: o.pageIndex,
  550. PageSize: o.pageSize,
  551. }
  552. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  553. if err != nil {
  554. return err
  555. }
  556. if !resp.Success {
  557. return errors.New("failed to get DatasetsId")
  558. }
  559. for _, dataset := range resp.Payload.Datasets {
  560. if dataset.Name == option.DatasetsName {
  561. option.DatasetsId = dataset.Id
  562. return nil
  563. }
  564. }
  565. return errors.New("failed to get DatasetsId")
  566. }
  567. func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption) error {
  568. if option.TaskType == "" {
  569. return errors.New("TaskType not set")
  570. }
  571. req := &octopus.GetUserImageListReq{
  572. Platform: o.platform,
  573. PageIndex: o.pageIndex,
  574. PageSize: o.pageSize,
  575. }
  576. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  577. if err != nil {
  578. return err
  579. }
  580. if !resp.Success {
  581. return errors.New("failed to get imageId")
  582. }
  583. if option.ResourceType == CPU {
  584. for _, img := range resp.Payload.Images {
  585. if img.Image.ImageName == "test-image" {
  586. option.ImageId = img.Image.Id
  587. return nil
  588. }
  589. }
  590. }
  591. preImgReq := &octopus.GetPresetImageListReq{
  592. Platform: o.platform,
  593. PageIndex: o.pageIndex,
  594. PageSize: o.pageSize,
  595. }
  596. preImgResp, err := o.octopusRpc.GetPresetImageList(ctx, preImgReq)
  597. if err != nil {
  598. return err
  599. }
  600. if !preImgResp.Success {
  601. return errors.New("failed to get PresetImages")
  602. }
  603. if option.ResourceType == CARD {
  604. for _, image := range preImgResp.Payload.Images {
  605. if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(option.ComputeCard)]) {
  606. switch strings.ToUpper(option.ComputeCard) {
  607. case GCU:
  608. if strings.HasPrefix(image.ImageVersion, "t20_") {
  609. option.ImageId = image.Id
  610. return nil
  611. }
  612. case BIV100:
  613. if strings.HasPrefix(image.ImageVersion, "bi_") {
  614. option.ImageId = image.Id
  615. return nil
  616. }
  617. case MLU:
  618. option.ImageId = image.Id
  619. return nil
  620. }
  621. }
  622. }
  623. }
  624. return errors.New("failed to get ImageId")
  625. }
  626. func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
  627. req := &octopus.GetMyAlgorithmListReq{
  628. Platform: o.platform,
  629. PageIndex: o.pageIndex,
  630. PageSize: o.pageSize,
  631. }
  632. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  633. if err != nil {
  634. return err
  635. }
  636. if !resp.Success {
  637. return errors.New("failed to get algorithmId")
  638. }
  639. for _, algorithm := range resp.Payload.Algorithms {
  640. if algorithm.FrameworkName == strings.Title(option.TaskType) {
  641. ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
  642. if ns[0] != option.DatasetsName {
  643. continue
  644. }
  645. if ns[1] != option.AlgorithmName {
  646. continue
  647. }
  648. switch option.ResourceType {
  649. case CPU:
  650. if ns[2] != CPU {
  651. continue
  652. }
  653. case CARD:
  654. if ns[2] != strings.ToLower(option.ComputeCard) {
  655. continue
  656. }
  657. }
  658. option.AlgorithmId = algorithm.AlgorithmId
  659. return nil
  660. }
  661. }
  662. if option.AlgorithmId == "" {
  663. return errors.New("Algorithm does not exist")
  664. }
  665. return errors.New("failed to get AlgorithmId")
  666. }
  667. func (o *OctopusLink) generateCmd(option *option.AiOption) error {
  668. if option.Cmd == "" {
  669. switch option.ComputeCard {
  670. case GCU:
  671. option.Cmd = "cd /code; python3 train.py"
  672. case MLU:
  673. option.Cmd = "su root; cd /torch/venv3/pytorch/bin; source activate; cd /code; python train.py"
  674. default:
  675. option.Cmd = TRAIN_CMD
  676. }
  677. }
  678. return nil
  679. }
  680. func (o *OctopusLink) generateEnv(option *option.AiOption) error {
  681. return nil
  682. }
  683. func (o *OctopusLink) generateParams(option *option.AiOption) error {
  684. if len(option.Params) == 0 {
  685. epoch := "epoch" + COMMA + "1"
  686. option.Params = append(option.Params, epoch)
  687. }
  688. return nil
  689. }
  690. func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error {
  691. if option.Tops == 0 {
  692. for _, spec := range specs.TrainResourceSpecs {
  693. if spec.Price == 1 {
  694. ns := strings.Split(spec.Name, COMMA)
  695. cardSpecs := strings.Split(ns[0], STAR)
  696. if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
  697. option.ResourceId = spec.Id
  698. option.ComputeCard = computeCard
  699. return nil
  700. }
  701. } else {
  702. continue
  703. }
  704. }
  705. } else {
  706. cardNum := math.Ceil(option.Tops / float64(BASE_TOPS))
  707. for _, spec := range specs.TrainResourceSpecs {
  708. if option.Tops < BASE_TOPS {
  709. if spec.Price == 1 {
  710. ns := strings.Split(spec.Name, COMMA)
  711. cardSpecs := strings.Split(ns[0], STAR)
  712. if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
  713. option.ResourceId = spec.Id
  714. option.ComputeCard = computeCard
  715. return nil
  716. }
  717. } else {
  718. continue
  719. }
  720. } else {
  721. ns := strings.Split(spec.Name, COMMA)
  722. if len(ns) != 4 {
  723. continue
  724. }
  725. cardSpecs := strings.Split(ns[0], STAR)
  726. if cardSpecs[1] != cardCnMap[strings.ToUpper(computeCard)] {
  727. continue
  728. }
  729. s, err := strconv.ParseFloat(cardSpecs[0], 64)
  730. if err != nil {
  731. return err
  732. }
  733. switch computeCard {
  734. case GCU:
  735. option.ComputeCard = computeCard
  736. if cardNum == s { // 1, 4, 8
  737. option.ResourceId = spec.Id
  738. return nil
  739. }
  740. if 1 < cardNum && cardNum <= 4 && s == 4 {
  741. option.ResourceId = spec.Id
  742. return nil
  743. }
  744. if 4 < cardNum && s == 8 {
  745. option.ResourceId = spec.Id
  746. return nil
  747. }
  748. case MLU: // 1, 2, 4
  749. option.ComputeCard = computeCard
  750. if cardNum/2 == s {
  751. option.ResourceId = spec.Id
  752. return nil
  753. }
  754. if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
  755. option.ResourceId = spec.Id
  756. return nil
  757. }
  758. if 2 < cardNum/2 && s == 4 {
  759. option.ResourceId = spec.Id
  760. return nil
  761. }
  762. }
  763. }
  764. }
  765. }
  766. return errors.New("set ResourceId error")
  767. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.