You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopus.go 20 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "errors"
  16. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
  17. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/service/collector"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  19. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  20. "gitlink.org.cn/JointCloud/pcm-octopus/octopus"
  21. "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
  22. "math"
  23. "strconv"
  24. "strings"
  25. "time"
  26. )
  27. type OctopusLink struct {
  28. octopusRpc octopusclient.Octopus
  29. pageIndex int32
  30. pageSize int32
  31. platform string
  32. participantId int64
  33. }
  34. const (
  35. IMG_NAME_PREFIX = "oct_"
  36. IMG_VERSION_PREFIX = "version_"
  37. TASK_NAME_PREFIX = "trainJob"
  38. RESOURCE_POOL = "common-pool"
  39. HANWUJI = "hanwuji"
  40. SUIYUAN = "suiyuan"
  41. SAILINGSI = "sailingsi"
  42. MLU = "MLU"
  43. BIV100 = "BI-V100"
  44. CAMBRICONMLU290 = 256
  45. GCU = "GCU"
  46. ENFLAME = "enflame"
  47. EnflameT20 = 128
  48. BASE_TOPS = 128
  49. CAMBRICON = "cambricon"
  50. ILUVATAR = "iluvatar"
  51. TRAIN_CMD = "cd /code; python train.py"
  52. VERSION = "V1"
  53. DOMAIN = "http://192.168.242.41:8001/"
  54. )
  55. var (
  56. cardAliasMap = map[string]string{
  57. MLU: CAMBRICON,
  58. GCU: ENFLAME,
  59. BIV100: ILUVATAR,
  60. }
  61. cardTopsMap = map[string]float64{
  62. MLU: CAMBRICONMLU290,
  63. GCU: EnflameT20,
  64. }
  65. )
  66. func NewOctopusLink(octopusRpc octopusclient.Octopus, name string, id int64) *OctopusLink {
  67. return &OctopusLink{octopusRpc: octopusRpc, platform: name, participantId: id, pageIndex: 1, pageSize: 100}
  68. }
  69. func (o *OctopusLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
  70. // octopus创建镜像
  71. createReq := &octopus.CreateImageReq{
  72. Platform: o.platform,
  73. CreateImage: &octopus.CreateImage{
  74. SourceType: 1,
  75. ImageName: IMG_NAME_PREFIX + utils.RandomString(7),
  76. ImageVersion: IMG_VERSION_PREFIX + utils.RandomString(7),
  77. },
  78. }
  79. createResp, err := o.octopusRpc.CreateImage(ctx, createReq)
  80. if err != nil {
  81. return nil, err
  82. }
  83. // octopus上传镜像
  84. uploadReq := &octopus.UploadImageReq{
  85. Platform: o.platform,
  86. ImageId: createResp.Payload.ImageId,
  87. Params: &octopus.UploadImageParam{
  88. Domain: "",
  89. FileName: "",
  90. },
  91. }
  92. uploadResp, err := o.octopusRpc.UploadImage(ctx, uploadReq)
  93. if err != nil {
  94. return nil, err
  95. }
  96. // Todo 实际上传
  97. return uploadResp, nil
  98. }
  99. func (o *OctopusLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  100. // octopus删除镜像
  101. req := &octopus.DeleteImageReq{
  102. Platform: o.platform,
  103. ImageId: imageId,
  104. }
  105. resp, err := o.octopusRpc.DeleteImage(ctx, req)
  106. if err != nil {
  107. return nil, err
  108. }
  109. return resp, nil
  110. }
  111. func (o *OctopusLink) QueryImageList(ctx context.Context) (interface{}, error) {
  112. // octopus获取镜像列表
  113. req := &octopus.GetUserImageListReq{
  114. Platform: o.platform,
  115. PageIndex: o.pageIndex,
  116. PageSize: o.pageSize,
  117. }
  118. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  119. if err != nil {
  120. return nil, err
  121. }
  122. return resp, nil
  123. }
  124. func (o *OctopusLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  125. // octopus提交任务
  126. // python参数
  127. var prms []*octopus.Parameters
  128. for _, param := range params {
  129. var p octopus.Parameters
  130. s := strings.Split(param, COMMA)
  131. p.Key = s[0]
  132. p.Value = s[1]
  133. prms = append(prms, &p)
  134. }
  135. //环境变量
  136. envMap := make(map[string]string)
  137. for _, env := range envs {
  138. s := strings.Split(env, COMMA)
  139. envMap[s[0]] = s[1]
  140. }
  141. req := &octopus.CreateTrainJobReq{
  142. Platform: o.platform,
  143. Params: &octopus.CreateTrainJobParam{
  144. ImageId: imageId,
  145. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  146. ResourcePool: RESOURCE_POOL,
  147. Config: []*octopus.Config{
  148. {
  149. Command: cmd,
  150. ResourceSpecId: resourceId,
  151. MinFailedTaskCount: 1,
  152. MinSucceededTaskCount: 1,
  153. TaskNumber: 1,
  154. Parameters: prms,
  155. Envs: envMap,
  156. },
  157. },
  158. DataSetId: datasetsId,
  159. DataSetVersion: VERSION,
  160. AlgorithmId: algorithmId,
  161. AlgorithmVersion: VERSION,
  162. },
  163. }
  164. resp, err := o.octopusRpc.CreateTrainJob(ctx, req)
  165. if err != nil {
  166. return nil, err
  167. }
  168. return resp, nil
  169. }
  170. func (o *OctopusLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  171. // octopus获取任务
  172. req := &octopus.GetTrainJobReq{
  173. Platform: o.platform,
  174. Id: taskId,
  175. }
  176. resp, err := o.octopusRpc.GetTrainJob(ctx, req)
  177. if err != nil {
  178. return nil, err
  179. }
  180. return resp, nil
  181. }
  182. func (o *OctopusLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  183. // octopus删除任务
  184. req := &octopus.DeleteTrainJobReq{
  185. Platform: o.platform,
  186. JobIds: []string{taskId},
  187. }
  188. resp, err := o.octopusRpc.DeleteTrainJob(ctx, req)
  189. if err != nil {
  190. return nil, err
  191. }
  192. return resp, nil
  193. }
  194. func (o *OctopusLink) QuerySpecs(ctx context.Context) (interface{}, error) {
  195. // octopus查询资源规格
  196. req := &octopus.GetResourceSpecsReq{
  197. Platform: o.platform,
  198. ResourcePool: RESOURCE_POOL,
  199. }
  200. resp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  201. if err != nil {
  202. return nil, err
  203. }
  204. return resp, nil
  205. }
  206. func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  207. req := &octopus.GetResourceSpecsReq{
  208. Platform: o.platform,
  209. ResourcePool: RESOURCE_POOL,
  210. }
  211. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  212. if err != nil {
  213. return nil, err
  214. }
  215. if !specResp.Success {
  216. return nil, errors.New(specResp.Error.Message)
  217. }
  218. balanceReq := &octopus.GetUserBalanceReq{
  219. Platform: o.platform,
  220. }
  221. balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
  222. if err != nil {
  223. return nil, err
  224. }
  225. if !balanceResp.Success {
  226. return nil, errors.New(balanceResp.Error.Message)
  227. }
  228. var cards []*collector.Card
  229. balance := float64(balanceResp.Payload.BillingUser.Amount)
  230. var cpuHours float64
  231. for _, spec := range specResp.TrainResourceSpecs {
  232. if spec.Price == 0 {
  233. ns := strings.Split(spec.Name, COMMA)
  234. if len(ns) == 2 {
  235. nss := strings.Split(ns[0], COLON)
  236. if nss[0] == CPU {
  237. cpuHours = -1
  238. }
  239. }
  240. }
  241. if spec.Price == 1 {
  242. ns := strings.Split(spec.Name, COMMA)
  243. cardSpecs := strings.Split(ns[0], STAR)
  244. cardTops, isMapContainsKey := cardTopsMap[cardSpecs[1]]
  245. if !isMapContainsKey {
  246. continue
  247. }
  248. card := &collector.Card{
  249. Platform: OCTOPUS,
  250. Type: CARD,
  251. Name: cardSpecs[1],
  252. TOpsAtFp16: cardTops,
  253. CardHours: balance / spec.Price,
  254. }
  255. cards = append(cards, card)
  256. }
  257. }
  258. resourceStats := &collector.ResourceStats{
  259. ClusterId: strconv.FormatInt(o.participantId, 10),
  260. Name: o.platform,
  261. Balance: balance,
  262. CardsAvail: cards,
  263. CpuCoreHours: cpuHours,
  264. }
  265. return resourceStats, nil
  266. }
  267. func (o *OctopusLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  268. req := &octopus.GetMyDatasetListReq{
  269. Platform: o.platform,
  270. PageIndex: o.pageIndex,
  271. PageSize: o.pageSize,
  272. }
  273. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  274. if err != nil {
  275. return nil, err
  276. }
  277. if !resp.Success {
  278. return nil, errors.New(resp.Error.Message)
  279. }
  280. specs := []*collector.DatasetsSpecs{}
  281. for _, dataset := range resp.Payload.Datasets {
  282. spec := &collector.DatasetsSpecs{Name: dataset.Name}
  283. specs = append(specs, spec)
  284. }
  285. return specs, nil
  286. }
  287. func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  288. var algorithms []*collector.Algorithm
  289. req := &octopus.GetMyAlgorithmListReq{
  290. Platform: o.platform,
  291. PageIndex: o.pageIndex,
  292. PageSize: o.pageSize,
  293. }
  294. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  295. if err != nil {
  296. return nil, err
  297. }
  298. if !resp.Success {
  299. return nil, errors.New("failed to get algorithms")
  300. }
  301. for _, a := range resp.Payload.Algorithms {
  302. algorithm := &collector.Algorithm{Name: a.AlgorithmName, Platform: OCTOPUS, TaskType: strings.ToLower(a.FrameworkName)}
  303. algorithms = append(algorithms, algorithm)
  304. }
  305. return algorithms, nil
  306. }
  307. func (o *OctopusLink) GetComputeCards(ctx context.Context) ([]string, error) {
  308. var cards []string
  309. for s, _ := range cardAliasMap {
  310. cards = append(cards, s)
  311. }
  312. return cards, nil
  313. }
  314. func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  315. var name string
  316. if resourceType == CARD {
  317. name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
  318. } else {
  319. name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
  320. }
  321. req := &octopus.GetMyAlgorithmListReq{
  322. Platform: o.platform,
  323. PageIndex: o.pageIndex,
  324. PageSize: o.pageSize,
  325. }
  326. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  327. if err != nil {
  328. return "", err
  329. }
  330. if !resp.Success {
  331. return "", errors.New("failed to get algorithmList")
  332. }
  333. var algorithmId string
  334. var algorithms []*octopus.Algorithms
  335. for _, a := range resp.Payload.Algorithms {
  336. if strings.ToLower(a.FrameworkName) != taskType {
  337. continue
  338. }
  339. if a.AlgorithmDescript == name {
  340. algorithms = append(algorithms, a)
  341. }
  342. }
  343. if len(algorithms) == 0 {
  344. return "", errors.New("algorithmId not found")
  345. }
  346. if len(algorithms) == 1 {
  347. algorithmId = algorithms[0].AlgorithmId
  348. }
  349. aLatest := &octopus.Algorithms{}
  350. for i, _ := range algorithms {
  351. if time.Unix(aLatest.CreatedAt, 0).After(time.Unix(algorithms[i].CreatedAt, 0)) {
  352. aLatest = algorithms[i]
  353. }
  354. }
  355. if aLatest.AlgorithmId == "" {
  356. return "", errors.New("algorithmId not found")
  357. }
  358. algorithmId = aLatest.AlgorithmId
  359. dcReq := &octopus.DownloadCompressReq{
  360. Platform: o.platform,
  361. Version: VERSION,
  362. AlgorithmId: algorithmId,
  363. }
  364. dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq)
  365. if err != nil {
  366. return "", err
  367. }
  368. if !dcResp.Success {
  369. return "", errors.New(dcResp.Error.Message)
  370. }
  371. daReq := &octopus.DownloadAlgorithmReq{
  372. Platform: o.platform,
  373. Version: VERSION,
  374. AlgorithmId: algorithmId,
  375. CompressAt: dcResp.Payload.CompressAt,
  376. Domain: DOMAIN,
  377. }
  378. daResp, err := o.octopusRpc.DownloadAlgorithm(ctx, daReq)
  379. if err != nil {
  380. return "", err
  381. }
  382. if !daResp.Success {
  383. return "", errors.New(dcResp.Error.Message)
  384. }
  385. urlReq := &octopus.AlgorithmUrlReq{
  386. Platform: o.platform,
  387. Url: daResp.Payload.DownloadUrl,
  388. }
  389. urlResp, err := o.octopusRpc.DownloadAlgorithmUrl(ctx, urlReq)
  390. if err != nil {
  391. return "", err
  392. }
  393. return urlResp.Algorithm, nil
  394. }
  395. func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  396. //var name string
  397. //if resourceType == CARD {
  398. // name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
  399. //} else {
  400. // name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
  401. //}
  402. //uploadReq := &octopus.UploadAlgorithmReq{}
  403. return nil
  404. }
  405. func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  406. instance, err := strconv.ParseInt(instanceNum, 10, 32)
  407. if err != nil {
  408. return "", err
  409. }
  410. req := &octopus.GetTrainJobLogReq{
  411. Platform: o.platform,
  412. TaskId: taskId,
  413. TaskNum: "task0",
  414. Num: int32(instance),
  415. }
  416. resp, err := o.octopusRpc.GetTrainJobLog(ctx, req)
  417. if err != nil {
  418. return "", err
  419. }
  420. if strings.Contains(resp.Content, "404 Not Found") {
  421. resp.Content = "waiting for logs..."
  422. }
  423. return resp.Content, nil
  424. }
  425. func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  426. resp, err := o.QueryTask(ctx, taskId)
  427. if err != nil {
  428. return nil, err
  429. }
  430. jobresp, ok := (resp).(*octopus.GetTrainJobResp)
  431. if !jobresp.Success || !ok {
  432. return nil, errors.New("get training task failed")
  433. }
  434. var task collector.Task
  435. task.Id = jobresp.Payload.TrainJob.Id
  436. task.Start = time.Unix(jobresp.Payload.TrainJob.StartedAt, 0).Format(constants.Layout)
  437. task.End = time.Unix(jobresp.Payload.TrainJob.CompletedAt, 0).Format(constants.Layout)
  438. switch jobresp.Payload.TrainJob.Status {
  439. case "succeeded":
  440. task.Status = constants.Completed
  441. case "failed":
  442. task.Status = constants.Failed
  443. case "running":
  444. task.Status = constants.Running
  445. case "stopped":
  446. task.Status = constants.Stopped
  447. default:
  448. task.Status = "undefined"
  449. }
  450. return &task, nil
  451. }
  452. func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
  453. err := o.GenerateSubmitParams(ctx, option)
  454. if err != nil {
  455. return nil, err
  456. }
  457. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  458. if err != nil {
  459. return nil, err
  460. }
  461. return task, nil
  462. }
  463. func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  464. err := o.generateResourceId(ctx, option)
  465. if err != nil {
  466. return err
  467. }
  468. err = o.generateDatasetsId(ctx, option)
  469. if err != nil {
  470. return err
  471. }
  472. err = o.generateImageId(ctx, option)
  473. if err != nil {
  474. return err
  475. }
  476. err = o.generateAlgorithmId(ctx, option)
  477. if err != nil {
  478. return err
  479. }
  480. err = o.generateCmd(option)
  481. if err != nil {
  482. return err
  483. }
  484. err = o.generateEnv(option)
  485. if err != nil {
  486. return err
  487. }
  488. err = o.generateParams(option)
  489. if err != nil {
  490. return err
  491. }
  492. return nil
  493. }
  494. func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption) error {
  495. if option.ResourceType == "" {
  496. return errors.New("ResourceType not set")
  497. }
  498. req := &octopus.GetResourceSpecsReq{
  499. Platform: o.platform,
  500. ResourcePool: RESOURCE_POOL,
  501. }
  502. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  503. if err != nil {
  504. return err
  505. }
  506. if !specResp.Success {
  507. return errors.New(specResp.Error.Message)
  508. }
  509. if option.ResourceType == CPU {
  510. for _, spec := range specResp.TrainResourceSpecs {
  511. if spec.Price == 0 {
  512. option.ResourceId = spec.Id
  513. return nil
  514. }
  515. }
  516. }
  517. if option.ResourceType == CARD {
  518. err = setResourceIdByCard(option, specResp, GCU)
  519. if err != nil {
  520. return err
  521. }
  522. return nil
  523. }
  524. return errors.New("failed to get ResourceId")
  525. }
  526. func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error {
  527. if option.DatasetsName == "" {
  528. return errors.New("DatasetsName not set")
  529. }
  530. req := &octopus.GetMyDatasetListReq{
  531. Platform: o.platform,
  532. PageIndex: o.pageIndex,
  533. PageSize: o.pageSize,
  534. }
  535. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  536. if err != nil {
  537. return err
  538. }
  539. if !resp.Success {
  540. return errors.New("failed to get DatasetsId")
  541. }
  542. for _, dataset := range resp.Payload.Datasets {
  543. if dataset.Name == option.DatasetsName {
  544. option.DatasetsId = dataset.Id
  545. return nil
  546. }
  547. }
  548. return errors.New("failed to get DatasetsId")
  549. }
  550. func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption) error {
  551. if option.TaskType == "" {
  552. return errors.New("TaskType not set")
  553. }
  554. req := &octopus.GetUserImageListReq{
  555. Platform: o.platform,
  556. PageIndex: o.pageIndex,
  557. PageSize: o.pageSize,
  558. }
  559. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  560. if err != nil {
  561. return err
  562. }
  563. if !resp.Success {
  564. return errors.New("failed to get imageId")
  565. }
  566. if option.ResourceType == CPU {
  567. for _, img := range resp.Payload.Images {
  568. if img.Image.ImageName == "test-image" {
  569. option.ImageId = img.Image.Id
  570. return nil
  571. }
  572. }
  573. }
  574. preImgReq := &octopus.GetPresetImageListReq{
  575. Platform: o.platform,
  576. PageIndex: o.pageIndex,
  577. PageSize: o.pageSize,
  578. }
  579. preImgResp, err := o.octopusRpc.GetPresetImageList(ctx, preImgReq)
  580. if err != nil {
  581. return err
  582. }
  583. if !preImgResp.Success {
  584. return errors.New("failed to get PresetImages")
  585. }
  586. if option.ResourceType == CARD {
  587. for _, image := range preImgResp.Payload.Images {
  588. if strings.Contains(image.ImageName, cardAliasMap[option.ComputeCard]) {
  589. option.ImageId = image.Id
  590. return nil
  591. }
  592. }
  593. }
  594. return errors.New("failed to get ImageId")
  595. }
  596. func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
  597. req := &octopus.GetMyAlgorithmListReq{
  598. Platform: o.platform,
  599. PageIndex: o.pageIndex,
  600. PageSize: o.pageSize,
  601. }
  602. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  603. if err != nil {
  604. return err
  605. }
  606. if !resp.Success {
  607. return errors.New("failed to get algorithmId")
  608. }
  609. for _, algorithm := range resp.Payload.Algorithms {
  610. if algorithm.FrameworkName == strings.Title(option.TaskType) {
  611. ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
  612. if ns[0] != option.DatasetsName {
  613. continue
  614. }
  615. if ns[1] != option.AlgorithmName {
  616. continue
  617. }
  618. switch option.ResourceType {
  619. case CPU:
  620. if ns[2] != CPU {
  621. continue
  622. }
  623. case CARD:
  624. if ns[2] != strings.ToLower(option.ComputeCard) {
  625. continue
  626. }
  627. }
  628. option.AlgorithmId = algorithm.AlgorithmId
  629. return nil
  630. }
  631. }
  632. if option.AlgorithmId == "" {
  633. return errors.New("Algorithm does not exist")
  634. }
  635. return errors.New("failed to get AlgorithmId")
  636. }
  637. func (o *OctopusLink) generateCmd(option *option.AiOption) error {
  638. if option.Cmd == "" {
  639. switch option.ComputeCard {
  640. case GCU:
  641. option.Cmd = "cd /code; python3 train.py"
  642. default:
  643. option.Cmd = TRAIN_CMD
  644. }
  645. }
  646. return nil
  647. }
  648. func (o *OctopusLink) generateEnv(option *option.AiOption) error {
  649. return nil
  650. }
  651. func (o *OctopusLink) generateParams(option *option.AiOption) error {
  652. if len(option.Params) == 0 {
  653. epoch := "epoch" + COMMA + "1"
  654. option.Params = append(option.Params, epoch)
  655. }
  656. return nil
  657. }
  658. func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error {
  659. if option.Tops == 0 {
  660. for _, spec := range specs.TrainResourceSpecs {
  661. if spec.Price == 1 {
  662. ns := strings.Split(spec.Name, COMMA)
  663. cardSpecs := strings.Split(ns[0], STAR)
  664. if cardSpecs[1] == computeCard {
  665. option.ResourceId = spec.Id
  666. option.ComputeCard = computeCard
  667. return nil
  668. }
  669. } else {
  670. continue
  671. }
  672. }
  673. } else {
  674. cardNum := math.Ceil(option.Tops / float64(BASE_TOPS))
  675. for _, spec := range specs.TrainResourceSpecs {
  676. if option.Tops < BASE_TOPS {
  677. if spec.Price == 1 {
  678. ns := strings.Split(spec.Name, COMMA)
  679. cardSpecs := strings.Split(ns[0], STAR)
  680. if cardSpecs[1] == computeCard {
  681. option.ResourceId = spec.Id
  682. option.ComputeCard = computeCard
  683. return nil
  684. }
  685. } else {
  686. continue
  687. }
  688. } else {
  689. ns := strings.Split(spec.Name, COMMA)
  690. if len(ns) != 4 {
  691. continue
  692. }
  693. cardSpecs := strings.Split(ns[0], STAR)
  694. if cardSpecs[1] != computeCard {
  695. continue
  696. }
  697. s, err := strconv.ParseFloat(cardSpecs[0], 64)
  698. if err != nil {
  699. return err
  700. }
  701. switch computeCard {
  702. case GCU:
  703. if cardNum == s { // 1, 4, 8
  704. option.ResourceId = spec.Id
  705. option.ComputeCard = computeCard
  706. return nil
  707. }
  708. if 1 < cardNum && cardNum <= 4 && s == 4 {
  709. option.ResourceId = spec.Id
  710. option.ComputeCard = computeCard
  711. return nil
  712. }
  713. if 4 < cardNum && s == 8 {
  714. option.ResourceId = spec.Id
  715. option.ComputeCard = computeCard
  716. return nil
  717. }
  718. case MLU: // 1, 2, 4
  719. if cardNum/2 == s {
  720. option.ResourceId = spec.Id
  721. option.ComputeCard = computeCard
  722. return nil
  723. }
  724. if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
  725. option.ResourceId = spec.Id
  726. option.ComputeCard = computeCard
  727. return nil
  728. }
  729. if 2 < cardNum/2 && s == 4 {
  730. option.ResourceId = spec.Id
  731. option.ComputeCard = computeCard
  732. return nil
  733. }
  734. }
  735. }
  736. }
  737. }
  738. return errors.New("set ResourceId error")
  739. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.