You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

octopus.go 32 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "bufio"
  15. "context"
  16. "errors"
  17. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  19. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  20. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  21. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  22. "gitlink.org.cn/JointCloud/pcm-octopus/octopus"
  23. "gitlink.org.cn/JointCloud/pcm-octopus/octopusclient"
  24. "io"
  25. "math"
  26. "mime/multipart"
  27. "strconv"
  28. "strings"
  29. "time"
  30. )
  31. type OctopusLink struct {
  32. octopusRpc octopusclient.Octopus
  33. pageIndex int32
  34. pageSize int32
  35. platform string
  36. participantId int64
  37. }
  38. const (
  39. IMG_NAME_PREFIX = "oct_"
  40. IMG_VERSION_PREFIX = "version_"
  41. TASK_NAME_PREFIX = "trainJob"
  42. RESOURCE_POOL = "common-pool"
  43. HANWUJI = "hanwuji"
  44. SUIYUAN = "suiyuan"
  45. SAILINGSI = "sailingsi"
  46. MLU = "MLU"
  47. BIV100 = "BI-V100"
  48. CAMBRICONMLU290 = 256
  49. GCU = "GCU"
  50. ENFLAME = "enflame"
  51. EnflameT20 = 128
  52. BASE_TOPS = 128
  53. CAMBRICON = "cambricon"
  54. ILUVATAR = "iluvatar"
  55. TRAIN_CMD = "cd /code; python train.py"
  56. VERSION = "V1"
  57. DOMAIN = "http://192.168.242.41:8001/"
  58. CAMBRICON_CN = "寒武纪290"
  59. ENFLAME_CN = "燧原T20"
  60. ILUVATAR_CN = "天数BI-V100"
  61. )
  62. var (
  63. cardAliasMap = map[string]string{
  64. MLU: CAMBRICON,
  65. GCU: ENFLAME,
  66. BIV100: ILUVATAR,
  67. }
  68. cardCnMap = map[string]string{
  69. MLU: CAMBRICON_CN,
  70. GCU: ENFLAME_CN,
  71. BIV100: ILUVATAR_CN,
  72. }
  73. cardTopsMap = map[string]float64{
  74. MLU: CAMBRICONMLU290,
  75. GCU: EnflameT20,
  76. }
  77. CardModelNameCmdMap = map[string]map[string]string{
  78. BIV100: {"blip-image-captioning-base": "pip install -U transformers; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code; python infer_biv100.py",
  79. "imagenet_resnet50": "pip install -U transformers; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code/infer; python infer_biv100.py",
  80. "chatGLM_6B": "su root; pip install transformers==4.33.2; pip install fastapi uvicorn[standard]; cd /code; python infer_biv100.py"},
  81. MLU: {"blip-image-captioning-base": "",
  82. "imagenet_resnet50": "su root; . /torch/venv3/pytorch/bin/activate; pip install fastapi uvicorn[standard]; pip install python-multipart; cd /code/infer; python infer_mlu.py",
  83. "chatGLM_6B": ""},
  84. }
  85. )
  86. func NewOctopusLink(octopusRpc octopusclient.Octopus, name string, id int64) *OctopusLink {
  87. return &OctopusLink{octopusRpc: octopusRpc, platform: name, participantId: id, pageIndex: 1, pageSize: 100}
  88. }
  89. func (o *OctopusLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
  90. // octopus创建镜像
  91. createReq := &octopus.CreateImageReq{
  92. Platform: o.platform,
  93. CreateImage: &octopus.CreateImage{
  94. SourceType: 1,
  95. ImageName: IMG_NAME_PREFIX + utils.RandomString(7),
  96. ImageVersion: IMG_VERSION_PREFIX + utils.RandomString(7),
  97. },
  98. }
  99. createResp, err := o.octopusRpc.CreateImage(ctx, createReq)
  100. if err != nil {
  101. return nil, err
  102. }
  103. // octopus上传镜像
  104. uploadReq := &octopus.UploadImageReq{
  105. Platform: o.platform,
  106. ImageId: createResp.Payload.ImageId,
  107. Params: &octopus.UploadImageParam{
  108. Domain: "",
  109. FileName: "",
  110. },
  111. }
  112. uploadResp, err := o.octopusRpc.UploadImage(ctx, uploadReq)
  113. if err != nil {
  114. return nil, err
  115. }
  116. // Todo 实际上传
  117. return uploadResp, nil
  118. }
  119. func (o *OctopusLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  120. // octopus删除镜像
  121. req := &octopus.DeleteImageReq{
  122. Platform: o.platform,
  123. ImageId: imageId,
  124. }
  125. resp, err := o.octopusRpc.DeleteImage(ctx, req)
  126. if err != nil {
  127. return nil, err
  128. }
  129. return resp, nil
  130. }
  131. func (o *OctopusLink) QueryImageList(ctx context.Context) (interface{}, error) {
  132. // octopus获取镜像列表
  133. req := &octopus.GetUserImageListReq{
  134. Platform: o.platform,
  135. PageIndex: o.pageIndex,
  136. PageSize: o.pageSize,
  137. }
  138. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  139. if err != nil {
  140. return nil, err
  141. }
  142. return resp, nil
  143. }
  144. func (o *OctopusLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  145. // octopus提交任务
  146. // python参数
  147. var prms []*octopus.Parameters
  148. for _, param := range params {
  149. var p octopus.Parameters
  150. s := strings.Split(param, COMMA)
  151. p.Key = s[0]
  152. p.Value = s[1]
  153. prms = append(prms, &p)
  154. }
  155. //环境变量
  156. envMap := make(map[string]string)
  157. for _, env := range envs {
  158. s := strings.Split(env, COMMA)
  159. envMap[s[0]] = s[1]
  160. }
  161. req := &octopus.CreateTrainJobReq{
  162. Platform: o.platform,
  163. Params: &octopus.CreateTrainJobParam{
  164. ImageId: imageId,
  165. Name: TASK_NAME_PREFIX + UNDERSCORE + utils.RandomString(10),
  166. ResourcePool: RESOURCE_POOL,
  167. Config: []*octopus.Config{
  168. {
  169. Command: cmd,
  170. ResourceSpecId: resourceId,
  171. MinFailedTaskCount: 1,
  172. MinSucceededTaskCount: 1,
  173. TaskNumber: 1,
  174. Parameters: prms,
  175. Envs: envMap,
  176. },
  177. },
  178. DataSetId: datasetsId,
  179. DataSetVersion: VERSION,
  180. AlgorithmId: algorithmId,
  181. AlgorithmVersion: VERSION,
  182. },
  183. }
  184. resp, err := o.octopusRpc.CreateTrainJob(ctx, req)
  185. if err != nil {
  186. return nil, err
  187. }
  188. return resp, nil
  189. }
  190. func (o *OctopusLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  191. // octopus获取任务
  192. req := &octopus.GetTrainJobReq{
  193. Platform: o.platform,
  194. Id: taskId,
  195. }
  196. resp, err := o.octopusRpc.GetTrainJob(ctx, req)
  197. if err != nil {
  198. return nil, err
  199. }
  200. return resp, nil
  201. }
  202. func (o *OctopusLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  203. // octopus删除任务
  204. req := &octopus.DeleteTrainJobReq{
  205. Platform: o.platform,
  206. JobIds: []string{taskId},
  207. }
  208. resp, err := o.octopusRpc.DeleteTrainJob(ctx, req)
  209. if err != nil {
  210. return nil, err
  211. }
  212. return resp, nil
  213. }
  214. func (o *OctopusLink) QuerySpecs(ctx context.Context) (interface{}, error) {
  215. // octopus查询资源规格
  216. req := &octopus.GetResourceSpecsReq{
  217. Platform: o.platform,
  218. ResourcePool: RESOURCE_POOL,
  219. }
  220. resp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  221. if err != nil {
  222. return nil, err
  223. }
  224. return resp, nil
  225. }
  226. func (o *OctopusLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  227. req := &octopus.GetResourceSpecsReq{
  228. Platform: o.platform,
  229. ResourcePool: RESOURCE_POOL,
  230. }
  231. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  232. if err != nil {
  233. return nil, err
  234. }
  235. if !specResp.Success {
  236. return nil, errors.New(specResp.Error.Message)
  237. }
  238. balanceReq := &octopus.GetUserBalanceReq{
  239. Platform: o.platform,
  240. }
  241. balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
  242. if err != nil {
  243. return nil, err
  244. }
  245. if !balanceResp.Success {
  246. return nil, errors.New(balanceResp.Error.Message)
  247. }
  248. var cards []*collector.Card
  249. balance := float64(balanceResp.Payload.BillingUser.Amount)
  250. var cpuHours float64
  251. for _, spec := range specResp.TrainResourceSpecs {
  252. if spec.Price == 0 {
  253. ns := strings.Split(spec.Name, COMMA)
  254. if len(ns) == 2 {
  255. nss := strings.Split(ns[0], COLON)
  256. if nss[0] == CPU {
  257. cpuHours = -1
  258. }
  259. }
  260. }
  261. if spec.Price == 1 {
  262. ns := strings.Split(spec.Name, COMMA)
  263. cardSpecs := strings.Split(ns[0], STAR)
  264. cardTops, isMapContainsKey := cardTopsMap[cardSpecs[1]]
  265. if !isMapContainsKey {
  266. continue
  267. }
  268. card := &collector.Card{
  269. Platform: OCTOPUS,
  270. Type: CARD,
  271. Name: cardSpecs[1],
  272. TOpsAtFp16: cardTops,
  273. CardHours: balance / spec.Price,
  274. }
  275. cards = append(cards, card)
  276. }
  277. }
  278. resourceStats := &collector.ResourceStats{
  279. ClusterId: strconv.FormatInt(o.participantId, 10),
  280. Name: o.platform,
  281. Balance: balance,
  282. CardsAvail: cards,
  283. CpuCoreHours: cpuHours,
  284. }
  285. return resourceStats, nil
  286. }
  287. func (o *OctopusLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  288. req := &octopus.GetMyDatasetListReq{
  289. Platform: o.platform,
  290. PageIndex: o.pageIndex,
  291. PageSize: o.pageSize,
  292. }
  293. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  294. if err != nil {
  295. return nil, err
  296. }
  297. if !resp.Success {
  298. return nil, errors.New(resp.Error.Message)
  299. }
  300. specs := []*collector.DatasetsSpecs{}
  301. for _, dataset := range resp.Payload.Datasets {
  302. spec := &collector.DatasetsSpecs{Name: dataset.Name}
  303. specs = append(specs, spec)
  304. }
  305. return specs, nil
  306. }
  307. func (o *OctopusLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  308. var algorithms []*collector.Algorithm
  309. req := &octopus.GetMyAlgorithmListReq{
  310. Platform: o.platform,
  311. PageIndex: o.pageIndex,
  312. PageSize: o.pageSize,
  313. }
  314. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  315. if err != nil {
  316. return nil, err
  317. }
  318. if !resp.Success {
  319. return nil, errors.New("failed to get algorithms")
  320. }
  321. for _, a := range resp.Payload.Algorithms {
  322. algorithm := &collector.Algorithm{Name: a.AlgorithmName, Platform: OCTOPUS, TaskType: strings.ToLower(a.FrameworkName)}
  323. algorithms = append(algorithms, algorithm)
  324. }
  325. return algorithms, nil
  326. }
  327. func (o *OctopusLink) GetComputeCards(ctx context.Context) ([]string, error) {
  328. var cards []string
  329. for s, _ := range cardAliasMap {
  330. cards = append(cards, s)
  331. }
  332. return cards, nil
  333. }
  334. func (o *OctopusLink) GetUserBalance(ctx context.Context) (float64, error) {
  335. balanceReq := &octopus.GetUserBalanceReq{
  336. Platform: o.platform,
  337. }
  338. balanceResp, err := o.octopusRpc.GetUserBalance(ctx, balanceReq)
  339. if err != nil {
  340. return 0, err
  341. }
  342. if !balanceResp.Success {
  343. if balanceResp.Error != nil {
  344. return 0, errors.New(balanceResp.Error.Message)
  345. } else {
  346. return 0, errors.New("failed to get user balance")
  347. }
  348. }
  349. balance := float64(balanceResp.Payload.BillingUser.Amount)
  350. return balance, nil
  351. }
  352. func (o *OctopusLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  353. var name string
  354. if resourceType == CARD {
  355. name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
  356. } else {
  357. name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
  358. }
  359. req := &octopus.GetMyAlgorithmListReq{
  360. Platform: o.platform,
  361. PageIndex: o.pageIndex,
  362. PageSize: o.pageSize,
  363. }
  364. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  365. if err != nil {
  366. return "", err
  367. }
  368. if !resp.Success {
  369. return "", errors.New("failed to get algorithmList")
  370. }
  371. var algorithmId string
  372. var algorithms []*octopus.Algorithms
  373. for _, a := range resp.Payload.Algorithms {
  374. if strings.ToLower(a.FrameworkName) != taskType {
  375. continue
  376. }
  377. if a.AlgorithmDescript == name {
  378. algorithms = append(algorithms, a)
  379. }
  380. }
  381. if len(algorithms) == 0 {
  382. return "", errors.New("algorithmId not found")
  383. }
  384. if len(algorithms) == 1 {
  385. algorithmId = algorithms[0].AlgorithmId
  386. }
  387. aLatest := &octopus.Algorithms{}
  388. for i, _ := range algorithms {
  389. if time.Unix(algorithms[i].CreatedAt, 0).After(time.Unix(aLatest.CreatedAt, 0)) {
  390. aLatest = algorithms[i]
  391. }
  392. }
  393. if aLatest.AlgorithmId == "" {
  394. return "", errors.New("algorithmId not found")
  395. }
  396. algorithmId = aLatest.AlgorithmId
  397. dcReq := &octopus.DownloadCompressReq{
  398. Platform: o.platform,
  399. Version: VERSION,
  400. AlgorithmId: algorithmId,
  401. }
  402. dcResp, err := o.octopusRpc.DownloadCompress(ctx, dcReq)
  403. if err != nil {
  404. return "", err
  405. }
  406. if !dcResp.Success {
  407. return "", errors.New(dcResp.Error.Message)
  408. }
  409. daReq := &octopus.DownloadAlgorithmReq{
  410. Platform: o.platform,
  411. Version: VERSION,
  412. AlgorithmId: algorithmId,
  413. CompressAt: dcResp.Payload.CompressAt,
  414. Domain: DOMAIN,
  415. }
  416. daResp, err := o.octopusRpc.DownloadAlgorithm(ctx, daReq)
  417. if err != nil {
  418. return "", err
  419. }
  420. if !daResp.Success {
  421. return "", errors.New(dcResp.Error.Message)
  422. }
  423. urlReq := &octopus.AlgorithmUrlReq{
  424. Platform: o.platform,
  425. Url: daResp.Payload.DownloadUrl,
  426. }
  427. urlResp, err := o.octopusRpc.DownloadAlgorithmUrl(ctx, urlReq)
  428. if err != nil {
  429. return "", err
  430. }
  431. return urlResp.Algorithm, nil
  432. }
  433. func (o *OctopusLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  434. //var name string
  435. //if resourceType == CARD {
  436. // name = dataset + UNDERSCORE + algorithm + UNDERSCORE + card
  437. //} else {
  438. // name = dataset + UNDERSCORE + algorithm + UNDERSCORE + CPU
  439. //}
  440. //uploadReq := &octopus.UploadAlgorithmReq{}
  441. return nil
  442. }
  443. func (o *OctopusLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  444. instance, err := strconv.ParseInt(instanceNum, 10, 32)
  445. if err != nil {
  446. return "", err
  447. }
  448. req := &octopus.GetTrainJobLogReq{
  449. Platform: o.platform,
  450. TaskId: taskId,
  451. TaskNum: "task0",
  452. Num: int32(instance),
  453. }
  454. resp, err := o.octopusRpc.GetTrainJobLog(ctx, req)
  455. if err != nil {
  456. return "", err
  457. }
  458. if strings.Contains(resp.Content, "404 Not Found") {
  459. resp.Content = "waiting for logs..."
  460. }
  461. return resp.Content, nil
  462. }
  463. func (o *OctopusLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  464. resp, err := o.QueryTask(ctx, taskId)
  465. if err != nil {
  466. return nil, err
  467. }
  468. jobresp, ok := (resp).(*octopus.GetTrainJobResp)
  469. if !jobresp.Success || !ok {
  470. if jobresp.Error != nil {
  471. return nil, errors.New(jobresp.Error.Message)
  472. } else {
  473. return nil, errors.New("get training task failed, empty error returned")
  474. }
  475. }
  476. var task collector.Task
  477. task.Id = jobresp.Payload.TrainJob.Id
  478. if jobresp.Payload.TrainJob.StartedAt != 0 {
  479. task.Start = time.Unix(jobresp.Payload.TrainJob.StartedAt, 0).Format(constants.Layout)
  480. }
  481. if jobresp.Payload.TrainJob.CompletedAt != 0 {
  482. task.End = time.Unix(jobresp.Payload.TrainJob.CompletedAt, 0).Format(constants.Layout)
  483. }
  484. switch jobresp.Payload.TrainJob.Status {
  485. case "succeeded":
  486. task.Status = constants.Completed
  487. case "failed":
  488. task.Status = constants.Failed
  489. case "running":
  490. task.Status = constants.Running
  491. case "stopped":
  492. task.Status = constants.Stopped
  493. case "pending":
  494. task.Status = constants.Pending
  495. default:
  496. task.Status = "undefined"
  497. }
  498. return &task, nil
  499. }
  500. func (o *OctopusLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
  501. err := o.GenerateSubmitParams(ctx, option)
  502. if err != nil {
  503. return nil, err
  504. }
  505. task, err := o.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  506. if err != nil {
  507. return nil, err
  508. }
  509. return task, nil
  510. }
  511. func (o *OctopusLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  512. err := o.generateResourceId(ctx, option, nil)
  513. if err != nil {
  514. return err
  515. }
  516. err = o.generateDatasetsId(ctx, option)
  517. if err != nil {
  518. return err
  519. }
  520. err = o.generateImageId(ctx, option, nil)
  521. if err != nil {
  522. return err
  523. }
  524. err = o.generateAlgorithmId(ctx, option, nil)
  525. if err != nil {
  526. return err
  527. }
  528. err = o.generateCmd(option, nil)
  529. if err != nil {
  530. return err
  531. }
  532. err = o.generateEnv(option)
  533. if err != nil {
  534. return err
  535. }
  536. err = o.generateParams(option)
  537. if err != nil {
  538. return err
  539. }
  540. return nil
  541. }
  542. func (o *OctopusLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  543. req := &octopus.GetResourceSpecsReq{
  544. Platform: o.platform,
  545. ResourcePool: RESOURCE_POOL,
  546. }
  547. specResp, err := o.octopusRpc.GetResourceSpecs(ctx, req)
  548. if err != nil {
  549. return err
  550. }
  551. if !specResp.Success {
  552. return errors.New(specResp.Error.Message)
  553. }
  554. if option != nil {
  555. err = generateResourceIdForTraining(option, specResp)
  556. if err != nil {
  557. return err
  558. }
  559. return nil
  560. }
  561. if ifoption != nil {
  562. err = generateResourceIdForInferDeployInstance(ifoption, specResp)
  563. if err != nil {
  564. return err
  565. }
  566. return nil
  567. }
  568. return errors.New("failed to set ResourceId")
  569. }
  570. func generateResourceIdForTraining(option *option.AiOption, specResp *octopus.GetResourceSpecsResp) error {
  571. if option.ResourceType == "" {
  572. return errors.New("ResourceType not set")
  573. }
  574. if option.ResourceType == CPU {
  575. for _, spec := range specResp.TrainResourceSpecs {
  576. if spec.Price == 0 {
  577. option.ResourceId = spec.Id
  578. return nil
  579. }
  580. }
  581. }
  582. if option.ResourceType == CARD {
  583. if option.ComputeCard == "" {
  584. option.ComputeCard = GCU
  585. }
  586. err := setResourceIdByCard(option, specResp, option.ComputeCard)
  587. if err != nil {
  588. return err
  589. }
  590. return nil
  591. }
  592. return errors.New("ResourceType not set")
  593. }
  594. func generateResourceIdForInferDeployInstance(option *option.InferOption, specResp *octopus.GetResourceSpecsResp) error {
  595. // temporarily use bi-v100
  596. cardName, ok := cardCnMap[BIV100]
  597. if !ok {
  598. errors.New("computeCard not set")
  599. }
  600. // set computeCard
  601. option.ComputeCard = BIV100
  602. for _, spec := range specResp.TrainResourceSpecs {
  603. names := strings.Split(spec.Name, COMMA)
  604. if len(names) != 4 {
  605. continue
  606. }
  607. ns := strings.Split(names[0], STAR)
  608. if len(ns) != 2 {
  609. continue
  610. }
  611. if ns[0] == "1" && ns[1] == cardName {
  612. option.ResourceId = spec.Id
  613. return nil
  614. }
  615. }
  616. return errors.New("failed to set ResourceId")
  617. }
  618. func (o *OctopusLink) generateDatasetsId(ctx context.Context, option *option.AiOption) error {
  619. if option.DatasetsName == "" {
  620. return errors.New("DatasetsName not set")
  621. }
  622. req := &octopus.GetMyDatasetListReq{
  623. Platform: o.platform,
  624. PageIndex: o.pageIndex,
  625. PageSize: o.pageSize,
  626. }
  627. resp, err := o.octopusRpc.GetMyDatasetList(ctx, req)
  628. if err != nil {
  629. return err
  630. }
  631. if !resp.Success {
  632. return errors.New("failed to get DatasetsId")
  633. }
  634. for _, dataset := range resp.Payload.Datasets {
  635. if dataset.Name == option.DatasetsName {
  636. option.DatasetsId = dataset.Id
  637. return nil
  638. }
  639. }
  640. return errors.New("failed to get DatasetsId")
  641. }
  642. func (o *OctopusLink) generateImageId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  643. preImgReq := &octopus.GetPresetImageListReq{
  644. Platform: o.platform,
  645. PageIndex: o.pageIndex,
  646. PageSize: o.pageSize,
  647. }
  648. preImgResp, err := o.octopusRpc.GetPresetImageList(ctx, preImgReq)
  649. if err != nil {
  650. return err
  651. }
  652. if !preImgResp.Success {
  653. return errors.New("failed to get PresetImages")
  654. }
  655. if option != nil {
  656. if option.TaskType == "" {
  657. return errors.New("TaskType not set")
  658. }
  659. req := &octopus.GetUserImageListReq{
  660. Platform: o.platform,
  661. PageIndex: o.pageIndex,
  662. PageSize: o.pageSize,
  663. }
  664. resp, err := o.octopusRpc.GetUserImageList(ctx, req)
  665. if err != nil {
  666. return err
  667. }
  668. if !resp.Success {
  669. return errors.New("failed to get imageId")
  670. }
  671. if option.ResourceType == CPU {
  672. for _, img := range resp.Payload.Images {
  673. if img.Image.ImageName == "test-image" {
  674. option.ImageId = img.Image.Id
  675. return nil
  676. }
  677. }
  678. }
  679. err = generateImageIdForTraining(option, preImgResp)
  680. if err != nil {
  681. return err
  682. }
  683. return nil
  684. }
  685. if ifoption != nil {
  686. err = generateImageIdForInferDeployInstance(ifoption, preImgResp)
  687. if err != nil {
  688. return err
  689. }
  690. return nil
  691. }
  692. return errors.New("failed to get ImageId")
  693. }
  694. func generateImageIdForTraining(option *option.AiOption, preImgResp *octopus.GetPresetImageListResp) error {
  695. if option.ResourceType == CARD {
  696. for _, image := range preImgResp.Payload.Images {
  697. if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(option.ComputeCard)]) {
  698. switch strings.ToUpper(option.ComputeCard) {
  699. case GCU:
  700. if strings.HasPrefix(image.ImageVersion, "t20_") {
  701. option.ImageId = image.Id
  702. return nil
  703. }
  704. case BIV100:
  705. if strings.HasPrefix(image.ImageVersion, "bi_") {
  706. option.ImageId = image.Id
  707. return nil
  708. }
  709. case MLU:
  710. option.ImageId = image.Id
  711. return nil
  712. }
  713. }
  714. }
  715. }
  716. return errors.New("failed to set ImageId")
  717. }
  718. func generateImageIdForInferDeployInstance(option *option.InferOption, preImgResp *octopus.GetPresetImageListResp) error {
  719. for _, image := range preImgResp.Payload.Images {
  720. // temporarily use bi-v100
  721. if strings.Contains(image.ImageName, cardAliasMap[strings.ToUpper(BIV100)]) {
  722. switch strings.ToUpper(BIV100) {
  723. case GCU:
  724. if strings.HasPrefix(image.ImageVersion, "t20_") {
  725. option.ImageId = image.Id
  726. return nil
  727. }
  728. case BIV100:
  729. if strings.HasPrefix(image.ImageVersion, "bi_") {
  730. option.ImageId = image.Id
  731. return nil
  732. }
  733. case MLU:
  734. option.ImageId = image.Id
  735. return nil
  736. }
  737. }
  738. }
  739. return errors.New("failed to set ImageId")
  740. }
  741. func (o *OctopusLink) generateAlgorithmId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  742. req := &octopus.GetMyAlgorithmListReq{
  743. Platform: o.platform,
  744. PageIndex: o.pageIndex,
  745. PageSize: o.pageSize,
  746. }
  747. resp, err := o.octopusRpc.GetMyAlgorithmList(ctx, req)
  748. if err != nil {
  749. return err
  750. }
  751. if !resp.Success {
  752. return errors.New("failed to get algorithmId")
  753. }
  754. if option != nil {
  755. err = generateAlgorithmIdForTraining(option, resp)
  756. if err != nil {
  757. return err
  758. }
  759. return nil
  760. }
  761. if ifoption != nil {
  762. err = generateAlgorithmIdForInferDeployInstance(ifoption, resp)
  763. if err != nil {
  764. return err
  765. }
  766. return nil
  767. }
  768. return errors.New("failed to set AlgorithmId")
  769. }
  770. func generateAlgorithmIdForTraining(option *option.AiOption, resp *octopus.GetMyAlgorithmListResp) error {
  771. for _, algorithm := range resp.Payload.Algorithms {
  772. if algorithm.FrameworkName == strings.Title(option.TaskType) {
  773. ns := strings.Split(algorithm.AlgorithmName, UNDERSCORE)
  774. if ns[0] != option.DatasetsName {
  775. continue
  776. }
  777. if ns[1] != option.AlgorithmName {
  778. continue
  779. }
  780. switch option.ResourceType {
  781. case CPU:
  782. if ns[2] != CPU {
  783. continue
  784. }
  785. case CARD:
  786. if ns[2] != strings.ToLower(option.ComputeCard) {
  787. continue
  788. }
  789. }
  790. option.AlgorithmId = algorithm.AlgorithmId
  791. return nil
  792. }
  793. }
  794. return errors.New("Algorithm does not exist")
  795. }
  796. func generateAlgorithmIdForInferDeployInstance(option *option.InferOption, resp *octopus.GetMyAlgorithmListResp) error {
  797. if option.ModelType == "" {
  798. return errors.New("ModelType not set")
  799. }
  800. if option.ModelName == "" {
  801. return errors.New("ModelName not set")
  802. }
  803. for _, algorithm := range resp.Payload.Algorithms {
  804. if strings.Contains(algorithm.AlgorithmName, option.ModelName) {
  805. option.AlgorithmId = algorithm.AlgorithmId
  806. return nil
  807. }
  808. }
  809. return errors.New("ModelName does not exist")
  810. }
  811. func (o *OctopusLink) generateCmd(option *option.AiOption, ifoption *option.InferOption) error {
  812. if option != nil {
  813. err := generateCmdForTraining(option)
  814. if err != nil {
  815. return err
  816. }
  817. return nil
  818. }
  819. if ifoption != nil {
  820. err := generateCmdForInferDeployInstance(ifoption)
  821. if err != nil {
  822. return err
  823. }
  824. return nil
  825. }
  826. return errors.New("failed to set cmd")
  827. }
  828. func generateCmdForTraining(option *option.AiOption) error {
  829. if option.Cmd == "" {
  830. switch option.ComputeCard {
  831. case GCU:
  832. option.Cmd = "cd /code; python3 train.py"
  833. case MLU:
  834. option.Cmd = ". /torch/venv3/pytorch/bin/activate; cd /code; python train.py"
  835. default:
  836. option.Cmd = TRAIN_CMD
  837. }
  838. }
  839. return nil
  840. }
  841. func generateCmdForInferDeployInstance(option *option.InferOption) error {
  842. if option.Cmd == "" {
  843. nameCmd, ok := CardModelNameCmdMap[option.ComputeCard]
  844. if !ok {
  845. return errors.New("failed to set cmd, ComputeCard not exist")
  846. }
  847. cmd, ok := nameCmd[option.ModelName]
  848. if !ok {
  849. return errors.New("failed to set cmd, ModelName not exist")
  850. }
  851. option.Cmd = cmd
  852. return nil
  853. }
  854. return nil
  855. }
  856. func (o *OctopusLink) generateEnv(option *option.AiOption) error {
  857. return nil
  858. }
  859. func (o *OctopusLink) generateParams(option *option.AiOption) error {
  860. if len(option.Params) == 0 {
  861. epoch := "epoch" + COMMA + "1"
  862. option.Params = append(option.Params, epoch)
  863. }
  864. return nil
  865. }
  866. func setResourceIdByCard(option *option.AiOption, specs *octopus.GetResourceSpecsResp, computeCard string) error {
  867. if option.Tops == 0 {
  868. for _, spec := range specs.TrainResourceSpecs {
  869. if spec.Price == 1 {
  870. ns := strings.Split(spec.Name, COMMA)
  871. cardSpecs := strings.Split(ns[0], STAR)
  872. if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
  873. option.ResourceId = spec.Id
  874. option.ComputeCard = computeCard
  875. return nil
  876. }
  877. } else {
  878. continue
  879. }
  880. }
  881. } else {
  882. cardNum := math.Ceil(option.Tops / float64(BASE_TOPS))
  883. for _, spec := range specs.TrainResourceSpecs {
  884. if option.Tops < BASE_TOPS {
  885. if spec.Price == 1 {
  886. ns := strings.Split(spec.Name, COMMA)
  887. cardSpecs := strings.Split(ns[0], STAR)
  888. if cardSpecs[1] == cardCnMap[strings.ToUpper(computeCard)] {
  889. option.ResourceId = spec.Id
  890. option.ComputeCard = computeCard
  891. return nil
  892. }
  893. } else {
  894. continue
  895. }
  896. } else {
  897. ns := strings.Split(spec.Name, COMMA)
  898. if len(ns) != 4 {
  899. continue
  900. }
  901. cardSpecs := strings.Split(ns[0], STAR)
  902. if cardSpecs[1] != cardCnMap[strings.ToUpper(computeCard)] {
  903. continue
  904. }
  905. s, err := strconv.ParseFloat(cardSpecs[0], 64)
  906. if err != nil {
  907. return err
  908. }
  909. switch computeCard {
  910. case GCU:
  911. option.ComputeCard = computeCard
  912. if cardNum == s { // 1, 4, 8
  913. option.ResourceId = spec.Id
  914. return nil
  915. }
  916. if 1 < cardNum && cardNum <= 4 && s == 4 {
  917. option.ResourceId = spec.Id
  918. return nil
  919. }
  920. if 4 < cardNum && s == 8 {
  921. option.ResourceId = spec.Id
  922. return nil
  923. }
  924. case MLU: // 1, 2, 4
  925. option.ComputeCard = computeCard
  926. if cardNum/2 == s {
  927. option.ResourceId = spec.Id
  928. return nil
  929. }
  930. if 1 < cardNum/2 && cardNum/2 <= 2 && s == 2 {
  931. option.ResourceId = spec.Id
  932. return nil
  933. }
  934. if 2 < cardNum/2 && s == 4 {
  935. option.ResourceId = spec.Id
  936. return nil
  937. }
  938. }
  939. }
  940. }
  941. }
  942. return errors.New("set ResourceId error")
  943. }
  944. func (o *OctopusLink) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  945. req := &octopus.GetNotebookListReq{
  946. Platform: o.platform,
  947. PageIndex: o.pageIndex,
  948. PageSize: o.pageSize,
  949. SearchKey: DEPLOY_INSTANCE_PREFIEX,
  950. }
  951. list, err := o.octopusRpc.GetNotebookList(ctx, req)
  952. if err != nil {
  953. return nil, err
  954. }
  955. var imageUrls []*inference.InferUrl
  956. for _, notebook := range list.Payload.GetNotebooks() {
  957. if strings.Contains(notebook.Desc, option.ModelName) && notebook.Status == "running" {
  958. url := strings.Replace(notebook.Tasks[0].Url, FORWARD_SLASH, "", -1)
  959. names := strings.Split(notebook.Desc, FORWARD_SLASH)
  960. imageUrl := &inference.InferUrl{
  961. Url: DOMAIN + url,
  962. Card: names[2],
  963. }
  964. imageUrls = append(imageUrls, imageUrl)
  965. } else {
  966. continue
  967. }
  968. }
  969. if len(imageUrls) == 0 {
  970. return nil, errors.New("no infer url available")
  971. }
  972. clusterWithUrl := &inference.ClusterInferUrl{
  973. ClusterName: o.platform,
  974. ClusterType: TYPE_OCTOPUS,
  975. InferUrls: imageUrls,
  976. }
  977. return clusterWithUrl, nil
  978. }
  979. func (o *OctopusLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  980. var insList []*inference.DeployInstance
  981. req := &octopus.GetNotebookListReq{
  982. Platform: o.platform,
  983. PageIndex: o.pageIndex,
  984. PageSize: o.pageSize,
  985. SearchKey: DEPLOY_INSTANCE_PREFIEX,
  986. }
  987. list, err := o.octopusRpc.GetNotebookList(ctx, req)
  988. if err != nil {
  989. return nil, err
  990. }
  991. if list.Error != nil {
  992. return nil, errors.New(list.Error.Message)
  993. }
  994. for _, notebook := range list.Payload.Notebooks {
  995. ins := &inference.DeployInstance{}
  996. ins.InstanceName = notebook.Name
  997. ins.InstanceId = notebook.Id
  998. ins.ClusterName = o.platform
  999. ins.Status = notebook.Status
  1000. ins.ClusterType = TYPE_OCTOPUS
  1001. insList = append(insList, ins)
  1002. }
  1003. return insList, nil
  1004. }
  1005. func (o *OctopusLink) StartInferDeployInstance(ctx context.Context, id string) bool {
  1006. req := &octopus.StartNotebookReq{
  1007. Platform: o.platform,
  1008. Id: id,
  1009. }
  1010. resp, err := o.octopusRpc.StartNotebook(ctx, req)
  1011. if err != nil || !resp.Success {
  1012. return false
  1013. }
  1014. return resp.Success
  1015. }
  1016. func (o *OctopusLink) StopInferDeployInstance(ctx context.Context, id string) bool {
  1017. req := &octopus.StopNotebookReq{
  1018. Platform: o.platform,
  1019. Id: id,
  1020. }
  1021. resp, err := o.octopusRpc.StopNotebook(ctx, req)
  1022. if err != nil || !resp.Success {
  1023. return false
  1024. }
  1025. return resp.Success
  1026. }
  1027. func (o *OctopusLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  1028. ins := &inference.DeployInstance{}
  1029. req := &octopus.GetNotebookReq{
  1030. Platform: o.platform,
  1031. Id: id,
  1032. }
  1033. resp, err := o.octopusRpc.GetNotebook(ctx, req)
  1034. if err != nil {
  1035. return nil, err
  1036. }
  1037. if resp.Payload == nil {
  1038. return nil, errors.New("instance does not exist")
  1039. }
  1040. url := strings.Replace(resp.Payload.Notebook.Tasks[0].Url, FORWARD_SLASH, "", -1)
  1041. inferUrl := DOMAIN + url
  1042. var modelType string
  1043. var modelName string
  1044. var card string
  1045. if resp.Payload.Notebook.Desc != "" {
  1046. str := strings.Split(resp.Payload.Notebook.Desc, FORWARD_SLASH)
  1047. if len(str) == 3 {
  1048. modelType = str[0]
  1049. modelName = str[1]
  1050. card = str[2]
  1051. }
  1052. }
  1053. ins.InstanceName = resp.Payload.Notebook.Name
  1054. ins.InstanceId = resp.Payload.Notebook.Id
  1055. ins.ClusterName = o.platform
  1056. ins.Status = resp.Payload.Notebook.Status
  1057. ins.ClusterType = TYPE_OCTOPUS
  1058. ins.ModelType = modelType
  1059. ins.ModelName = modelName
  1060. ins.InferUrl = inferUrl
  1061. ins.InferCard = card
  1062. return ins, nil
  1063. }
  1064. func (o *OctopusLink) GetInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  1065. stream, err := o.octopusRpc.GetInferResult(ctx)
  1066. if err != nil {
  1067. return "", err
  1068. }
  1069. buffer := make([]byte, 2048)
  1070. bufferedReader := bufio.NewReader(file)
  1071. for {
  1072. _, err = bufferedReader.Read(buffer)
  1073. if err != nil {
  1074. if err != io.EOF {
  1075. return "", err
  1076. }
  1077. break
  1078. }
  1079. err = stream.Send(&octopus.InferResultReq{
  1080. Platform: o.platform,
  1081. InferUrl: url,
  1082. FileName: fileName,
  1083. FileBytes: buffer,
  1084. })
  1085. }
  1086. recv, err := stream.CloseAndRecv()
  1087. if err != nil {
  1088. return "", err
  1089. }
  1090. return recv.Result, nil
  1091. }
  1092. func (o *OctopusLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  1093. err := o.generateResourceId(ctx, nil, option)
  1094. if err != nil {
  1095. return "", err
  1096. }
  1097. err = o.generateAlgorithmId(ctx, nil, option)
  1098. if err != nil {
  1099. return "", err
  1100. }
  1101. err = o.generateImageId(ctx, nil, option)
  1102. if err != nil {
  1103. return "", err
  1104. }
  1105. err = o.generateCmd(nil, option)
  1106. if err != nil {
  1107. return "", err
  1108. }
  1109. desc := option.ModelType + FORWARD_SLASH + option.ModelName + FORWARD_SLASH + strings.ToLower(BIV100)
  1110. param := &octopus.CreateNotebookParam{
  1111. Name: option.TaskName,
  1112. ResourcePool: RESOURCE_POOL,
  1113. ResourceSpecId: option.ResourceId,
  1114. AlgorithmId: option.AlgorithmId,
  1115. AlgorithmVersion: VERSION,
  1116. ImageId: option.ImageId,
  1117. DatasetId: "",
  1118. DatasetVersion: "",
  1119. Command: option.Cmd,
  1120. Desc: desc,
  1121. TaskNumber: 1,
  1122. }
  1123. req := &octopus.CreateNotebookReq{
  1124. Platform: o.platform,
  1125. Params: param,
  1126. }
  1127. resp, err := o.octopusRpc.CreateNotebook(ctx, req)
  1128. if err != nil {
  1129. return "", err
  1130. }
  1131. if !resp.Success {
  1132. return "", errors.New(resp.Error.Message)
  1133. }
  1134. return resp.Payload.Id, nil
  1135. }
  1136. func (o *OctopusLink) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
  1137. ifoption := &option.InferOption{
  1138. ModelName: name,
  1139. ModelType: mtype,
  1140. }
  1141. err := o.generateAlgorithmId(ctx, nil, ifoption)
  1142. if err != nil {
  1143. return false
  1144. }
  1145. return true
  1146. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.