You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 35 kB

11 months ago
10 months ago
10 months ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "fmt"
  16. "github.com/pkg/errors"
  17. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/common"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  19. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  20. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/executor"
  21. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  22. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  23. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  24. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
  25. "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
  26. "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
  27. "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
  28. modelartsclient "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
  29. "io"
  30. "k8s.io/apimachinery/pkg/util/json"
  31. "log"
  32. "mime/multipart"
  33. "regexp"
  34. "strconv"
  35. "strings"
  36. "sync"
  37. "time"
  38. )
  39. const (
  40. Ascend = "Ascend"
  41. Npu = "npu"
  42. ImageNetResnet50Cmd = "cd /home/ma-user & python ./inference_ascend.py"
  43. ChatGLM6BCmd = "cd /home/ma-user && python ./download_model.py && python ./inference_chatGLM.py"
  44. ASCEND = "ASCEND910"
  45. )
  46. type ModelArtsLink struct {
  47. modelArtsRpc modelartsservice.ModelArtsService
  48. modelArtsImgRpc imagesservice.ImagesService
  49. platform string
  50. participantId int64
  51. pageIndex int32
  52. pageSize int32
  53. SourceLocation string
  54. Version string
  55. ModelId string
  56. ModelType string
  57. }
  58. type MoUsage struct {
  59. CpuSize int64
  60. NpuSize int64
  61. MemorySize int64
  62. VMemorySize int64
  63. VMemoryNumber int64
  64. CpuAvailable int64
  65. NpuAvailable int64
  66. MemoryAvailable int64
  67. VMemoryAvailable int64
  68. }
  69. // Version 结构体表示版本号
  70. type Version struct {
  71. Major, Minor, Patch int
  72. }
  73. // ParseVersion 从字符串解析版本号
  74. func ParseVersion(versionStr string) (*Version, error) {
  75. parts := strings.Split(versionStr, ".")
  76. if len(parts) != 3 {
  77. return nil, fmt.Errorf("invalid version format: %s", versionStr)
  78. }
  79. major, err := strconv.Atoi(parts[0])
  80. if err != nil {
  81. return nil, err
  82. }
  83. minor, err := strconv.Atoi(parts[1])
  84. if err != nil {
  85. return nil, err
  86. }
  87. patch, err := strconv.Atoi(parts[2])
  88. if err != nil {
  89. return nil, err
  90. }
  91. return &Version{Major: major, Minor: minor, Patch: patch}, nil
  92. }
  93. // Increment 根据给定规则递增版本号
  94. func (v *Version) Increment() {
  95. if v.Patch < 9 {
  96. v.Patch++
  97. } else {
  98. v.Patch = 0
  99. if v.Minor < 9 {
  100. v.Minor++
  101. } else {
  102. v.Minor = 0
  103. v.Major++
  104. }
  105. }
  106. }
  107. // String 将版本号转换回字符串格式
  108. func (v *Version) String() string {
  109. return fmt.Sprintf("%d.%d.%d", v.Major, v.Minor, v.Patch)
  110. }
  111. func NewModelArtsLink(modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, name string, id int64, nickname string) *ModelArtsLink {
  112. return &ModelArtsLink{modelArtsRpc: modelArtsRpc, modelArtsImgRpc: modelArtsImgRpc, platform: nickname, participantId: id, pageIndex: 0, pageSize: 50}
  113. }
  114. func (m *ModelArtsLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
  115. //TODO modelArts上传镜像
  116. return nil, nil
  117. }
  118. func (m *ModelArtsLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  119. // TODO modelArts删除镜像
  120. return nil, nil
  121. }
  122. func (m *ModelArtsLink) QueryImageList(ctx context.Context) (interface{}, error) {
  123. // modelArts获取镜像列表
  124. req := &modelarts.ListRepoReq{
  125. Offset: "0",
  126. Limit: strconv.Itoa(int(m.pageSize)),
  127. Platform: m.platform,
  128. }
  129. resp, err := m.modelArtsImgRpc.ListReposDetails(ctx, req)
  130. if err != nil {
  131. return nil, err
  132. }
  133. return resp, nil
  134. }
  135. func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  136. // modelArts提交任务
  137. environments := make(map[string]string)
  138. parameters := make([]*modelarts.ParametersTrainJob, 0)
  139. //parameters2 := make([]*modelarts.ParametersTrainJob, 0)
  140. inputs := make([]*modelarts.InputTraining, 0)
  141. outputs := make([]*modelarts.OutputTraining, 0)
  142. outputValue := ""
  143. for _, env := range envs {
  144. // 找到第一个逗号位置
  145. idx := strings.Index(env, COMMA)
  146. if idx == -1 {
  147. continue
  148. }
  149. key := strings.TrimSpace(env[:idx])
  150. value := strings.TrimSpace(env[idx+1:])
  151. environments[key] = value
  152. }
  153. for _, param := range params {
  154. s := strings.Split(param, COMMA)
  155. parameters = append(parameters, &modelarts.ParametersTrainJob{
  156. Name: s[0],
  157. Value: s[1],
  158. })
  159. if s[0] == "output" {
  160. outputValue = s[1]
  161. }
  162. }
  163. if len(datasetsId) != 0 {
  164. inputs = append(inputs, &modelarts.InputTraining{
  165. Name: "input",
  166. AccessMethod: "parameter",
  167. Remote: &modelarts.RemoteTra{
  168. Obs: &modelarts.ObsTra{
  169. ObsUrl: datasetsId + "/",
  170. },
  171. }})
  172. }
  173. if len(outputValue) != 0 {
  174. outputs = append(outputs, &modelarts.OutputTraining{
  175. Name: "output",
  176. Remote: &modelarts.RemoteOut{
  177. Obs: &modelarts.ObsTra{
  178. ObsUrl: "obs:/" + outputValue + "/",
  179. },
  180. }})
  181. }
  182. req := &modelarts.CreateTrainingJobReq{
  183. Kind: "job",
  184. Metadata: &modelarts.MetadataS{
  185. Name: TASK_NAME_PREFIX + utils.RandomString(10),
  186. WorkspaceId: "0",
  187. },
  188. Algorithm: &modelarts.Algorithms{
  189. Id: algorithmId,
  190. Engine: &modelarts.EngineCreateTraining{
  191. ImageUrl: imageId,
  192. },
  193. Command: cmd,
  194. Environments: environments,
  195. Parameters: parameters,
  196. Inputs: inputs,
  197. Outputs: outputs,
  198. },
  199. Spec: &modelarts.SpecsC{
  200. Resource: &modelarts.ResourceCreateTraining{
  201. FlavorId: resourceId,
  202. NodeCount: 1,
  203. },
  204. },
  205. Platform: m.platform,
  206. }
  207. marshal, err2 := json.Marshal(req)
  208. if err2 != nil {
  209. }
  210. println(string(marshal))
  211. resp, err := m.modelArtsRpc.CreateTrainingJob(ctx, req)
  212. if err != nil {
  213. return nil, err
  214. }
  215. if resp.ErrorMsg != "" {
  216. return nil, errors.New(resp.ErrorMsg)
  217. }
  218. return resp, nil
  219. }
  220. func (m *ModelArtsLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  221. // 获取任务
  222. req := &modelarts.DetailTrainingJobsReq{
  223. TrainingJobId: taskId,
  224. Platform: m.platform,
  225. }
  226. resp, err := m.modelArtsRpc.GetTrainingJobs(ctx, req)
  227. if err != nil {
  228. return nil, err
  229. }
  230. return resp, nil
  231. }
  232. func (m *ModelArtsLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  233. // 删除任务
  234. req := &modelarts.DeleteTrainingJobReq{
  235. TrainingJobId: taskId,
  236. Platform: m.platform,
  237. }
  238. resp, err := m.modelArtsRpc.DeleteTrainingJob(ctx, req)
  239. if err != nil {
  240. return nil, err
  241. }
  242. return resp, nil
  243. }
  244. func (m *ModelArtsLink) QuerySpecs(ctx context.Context) (interface{}, error) {
  245. // modelarts查询资源规格
  246. req := &modelarts.TrainingJobFlavorsReq{
  247. Platform: m.platform,
  248. }
  249. resp, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, req)
  250. if err != nil {
  251. return nil, err
  252. }
  253. return resp, nil
  254. }
  255. func (m *ModelArtsLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  256. req := &modelarts.GetPoolsRuntimeMetricsReq{}
  257. resp, err := m.modelArtsRpc.GetPoolsRuntimeMetrics(ctx, req)
  258. if err != nil {
  259. return nil, err
  260. }
  261. if resp.ErrorMsg != "" {
  262. return nil, errors.New("failed to get algorithms")
  263. }
  264. resourceStats := &collector.ResourceStats{}
  265. CpuCoreTotalSum := int64(0)
  266. CpuCoreAvailSum := int64(0)
  267. MemTotalSum := float64(0)
  268. MemAvailSum := float64(0)
  269. var CpuCoreTotal int64
  270. var CpuCoreAvail int64
  271. var MemTotal float64
  272. var MemAvail float64
  273. for _, items := range resp.Items {
  274. //TODO The value of taskType is temporarily fixed to "pytorch"
  275. CpuCoreTotal, err = strconv.ParseInt(items.Table.Capacity.Value.Cpu, 10, 64)
  276. CpuCoreTotalSum += CpuCoreTotal
  277. CpuCoreAvail, err = strconv.ParseInt(items.Table.Allocated.Value.Cpu, 10, 64)
  278. CpuCoreAvailSum += CpuCoreAvail
  279. MemTotal, err = strconv.ParseFloat(items.Table.Capacity.Value.Memory, 64)
  280. MemTotalSum += MemTotal
  281. MemAvail, err = strconv.ParseFloat(items.Table.Allocated.Value.Memory, 64)
  282. MemAvailSum += MemAvail
  283. }
  284. resourceStats.CpuCoreTotal = CpuCoreTotalSum
  285. resourceStats.CpuCoreAvail = CpuCoreAvailSum
  286. resourceStats.MemTotal = MemTotalSum
  287. resourceStats.MemAvail = MemAvailSum
  288. req1 := &modelarts.GetResourceFlavorsReq{}
  289. resp1, err := m.modelArtsRpc.GetResourceFlavors(ctx, req1)
  290. num32, _ := strconv.Atoi(resp1.Items[0].Spec.Npu.Size)
  291. var cards []*collector.Card
  292. card := &collector.Card{
  293. Platform: MODELARTS,
  294. Type: CARD,
  295. Name: Npu,
  296. CardNum: int32(num32),
  297. TOpsAtFp16: float64(num32 * 320),
  298. }
  299. cards = append(cards, card)
  300. resourceStats.CardsAvail = cards
  301. return resourceStats, nil
  302. }
  303. func (m *ModelArtsLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  304. return nil, nil
  305. }
  306. func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  307. var algorithms []*collector.Algorithm
  308. req := &modelarts.ListAlgorithmsReq{
  309. Platform: m.platform,
  310. Offset: m.pageIndex,
  311. Limit: m.pageSize,
  312. }
  313. resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
  314. if err != nil {
  315. return nil, err
  316. }
  317. if resp.ErrorMsg != "" {
  318. return nil, errors.New("failed to get algorithms")
  319. }
  320. for _, a := range resp.Items {
  321. //TODO The value of taskType is temporarily fixed to "pytorch"
  322. algorithm := &collector.Algorithm{Name: a.Metadata.Name, Platform: MODELARTS, TaskType: "pytorch"}
  323. algorithms = append(algorithms, algorithm)
  324. }
  325. return algorithms, nil
  326. }
  327. func (m *ModelArtsLink) GetComputeCards(ctx context.Context) ([]string, error) {
  328. var cards []string
  329. cards = append(cards, Ascend)
  330. return cards, nil
  331. }
  332. func (m *ModelArtsLink) GetUserBalance(ctx context.Context) (float64, error) {
  333. return 0, nil
  334. }
  335. func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  336. algoName := dataset + DASH + algorithm
  337. req := &modelarts.GetFileReq{
  338. Path: algoName + FORWARD_SLASH + TRAIN_FILE,
  339. }
  340. resp, err := m.modelArtsRpc.GetFile(ctx, req)
  341. if err != nil {
  342. return "", err
  343. }
  344. return string(resp.Content), nil
  345. }
  346. func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  347. return nil
  348. }
  349. // Determine whether there is a necessary image in image management and query the image name based on the image name
  350. func (m *ModelArtsLink) getSourceLocationFromImages(ctx context.Context, option *option.InferOption) error {
  351. req := &modelarts.ListImagesReq{
  352. //Platform: m.platform,
  353. Limit: 50,
  354. Offset: 0,
  355. }
  356. ListImagesResp, err := m.modelArtsRpc.ListImages(ctx, req)
  357. if err != nil {
  358. return err
  359. }
  360. if ListImagesResp.Code != 200 {
  361. return errors.New("failed to get ListImages")
  362. }
  363. for _, ListImages := range ListImagesResp.Data {
  364. if option.ModelName == "ChatGLM-6B" {
  365. if ListImages.Name == "chatglm-6b" {
  366. m.SourceLocation = ListImages.SwrPath
  367. return nil
  368. }
  369. } else {
  370. if ListImages.Name == option.ModelName {
  371. m.SourceLocation = ListImages.SwrPath
  372. return nil
  373. }
  374. }
  375. }
  376. return errors.New("SourceLocation not set")
  377. }
  378. // Get AI Application List
  379. func (m *ModelArtsLink) GetModelId(ctx context.Context, option *option.InferOption) error {
  380. req := &modelarts.ListModelReq{
  381. Platform: m.platform,
  382. ModelName: option.ModelName,
  383. //ModelType: "Image",
  384. Limit: int64(m.pageIndex),
  385. Offset: int64(m.pageSize),
  386. }
  387. ListModelResp, err := m.modelArtsRpc.ListModels(ctx, req)
  388. if err != nil {
  389. return err
  390. }
  391. if ListModelResp.Code == 200 {
  392. //return errors.New("failed to get ModelId")
  393. for _, ListModel := range ListModelResp.Models {
  394. if ListModel.ModelName == option.ModelName {
  395. option.ModelId = ListModel.ModelId
  396. m.Version = ListModel.ModelVersion
  397. return nil
  398. }
  399. }
  400. }
  401. err = m.CreateModel(ctx, option)
  402. if err != nil {
  403. return err
  404. }
  405. return nil
  406. }
  407. func (m *ModelArtsLink) GetModel(ctx context.Context, option *option.InferOption) string {
  408. req := &modelarts.ShowModelReq{
  409. Platform: m.platform,
  410. ModelId: option.ModelID,
  411. }
  412. ctx, cancel := context.WithTimeout(context.Background(), 50*time.Second)
  413. defer cancel()
  414. ShowModelsResp, err := m.modelArtsRpc.ShowModels(ctx, req)
  415. if err != nil {
  416. if err == context.DeadlineExceeded {
  417. log.Println("Request timed out")
  418. // 重试请求或其他处理
  419. } else {
  420. log.Fatalf("could not call method: %v", err)
  421. }
  422. }
  423. if ShowModelsResp.Code != 200 {
  424. errors.New("failed to get findModelsStatus")
  425. }
  426. m.ModelType = ShowModelsResp.ShowModelDetail.ModelAlgorithm
  427. return ShowModelsResp.ShowModelDetail.ModelStatus
  428. }
  429. // Get AI Application List
  430. func (m *ModelArtsLink) GetModelStatus(ctx context.Context, option *option.InferOption) error {
  431. var wg sync.WaitGroup
  432. wg.Add(1)
  433. // 使用goroutine进行轮询
  434. //defer wg.Done()
  435. for {
  436. status := m.GetModel(ctx, option)
  437. if status == "published" {
  438. fmt.Println("Model is now published.")
  439. break // 一旦状态变为published,就退出循环
  440. }
  441. fmt.Println("Waiting for model to be published...")
  442. time.Sleep(5 * time.Second) // 等待一段时间后再次检查
  443. }
  444. // 在这里执行模型状态为published后需要进行的操作
  445. fmt.Println("Continuing with the program...")
  446. return nil
  447. }
  448. // Create an AI application
  449. func (m *ModelArtsLink) CreateModel(ctx context.Context, option *option.InferOption) error {
  450. //Before creating an AI application, check if there are any images that can be created
  451. err := m.getSourceLocationFromImages(ctx, option)
  452. if err != nil { //
  453. return errors.New("No image available for creationd")
  454. }
  455. //
  456. var CMD string
  457. if option.ModelName == "imagenet_resnet50" {
  458. CMD = ImageNetResnet50Cmd
  459. } else if option.ModelName == "ChatGLM-6B" {
  460. CMD = ChatGLM6BCmd
  461. }
  462. if m.Version == "" {
  463. m.Version = "0.0.1"
  464. }
  465. version, err := ParseVersion(m.Version)
  466. version.Increment()
  467. req := &modelarts.CreateModelReq{
  468. Platform: m.platform,
  469. ModelName: option.ModelName,
  470. ModelType: "Image",
  471. ModelVersion: version.String(),
  472. SourceLocation: m.SourceLocation,
  473. InstallType: []string{"real-time"},
  474. Cmd: CMD,
  475. ModelAlgorithm: option.ModelType,
  476. }
  477. ModelResp, err := m.modelArtsRpc.CreateModel(ctx, req)
  478. if err != nil {
  479. return err
  480. }
  481. if ModelResp.Code != 200 {
  482. return errors.New("failed to get ModelId")
  483. }
  484. option.ModelId = ModelResp.ModelId
  485. return nil
  486. }
  487. func (m *ModelArtsLink) GetSpecifications(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  488. req := &modelarts.ListSpecificationsReq{
  489. //Platform: m.platform,
  490. IsPersonalCluster: false,
  491. InferType: "real-time",
  492. Limit: m.pageIndex,
  493. OffSet: m.pageSize,
  494. }
  495. ListSpecificationsResp, err := m.modelArtsRpc.ListSpecifications(ctx, req)
  496. if err != nil {
  497. return err
  498. }
  499. for _, ListSpecifications := range ListSpecificationsResp.Specifications {
  500. if ListSpecifications.Specification == "modelarts.kat1.xlarge" {
  501. ifoption.Specification = ListSpecifications.Specification
  502. return nil
  503. }
  504. }
  505. return nil
  506. }
  507. func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  508. req := &modelartsservice.GetTrainingJobLogsPreviewReq{
  509. Platform: m.platform,
  510. TaskId: "worker-0",
  511. TrainingJobId: taskId,
  512. }
  513. //resp, err := m.modelArtsRpc.GetTrainingJobLogsPreview(ctx, req)
  514. stream, err := m.modelArtsRpc.GetTrainingJobLogStream(ctx, req)
  515. if err != nil {
  516. log.Fatalf("error calling StreamLogs: %v", err)
  517. }
  518. var fullLog string
  519. for {
  520. // 接收服务端发送的日志块
  521. logEntry, err := stream.Recv()
  522. if err == io.EOF {
  523. // 流结束
  524. break
  525. }
  526. if err != nil {
  527. log.Fatalf("接收日志块失败: %v", err)
  528. }
  529. // 拼接日志块
  530. fullLog += logEntry.Message
  531. }
  532. return fullLog, nil
  533. /* if strings.Contains(resp.Content, "404 Not Found") {
  534. = "waiting for logs..."
  535. }*/
  536. //return resp.Content, nil
  537. }
  538. func (m *ModelArtsLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  539. resp, err := m.QueryTask(ctx, taskId)
  540. if err != nil {
  541. return nil, err
  542. }
  543. jobresp, ok := (resp).(*modelartsservice.JobResponse)
  544. if jobresp.ErrorMsg != "" || !ok {
  545. if jobresp.ErrorMsg != "" {
  546. return nil, errors.New(jobresp.ErrorMsg)
  547. } else {
  548. return nil, errors.New("get training task failed, empty error returned")
  549. }
  550. }
  551. var task collector.Task
  552. task.Id = jobresp.Metadata.Id
  553. switch strings.ToLower(jobresp.Status.Phase) {
  554. case "completed":
  555. milliTimestamp := int64(jobresp.Status.StartTime)
  556. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  557. duration := int64(jobresp.Status.Duration)
  558. task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
  559. task.Status = constants.Completed
  560. case "failed":
  561. milliTimestamp := int64(jobresp.Status.StartTime)
  562. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  563. duration := int64(jobresp.Status.Duration)
  564. task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
  565. task.Status = constants.Failed
  566. case "running":
  567. milliTimestamp := int64(jobresp.Status.StartTime)
  568. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  569. task.Status = constants.Running
  570. case "stopped":
  571. task.Status = constants.Stopped
  572. case "pending":
  573. task.Status = constants.Pending
  574. case "terminated":
  575. //TODO Failed
  576. task.Status = constants.Failed
  577. default:
  578. task.Status = "undefined"
  579. }
  580. return &task, nil
  581. }
  582. func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption, mode int) (interface{}, error) {
  583. switch mode {
  584. case executor.SUBMIT_MODE_JOINT_CLOUD:
  585. err := m.GenerateSubmitParams(ctx, option)
  586. if err != nil {
  587. return nil, err
  588. }
  589. case executor.SUBMIT_MODE_STORAGE_SCHEDULE:
  590. var ascendNum int32
  591. for _, res := range option.ResourcesRequired {
  592. typeName, ok := res["type"]
  593. if !ok {
  594. continue
  595. }
  596. switch typeName {
  597. case "NPU":
  598. num, ok := res["number"]
  599. if !ok {
  600. continue
  601. }
  602. n := common.ConvertTypeToString(num)
  603. val, err := strconv.ParseInt(n, 10, 32)
  604. if err != nil {
  605. return nil, err
  606. }
  607. ascendNum = int32(val)
  608. }
  609. }
  610. req := &modelarts.TrainingJobFlavorsReq{
  611. Platform: "modelarts-CloudBrain2",
  612. FlavorType: "",
  613. }
  614. resp, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, req)
  615. for _, v := range resp.Flavors {
  616. if ascendNum == v.FlavorInfo.Npu.UnitNum {
  617. option.ResourceId = v.FlavorId
  618. break
  619. } else if ascendNum <= 1 {
  620. option.ResourceId = "modelarts.kat1.xlarge"
  621. break
  622. } else if ascendNum == 2 {
  623. option.ResourceId = "modelarts.kat1.2xlarge"
  624. break
  625. } else if ascendNum > 2 && ascendNum <= 4 {
  626. option.ResourceId = "modelarts.kat1.4xlarge"
  627. break
  628. } else if ascendNum >= 5 && ascendNum <= 8 {
  629. option.ResourceId = "modelarts.kat1.8xlarge"
  630. break
  631. } else if ascendNum > 8 {
  632. option.ResourceId = "modelarts.kat1.8xlarge"
  633. break
  634. }
  635. }
  636. if err != nil {
  637. return nil, err
  638. }
  639. option.ComputeCard = NPU
  640. default:
  641. return nil, errors.New("failed to choose submit mode")
  642. }
  643. task, err := m.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  644. if err != nil {
  645. return nil, err
  646. }
  647. return task, nil
  648. }
  649. func (m *ModelArtsLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  650. err := m.generateResourceId(ctx, option, nil)
  651. if err != nil {
  652. return err
  653. }
  654. err = m.generateAlgorithmId(ctx, option)
  655. if err != nil {
  656. return err
  657. }
  658. err = m.generateImageId(option)
  659. if err != nil {
  660. return err
  661. }
  662. err = m.generateCmd(option)
  663. if err != nil {
  664. return err
  665. }
  666. err = m.generateEnv(option)
  667. if err != nil {
  668. return err
  669. }
  670. err = m.generateParams(option)
  671. if err != nil {
  672. return err
  673. }
  674. return nil
  675. }
  676. func (m *ModelArtsLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  677. option.ResourceId = "modelarts.kat1.xlarge"
  678. return nil
  679. }
  680. func (m *ModelArtsLink) generateImageId(option *option.AiOption) error {
  681. return nil
  682. }
  683. func (m *ModelArtsLink) generateCmd(option *option.AiOption) error {
  684. return nil
  685. }
  686. func (m *ModelArtsLink) generateEnv(option *option.AiOption) error {
  687. return nil
  688. }
  689. func (m *ModelArtsLink) generateParams(option *option.AiOption) error {
  690. return nil
  691. }
  692. func (m *ModelArtsLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
  693. req := &modelarts.ListAlgorithmsReq{
  694. Platform: m.platform,
  695. Offset: m.pageIndex,
  696. Limit: m.pageSize,
  697. }
  698. resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
  699. if err != nil {
  700. return err
  701. }
  702. if resp.ErrorMsg != "" {
  703. return errors.New("failed to get algorithmId")
  704. }
  705. for _, algorithm := range resp.Items {
  706. engVersion := algorithm.JobConfig.Engine.EngineVersion
  707. if strings.Contains(engVersion, option.TaskType) {
  708. ns := strings.Split(algorithm.Metadata.Name, DASH)
  709. if ns[0] != option.TaskType {
  710. continue
  711. }
  712. if ns[1] != option.DatasetsName {
  713. continue
  714. }
  715. if ns[2] != option.AlgorithmName {
  716. continue
  717. }
  718. option.AlgorithmId = algorithm.Metadata.Id
  719. return nil
  720. }
  721. }
  722. if option.AlgorithmId == "" {
  723. return errors.New("Algorithm does not exist")
  724. }
  725. return errors.New("failed to get AlgorithmId")
  726. }
  727. func (m *ModelArtsLink) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  728. var imageUrls []*inference.InferUrl
  729. urlReq := &modelartsclient.ImageReasoningUrlReq{
  730. ServiceName: option.ModelName,
  731. Type: option.ModelType,
  732. Card: "npu",
  733. }
  734. urlResp, err := m.modelArtsRpc.ImageReasoningUrl(ctx, urlReq)
  735. if err != nil {
  736. return nil, err
  737. }
  738. imageUrl := &inference.InferUrl{
  739. Url: urlResp.Url,
  740. Card: "npu",
  741. }
  742. imageUrls = append(imageUrls, imageUrl)
  743. clusterWithUrl := &inference.ClusterInferUrl{
  744. ClusterName: m.platform,
  745. ClusterType: TYPE_MODELARTS,
  746. InferUrls: imageUrls,
  747. }
  748. return clusterWithUrl, nil
  749. }
  750. func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  751. var insList []*inference.DeployInstance
  752. req := &modelarts.ListServicesReq{
  753. Platform: m.platform,
  754. OffSet: m.pageIndex,
  755. Limit: m.pageSize,
  756. }
  757. //list, err := m.modelArtsRpc.ListServices(ctx, req)
  758. resp, err := m.modelArtsRpc.ListServices(ctx, req)
  759. if err != nil {
  760. return nil, err
  761. }
  762. if resp.ErrorMsg != "" {
  763. return nil, errors.New(resp.Msg)
  764. }
  765. for _, services := range resp.Services {
  766. ins := &inference.DeployInstance{}
  767. ins.InstanceName = services.ServiceName
  768. ins.InstanceId = services.ServiceId
  769. ins.Status = services.Status
  770. ins.InferCard = "NPU"
  771. ins.ClusterName = m.platform
  772. ins.CreatedTime = string(services.StartTime)
  773. ins.ClusterType = TYPE_MODELARTS
  774. insList = append(insList, ins)
  775. }
  776. return insList, nil
  777. }
  778. func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool {
  779. req := &modelartsclient.UpdateServiceReq{
  780. ServiceId: id,
  781. Status: "running",
  782. }
  783. resp, err := m.modelArtsRpc.UpdateService(ctx, req)
  784. if err != nil || resp.Code != 0 {
  785. return false
  786. }
  787. if resp.Code == 0 {
  788. return true
  789. }
  790. return false
  791. }
  792. func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool {
  793. req := &modelartsclient.UpdateServiceReq{
  794. ServiceId: id,
  795. Status: "stopped",
  796. }
  797. resp, err := m.modelArtsRpc.UpdateService(ctx, req)
  798. if err != nil || resp.Code != 0 {
  799. return false
  800. }
  801. if resp.Code == 0 {
  802. return true
  803. }
  804. return false
  805. }
  806. func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  807. req := &modelarts.ShowServiceReq{
  808. ServiceId: id,
  809. }
  810. resp, err := m.modelArtsRpc.ShowService(ctx, req)
  811. if err != nil {
  812. return nil, err
  813. }
  814. /* if resp.ErrorMsg != "" {
  815. return nil, errors.New(resp.Msg)
  816. }*/
  817. ins := &inference.DeployInstance{}
  818. ins.InstanceName = resp.ServiceName
  819. ins.InstanceId = resp.ServiceId
  820. ins.Status = resp.Status
  821. ins.InferCard = "NPU"
  822. ins.ClusterName = m.platform
  823. ins.CreatedTime = string(resp.StartTime)
  824. ins.ClusterType = TYPE_MODELARTS
  825. if resp.Config != nil {
  826. ins.ModelName = resp.Config[0].ModelName
  827. }
  828. if m.ModelType != "" {
  829. ins.ModelType = m.ModelType
  830. }
  831. ins.InferUrl = resp.AccessAddress
  832. return ins, nil
  833. }
  834. func (m *ModelArtsLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  835. return "", nil
  836. }
  837. func (m *ModelArtsLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  838. /* err := m.GetModelId(ctx, option)
  839. if err != nil {
  840. return "", err
  841. }*/
  842. err := m.GetModelStatus(ctx, option)
  843. if err != nil {
  844. return "模型状态查询错误", err
  845. }
  846. configParam := &modelarts.ServiceConfig{
  847. Specification: "modelarts.kat1.xlarge",
  848. Weight: 100,
  849. ModelId: option.ModelID,
  850. InstanceCount: 1,
  851. }
  852. var configItems []*modelarts.ServiceConfig
  853. configItems = append(configItems, configParam)
  854. now := time.Now()
  855. timestampSec := now.Unix()
  856. str := strconv.FormatInt(timestampSec, 10)
  857. req := &modelarts.CreateServiceReq{
  858. Platform: m.platform,
  859. Config: configItems,
  860. InferType: "real-time",
  861. ServiceName: option.ModelName + "_" + option.ModelType + "_" + Npu + "_" + str,
  862. }
  863. ctx, cancel := context.WithTimeout(context.Background(), 150*time.Second)
  864. defer cancel()
  865. resp, err := m.modelArtsRpc.CreateService(ctx, req)
  866. if err != nil {
  867. return "", err
  868. }
  869. return resp.ServiceId, nil
  870. }
  871. func (m *ModelArtsLink) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
  872. ifoption := &option.InferOption{
  873. ModelName: name,
  874. ModelType: mtype,
  875. }
  876. err := m.CheckImageExist(ctx, ifoption)
  877. if err != nil {
  878. return false
  879. }
  880. return true
  881. }
  882. func (m *ModelArtsLink) CheckImageExist(ctx context.Context, option *option.InferOption) error {
  883. req := &modelarts.ListImagesReq{
  884. Limit: m.pageSize,
  885. Offset: m.pageIndex,
  886. }
  887. ListImageResp, err := m.modelArtsRpc.ListImages(ctx, req)
  888. if err != nil {
  889. return err
  890. }
  891. var modelName string
  892. if ListImageResp.Code == 200 {
  893. //return errors.New("failed to get ModelId")
  894. for _, ListImage := range ListImageResp.Data {
  895. if option.ModelName == "ChatGLM-6B" {
  896. modelName = "chatglm-6b"
  897. } else {
  898. modelName = option.ModelName
  899. }
  900. if ListImage.Name == modelName {
  901. return nil
  902. }
  903. }
  904. }
  905. return errors.New("failed to find Image ")
  906. }
  907. func (m *ModelArtsLink) GetResourceSpecs(ctx context.Context, resrcType string) (*collector.ResourceSpec, error) {
  908. MoUsage := MoUsage{}
  909. var cpusum int64 = 0
  910. var npusum int64 = 0
  911. var memorysum int64 = 0
  912. var VMemorysum int64 = 0
  913. var RunningTaskNum int64 = 0
  914. var BalanceValue float64 = -1
  915. var RateValue = 0.930000
  916. var StorageValue int64 = 1024
  917. var AvailableValue int64 = 886
  918. resUsage := &collector.ResourceSpec{
  919. ClusterId: strconv.FormatInt(m.participantId, 10),
  920. }
  921. switch resrcType {
  922. case "Train":
  923. //查询获取训练作业支持的公共规格(包括1,2,4,8卡的选择和显存的数值)
  924. reqJobFlavors := &modelarts.TrainingJobFlavorsReq{
  925. Platform: m.platform,
  926. }
  927. respJobFlavors, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, reqJobFlavors)
  928. if err != nil {
  929. return nil, err
  930. }
  931. respJobFlavorsMarshal, err2 := json.Marshal(respJobFlavors)
  932. if err2 != nil {
  933. }
  934. println(string(respJobFlavorsMarshal))
  935. for _, TrainLists := range respJobFlavors.Flavors {
  936. re := regexp.MustCompile(`\d+`)
  937. VMemorynumberStr := re.FindString(string(TrainLists.FlavorInfo.Npu.Memory)) //显存的值,正则表达式去单位
  938. MoUsage.VMemorySize, err = strconv.ParseInt(VMemorynumberStr, 10, 64) //显存的值
  939. MoUsage.NpuSize = int64(TrainLists.FlavorInfo.Npu.UnitNum) //npu数量,张数
  940. MoUsage.CpuAvailable = int64(TrainLists.FlavorInfo.Cpu.CoreNum) //cpu核数
  941. MoUsage.MemoryAvailable = int64(TrainLists.FlavorInfo.Memory.Size) //内存大小
  942. npusum = MoUsage.NpuSize
  943. MoUsage.NpuAvailable = MoUsage.NpuSize
  944. cpusum = MoUsage.CpuAvailable
  945. memorysum = MoUsage.MemoryAvailable
  946. VMemorysum = MoUsage.VMemorySize
  947. MoUsage.VMemoryAvailable = MoUsage.VMemorySize
  948. str := fmt.Sprintf("%d", MoUsage.NpuSize) // 使用%d格式化占位符
  949. ASCENDName := str + "*ASCEND910"
  950. UsageCPU := &collector.Usage{Type: strings.ToUpper(CPU), Name: strings.ToUpper("ARM"), Total: &collector.UnitValue{Unit: CPUCORE, Value: cpusum}, Available: &collector.UnitValue{Unit: CPUCORE, Value: MoUsage.CpuAvailable}}
  951. UsageNPU := &collector.Usage{Type: strings.ToUpper(NPU), Name: ASCENDName, Total: &collector.UnitValue{Unit: NUMBER, Value: npusum}, Available: &collector.UnitValue{Unit: NUMBER, Value: MoUsage.NpuAvailable}}
  952. UsageMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(RAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: memorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.MemoryAvailable}}
  953. UsageVMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(VRAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: VMemorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.VMemoryAvailable}}
  954. Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}, Name: strings.ToUpper("disk"), Available: &collector.UnitValue{Unit: GIGABYTE, Value: AvailableValue}}
  955. cres := &collector.ClusterResource{}
  956. cres.Resource = UsageNPU
  957. cres.BaseResources = append(cres.BaseResources, UsageCPU)
  958. cres.BaseResources = append(cres.BaseResources, UsageMEMORY)
  959. cres.BaseResources = append(cres.BaseResources, UsageVMEMORY)
  960. cres.BaseResources = append(cres.BaseResources, Storage)
  961. resUsage.Resources = append(resUsage.Resources, cres)
  962. }
  963. RunningTask := &collector.Usage{Type: strings.ToUpper(RUNNINGTASK), Total: &collector.UnitValue{Unit: NUMBER, Value: RunningTaskNum}}
  964. Balance := &collector.Usage{Type: strings.ToUpper(BALANCE), Total: &collector.UnitValue{Unit: RMB, Value: BalanceValue}}
  965. Rate := &collector.Usage{Type: strings.ToUpper(RATE), Total: &collector.UnitValue{Unit: PERHOUR, Value: RateValue}}
  966. RunningTaskRes := &collector.ClusterResource{}
  967. RunningTaskRes.Resource = RunningTask
  968. BalanceRes := &collector.ClusterResource{}
  969. BalanceRes.Resource = Balance
  970. RateRes := &collector.ClusterResource{}
  971. RateRes.Resource = Rate
  972. resUsage.Resources = append(resUsage.Resources, RunningTaskRes)
  973. resUsage.Resources = append(resUsage.Resources, BalanceRes)
  974. resUsage.Resources = append(resUsage.Resources, RateRes)
  975. resUsage.Tag = "Train"
  976. case "Inference":
  977. req := &modelarts.ListSpecificationsReq{
  978. //Platform: m.platform,
  979. IsPersonalCluster: true,
  980. InferType: "real-time",
  981. Limit: m.pageIndex,
  982. OffSet: m.pageSize,
  983. }
  984. ListSpecificationsResp, err := m.modelArtsRpc.ListSpecifications(ctx, req)
  985. if err != nil {
  986. return nil, err
  987. }
  988. respJobSpecificationsMarshal, err2 := json.Marshal(ListSpecificationsResp)
  989. if err2 != nil {
  990. }
  991. println(string(respJobSpecificationsMarshal))
  992. for _, Specifications := range ListSpecificationsResp.Specifications {
  993. if Specifications.SpecStatus == "normal" {
  994. MoUsage.VMemorySize = int64(Specifications.NpuInfo.Memory) //显存的值
  995. MoUsage.NpuSize = int64(Specifications.NpuInfo.Npu) //npu数量,张数
  996. MoUsage.CpuAvailable = int64(Specifications.CpuInfo.Cpu) //cpu核数
  997. MoUsage.MemoryAvailable = int64(Specifications.MemoryInfo.Memory) //内存大小
  998. npusum = MoUsage.NpuSize
  999. MoUsage.NpuAvailable = MoUsage.NpuSize
  1000. cpusum = MoUsage.CpuAvailable
  1001. memorysum = MoUsage.MemoryAvailable
  1002. VMemorysum = MoUsage.VMemorySize
  1003. MoUsage.VMemoryAvailable = MoUsage.VMemorySize
  1004. ASCENDName := Specifications.DisplayCn + Specifications.Specification
  1005. UsageCPU := &collector.Usage{Type: strings.ToUpper(CPU), Name: strings.ToUpper("ARM"), Total: &collector.UnitValue{Unit: CPUCORE, Value: cpusum}, Available: &collector.UnitValue{Unit: CPUCORE, Value: MoUsage.CpuAvailable}}
  1006. UsageNPU := &collector.Usage{Type: strings.ToUpper(NPU), Name: ASCENDName, Total: &collector.UnitValue{Unit: NUMBER, Value: npusum}, Available: &collector.UnitValue{Unit: NUMBER, Value: MoUsage.NpuAvailable}}
  1007. UsageMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(RAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: memorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.MemoryAvailable}}
  1008. UsageVMEMORY := &collector.Usage{Type: strings.ToUpper(MEMORY), Name: strings.ToUpper(VRAM), Total: &collector.UnitValue{Unit: GIGABYTE, Value: VMemorysum}, Available: &collector.UnitValue{Unit: GIGABYTE, Value: MoUsage.VMemoryAvailable}}
  1009. Storage := &collector.Usage{Type: strings.ToUpper(STORAGE), Total: &collector.UnitValue{Unit: GIGABYTE, Value: StorageValue}, Name: strings.ToUpper("disk"), Available: &collector.UnitValue{Unit: GIGABYTE, Value: AvailableValue}}
  1010. cres := &collector.ClusterResource{}
  1011. cres.Resource = UsageNPU
  1012. cres.BaseResources = append(cres.BaseResources, UsageCPU)
  1013. cres.BaseResources = append(cres.BaseResources, UsageMEMORY)
  1014. cres.BaseResources = append(cres.BaseResources, UsageVMEMORY)
  1015. cres.BaseResources = append(cres.BaseResources, Storage)
  1016. resUsage.Tag = "Inference"
  1017. resUsage.Resources = append(resUsage.Resources, cres)
  1018. }
  1019. }
  1020. RunningTask := &collector.Usage{Type: strings.ToUpper(RUNNINGTASK), Total: &collector.UnitValue{Unit: NUMBER, Value: RunningTaskNum}}
  1021. Balance := &collector.Usage{Type: strings.ToUpper(BALANCE), Total: &collector.UnitValue{Unit: RMB, Value: BalanceValue}}
  1022. Rate := &collector.Usage{Type: strings.ToUpper(RATE), Total: &collector.UnitValue{Unit: PERHOUR, Value: RateValue}}
  1023. RunningTaskRes := &collector.ClusterResource{}
  1024. RunningTaskRes.Resource = RunningTask
  1025. BalanceRes := &collector.ClusterResource{}
  1026. BalanceRes.Resource = Balance
  1027. RateRes := &collector.ClusterResource{}
  1028. RateRes.Resource = Rate
  1029. resUsage.Resources = append(resUsage.Resources, RunningTaskRes)
  1030. resUsage.Resources = append(resUsage.Resources, BalanceRes)
  1031. resUsage.Resources = append(resUsage.Resources, RateRes)
  1032. }
  1033. return resUsage, nil
  1034. }
  1035. func (m *ModelArtsLink) Stop(ctx context.Context, id string) error {
  1036. req := &modelarts.StopTrainingJobReq{
  1037. TrainingJobId: id,
  1038. ActionType: "terminate",
  1039. }
  1040. resp, err := m.modelArtsRpc.StopTrainingJob(ctx, req)
  1041. if err != nil {
  1042. return err
  1043. }
  1044. if resp.Code != 0 {
  1045. return errors.New(resp.ErrorMsg)
  1046. }
  1047. return nil
  1048. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.