You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "fmt"
  16. "github.com/pkg/errors"
  17. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  19. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  20. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  21. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  22. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
  23. "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
  24. "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
  25. "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
  26. modelartsclient "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
  27. "gorm.io/gorm"
  28. "log"
  29. "mime/multipart"
  30. "strconv"
  31. "strings"
  32. "sync"
  33. "time"
  34. )
  35. const (
  36. Ascend = "Ascend"
  37. Npu = "npu"
  38. ImageNetResnet50Cmd = "cd /home/ma-user & python ./inference_ascend.py"
  39. ChatGLM6BCmd = "cd /home/ma-user && python ./download_model.py && python ./inference_chatGLM.py"
  40. )
  41. type ModelArtsLink struct {
  42. modelArtsRpc modelartsservice.ModelArtsService
  43. modelArtsImgRpc imagesservice.ImagesService
  44. platform string
  45. participantId int64
  46. pageIndex int32
  47. pageSize int32
  48. SourceLocation string
  49. Version string
  50. ModelId string
  51. ModelType string
  52. DbEngin *gorm.DB
  53. }
  54. // Version 结构体表示版本号
  55. type Version struct {
  56. Major, Minor, Patch int
  57. }
  58. type AiStorage struct {
  59. DbEngin *gorm.DB
  60. }
  61. // ParseVersion 从字符串解析版本号
  62. func ParseVersion(versionStr string) (*Version, error) {
  63. parts := strings.Split(versionStr, ".")
  64. if len(parts) != 3 {
  65. return nil, fmt.Errorf("invalid version format: %s", versionStr)
  66. }
  67. major, err := strconv.Atoi(parts[0])
  68. if err != nil {
  69. return nil, err
  70. }
  71. minor, err := strconv.Atoi(parts[1])
  72. if err != nil {
  73. return nil, err
  74. }
  75. patch, err := strconv.Atoi(parts[2])
  76. if err != nil {
  77. return nil, err
  78. }
  79. return &Version{Major: major, Minor: minor, Patch: patch}, nil
  80. }
  81. // Increment 根据给定规则递增版本号
  82. func (v *Version) Increment() {
  83. if v.Patch < 9 {
  84. v.Patch++
  85. } else {
  86. v.Patch = 0
  87. if v.Minor < 9 {
  88. v.Minor++
  89. } else {
  90. v.Minor = 0
  91. v.Major++
  92. }
  93. }
  94. }
  95. // String 将版本号转换回字符串格式
  96. func (v *Version) String() string {
  97. return fmt.Sprintf("%d.%d.%d", v.Major, v.Minor, v.Patch)
  98. }
  99. func NewModelArtsLink(modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, name string, id int64, nickname string) *ModelArtsLink {
  100. return &ModelArtsLink{modelArtsRpc: modelArtsRpc, modelArtsImgRpc: modelArtsImgRpc, platform: nickname, participantId: id, pageIndex: 0, pageSize: 50}
  101. }
  102. func (m *ModelArtsLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
  103. //TODO modelArts上传镜像
  104. return nil, nil
  105. }
  106. func (m *ModelArtsLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  107. // TODO modelArts删除镜像
  108. return nil, nil
  109. }
  110. func (m *ModelArtsLink) QueryImageList(ctx context.Context) (interface{}, error) {
  111. // modelArts获取镜像列表
  112. req := &modelarts.ListRepoReq{
  113. Offset: "0",
  114. Limit: strconv.Itoa(int(m.pageSize)),
  115. Platform: m.platform,
  116. }
  117. resp, err := m.modelArtsImgRpc.ListReposDetails(ctx, req)
  118. if err != nil {
  119. return nil, err
  120. }
  121. return resp, nil
  122. }
  123. func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  124. // modelArts提交任务
  125. environments := make(map[string]string)
  126. parameters := make([]*modelarts.ParametersTrainJob, 0)
  127. for _, env := range envs {
  128. s := strings.Split(env, COMMA)
  129. environments[s[0]] = s[1]
  130. }
  131. for _, param := range params {
  132. s := strings.Split(param, COMMA)
  133. parameters = append(parameters, &modelarts.ParametersTrainJob{
  134. Name: s[0],
  135. Value: s[1],
  136. })
  137. }
  138. req := &modelarts.CreateTrainingJobReq{
  139. Kind: "job",
  140. Metadata: &modelarts.MetadataS{
  141. Name: TASK_NAME_PREFIX + utils.RandomString(10),
  142. WorkspaceId: "0",
  143. },
  144. Algorithm: &modelarts.Algorithms{
  145. Id: algorithmId,
  146. Engine: &modelarts.EngineCreateTraining{
  147. ImageUrl: imageId,
  148. },
  149. Command: cmd,
  150. Environments: environments,
  151. Parameters: parameters,
  152. },
  153. Spec: &modelarts.SpecsC{
  154. Resource: &modelarts.ResourceCreateTraining{
  155. FlavorId: resourceId,
  156. NodeCount: 1,
  157. },
  158. },
  159. Platform: m.platform,
  160. }
  161. resp, err := m.modelArtsRpc.CreateTrainingJob(ctx, req)
  162. //tx := m.DbEngin.Create(adapterId)
  163. /*if tx.Error != nil {
  164. return tx.Error, nil
  165. }*/
  166. if err != nil {
  167. return nil, err
  168. }
  169. return resp, nil
  170. }
  171. func (m *ModelArtsLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  172. // 获取任务
  173. req := &modelarts.DetailTrainingJobsReq{
  174. TrainingJobId: taskId,
  175. Platform: m.platform,
  176. }
  177. resp, err := m.modelArtsRpc.GetTrainingJobs(ctx, req)
  178. if err != nil {
  179. return nil, err
  180. }
  181. return resp, nil
  182. }
  183. func (m *ModelArtsLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  184. // 删除任务
  185. req := &modelarts.DeleteTrainingJobReq{
  186. TrainingJobId: taskId,
  187. Platform: m.platform,
  188. }
  189. resp, err := m.modelArtsRpc.DeleteTrainingJob(ctx, req)
  190. if err != nil {
  191. return nil, err
  192. }
  193. return resp, nil
  194. }
  195. func (m *ModelArtsLink) QuerySpecs(ctx context.Context) (interface{}, error) {
  196. // octopus查询资源规格
  197. req := &modelarts.TrainingJobFlavorsReq{
  198. Platform: m.platform,
  199. }
  200. resp, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, req)
  201. if err != nil {
  202. return nil, err
  203. }
  204. return resp, nil
  205. }
  206. func (m *ModelArtsLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  207. req := &modelarts.GetPoolsRuntimeMetricsReq{}
  208. resp, err := m.modelArtsRpc.GetPoolsRuntimeMetrics(ctx, req)
  209. if err != nil {
  210. return nil, err
  211. }
  212. if resp.ErrorMsg != "" {
  213. return nil, errors.New("failed to get algorithms")
  214. }
  215. resourceStats := &collector.ResourceStats{}
  216. CpuCoreTotalSum := int64(0)
  217. CpuCoreAvailSum := int64(0)
  218. MemTotalSum := float64(0)
  219. MemAvailSum := float64(0)
  220. var CpuCoreTotal int64
  221. var CpuCoreAvail int64
  222. var MemTotal float64
  223. var MemAvail float64
  224. for _, items := range resp.Items {
  225. //TODO The value of taskType is temporarily fixed to "pytorch"
  226. CpuCoreTotal, err = strconv.ParseInt(items.Table.Capacity.Value.Cpu, 10, 64)
  227. CpuCoreTotalSum += CpuCoreTotal
  228. CpuCoreAvail, err = strconv.ParseInt(items.Table.Allocated.Value.Cpu, 10, 64)
  229. CpuCoreAvailSum += CpuCoreAvail
  230. MemTotal, err = strconv.ParseFloat(items.Table.Capacity.Value.Memory, 64)
  231. MemTotalSum += MemTotal
  232. MemAvail, err = strconv.ParseFloat(items.Table.Allocated.Value.Memory, 64)
  233. MemAvailSum += MemAvail
  234. }
  235. resourceStats.CpuCoreTotal = CpuCoreTotalSum
  236. resourceStats.CpuCoreAvail = CpuCoreAvailSum
  237. resourceStats.MemTotal = MemTotalSum
  238. resourceStats.MemAvail = MemAvailSum
  239. req1 := &modelarts.GetResourceFlavorsReq{}
  240. resp1, err := m.modelArtsRpc.GetResourceFlavors(ctx, req1)
  241. num32, _ := strconv.Atoi(resp1.Items[0].Spec.Npu.Size)
  242. var cards []*collector.Card
  243. card := &collector.Card{
  244. Platform: MODELARTS,
  245. Type: CARD,
  246. Name: Npu,
  247. CardNum: int32(num32),
  248. TOpsAtFp16: float64(num32 * 320),
  249. }
  250. cards = append(cards, card)
  251. resourceStats.CardsAvail = cards
  252. return resourceStats, nil
  253. }
  254. func (m *ModelArtsLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  255. return nil, nil
  256. }
  257. func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  258. var algorithms []*collector.Algorithm
  259. req := &modelarts.ListAlgorithmsReq{
  260. Platform: m.platform,
  261. Offset: m.pageIndex,
  262. Limit: m.pageSize,
  263. }
  264. resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
  265. if err != nil {
  266. return nil, err
  267. }
  268. if resp.ErrorMsg != "" {
  269. return nil, errors.New("failed to get algorithms")
  270. }
  271. for _, a := range resp.Items {
  272. //TODO The value of taskType is temporarily fixed to "pytorch"
  273. algorithm := &collector.Algorithm{Name: a.Metadata.Name, Platform: MODELARTS, TaskType: "pytorch"}
  274. algorithms = append(algorithms, algorithm)
  275. }
  276. return algorithms, nil
  277. }
  278. func (m *ModelArtsLink) GetComputeCards(ctx context.Context) ([]string, error) {
  279. var cards []string
  280. cards = append(cards, Ascend)
  281. return cards, nil
  282. }
  283. func (m *ModelArtsLink) GetUserBalance(ctx context.Context) (float64, error) {
  284. return 0, nil
  285. }
  286. func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  287. return "", nil
  288. }
  289. func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  290. return nil
  291. }
  292. // Determine whether there is a necessary image in image management and query the image name based on the image name
  293. func (m *ModelArtsLink) getSourceLocationFromImages(ctx context.Context, option *option.InferOption) error {
  294. req := &modelarts.ListImagesReq{
  295. //Platform: m.platform,
  296. Limit: 50,
  297. Offset: 0,
  298. }
  299. ListImagesResp, err := m.modelArtsRpc.ListImages(ctx, req)
  300. if err != nil {
  301. return err
  302. }
  303. if ListImagesResp.Code != 200 {
  304. return errors.New("failed to get ListImages")
  305. }
  306. for _, ListImages := range ListImagesResp.Data {
  307. if option.ModelName == "ChatGLM-6B" {
  308. if ListImages.Name == "chatglm-6b" {
  309. m.SourceLocation = ListImages.SwrPath
  310. return nil
  311. }
  312. } else {
  313. if ListImages.Name == option.ModelName {
  314. m.SourceLocation = ListImages.SwrPath
  315. return nil
  316. }
  317. }
  318. }
  319. return errors.New("SourceLocation not set")
  320. }
  321. // Get AI Application List
  322. func (m *ModelArtsLink) GetModelId(ctx context.Context, option *option.InferOption) error {
  323. req := &modelarts.ListModelReq{
  324. Platform: m.platform,
  325. ModelName: option.ModelName,
  326. //ModelType: "Image",
  327. Limit: int64(m.pageIndex),
  328. Offset: int64(m.pageSize),
  329. }
  330. ListModelResp, err := m.modelArtsRpc.ListModels(ctx, req)
  331. if err != nil {
  332. return err
  333. }
  334. if ListModelResp.Code == 200 {
  335. //return errors.New("failed to get ModelId")
  336. for _, ListModel := range ListModelResp.Models {
  337. if ListModel.ModelName == option.ModelName {
  338. option.ModelId = ListModel.ModelId
  339. m.Version = ListModel.ModelVersion
  340. return nil
  341. }
  342. }
  343. }
  344. err = m.CreateModel(ctx, option)
  345. if err != nil {
  346. return err
  347. }
  348. return nil
  349. }
  350. func (m *ModelArtsLink) GetModel(ctx context.Context, option *option.InferOption) string {
  351. req := &modelarts.ShowModelReq{
  352. Platform: m.platform,
  353. ModelId: option.ModelId,
  354. }
  355. ctx, cancel := context.WithTimeout(context.Background(), 50*time.Second)
  356. defer cancel()
  357. ShowModelsResp, err := m.modelArtsRpc.ShowModels(ctx, req)
  358. if err != nil {
  359. if err == context.DeadlineExceeded {
  360. log.Println("Request timed out")
  361. // 重试请求或其他处理
  362. } else {
  363. log.Fatalf("could not call method: %v", err)
  364. }
  365. }
  366. if ShowModelsResp.Code != 200 {
  367. errors.New("failed to get findModelsStatus")
  368. }
  369. m.ModelType = ShowModelsResp.ShowModelDetail.ModelAlgorithm
  370. return ShowModelsResp.ShowModelDetail.ModelStatus
  371. }
  372. // Get AI Application List
  373. func (m *ModelArtsLink) GetModelStatus(ctx context.Context, option *option.InferOption) error {
  374. var wg sync.WaitGroup
  375. wg.Add(1)
  376. // 使用goroutine进行轮询
  377. //defer wg.Done()
  378. for {
  379. status := m.GetModel(ctx, option)
  380. if status == "published" {
  381. fmt.Println("Model is now published.")
  382. break // 一旦状态变为published,就退出循环
  383. }
  384. fmt.Println("Waiting for model to be published...")
  385. time.Sleep(5 * time.Second) // 等待一段时间后再次检查
  386. }
  387. // 在这里执行模型状态为published后需要进行的操作
  388. fmt.Println("Continuing with the program...")
  389. return nil
  390. }
  391. // Create an AI application
  392. func (m *ModelArtsLink) CreateModel(ctx context.Context, option *option.InferOption) error {
  393. //Before creating an AI application, check if there are any images that can be created
  394. err := m.getSourceLocationFromImages(ctx, option)
  395. if err != nil { //
  396. return errors.New("No image available for creationd")
  397. }
  398. //
  399. var CMD string
  400. if option.ModelName == "imagenet_resnet50" {
  401. CMD = ImageNetResnet50Cmd
  402. } else if option.ModelName == "ChatGLM-6B" {
  403. CMD = ChatGLM6BCmd
  404. }
  405. if m.Version == "" {
  406. m.Version = "0.0.1"
  407. }
  408. version, err := ParseVersion(m.Version)
  409. version.Increment()
  410. req := &modelarts.CreateModelReq{
  411. Platform: m.platform,
  412. ModelName: option.ModelName,
  413. ModelType: "Image",
  414. ModelVersion: version.String(),
  415. SourceLocation: m.SourceLocation,
  416. InstallType: []string{"real-time"},
  417. Cmd: CMD,
  418. ModelAlgorithm: option.ModelType,
  419. }
  420. ModelResp, err := m.modelArtsRpc.CreateModel(ctx, req)
  421. if err != nil {
  422. return err
  423. }
  424. if ModelResp.Code != 200 {
  425. return errors.New("failed to get ModelId")
  426. }
  427. option.ModelId = ModelResp.ModelId
  428. return nil
  429. }
  430. func (m *ModelArtsLink) GetSpecifications(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  431. req := &modelarts.ListSpecificationsReq{
  432. //Platform: m.platform,
  433. IsPersonalCluster: false,
  434. InferType: "real-time",
  435. Limit: m.pageIndex,
  436. OffSet: m.pageSize,
  437. }
  438. ListSpecificationsResp, err := m.modelArtsRpc.ListSpecifications(ctx, req)
  439. if err != nil {
  440. return err
  441. }
  442. for _, ListSpecifications := range ListSpecificationsResp.Specifications {
  443. if ListSpecifications.Specification == "modelarts.kat1.xlarge" {
  444. ifoption.Specification = ListSpecifications.Specification
  445. return nil
  446. }
  447. }
  448. return nil
  449. }
  450. func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  451. req := &modelartsservice.GetTrainingJobLogsPreviewReq{
  452. Platform: m.platform,
  453. TaskId: "worker-0",
  454. TrainingJobId: taskId,
  455. }
  456. resp, err := m.modelArtsRpc.GetTrainingJobLogsPreview(ctx, req)
  457. if err != nil {
  458. return "", err
  459. }
  460. if strings.Contains(resp.Content, "404 Not Found") {
  461. resp.Content = "waiting for logs..."
  462. }
  463. return resp.Content, nil
  464. }
  465. func (m *ModelArtsLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  466. resp, err := m.QueryTask(ctx, taskId)
  467. if err != nil {
  468. return nil, err
  469. }
  470. jobresp, ok := (resp).(*modelartsservice.JobResponse)
  471. if jobresp.ErrorMsg != "" || !ok {
  472. if jobresp.ErrorMsg != "" {
  473. return nil, errors.New(jobresp.ErrorMsg)
  474. } else {
  475. return nil, errors.New("get training task failed, empty error returned")
  476. }
  477. }
  478. var task collector.Task
  479. task.Id = jobresp.Metadata.Id
  480. switch strings.ToLower(jobresp.Status.Phase) {
  481. case "completed":
  482. milliTimestamp := int64(jobresp.Status.StartTime)
  483. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  484. duration := int64(jobresp.Status.Duration)
  485. task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
  486. task.Status = constants.Completed
  487. case "failed":
  488. milliTimestamp := int64(jobresp.Status.StartTime)
  489. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  490. duration := int64(jobresp.Status.Duration)
  491. task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
  492. task.Status = constants.Failed
  493. case "running":
  494. milliTimestamp := int64(jobresp.Status.StartTime)
  495. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  496. task.Status = constants.Running
  497. case "stopped":
  498. task.Status = constants.Stopped
  499. case "pending":
  500. task.Status = constants.Pending
  501. case "terminated":
  502. //TODO Failed
  503. task.Status = constants.Failed
  504. default:
  505. task.Status = "undefined"
  506. }
  507. return &task, nil
  508. }
  509. func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
  510. err := m.GenerateSubmitParams(ctx, option)
  511. if err != nil {
  512. return nil, err
  513. }
  514. task, err := m.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  515. if err != nil {
  516. return nil, err
  517. }
  518. return task, nil
  519. }
  520. func (m *ModelArtsLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  521. err := m.generateResourceId(ctx, option, nil)
  522. if err != nil {
  523. return err
  524. }
  525. err = m.generateAlgorithmId(ctx, option)
  526. if err != nil {
  527. return err
  528. }
  529. err = m.generateImageId(option)
  530. if err != nil {
  531. return err
  532. }
  533. err = m.generateCmd(option)
  534. if err != nil {
  535. return err
  536. }
  537. err = m.generateEnv(option)
  538. if err != nil {
  539. return err
  540. }
  541. err = m.generateParams(option)
  542. if err != nil {
  543. return err
  544. }
  545. return nil
  546. }
  547. func (m *ModelArtsLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  548. option.ResourceId = "modelarts.kat1.xlarge"
  549. return nil
  550. }
  551. func (m *ModelArtsLink) generateImageId(option *option.AiOption) error {
  552. return nil
  553. }
  554. func (m *ModelArtsLink) generateCmd(option *option.AiOption) error {
  555. return nil
  556. }
  557. func (m *ModelArtsLink) generateEnv(option *option.AiOption) error {
  558. return nil
  559. }
  560. func (m *ModelArtsLink) generateParams(option *option.AiOption) error {
  561. return nil
  562. }
  563. func (m *ModelArtsLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
  564. req := &modelarts.ListAlgorithmsReq{
  565. Platform: m.platform,
  566. Offset: m.pageIndex,
  567. Limit: m.pageSize,
  568. }
  569. resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
  570. if err != nil {
  571. return err
  572. }
  573. if resp.ErrorMsg != "" {
  574. return errors.New("failed to get algorithmId")
  575. }
  576. for _, algorithm := range resp.Items {
  577. engVersion := algorithm.JobConfig.Engine.EngineVersion
  578. if strings.Contains(engVersion, option.TaskType) {
  579. ns := strings.Split(algorithm.Metadata.Name, DASH)
  580. if ns[0] != option.TaskType {
  581. continue
  582. }
  583. if ns[1] != option.DatasetsName {
  584. continue
  585. }
  586. if ns[2] != option.AlgorithmName {
  587. continue
  588. }
  589. option.AlgorithmId = algorithm.Metadata.Id
  590. return nil
  591. }
  592. }
  593. if option.AlgorithmId == "" {
  594. return errors.New("Algorithm does not exist")
  595. }
  596. return errors.New("failed to get AlgorithmId")
  597. }
  598. func (m *ModelArtsLink) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  599. var imageUrls []*inference.InferUrl
  600. urlReq := &modelartsclient.ImageReasoningUrlReq{
  601. ServiceName: option.ModelName,
  602. Type: option.ModelType,
  603. Card: "npu",
  604. }
  605. urlResp, err := m.modelArtsRpc.ImageReasoningUrl(ctx, urlReq)
  606. if err != nil {
  607. return nil, err
  608. }
  609. imageUrl := &inference.InferUrl{
  610. Url: urlResp.Url,
  611. Card: "npu",
  612. }
  613. imageUrls = append(imageUrls, imageUrl)
  614. clusterWithUrl := &inference.ClusterInferUrl{
  615. ClusterName: m.platform,
  616. ClusterType: TYPE_MODELARTS,
  617. InferUrls: imageUrls,
  618. }
  619. return clusterWithUrl, nil
  620. }
  621. func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  622. var insList []*inference.DeployInstance
  623. req := &modelarts.ListServicesReq{
  624. Platform: m.platform,
  625. OffSet: m.pageIndex,
  626. Limit: m.pageSize,
  627. }
  628. //list, err := m.modelArtsRpc.ListServices(ctx, req)
  629. resp, err := m.modelArtsRpc.ListServices(ctx, req)
  630. if err != nil {
  631. return nil, err
  632. }
  633. if resp.ErrorMsg != "" {
  634. return nil, errors.New(resp.Msg)
  635. }
  636. for _, services := range resp.Services {
  637. ins := &inference.DeployInstance{}
  638. ins.InstanceName = services.ServiceName
  639. ins.InstanceId = services.ServiceId
  640. ins.Status = services.Status
  641. ins.InferCard = "NPU"
  642. ins.ClusterName = m.platform
  643. ins.CreatedTime = string(services.StartTime)
  644. ins.ClusterType = TYPE_MODELARTS
  645. insList = append(insList, ins)
  646. }
  647. return insList, nil
  648. }
  649. func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool {
  650. req := &modelartsclient.UpdateServiceReq{
  651. ServiceId: id,
  652. Status: "running",
  653. }
  654. resp, err := m.modelArtsRpc.UpdateService(ctx, req)
  655. if err != nil || resp.Code != 0 {
  656. return false
  657. }
  658. if resp.Code == 0 {
  659. return true
  660. }
  661. return false
  662. }
  663. func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool {
  664. req := &modelartsclient.UpdateServiceReq{
  665. ServiceId: id,
  666. Status: "stopped",
  667. }
  668. resp, err := m.modelArtsRpc.UpdateService(ctx, req)
  669. if err != nil || resp.Code != 0 {
  670. return false
  671. }
  672. if resp.Code == 0 {
  673. return true
  674. }
  675. return false
  676. }
  677. func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  678. req := &modelarts.ShowServiceReq{
  679. ServiceId: id,
  680. }
  681. resp, err := m.modelArtsRpc.ShowService(ctx, req)
  682. if err != nil {
  683. return nil, err
  684. }
  685. if resp.ErrorMsg != "" {
  686. return nil, errors.New(resp.Msg)
  687. }
  688. ins := &inference.DeployInstance{}
  689. ins.InstanceName = resp.ServiceName
  690. ins.InstanceId = resp.ServiceId
  691. ins.Status = resp.Status
  692. ins.InferCard = "NPU"
  693. ins.ClusterName = m.platform
  694. ins.CreatedTime = string(resp.StartTime)
  695. ins.ClusterType = TYPE_MODELARTS
  696. ins.ModelName = resp.Config[0].ModelName
  697. ins.ModelType = m.ModelType
  698. ins.InferUrl = resp.AccessAddress
  699. return ins, nil
  700. }
  701. func (m *ModelArtsLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  702. return "", nil
  703. }
  704. func (m *ModelArtsLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  705. err := m.GetModelId(ctx, option)
  706. if err != nil {
  707. return "", err
  708. }
  709. err = m.GetModelStatus(ctx, option)
  710. if err != nil {
  711. return "", err
  712. }
  713. configParam := &modelarts.ServiceConfig{
  714. Specification: "modelarts.kat1.xlarge",
  715. Weight: 100,
  716. ModelId: option.ModelId,
  717. InstanceCount: 1,
  718. }
  719. var configItems []*modelarts.ServiceConfig
  720. configItems = append(configItems, configParam)
  721. now := time.Now()
  722. timestampSec := now.Unix()
  723. str := strconv.FormatInt(timestampSec, 10)
  724. req := &modelarts.CreateServiceReq{
  725. Platform: m.platform,
  726. Config: configItems,
  727. InferType: "real-time",
  728. ServiceName: option.ModelName + "_" + option.ModelType + "_" + Npu + "_" + str,
  729. }
  730. ctx, cancel := context.WithTimeout(context.Background(), 150*time.Second)
  731. defer cancel()
  732. resp, err := m.modelArtsRpc.CreateService(ctx, req)
  733. if err != nil {
  734. return "", err
  735. }
  736. return resp.ServiceId, nil
  737. }
  738. func (m *ModelArtsLink) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
  739. ifoption := &option.InferOption{
  740. ModelName: name,
  741. ModelType: mtype,
  742. }
  743. err := m.CheckImageExist(ctx, ifoption)
  744. if err != nil {
  745. return false
  746. }
  747. return true
  748. }
  749. func (m *ModelArtsLink) CheckImageExist(ctx context.Context, option *option.InferOption) error {
  750. req := &modelarts.ListImagesReq{
  751. Limit: m.pageSize,
  752. Offset: m.pageIndex,
  753. }
  754. ListImageResp, err := m.modelArtsRpc.ListImages(ctx, req)
  755. if err != nil {
  756. return err
  757. }
  758. var modelName string
  759. if ListImageResp.Code == 200 {
  760. //return errors.New("failed to get ModelId")
  761. for _, ListImage := range ListImageResp.Data {
  762. if option.ModelName == "ChatGLM-6B" {
  763. modelName = "chatglm-6b"
  764. } else {
  765. modelName = option.ModelName
  766. }
  767. if ListImage.Name == modelName {
  768. return nil
  769. }
  770. }
  771. }
  772. return errors.New("failed to find Image ")
  773. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.