You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

modelarts.go 24 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "fmt"
  16. "github.com/pkg/errors"
  17. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/schedulers/option"
  18. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  19. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/inference"
  20. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  21. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  22. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  23. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/timeutils"
  24. "gitlink.org.cn/JointCloud/pcm-modelarts/client/imagesservice"
  25. "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice"
  26. "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
  27. modelartsclient "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts"
  28. "gorm.io/gorm"
  29. "log"
  30. "mime/multipart"
  31. "strconv"
  32. "strings"
  33. "sync"
  34. "time"
  35. )
  36. const (
  37. Ascend = "Ascend"
  38. Npu = "npu"
  39. ImageNetResnet50Cmd = "cd /home/ma-user & python ./inference_ascend.py"
  40. ChatGLM6BCmd = "cd /home/ma-user && python ./download_model.py && python ./inference_chatGLM.py"
  41. )
  42. type ModelArtsLink struct {
  43. modelArtsRpc modelartsservice.ModelArtsService
  44. modelArtsImgRpc imagesservice.ImagesService
  45. platform string
  46. participantId int64
  47. pageIndex int32
  48. pageSize int32
  49. SourceLocation string
  50. Version string
  51. ModelId string
  52. ModelType string
  53. DbEngin *gorm.DB
  54. }
  55. // Version 结构体表示版本号
  56. type Version struct {
  57. Major, Minor, Patch int
  58. }
  59. // ParseVersion 从字符串解析版本号
  60. func ParseVersion(versionStr string) (*Version, error) {
  61. parts := strings.Split(versionStr, ".")
  62. if len(parts) != 3 {
  63. return nil, fmt.Errorf("invalid version format: %s", versionStr)
  64. }
  65. major, err := strconv.Atoi(parts[0])
  66. if err != nil {
  67. return nil, err
  68. }
  69. minor, err := strconv.Atoi(parts[1])
  70. if err != nil {
  71. return nil, err
  72. }
  73. patch, err := strconv.Atoi(parts[2])
  74. if err != nil {
  75. return nil, err
  76. }
  77. return &Version{Major: major, Minor: minor, Patch: patch}, nil
  78. }
  79. // Increment 根据给定规则递增版本号
  80. func (v *Version) Increment() {
  81. if v.Patch < 9 {
  82. v.Patch++
  83. } else {
  84. v.Patch = 0
  85. if v.Minor < 9 {
  86. v.Minor++
  87. } else {
  88. v.Minor = 0
  89. v.Major++
  90. }
  91. }
  92. }
  93. // String 将版本号转换回字符串格式
  94. func (v *Version) String() string {
  95. return fmt.Sprintf("%d.%d.%d", v.Major, v.Minor, v.Patch)
  96. }
  97. func NewModelArtsLink(modelArtsRpc modelartsservice.ModelArtsService, modelArtsImgRpc imagesservice.ImagesService, name string, id int64, nickname string) *ModelArtsLink {
  98. return &ModelArtsLink{modelArtsRpc: modelArtsRpc, modelArtsImgRpc: modelArtsImgRpc, platform: nickname, participantId: id, pageIndex: 0, pageSize: 50}
  99. }
  100. func (m *ModelArtsLink) UploadImage(ctx context.Context, path string) (interface{}, error) {
  101. //TODO modelArts上传镜像
  102. return nil, nil
  103. }
  104. func (m *ModelArtsLink) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  105. // TODO modelArts删除镜像
  106. return nil, nil
  107. }
  108. func (m *ModelArtsLink) QueryImageList(ctx context.Context) (interface{}, error) {
  109. // modelArts获取镜像列表
  110. req := &modelarts.ListRepoReq{
  111. Offset: "0",
  112. Limit: strconv.Itoa(int(m.pageSize)),
  113. Platform: m.platform,
  114. }
  115. resp, err := m.modelArtsImgRpc.ListReposDetails(ctx, req)
  116. if err != nil {
  117. return nil, err
  118. }
  119. return resp, nil
  120. }
  121. func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  122. // modelArts提交任务
  123. environments := make(map[string]string)
  124. parameters := make([]*modelarts.ParametersTrainJob, 0)
  125. for _, env := range envs {
  126. s := strings.Split(env, COMMA)
  127. environments[s[0]] = s[1]
  128. }
  129. for _, param := range params {
  130. s := strings.Split(param, COMMA)
  131. parameters = append(parameters, &modelarts.ParametersTrainJob{
  132. Name: s[0],
  133. Value: s[1],
  134. })
  135. }
  136. req := &modelarts.CreateTrainingJobReq{
  137. Kind: "job",
  138. Metadata: &modelarts.MetadataS{
  139. Name: TASK_NAME_PREFIX + utils.RandomString(10),
  140. WorkspaceId: "0",
  141. },
  142. Algorithm: &modelarts.Algorithms{
  143. Id: algorithmId,
  144. Engine: &modelarts.EngineCreateTraining{
  145. ImageUrl: imageId,
  146. },
  147. Command: cmd,
  148. Environments: environments,
  149. Parameters: parameters,
  150. },
  151. Spec: &modelarts.SpecsC{
  152. Resource: &modelarts.ResourceCreateTraining{
  153. FlavorId: resourceId,
  154. NodeCount: 1,
  155. },
  156. },
  157. //Platform: m.platform,
  158. Platform: "modelarts-CloudBrain2",
  159. }
  160. resp, err := m.modelArtsRpc.CreateTrainingJob(ctx, req)
  161. aiModelarts := models.TaskAiModelarts{}
  162. aiModelarts.ImageId = imageId
  163. aiModelarts.FlavorId = resourceId
  164. aiModelarts.Cmd = cmd
  165. //aiModelarts.TaskId =
  166. tx := m.DbEngin.Table("task_ai_modelarts").Create(&aiModelarts)
  167. if tx.Error != nil {
  168. return tx.Error, nil
  169. }
  170. if err != nil {
  171. return nil, err
  172. }
  173. return resp, nil
  174. }
  175. func (m *ModelArtsLink) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  176. // 获取任务
  177. req := &modelarts.DetailTrainingJobsReq{
  178. TrainingJobId: taskId,
  179. Platform: m.platform,
  180. }
  181. resp, err := m.modelArtsRpc.GetTrainingJobs(ctx, req)
  182. if err != nil {
  183. return nil, err
  184. }
  185. return resp, nil
  186. }
  187. func (m *ModelArtsLink) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  188. // 删除任务
  189. req := &modelarts.DeleteTrainingJobReq{
  190. TrainingJobId: taskId,
  191. Platform: m.platform,
  192. }
  193. resp, err := m.modelArtsRpc.DeleteTrainingJob(ctx, req)
  194. if err != nil {
  195. return nil, err
  196. }
  197. return resp, nil
  198. }
  199. func (m *ModelArtsLink) QuerySpecs(ctx context.Context) (interface{}, error) {
  200. // octopus查询资源规格
  201. req := &modelarts.TrainingJobFlavorsReq{
  202. Platform: m.platform,
  203. }
  204. resp, err := m.modelArtsRpc.GetTrainingJobFlavors(ctx, req)
  205. if err != nil {
  206. return nil, err
  207. }
  208. return resp, nil
  209. }
  210. func (m *ModelArtsLink) GetResourceStats(ctx context.Context) (*collector.ResourceStats, error) {
  211. req := &modelarts.GetPoolsRuntimeMetricsReq{}
  212. resp, err := m.modelArtsRpc.GetPoolsRuntimeMetrics(ctx, req)
  213. if err != nil {
  214. return nil, err
  215. }
  216. if resp.ErrorMsg != "" {
  217. return nil, errors.New("failed to get algorithms")
  218. }
  219. resourceStats := &collector.ResourceStats{}
  220. CpuCoreTotalSum := int64(0)
  221. CpuCoreAvailSum := int64(0)
  222. MemTotalSum := float64(0)
  223. MemAvailSum := float64(0)
  224. var CpuCoreTotal int64
  225. var CpuCoreAvail int64
  226. var MemTotal float64
  227. var MemAvail float64
  228. for _, items := range resp.Items {
  229. //TODO The value of taskType is temporarily fixed to "pytorch"
  230. CpuCoreTotal, err = strconv.ParseInt(items.Table.Capacity.Value.Cpu, 10, 64)
  231. CpuCoreTotalSum += CpuCoreTotal
  232. CpuCoreAvail, err = strconv.ParseInt(items.Table.Allocated.Value.Cpu, 10, 64)
  233. CpuCoreAvailSum += CpuCoreAvail
  234. MemTotal, err = strconv.ParseFloat(items.Table.Capacity.Value.Memory, 64)
  235. MemTotalSum += MemTotal
  236. MemAvail, err = strconv.ParseFloat(items.Table.Allocated.Value.Memory, 64)
  237. MemAvailSum += MemAvail
  238. }
  239. resourceStats.CpuCoreTotal = CpuCoreTotalSum
  240. resourceStats.CpuCoreAvail = CpuCoreAvailSum
  241. resourceStats.MemTotal = MemTotalSum
  242. resourceStats.MemAvail = MemAvailSum
  243. req1 := &modelarts.GetResourceFlavorsReq{}
  244. resp1, err := m.modelArtsRpc.GetResourceFlavors(ctx, req1)
  245. num32, _ := strconv.Atoi(resp1.Items[0].Spec.Npu.Size)
  246. var cards []*collector.Card
  247. card := &collector.Card{
  248. Platform: MODELARTS,
  249. Type: CARD,
  250. Name: Npu,
  251. CardNum: int32(num32),
  252. TOpsAtFp16: float64(num32 * 320),
  253. }
  254. cards = append(cards, card)
  255. resourceStats.CardsAvail = cards
  256. return resourceStats, nil
  257. }
  258. func (m *ModelArtsLink) GetDatasetsSpecs(ctx context.Context) ([]*collector.DatasetsSpecs, error) {
  259. return nil, nil
  260. }
  261. func (m *ModelArtsLink) GetAlgorithms(ctx context.Context) ([]*collector.Algorithm, error) {
  262. var algorithms []*collector.Algorithm
  263. req := &modelarts.ListAlgorithmsReq{
  264. Platform: m.platform,
  265. Offset: m.pageIndex,
  266. Limit: m.pageSize,
  267. }
  268. resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
  269. if err != nil {
  270. return nil, err
  271. }
  272. if resp.ErrorMsg != "" {
  273. return nil, errors.New("failed to get algorithms")
  274. }
  275. for _, a := range resp.Items {
  276. //TODO The value of taskType is temporarily fixed to "pytorch"
  277. algorithm := &collector.Algorithm{Name: a.Metadata.Name, Platform: MODELARTS, TaskType: "pytorch"}
  278. algorithms = append(algorithms, algorithm)
  279. }
  280. return algorithms, nil
  281. }
  282. func (m *ModelArtsLink) GetComputeCards(ctx context.Context) ([]string, error) {
  283. var cards []string
  284. cards = append(cards, Ascend)
  285. return cards, nil
  286. }
  287. func (m *ModelArtsLink) GetUserBalance(ctx context.Context) (float64, error) {
  288. return 0, nil
  289. }
  290. func (m *ModelArtsLink) DownloadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string) (string, error) {
  291. return "", nil
  292. }
  293. func (m *ModelArtsLink) UploadAlgorithmCode(ctx context.Context, resourceType string, card string, taskType string, dataset string, algorithm string, code string) error {
  294. return nil
  295. }
  296. // Determine whether there is a necessary image in image management and query the image name based on the image name
  297. func (m *ModelArtsLink) getSourceLocationFromImages(ctx context.Context, option *option.InferOption) error {
  298. req := &modelarts.ListImagesReq{
  299. //Platform: m.platform,
  300. Limit: 50,
  301. Offset: 0,
  302. }
  303. ListImagesResp, err := m.modelArtsRpc.ListImages(ctx, req)
  304. if err != nil {
  305. return err
  306. }
  307. if ListImagesResp.Code != 200 {
  308. return errors.New("failed to get ListImages")
  309. }
  310. for _, ListImages := range ListImagesResp.Data {
  311. if option.ModelName == "ChatGLM-6B" {
  312. if ListImages.Name == "chatglm-6b" {
  313. m.SourceLocation = ListImages.SwrPath
  314. return nil
  315. }
  316. } else {
  317. if ListImages.Name == option.ModelName {
  318. m.SourceLocation = ListImages.SwrPath
  319. return nil
  320. }
  321. }
  322. }
  323. return errors.New("SourceLocation not set")
  324. }
  325. // Get AI Application List
  326. func (m *ModelArtsLink) GetModelId(ctx context.Context, option *option.InferOption) error {
  327. req := &modelarts.ListModelReq{
  328. Platform: m.platform,
  329. ModelName: option.ModelName,
  330. //ModelType: "Image",
  331. Limit: int64(m.pageIndex),
  332. Offset: int64(m.pageSize),
  333. }
  334. ListModelResp, err := m.modelArtsRpc.ListModels(ctx, req)
  335. if err != nil {
  336. return err
  337. }
  338. if ListModelResp.Code == 200 {
  339. //return errors.New("failed to get ModelId")
  340. for _, ListModel := range ListModelResp.Models {
  341. if ListModel.ModelName == option.ModelName {
  342. option.ModelId = ListModel.ModelId
  343. m.Version = ListModel.ModelVersion
  344. return nil
  345. }
  346. }
  347. }
  348. err = m.CreateModel(ctx, option)
  349. if err != nil {
  350. return err
  351. }
  352. return nil
  353. }
  354. func (m *ModelArtsLink) GetModel(ctx context.Context, option *option.InferOption) string {
  355. req := &modelarts.ShowModelReq{
  356. Platform: m.platform,
  357. ModelId: option.ModelId,
  358. }
  359. ctx, cancel := context.WithTimeout(context.Background(), 50*time.Second)
  360. defer cancel()
  361. ShowModelsResp, err := m.modelArtsRpc.ShowModels(ctx, req)
  362. if err != nil {
  363. if err == context.DeadlineExceeded {
  364. log.Println("Request timed out")
  365. // 重试请求或其他处理
  366. } else {
  367. log.Fatalf("could not call method: %v", err)
  368. }
  369. }
  370. if ShowModelsResp.Code != 200 {
  371. errors.New("failed to get findModelsStatus")
  372. }
  373. m.ModelType = ShowModelsResp.ShowModelDetail.ModelAlgorithm
  374. return ShowModelsResp.ShowModelDetail.ModelStatus
  375. }
  376. // Get AI Application List
  377. func (m *ModelArtsLink) GetModelStatus(ctx context.Context, option *option.InferOption) error {
  378. var wg sync.WaitGroup
  379. wg.Add(1)
  380. // 使用goroutine进行轮询
  381. //defer wg.Done()
  382. for {
  383. status := m.GetModel(ctx, option)
  384. if status == "published" {
  385. fmt.Println("Model is now published.")
  386. break // 一旦状态变为published,就退出循环
  387. }
  388. fmt.Println("Waiting for model to be published...")
  389. time.Sleep(5 * time.Second) // 等待一段时间后再次检查
  390. }
  391. // 在这里执行模型状态为published后需要进行的操作
  392. fmt.Println("Continuing with the program...")
  393. return nil
  394. }
  395. // Create an AI application
  396. func (m *ModelArtsLink) CreateModel(ctx context.Context, option *option.InferOption) error {
  397. //Before creating an AI application, check if there are any images that can be created
  398. err := m.getSourceLocationFromImages(ctx, option)
  399. if err != nil { //
  400. return errors.New("No image available for creationd")
  401. }
  402. //
  403. var CMD string
  404. if option.ModelName == "imagenet_resnet50" {
  405. CMD = ImageNetResnet50Cmd
  406. } else if option.ModelName == "ChatGLM-6B" {
  407. CMD = ChatGLM6BCmd
  408. }
  409. if m.Version == "" {
  410. m.Version = "0.0.1"
  411. }
  412. version, err := ParseVersion(m.Version)
  413. version.Increment()
  414. req := &modelarts.CreateModelReq{
  415. Platform: m.platform,
  416. ModelName: option.ModelName,
  417. ModelType: "Image",
  418. ModelVersion: version.String(),
  419. SourceLocation: m.SourceLocation,
  420. InstallType: []string{"real-time"},
  421. Cmd: CMD,
  422. ModelAlgorithm: option.ModelType,
  423. }
  424. ModelResp, err := m.modelArtsRpc.CreateModel(ctx, req)
  425. if err != nil {
  426. return err
  427. }
  428. if ModelResp.Code != 200 {
  429. return errors.New("failed to get ModelId")
  430. }
  431. option.ModelId = ModelResp.ModelId
  432. return nil
  433. }
  434. func (m *ModelArtsLink) GetSpecifications(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  435. req := &modelarts.ListSpecificationsReq{
  436. //Platform: m.platform,
  437. IsPersonalCluster: false,
  438. InferType: "real-time",
  439. Limit: m.pageIndex,
  440. OffSet: m.pageSize,
  441. }
  442. ListSpecificationsResp, err := m.modelArtsRpc.ListSpecifications(ctx, req)
  443. if err != nil {
  444. return err
  445. }
  446. for _, ListSpecifications := range ListSpecificationsResp.Specifications {
  447. if ListSpecifications.Specification == "modelarts.kat1.xlarge" {
  448. ifoption.Specification = ListSpecifications.Specification
  449. return nil
  450. }
  451. }
  452. return nil
  453. }
  454. func (m *ModelArtsLink) GetTrainingTaskLog(ctx context.Context, taskId string, instanceNum string) (string, error) {
  455. req := &modelartsservice.GetTrainingJobLogsPreviewReq{
  456. Platform: m.platform,
  457. TaskId: "worker-0",
  458. TrainingJobId: taskId,
  459. }
  460. resp, err := m.modelArtsRpc.GetTrainingJobLogsPreview(ctx, req)
  461. if err != nil {
  462. return "", err
  463. }
  464. if strings.Contains(resp.Content, "404 Not Found") {
  465. resp.Content = "waiting for logs..."
  466. }
  467. return resp.Content, nil
  468. }
  469. func (m *ModelArtsLink) GetTrainingTask(ctx context.Context, taskId string) (*collector.Task, error) {
  470. resp, err := m.QueryTask(ctx, taskId)
  471. if err != nil {
  472. return nil, err
  473. }
  474. jobresp, ok := (resp).(*modelartsservice.JobResponse)
  475. if jobresp.ErrorMsg != "" || !ok {
  476. if jobresp.ErrorMsg != "" {
  477. return nil, errors.New(jobresp.ErrorMsg)
  478. } else {
  479. return nil, errors.New("get training task failed, empty error returned")
  480. }
  481. }
  482. var task collector.Task
  483. task.Id = jobresp.Metadata.Id
  484. switch strings.ToLower(jobresp.Status.Phase) {
  485. case "completed":
  486. milliTimestamp := int64(jobresp.Status.StartTime)
  487. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  488. duration := int64(jobresp.Status.Duration)
  489. task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
  490. task.Status = constants.Completed
  491. case "failed":
  492. milliTimestamp := int64(jobresp.Status.StartTime)
  493. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  494. duration := int64(jobresp.Status.Duration)
  495. task.End = timeutils.MillisecondsToAddDurationToUTCString(milliTimestamp, duration, time.DateTime)
  496. task.Status = constants.Failed
  497. case "running":
  498. milliTimestamp := int64(jobresp.Status.StartTime)
  499. task.Start = timeutils.MillisecondsToUTCString(milliTimestamp, time.DateTime)
  500. task.Status = constants.Running
  501. case "stopped":
  502. task.Status = constants.Stopped
  503. case "pending":
  504. task.Status = constants.Pending
  505. case "terminated":
  506. //TODO Failed
  507. task.Status = constants.Failed
  508. default:
  509. task.Status = "undefined"
  510. }
  511. return &task, nil
  512. }
  513. func (m *ModelArtsLink) Execute(ctx context.Context, option *option.AiOption) (interface{}, error) {
  514. err := m.GenerateSubmitParams(ctx, option)
  515. if err != nil {
  516. return nil, err
  517. }
  518. task, err := m.SubmitTask(ctx, option.ImageId, option.Cmd, option.Envs, option.Params, option.ResourceId, option.DatasetsId, option.AlgorithmId, option.TaskType)
  519. if err != nil {
  520. return nil, err
  521. }
  522. return task, nil
  523. }
  524. func (m *ModelArtsLink) GenerateSubmitParams(ctx context.Context, option *option.AiOption) error {
  525. err := m.generateResourceId(ctx, option, nil)
  526. if err != nil {
  527. return err
  528. }
  529. err = m.generateAlgorithmId(ctx, option)
  530. if err != nil {
  531. return err
  532. }
  533. err = m.generateImageId(option)
  534. if err != nil {
  535. return err
  536. }
  537. err = m.generateCmd(option)
  538. if err != nil {
  539. return err
  540. }
  541. err = m.generateEnv(option)
  542. if err != nil {
  543. return err
  544. }
  545. err = m.generateParams(option)
  546. if err != nil {
  547. return err
  548. }
  549. return nil
  550. }
  551. func (m *ModelArtsLink) generateResourceId(ctx context.Context, option *option.AiOption, ifoption *option.InferOption) error {
  552. option.ResourceId = "modelarts.kat1.xlarge"
  553. return nil
  554. }
  555. func (m *ModelArtsLink) generateImageId(option *option.AiOption) error {
  556. return nil
  557. }
  558. func (m *ModelArtsLink) generateCmd(option *option.AiOption) error {
  559. return nil
  560. }
  561. func (m *ModelArtsLink) generateEnv(option *option.AiOption) error {
  562. return nil
  563. }
  564. func (m *ModelArtsLink) generateParams(option *option.AiOption) error {
  565. return nil
  566. }
  567. func (m *ModelArtsLink) generateAlgorithmId(ctx context.Context, option *option.AiOption) error {
  568. req := &modelarts.ListAlgorithmsReq{
  569. Platform: m.platform,
  570. Offset: m.pageIndex,
  571. Limit: m.pageSize,
  572. }
  573. resp, err := m.modelArtsRpc.ListAlgorithms(ctx, req)
  574. if err != nil {
  575. return err
  576. }
  577. if resp.ErrorMsg != "" {
  578. return errors.New("failed to get algorithmId")
  579. }
  580. for _, algorithm := range resp.Items {
  581. engVersion := algorithm.JobConfig.Engine.EngineVersion
  582. if strings.Contains(engVersion, option.TaskType) {
  583. ns := strings.Split(algorithm.Metadata.Name, DASH)
  584. if ns[0] != option.TaskType {
  585. continue
  586. }
  587. if ns[1] != option.DatasetsName {
  588. continue
  589. }
  590. if ns[2] != option.AlgorithmName {
  591. continue
  592. }
  593. option.AlgorithmId = algorithm.Metadata.Id
  594. return nil
  595. }
  596. }
  597. if option.AlgorithmId == "" {
  598. return errors.New("Algorithm does not exist")
  599. }
  600. return errors.New("failed to get AlgorithmId")
  601. }
  602. func (m *ModelArtsLink) GetClusterInferUrl(ctx context.Context, option *option.InferOption) (*inference.ClusterInferUrl, error) {
  603. var imageUrls []*inference.InferUrl
  604. urlReq := &modelartsclient.ImageReasoningUrlReq{
  605. ServiceName: option.ModelName,
  606. Type: option.ModelType,
  607. Card: "npu",
  608. }
  609. urlResp, err := m.modelArtsRpc.ImageReasoningUrl(ctx, urlReq)
  610. if err != nil {
  611. return nil, err
  612. }
  613. imageUrl := &inference.InferUrl{
  614. Url: urlResp.Url,
  615. Card: "npu",
  616. }
  617. imageUrls = append(imageUrls, imageUrl)
  618. clusterWithUrl := &inference.ClusterInferUrl{
  619. ClusterName: m.platform,
  620. ClusterType: TYPE_MODELARTS,
  621. InferUrls: imageUrls,
  622. }
  623. return clusterWithUrl, nil
  624. }
  625. func (m *ModelArtsLink) GetInferDeployInstanceList(ctx context.Context) ([]*inference.DeployInstance, error) {
  626. var insList []*inference.DeployInstance
  627. req := &modelarts.ListServicesReq{
  628. Platform: m.platform,
  629. OffSet: m.pageIndex,
  630. Limit: m.pageSize,
  631. }
  632. //list, err := m.modelArtsRpc.ListServices(ctx, req)
  633. resp, err := m.modelArtsRpc.ListServices(ctx, req)
  634. if err != nil {
  635. return nil, err
  636. }
  637. if resp.ErrorMsg != "" {
  638. return nil, errors.New(resp.Msg)
  639. }
  640. for _, services := range resp.Services {
  641. ins := &inference.DeployInstance{}
  642. ins.InstanceName = services.ServiceName
  643. ins.InstanceId = services.ServiceId
  644. ins.Status = services.Status
  645. ins.InferCard = "NPU"
  646. ins.ClusterName = m.platform
  647. ins.CreatedTime = string(services.StartTime)
  648. ins.ClusterType = TYPE_MODELARTS
  649. insList = append(insList, ins)
  650. }
  651. return insList, nil
  652. }
  653. func (m *ModelArtsLink) StartInferDeployInstance(ctx context.Context, id string) bool {
  654. req := &modelartsclient.UpdateServiceReq{
  655. ServiceId: id,
  656. Status: "running",
  657. }
  658. resp, err := m.modelArtsRpc.UpdateService(ctx, req)
  659. if err != nil || resp.Code != 0 {
  660. return false
  661. }
  662. if resp.Code == 0 {
  663. return true
  664. }
  665. return false
  666. }
  667. func (m *ModelArtsLink) StopInferDeployInstance(ctx context.Context, id string) bool {
  668. req := &modelartsclient.UpdateServiceReq{
  669. ServiceId: id,
  670. Status: "stopped",
  671. }
  672. resp, err := m.modelArtsRpc.UpdateService(ctx, req)
  673. if err != nil || resp.Code != 0 {
  674. return false
  675. }
  676. if resp.Code == 0 {
  677. return true
  678. }
  679. return false
  680. }
  681. func (m *ModelArtsLink) GetInferDeployInstance(ctx context.Context, id string) (*inference.DeployInstance, error) {
  682. req := &modelarts.ShowServiceReq{
  683. ServiceId: id,
  684. }
  685. resp, err := m.modelArtsRpc.ShowService(ctx, req)
  686. if err != nil {
  687. return nil, err
  688. }
  689. if resp.ErrorMsg != "" {
  690. return nil, errors.New(resp.Msg)
  691. }
  692. ins := &inference.DeployInstance{}
  693. ins.InstanceName = resp.ServiceName
  694. ins.InstanceId = resp.ServiceId
  695. ins.Status = resp.Status
  696. ins.InferCard = "NPU"
  697. ins.ClusterName = m.platform
  698. ins.CreatedTime = string(resp.StartTime)
  699. ins.ClusterType = TYPE_MODELARTS
  700. ins.ModelName = resp.Config[0].ModelName
  701. ins.ModelType = m.ModelType
  702. ins.InferUrl = resp.AccessAddress
  703. return ins, nil
  704. }
  705. func (m *ModelArtsLink) GetImageInferResult(ctx context.Context, url string, file multipart.File, fileName string) (string, error) {
  706. return "", nil
  707. }
  708. func (m *ModelArtsLink) CreateInferDeployInstance(ctx context.Context, option *option.InferOption) (string, error) {
  709. err := m.GetModelId(ctx, option)
  710. if err != nil {
  711. return "", err
  712. }
  713. err = m.GetModelStatus(ctx, option)
  714. if err != nil {
  715. return "", err
  716. }
  717. configParam := &modelarts.ServiceConfig{
  718. Specification: "modelarts.kat1.xlarge",
  719. Weight: 100,
  720. ModelId: option.ModelId,
  721. InstanceCount: 1,
  722. }
  723. var configItems []*modelarts.ServiceConfig
  724. configItems = append(configItems, configParam)
  725. now := time.Now()
  726. timestampSec := now.Unix()
  727. str := strconv.FormatInt(timestampSec, 10)
  728. req := &modelarts.CreateServiceReq{
  729. Platform: m.platform,
  730. Config: configItems,
  731. InferType: "real-time",
  732. ServiceName: option.ModelName + "_" + option.ModelType + "_" + Npu + "_" + str,
  733. }
  734. ctx, cancel := context.WithTimeout(context.Background(), 150*time.Second)
  735. defer cancel()
  736. resp, err := m.modelArtsRpc.CreateService(ctx, req)
  737. if err != nil {
  738. return "", err
  739. }
  740. return resp.ServiceId, nil
  741. }
  742. func (m *ModelArtsLink) CheckModelExistence(ctx context.Context, name string, mtype string) bool {
  743. ifoption := &option.InferOption{
  744. ModelName: name,
  745. ModelType: mtype,
  746. }
  747. err := m.CheckImageExist(ctx, ifoption)
  748. if err != nil {
  749. return false
  750. }
  751. return true
  752. }
  753. func (m *ModelArtsLink) CheckImageExist(ctx context.Context, option *option.InferOption) error {
  754. req := &modelarts.ListImagesReq{
  755. Limit: m.pageSize,
  756. Offset: m.pageIndex,
  757. }
  758. ListImageResp, err := m.modelArtsRpc.ListImages(ctx, req)
  759. if err != nil {
  760. return err
  761. }
  762. var modelName string
  763. if ListImageResp.Code == 200 {
  764. //return errors.New("failed to get ModelId")
  765. for _, ListImage := range ListImageResp.Data {
  766. if option.ModelName == "ChatGLM-6B" {
  767. modelName = "chatglm-6b"
  768. } else {
  769. modelName = option.ModelName
  770. }
  771. if ListImage.Name == modelName {
  772. return nil
  773. }
  774. }
  775. }
  776. return errors.New("failed to find Image ")
  777. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.