You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aiStorage.go 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. package database
  2. import (
  3. "github.com/zeromicro/go-zero/core/logx"
  4. clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/api/client"
  5. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/scheduler/schedulers/option"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/tracker"
  10. "gorm.io/gorm"
  11. "strconv"
  12. "time"
  13. )
  14. type AiStorage struct {
  15. DbEngin *gorm.DB
  16. }
  17. func (s *AiStorage) GetParticipants() (*types.ClusterListResp, error) {
  18. var resp types.ClusterListResp
  19. tx := s.DbEngin.Raw("select * from t_cluster where `deleted_at` IS NULL ORDER BY create_time Desc").Scan(&resp.List)
  20. if tx.Error != nil {
  21. logx.Errorf(tx.Error.Error())
  22. return nil, tx.Error
  23. }
  24. return &resp, nil
  25. }
  26. func (s *AiStorage) GetClustersByAdapterId(id string) (*types.ClusterListResp, error) {
  27. var resp types.ClusterListResp
  28. tx := s.DbEngin.Raw("select * from t_cluster where `deleted_at` IS NULL and `adapter_id` = ? ORDER BY create_time Desc", id).Scan(&resp.List)
  29. if tx.Error != nil {
  30. logx.Errorf(tx.Error.Error())
  31. return nil, tx.Error
  32. }
  33. return &resp, nil
  34. }
  35. func (s *AiStorage) GetClusterNameById(id string) (string, error) {
  36. var name string
  37. tx := s.DbEngin.Raw("select `description` from t_cluster where `id` = ?", id).Scan(&name)
  38. if tx.Error != nil {
  39. logx.Errorf(tx.Error.Error())
  40. return "", tx.Error
  41. }
  42. return name, nil
  43. }
  44. func (s *AiStorage) GetAdapterNameById(id string) (string, error) {
  45. var name string
  46. tx := s.DbEngin.Raw("select `name` from t_adapter where `id` = ?", id).Scan(&name)
  47. if tx.Error != nil {
  48. logx.Errorf(tx.Error.Error())
  49. return "", tx.Error
  50. }
  51. return name, nil
  52. }
  53. func (s *AiStorage) GetAdapterIdsByType(adapterType string) ([]string, error) {
  54. var list []types.AdapterInfo
  55. var ids []string
  56. db := s.DbEngin.Model(&types.AdapterInfo{}).Table("t_adapter")
  57. db = db.Where("type = ?", adapterType)
  58. err := db.Order("create_time desc").Find(&list).Error
  59. if err != nil {
  60. return nil, err
  61. }
  62. for _, info := range list {
  63. ids = append(ids, info.Id)
  64. }
  65. return ids, nil
  66. }
  67. func (s *AiStorage) GetAdaptersByType(adapterType string) ([]*types.AdapterInfo, error) {
  68. var list []*types.AdapterInfo
  69. db := s.DbEngin.Model(&types.AdapterInfo{}).Table("t_adapter")
  70. db = db.Where("type = ?", adapterType)
  71. err := db.Order("create_time desc").Find(&list).Error
  72. if err != nil {
  73. return nil, err
  74. }
  75. return list, nil
  76. }
  77. func (s *AiStorage) GetAiTasksByAdapterId(adapterId string) ([]*models.TaskAi, error) {
  78. var resp []*models.TaskAi
  79. db := s.DbEngin.Model(&models.TaskAi{}).Table("task_ai")
  80. db = db.Where("adapter_id = ?", adapterId)
  81. err := db.Order("commit_time desc").Find(&resp).Error
  82. if err != nil {
  83. return nil, err
  84. }
  85. return resp, nil
  86. }
  87. func (s *AiStorage) GetAiTaskListById(id int64) ([]*models.TaskAi, error) {
  88. var aiTaskList []*models.TaskAi
  89. tx := s.DbEngin.Raw("select * from task_ai where `task_id` = ? ", id).Scan(&aiTaskList)
  90. if tx.Error != nil {
  91. return nil, tx.Error
  92. }
  93. return aiTaskList, nil
  94. }
  95. func (s *AiStorage) SaveTask(name string, strategyCode int64, synergyStatus int64, aiType string) (int64, error) {
  96. startTime := time.Now()
  97. // 构建主任务结构体
  98. taskModel := models.Task{
  99. Status: constants.Saved,
  100. Description: "ai task",
  101. Name: name,
  102. SynergyStatus: synergyStatus,
  103. Strategy: strategyCode,
  104. AdapterTypeDict: "1",
  105. TaskTypeDict: aiType,
  106. StartTime: &startTime,
  107. CommitTime: time.Now(),
  108. }
  109. // 保存任务数据到数据库
  110. tx := s.DbEngin.Create(&taskModel)
  111. if tx.Error != nil {
  112. return 0, tx.Error
  113. }
  114. return taskModel.Id, nil
  115. }
  116. func (s *AiStorage) UpdateTask(task *types.TaskModel) error {
  117. task.UpdatedTime = time.Now().Format(constants.Layout)
  118. tx := s.DbEngin.Table("task").Model(task).Updates(task)
  119. if tx.Error != nil {
  120. logx.Errorf(tx.Error.Error())
  121. return tx.Error
  122. }
  123. return nil
  124. }
  125. func (s *AiStorage) SaveAiTask(taskId int64, opt option.Option, adapterName string, clusterId string, clusterName string, jobId string, status string, msg string) error {
  126. var aiOpt *option.AiOption
  127. switch (opt).(type) {
  128. case *option.AiOption:
  129. aiOpt = (opt).(*option.AiOption)
  130. case *option.InferOption:
  131. inferOpt := (opt).(*option.InferOption)
  132. aiOpt = &option.AiOption{}
  133. aiOpt.TaskName = inferOpt.TaskName
  134. aiOpt.Replica = inferOpt.Replica
  135. aiOpt.AdapterId = inferOpt.AdapterId
  136. aiOpt.TaskType = inferOpt.ModelType
  137. aiOpt.StrategyName = inferOpt.Strategy
  138. }
  139. // 构建主任务结构体
  140. aId, err := strconv.ParseInt(aiOpt.AdapterId, 10, 64)
  141. if err != nil {
  142. return err
  143. }
  144. cId, err := strconv.ParseInt(clusterId, 10, 64)
  145. if err != nil {
  146. return err
  147. }
  148. aiTaskModel := models.TaskAi{
  149. TaskId: taskId,
  150. AdapterId: aId,
  151. AdapterName: adapterName,
  152. ClusterId: cId,
  153. ClusterName: clusterName,
  154. Name: aiOpt.TaskName,
  155. Replica: int64(aiOpt.Replica),
  156. JobId: jobId,
  157. TaskType: aiOpt.TaskType,
  158. Strategy: aiOpt.StrategyName,
  159. Status: status,
  160. Msg: msg,
  161. Card: aiOpt.ComputeCard,
  162. StartTime: time.Now().Format(time.RFC3339),
  163. CommitTime: time.Now(),
  164. }
  165. // 保存任务数据到数据库
  166. tx := s.DbEngin.Create(&aiTaskModel)
  167. if tx.Error != nil {
  168. return tx.Error
  169. }
  170. return nil
  171. }
  172. func (s *AiStorage) SaveAiTaskImageSubTask(ta *models.TaskAiSub) error {
  173. tx := s.DbEngin.Table("task_ai_sub").Create(ta)
  174. if tx.Error != nil {
  175. return tx.Error
  176. }
  177. return nil
  178. }
  179. func (s *AiStorage) SaveClusterTaskQueue(adapterId string, clusterId string, queueNum int64) error {
  180. aId, err := strconv.ParseInt(adapterId, 10, 64)
  181. if err != nil {
  182. return err
  183. }
  184. cId, err := strconv.ParseInt(clusterId, 10, 64)
  185. if err != nil {
  186. return err
  187. }
  188. taskQueue := models.TClusterTaskQueue{
  189. AdapterId: aId,
  190. ClusterId: cId,
  191. QueueNum: queueNum,
  192. }
  193. tx := s.DbEngin.Create(&taskQueue)
  194. if tx.Error != nil {
  195. return tx.Error
  196. }
  197. return nil
  198. }
  199. func (s *AiStorage) GetClusterTaskQueues(adapterId string, clusterId string) ([]*models.TClusterTaskQueue, error) {
  200. var taskQueues []*models.TClusterTaskQueue
  201. tx := s.DbEngin.Raw("select * from t_cluster_task_queue where `adapter_id` = ? and `cluster_id` = ?", adapterId, clusterId).Scan(&taskQueues)
  202. if tx.Error != nil {
  203. logx.Errorf(tx.Error.Error())
  204. return nil, tx.Error
  205. }
  206. return taskQueues, nil
  207. }
  208. func (s *AiStorage) GetAiTaskIdByClusterIdAndTaskId(clusterId string, taskId string) (string, error) {
  209. var aiTask models.TaskAi
  210. tx := s.DbEngin.Raw("select * from task_ai where `cluster_id` = ? and `task_id` = ?", clusterId, taskId).Scan(&aiTask)
  211. if tx.Error != nil {
  212. logx.Errorf(tx.Error.Error())
  213. return "", tx.Error
  214. }
  215. return aiTask.JobId, nil
  216. }
  217. func (s *AiStorage) GetClusterResourcesById(clusterId string) (*models.TClusterResource, error) {
  218. var clusterResource models.TClusterResource
  219. tx := s.DbEngin.Raw("select * from t_cluster_resource where `cluster_id` = ?", clusterId).Scan(&clusterResource)
  220. if tx.Error != nil {
  221. logx.Errorf(tx.Error.Error())
  222. return nil, tx.Error
  223. }
  224. return &clusterResource, nil
  225. }
  226. func (s *AiStorage) SaveClusterResources(adapterId string, clusterId string, clusterName string, clusterType int64, cpuAvail float64, cpuTotal float64,
  227. memAvail float64, memTotal float64, diskAvail float64, diskTotal float64, gpuAvail float64, gpuTotal float64, cardTotal int64, topsTotal float64) error {
  228. cId, err := strconv.ParseInt(clusterId, 10, 64)
  229. if err != nil {
  230. return err
  231. }
  232. aId, err := strconv.ParseInt(adapterId, 10, 64)
  233. if err != nil {
  234. return err
  235. }
  236. clusterResource := models.TClusterResource{
  237. AdapterId: aId,
  238. ClusterId: cId,
  239. ClusterName: clusterName,
  240. ClusterType: clusterType,
  241. CpuAvail: cpuAvail,
  242. CpuTotal: cpuTotal,
  243. MemAvail: memAvail,
  244. MemTotal: memTotal,
  245. DiskAvail: diskAvail,
  246. DiskTotal: diskTotal,
  247. GpuAvail: gpuAvail,
  248. GpuTotal: gpuTotal,
  249. CardTotal: cardTotal,
  250. CardTopsTotal: topsTotal,
  251. }
  252. tx := s.DbEngin.Create(&clusterResource)
  253. if tx.Error != nil {
  254. return tx.Error
  255. }
  256. // prometheus
  257. param := tracker.ClusterLoadRecord{
  258. AdapterId: aId,
  259. ClusterName: clusterName,
  260. CpuAvail: cpuAvail,
  261. CpuTotal: cpuTotal,
  262. CpuUtilisation: clusterResource.CpuAvail / clusterResource.CpuTotal,
  263. MemoryAvail: memAvail,
  264. MemoryTotal: memTotal,
  265. MemoryUtilisation: clusterResource.MemAvail / clusterResource.MemTotal,
  266. DiskAvail: diskAvail,
  267. DiskTotal: diskTotal,
  268. DiskUtilisation: clusterResource.DiskAvail / clusterResource.DiskTotal,
  269. }
  270. tracker.SyncClusterLoad(param)
  271. return nil
  272. }
  273. func (s *AiStorage) UpdateClusterResources(clusterResource *models.TClusterResource) error {
  274. tx := s.DbEngin.Where("cluster_id = ?", clusterResource.ClusterId).Updates(clusterResource)
  275. if tx.Error != nil {
  276. return tx.Error
  277. }
  278. // prometheus
  279. param := tracker.ClusterLoadRecord{
  280. AdapterId: clusterResource.AdapterId,
  281. ClusterName: clusterResource.ClusterName,
  282. CpuAvail: clusterResource.CpuAvail,
  283. CpuTotal: clusterResource.CpuTotal,
  284. CpuUtilisation: clusterResource.CpuAvail / clusterResource.CpuTotal,
  285. MemoryAvail: clusterResource.MemAvail,
  286. MemoryTotal: clusterResource.MemTotal,
  287. MemoryUtilisation: clusterResource.MemAvail / clusterResource.MemTotal,
  288. DiskAvail: clusterResource.DiskAvail,
  289. DiskTotal: clusterResource.DiskTotal,
  290. DiskUtilisation: clusterResource.DiskAvail / clusterResource.DiskTotal,
  291. }
  292. tracker.SyncClusterLoad(param)
  293. return nil
  294. }
  295. func (s *AiStorage) UpdateAiTask(task *models.TaskAi) error {
  296. tx := s.DbEngin.Updates(task)
  297. if tx.Error != nil {
  298. return tx.Error
  299. }
  300. return nil
  301. }
  302. func (s *AiStorage) GetStrategyCode(name string) (int64, error) {
  303. var strategy int64
  304. sqlStr := `select t_dict_item.item_value
  305. from t_dict
  306. left join t_dict_item on t_dict.id = t_dict_item.dict_id
  307. where item_text = ?
  308. and t_dict.dict_code = 'schedule_Strategy'`
  309. //查询调度策略
  310. err := s.DbEngin.Raw(sqlStr, name).Scan(&strategy).Error
  311. if err != nil {
  312. return strategy, nil
  313. }
  314. return strategy, nil
  315. }
  316. func (s *AiStorage) AddNoticeInfo(adapterId string, adapterName string, clusterId string, clusterName string, taskName string, noticeType string, incident string) {
  317. aId, err := strconv.ParseInt(adapterId, 10, 64)
  318. if err != nil {
  319. logx.Errorf("adapterId convert failure, err: %v", err)
  320. }
  321. var cId int64
  322. if clusterId != "" {
  323. cId, err = strconv.ParseInt(clusterId, 10, 64)
  324. if err != nil {
  325. logx.Errorf("clusterId convert failure, err: %v", err)
  326. }
  327. }
  328. noticeInfo := clientCore.NoticeInfo{
  329. AdapterId: aId,
  330. AdapterName: adapterName,
  331. ClusterId: cId,
  332. ClusterName: clusterName,
  333. NoticeType: noticeType,
  334. TaskName: taskName,
  335. Incident: incident,
  336. CreatedTime: time.Now(),
  337. }
  338. result := s.DbEngin.Table("t_notice").Create(&noticeInfo)
  339. if result.Error != nil {
  340. logx.Errorf("Task creation failure, err: %v", result.Error)
  341. }
  342. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.