You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 37 kB

4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. JobTypeBrainScore JobType = "BRAINSCORE"
  26. JobTypeTrain JobType = "TRAIN"
  27. JobVersionName JobType = "V0001"
  28. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  29. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  30. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  31. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  32. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  33. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  34. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  35. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  36. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  37. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  38. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  39. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  40. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  41. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  42. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  43. )
  44. type Cloudbrain struct {
  45. ID int64 `xorm:"pk autoincr"`
  46. JobID string `xorm:"INDEX NOT NULL"`
  47. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  48. JobName string `xorm:"INDEX"`
  49. Status string `xorm:"INDEX"`
  50. UserID int64 `xorm:"INDEX"`
  51. RepoID int64 `xorm:"INDEX"`
  52. SubTaskName string `xorm:"INDEX"`
  53. ContainerID string
  54. ContainerIp string
  55. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  56. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  57. Duration int64 `xorm:"INDEX duration"`
  58. TrainJobDuration string
  59. DeletedAt time.Time `xorm:"deleted"`
  60. CanDebug bool `xorm:"-"`
  61. CanDel bool `xorm:"-"`
  62. Type int `xorm:"INDEX DEFAULT 0"`
  63. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  64. VersionName string
  65. Uuid string
  66. DatasetName string
  67. VersionCount int64 `xorm:"INDEX DEFAULT 1"`
  68. User *User `xorm:"-"`
  69. Repo *Repository `xorm:"-"`
  70. }
  71. type TrainjobConfigDetail struct {
  72. ID int64 `xorm:"pk autoincr"`
  73. JobName string `xorm:"INDEX"`
  74. ResourcePools string `xorm:"INDEX"`
  75. EngineVersions int `xorm:"INDEX"`
  76. FlavorInfos string `xorm:"INDEX"`
  77. TrainUrl string `xorm:"INDEX"`
  78. BootFile string `xorm:"INDEX"`
  79. Uuid string `xorm:"INDEX"`
  80. DatasetName string `xorm:"INDEX"`
  81. Params string `xorm:"deleted"`
  82. BranchName string `xorm:"INDEX"`
  83. // User *User `xorm:"-"`
  84. // Repo *Repository `xorm:"-"`
  85. }
  86. type CloudbrainInfo struct {
  87. Cloudbrain `xorm:"extends"`
  88. User `xorm:"extends"`
  89. }
  90. type CloudBrainLoginResult struct {
  91. Code string
  92. Msg string
  93. Payload map[string]interface{}
  94. }
  95. type TaskRole struct {
  96. Name string `json:"name"`
  97. TaskNumber int `json:"taskNumber"`
  98. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  99. MinFailedTaskCount int `json:"minFailedTaskCount"`
  100. CPUNumber int `json:"cpuNumber"`
  101. GPUNumber int `json:"gpuNumber"`
  102. MemoryMB int `json:"memoryMB"`
  103. ShmMB int `json:"shmMB"`
  104. Command string `json:"command"`
  105. NeedIBDevice bool `json:"needIBDevice"`
  106. IsMainRole bool `json:"isMainRole"`
  107. UseNNI bool `json:"useNNI"`
  108. }
  109. type StHostPath struct {
  110. Path string `json:"path"`
  111. MountPath string `json:"mountPath"`
  112. ReadOnly bool `json:"readOnly"`
  113. }
  114. type Volume struct {
  115. HostPath StHostPath `json:"hostPath"`
  116. }
  117. type CreateJobParams struct {
  118. JobName string `json:"jobName"`
  119. RetryCount int8 `json:"retryCount"`
  120. GpuType string `json:"gpuType"`
  121. Image string `json:"image"`
  122. TaskRoles []TaskRole `json:"taskRoles"`
  123. Volumes []Volume `json:"volumes"`
  124. }
  125. type CreateJobResult struct {
  126. Code string `json:"code"`
  127. Msg string `json:"msg"`
  128. Payload map[string]interface{} `json:"payload"`
  129. }
  130. type GetJobResult struct {
  131. Code string `json:"code"`
  132. Msg string `json:"msg"`
  133. Payload map[string]interface{} `json:"payload"`
  134. }
  135. type GetImagesResult struct {
  136. Code string `json:"code"`
  137. Msg string `json:"msg"`
  138. Payload GetImagesPayload `json:"payload"`
  139. }
  140. type GetImagesPayload struct {
  141. Count int `json:"count"`
  142. TotalPages int `json:"totalPages,omitempty"`
  143. ImageInfo []*ImageInfo `json:"rows"`
  144. }
  145. type CloudbrainsOptions struct {
  146. ListOptions
  147. RepoID int64 // include all repos if empty
  148. UserID int64
  149. JobID string
  150. SortType string
  151. CloudbrainIDs []int64
  152. // JobStatus CloudbrainStatus
  153. Type int
  154. JobType string
  155. VersionName string
  156. }
  157. type TaskPod struct {
  158. TaskRoleStatus struct {
  159. Name string `json:"name"`
  160. } `json:"taskRoleStatus"`
  161. //TaskStatuses []struct {
  162. // TaskIndex int `json:"taskIndex"`
  163. // PodUID string `json:"podUid"`
  164. // PodIP string `json:"podIp"`
  165. // PodName string `json:"podName"`
  166. // ContainerID string `json:"containerId"`
  167. // ContainerIP string `json:"containerIp"`
  168. // ContainerGpus string `json:"containerGpus"`
  169. // State string `json:"state"`
  170. // StartAt time.Time `json:"startAt"`
  171. // FinishedAt time.Time `json:"finishedAt"`
  172. // ExitCode int `json:"exitCode"`
  173. // ExitDiagnostics string `json:"exitDiagnostics"`
  174. // RetriedCount int `json:"retriedCount"`
  175. // StartTime string
  176. // FinishedTime string
  177. //} `json:"taskStatuses"`
  178. TaskStatuses []TaskStatuses `json:"taskStatuses"`
  179. }
  180. type TaskStatuses struct {
  181. TaskIndex int `json:"taskIndex"`
  182. PodUID string `json:"podUid"`
  183. PodIP string `json:"podIp"`
  184. PodName string `json:"podName"`
  185. ContainerID string `json:"containerId"`
  186. ContainerIP string `json:"containerIp"`
  187. ContainerGpus string `json:"containerGpus"`
  188. State string `json:"state"`
  189. StartAt time.Time `json:"startAt"`
  190. FinishedAt time.Time `json:"finishedAt"`
  191. ExitCode int `json:"exitCode"`
  192. ExitDiagnostics string `json:"exitDiagnostics"`
  193. RetriedCount int `json:"retriedCount"`
  194. StartTime string
  195. FinishedTime string
  196. }
  197. type TaskInfo struct {
  198. Username string `json:"username"`
  199. TaskName string `json:"task_name"`
  200. CodeName string `json:"code_name"`
  201. BenchmarkCategory []string `json:"selected_category"`
  202. CodeLink string `json:"code_link"`
  203. GpuType string `json:"gpu_type"`
  204. }
  205. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  206. data, _ := json.Marshal(input)
  207. var taskPod TaskPod
  208. err := json.Unmarshal(data, &taskPod)
  209. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  210. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  211. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  212. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  213. taskPod.TaskStatuses[0].FinishedTime = "-"
  214. }
  215. return taskPod, err
  216. }
  217. type JobResultPayload struct {
  218. ID string `json:"id"`
  219. Name string `json:"name"`
  220. Platform string `json:"platform"`
  221. JobStatus struct {
  222. Username string `json:"username"`
  223. State string `json:"state"`
  224. SubState string `json:"subState"`
  225. ExecutionType string `json:"executionType"`
  226. Retries int `json:"retries"`
  227. CreatedTime int64 `json:"createdTime"`
  228. CompletedTime int64 `json:"completedTime"`
  229. AppID string `json:"appId"`
  230. AppProgress string `json:"appProgress"`
  231. AppTrackingURL string `json:"appTrackingUrl"`
  232. AppLaunchedTime int64 `json:"appLaunchedTime"`
  233. AppCompletedTime interface{} `json:"appCompletedTime"`
  234. AppExitCode int `json:"appExitCode"`
  235. AppExitDiagnostics string `json:"appExitDiagnostics"`
  236. AppExitType interface{} `json:"appExitType"`
  237. VirtualCluster string `json:"virtualCluster"`
  238. StartTime string
  239. EndTime string
  240. } `json:"jobStatus"`
  241. TaskRoles map[string]interface{} `json:"taskRoles"`
  242. Resource struct {
  243. CPU int `json:"cpu"`
  244. Memory string `json:"memory"`
  245. NvidiaComGpu int `json:"nvidia.com/gpu"`
  246. } `json:"resource"`
  247. Config struct {
  248. Image string `json:"image"`
  249. JobID string `json:"jobId"`
  250. GpuType string `json:"gpuType"`
  251. JobName string `json:"jobName"`
  252. JobType string `json:"jobType"`
  253. TaskRoles []struct {
  254. Name string `json:"name"`
  255. ShmMB int `json:"shmMB"`
  256. Command string `json:"command"`
  257. MemoryMB int `json:"memoryMB"`
  258. CPUNumber int `json:"cpuNumber"`
  259. GpuNumber int `json:"gpuNumber"`
  260. IsMainRole bool `json:"isMainRole"`
  261. TaskNumber int `json:"taskNumber"`
  262. NeedIBDevice bool `json:"needIBDevice"`
  263. MinFailedTaskCount int `json:"minFailedTaskCount"`
  264. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  265. } `json:"taskRoles"`
  266. RetryCount int `json:"retryCount"`
  267. } `json:"config"`
  268. Userinfo struct {
  269. User string `json:"user"`
  270. OrgID string `json:"org_id"`
  271. } `json:"userinfo"`
  272. }
  273. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  274. data, _ := json.Marshal(input)
  275. var jobResultPayload JobResultPayload
  276. err := json.Unmarshal(data, &jobResultPayload)
  277. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  278. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  279. if jobResultPayload.JobStatus.State == string(JobWaiting) {
  280. jobResultPayload.JobStatus.StartTime = "-"
  281. jobResultPayload.JobStatus.EndTime = "-"
  282. }
  283. return jobResultPayload, err
  284. }
  285. type ImagesResultPayload struct {
  286. Images []struct {
  287. ID int `json:"id"`
  288. Name string `json:"name"`
  289. Place string `json:"place"`
  290. Description string `json:"description"`
  291. Provider string `json:"provider"`
  292. Createtime string `json:"createtime"`
  293. Remark string `json:"remark"`
  294. } `json:"taskStatuses"`
  295. }
  296. type ImageInfo struct {
  297. ID int `json:"id"`
  298. Name string `json:"name"`
  299. Place string `json:"place"`
  300. Description string `json:"description"`
  301. Provider string `json:"provider"`
  302. Createtime string `json:"createtime"`
  303. Remark string `json:"remark"`
  304. IsPublic int `json:"isPublic"`
  305. PlaceView string
  306. }
  307. type Categories struct {
  308. Category []*Category `json:"category"`
  309. }
  310. type Category struct {
  311. Id int `json:"id"`
  312. Value string `json:"value"`
  313. }
  314. type GpuInfos struct {
  315. GpuInfo []*GpuInfo `json:"gpu_type"`
  316. }
  317. type GpuInfo struct {
  318. Id int `json:"id"`
  319. Value string `json:"value"`
  320. Queue string `json:"queue"`
  321. }
  322. type ResourceSpecs struct {
  323. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  324. }
  325. type ResourceSpec struct {
  326. Id int `json:"id"`
  327. CpuNum int `json:"cpu"`
  328. GpuNum int `json:"gpu"`
  329. MemMiB int `json:"memMiB"`
  330. ShareMemMiB int `json:"shareMemMiB"`
  331. }
  332. type FlavorInfos struct {
  333. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  334. }
  335. type FlavorInfo struct {
  336. Id int `json:"id"`
  337. Value string `json:"value"`
  338. }
  339. type PoolInfos struct {
  340. PoolInfo []*PoolInfo `json:"pool_info"`
  341. }
  342. type PoolInfo struct {
  343. PoolId string `json:"pool_id"`
  344. PoolName string `json:"pool_name"`
  345. PoolType string `json:"pool_type"`
  346. }
  347. type CommitImageParams struct {
  348. Ip string `json:"ip"`
  349. TaskContainerId string `json:"taskContainerId"`
  350. ImageTag string `json:"imageTag"`
  351. ImageDescription string `json:"imageDescription"`
  352. }
  353. type CommitImageResult struct {
  354. Code string `json:"code"`
  355. Msg string `json:"msg"`
  356. Payload map[string]interface{} `json:"payload"`
  357. }
  358. type CloudBrainResult struct {
  359. Code string `json:"code"`
  360. Msg string `json:"msg"`
  361. }
  362. type CreateNotebookParams struct {
  363. JobName string `json:"name"`
  364. Description string `json:"description"`
  365. ProfileID string `json:"profile_id"`
  366. Flavor string `json:"flavor"`
  367. Spec Spec `json:"spec"`
  368. Workspace Workspace `json:"workspace"`
  369. Pool Pool `json:"pool"`
  370. }
  371. type Pool struct {
  372. ID string `json:"id"`
  373. Name string `json:"name"`
  374. Type string `json:"type"`
  375. }
  376. type Workspace struct {
  377. ID string `json:"id"`
  378. }
  379. type Spec struct {
  380. Storage Storage `json:"storage"`
  381. AutoStop AutoStop `json:"auto_stop"`
  382. }
  383. type AutoStop struct {
  384. Enable bool `json:"enable"`
  385. Duration int `json:"duration"`
  386. }
  387. type Storage struct {
  388. Type string `json:"type"`
  389. Location Location `json:"location"`
  390. }
  391. type Location struct {
  392. Path string `json:"path"`
  393. }
  394. type NotebookResult struct {
  395. ErrorCode string `json:"error_code"`
  396. ErrorMsg string `json:"error_msg"`
  397. }
  398. type CreateNotebookResult struct {
  399. ErrorCode string `json:"error_code"`
  400. ErrorMsg string `json:"error_msg"`
  401. ID string `json:"id"`
  402. Name string `json:"name"`
  403. Description string `json:"description"`
  404. Status string `json:"status"`
  405. CreationTimestamp string `json:"creation_timestamp"`
  406. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  407. Profile struct {
  408. ID string `json:"id"`
  409. Name string `json:"name"`
  410. Description string `json:"description"`
  411. DeType string `json:"de_type"`
  412. FlavorType string `json:"flavor_type"`
  413. } `json:"profile"`
  414. Flavor string `json:"flavor"`
  415. FlavorDetails struct {
  416. Name string `json:"name"`
  417. Status string `json:"status"`
  418. QueuingNum int `json:"queuing_num"`
  419. QueueLeftTime int `json:"queue_left_time"` //s
  420. Duration int `json:"duration"` //auto_stop_time s
  421. } `json:"flavor_details"`
  422. }
  423. type GetNotebookResult struct {
  424. ErrorCode string `json:"error_code"`
  425. ErrorMsg string `json:"error_msg"`
  426. ID string `json:"id"`
  427. Name string `json:"name"`
  428. Description string `json:"description"`
  429. Status string `json:"status"`
  430. CreationTimestamp string `json:"creation_timestamp"`
  431. CreateTime string
  432. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  433. LatestUpdateTime string
  434. Profile struct {
  435. ID string `json:"id"`
  436. Name string `json:"name"`
  437. Description string `json:"description"`
  438. DeType string `json:"de_type"`
  439. FlavorType string `json:"flavor_type"`
  440. } `json:"profile"`
  441. Flavor string `json:"flavor"`
  442. FlavorDetails struct {
  443. Name string `json:"name"`
  444. Status string `json:"status"`
  445. QueuingNum int `json:"queuing_num"`
  446. QueueLeftTime int `json:"queue_left_time"` //s
  447. Duration int `json:"duration"` //auto_stop_time s
  448. } `json:"flavor_details"`
  449. QueuingInfo struct {
  450. ID string `json:"id"`
  451. Name string `json:"name"`
  452. Flavor string `json:"flavor"`
  453. DeType string `json:"de_type"`
  454. Status string `json:"status"`
  455. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  456. BeginTime string
  457. RemainTime int `json:"remain_time"` //remain time of instance
  458. EndTimestamp int `json:"end_timestamp"` //
  459. EndTime string
  460. Rank int `json:"rank"` //rank of instance in queue
  461. } `json:"queuing_info"`
  462. Spec struct {
  463. Annotations struct {
  464. TargetDomain string `json:"target_domain"`
  465. Url string `json:"url"`
  466. } `json:"annotations"`
  467. } `json:"spec"`
  468. }
  469. type GetTokenParams struct {
  470. Auth Auth `json:"auth"`
  471. }
  472. type Auth struct {
  473. Identity Identity `json:"identity"`
  474. Scope Scope `json:"scope"`
  475. }
  476. type Scope struct {
  477. Project Project `json:"project"`
  478. }
  479. type Project struct {
  480. Name string `json:"name"`
  481. }
  482. type Identity struct {
  483. Methods []string `json:"methods"`
  484. Password Password `json:"password"`
  485. }
  486. type Password struct {
  487. User NotebookUser `json:"user"`
  488. }
  489. type NotebookUser struct {
  490. Name string `json:"name"`
  491. Password string `json:"password"`
  492. Domain Domain `json:"domain"`
  493. }
  494. type Domain struct {
  495. Name string `json:"name"`
  496. }
  497. const (
  498. ActionStart = "start"
  499. ActionStop = "stop"
  500. ActionRestart = "restart"
  501. ActionQueue = "queue"
  502. ActionDequeue = "dequeue"
  503. )
  504. type NotebookAction struct {
  505. Action string `json:"action"`
  506. }
  507. type NotebookActionResult struct {
  508. ErrorCode string `json:"error_code"`
  509. ErrorMsg string `json:"error_msg"`
  510. CurrentStatus string `json:"current_status"`
  511. PreviousState string `json:"previous_state"`
  512. }
  513. type NotebookGetJobTokenResult struct {
  514. ErrorCode string `json:"error_code"`
  515. ErrorMsg string `json:"error_msg"`
  516. Token string `json:"token"`
  517. }
  518. type NotebookDelResult struct {
  519. InstanceID string `json:"instance_id"`
  520. }
  521. type CreateTrainJobParams struct {
  522. JobName string `json:"job_name"`
  523. Description string `json:"job_desc"`
  524. Config Config `json:"config"`
  525. WorkspaceID string `json:"workspace_id"`
  526. }
  527. type Config struct {
  528. WorkServerNum int `json:"worker_server_num"`
  529. AppUrl string `json:"app_url"` //训练作业的代码目录
  530. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  531. Parameter []Parameter `json:"parameter"`
  532. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  533. //DatasetID string `json:"dataset_id"`
  534. //DataVersionID string `json:"dataset_version_id"`
  535. //DataSource []DataSource `json:"data_source"`
  536. //SpecID int64 `json:"spec_id"`
  537. EngineID int64 `json:"engine_id"`
  538. //ModelID int64 `json:"model_id"`
  539. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  540. LogUrl string `json:"log_url"`
  541. //UserImageUrl string `json:"user_image_url"`
  542. //UserCommand string `json:"user_command"`
  543. CreateVersion bool `json:"create_version"`
  544. //Volumes []Volumes `json:"volumes"`
  545. Flavor Flavor `json:"flavor"`
  546. PoolID string `json:"pool_id"`
  547. }
  548. type CreateTrainJobVersionParams struct {
  549. Description string `json:"job_desc"`
  550. Config TrainJobVersionConfig `json:"config"`
  551. }
  552. type TrainJobVersionConfig struct {
  553. WorkServerNum int `json:"worker_server_num"`
  554. AppUrl string `json:"app_url"` //训练作业的代码目录
  555. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  556. Parameter []Parameter `json:"parameter"`
  557. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  558. //DatasetID string `json:"dataset_id"`
  559. //DataVersionID string `json:"dataset_version_id"`
  560. //DataSource []DataSource `json:"data_source"`
  561. //SpecID int64 `json:"spec_id"`
  562. EngineID int64 `json:"engine_id"`
  563. //ModelID int64 `json:"model_id"`
  564. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  565. LogUrl string `json:"log_url"`
  566. //UserImageUrl string `json:"user_image_url"`
  567. //UserCommand string `json:"user_command"`
  568. //Volumes []Volumes `json:"volumes"`
  569. Flavor Flavor `json:"flavor"`
  570. PoolID string `json:"pool_id"`
  571. PreVersionId int64 `json:"pre_version_id"`
  572. }
  573. type CreateConfigParams struct {
  574. ConfigName string `json:"config_name"`
  575. Description string `json:"config_desc"`
  576. WorkServerNum int `json:"worker_server_num"`
  577. AppUrl string `json:"app_url"` //训练作业的代码目录
  578. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  579. Parameter []Parameter `json:"parameter"`
  580. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  581. //DatasetID string `json:"dataset_id"`
  582. //DataVersionID string `json:"dataset_version_id"`
  583. //DataSource []DataSource `json:"data_source"`
  584. //SpecID int64 `json:"spec_id"`
  585. EngineID int64 `json:"engine_id"`
  586. //ModelID int64 `json:"model_id"`
  587. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  588. LogUrl string `json:"log_url"`
  589. //UserImageUrl string `json:"user_image_url"`
  590. //UserCommand string `json:"user_command"`
  591. //CreateVersion bool `json:"create_version"`
  592. //Volumes []Volumes `json:"volumes"`
  593. Flavor Flavor `json:"flavor"`
  594. PoolID string `json:"pool_id"`
  595. }
  596. type Parameter struct {
  597. Label string `json:"label"`
  598. Value string `json:"value"`
  599. }
  600. type Parameters struct {
  601. Parameter []Parameter `json:"parameter"`
  602. }
  603. type DataSource struct {
  604. DatasetID string `json:"dataset_id"`
  605. DatasetVersion string `json:"dataset_version"`
  606. Type string `json:"type"`
  607. DataUrl string `json:"data_url"`
  608. }
  609. type Volumes struct {
  610. Nfs Nfs `json:"nfs"`
  611. HostPath HostPath `json:"host_path"`
  612. }
  613. type Nfs struct {
  614. ID string `json:"id"`
  615. SourcePath string `json:"src_path"`
  616. DestPath string `json:"dest_path"`
  617. ReadOnly bool `json:"read_only"`
  618. }
  619. type HostPath struct {
  620. SourcePath string `json:"src_path"`
  621. DestPath string `json:"dest_path"`
  622. ReadOnly bool `json:"read_only"`
  623. }
  624. type Flavor struct {
  625. Code string `json:"code"`
  626. }
  627. type CreateTrainJobResult struct {
  628. ErrorCode string `json:"error_code"`
  629. ErrorMsg string `json:"error_msg"`
  630. IsSuccess bool `json:"is_success"`
  631. JobName string `json:"job_name"`
  632. JobID int64 `json:"job_id"`
  633. Status int `json:"status"`
  634. CreateTime int64 `json:"create_time"`
  635. VersionID int64 `json:"version_id"`
  636. ResourceID string `json:"resource_id"`
  637. VersionName string `json:"version_name"`
  638. }
  639. type CreateTrainJobConfigResult struct {
  640. ErrorCode string `json:"error_code"`
  641. ErrorMsg string `json:"error_msg"`
  642. IsSuccess bool `json:"is_success"`
  643. }
  644. type GetResourceSpecsResult struct {
  645. ErrorCode string `json:"error_code"`
  646. ErrorMsg string `json:"error_msg"`
  647. IsSuccess bool `json:"is_success"`
  648. SpecTotalCount int `json:"spec_total_count"`
  649. Specs []Specs `json:"specs"`
  650. }
  651. type Specs struct {
  652. Core string `json:"core"`
  653. Cpu string `json:"cpu"`
  654. IsNoResource bool `json:"no_resource"`
  655. GpuType string `json:"gpu_type"`
  656. SpecID int64 `json:"spec_id"`
  657. GpuNum int `json:"gpu_num"`
  658. SpecCode string `json:"spec_code"`
  659. Storage string `json:"storage"`
  660. MaxNum int `json:"max_num"`
  661. UnitNum int `json:"unit_num"`
  662. InterfaceType int `json:"interface_type"`
  663. }
  664. type GetConfigListResult struct {
  665. ErrorCode string `json:"error_code"`
  666. ErrorMsg string `json:"error_msg"`
  667. IsSuccess bool `json:"is_success"`
  668. ConfigTotalCount int `json:"config_total_count"`
  669. ParaConfigs []ParaConfig `json:"configs"`
  670. }
  671. type ParaConfig struct {
  672. ConfigName string `json:"config_name"`
  673. ConfigDesc string `json:"config_desc"`
  674. CreateTime int64 `json:"create_time"`
  675. EngineType int `json:"engine_type"`
  676. EngineName string `json:"engine_name"`
  677. EngineId int64 `json:"engine_id"`
  678. EngineVersion string `json:"engine_version"`
  679. UserImageUrl string `json:"user_image_url"`
  680. UserCommand string `json:"user_command"`
  681. Result GetConfigResult
  682. }
  683. type GetConfigResult struct {
  684. ErrorCode string `json:"error_code"`
  685. ErrorMsg string `json:"error_msg"`
  686. IsSuccess bool `json:"is_success"`
  687. ConfigName string `json:"config_name"`
  688. Description string `json:"config_desc"`
  689. WorkServerNum int `json:"worker_server_num"`
  690. AppUrl string `json:"app_url"` //训练作业的代码目录
  691. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  692. Parameter []Parameter `json:"parameter"`
  693. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  694. //DatasetID string `json:"dataset_id"`
  695. //DataVersionID string `json:"dataset_version_id"`
  696. //DataSource []DataSource `json:"data_source"`
  697. //SpecID int64 `json:"spec_id"`
  698. EngineID int64 `json:"engine_id"`
  699. //ModelID int64 `json:"model_id"`
  700. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  701. LogUrl string `json:"log_url"`
  702. //UserImageUrl string `json:"user_image_url"`
  703. //UserCommand string `json:"user_command"`
  704. //CreateVersion bool `json:"create_version"`
  705. //Volumes []Volumes `json:"volumes"`
  706. Flavor Flavor `json:"flavor"`
  707. PoolID string `json:"pool_id"`
  708. }
  709. type ErrorResult struct {
  710. ErrorCode string `json:"error_code"`
  711. ErrorMsg string `json:"error_message"`
  712. IsSuccess bool `json:"is_success"`
  713. }
  714. type GetTrainJobResult struct {
  715. IsSuccess bool `json:"is_success"`
  716. JobName string `json:"job_name"`
  717. JobID int64 `json:"job_id"`
  718. Description string `json:"job_desc"`
  719. IntStatus int `json:"status"`
  720. Status string
  721. LongCreateTime int64 `json:"create_time"`
  722. CreateTime string
  723. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  724. TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss
  725. VersionID int64 `json:"version_id"`
  726. ResourceID string `json:"resource_id"`
  727. VersionName string `json:"version_name"`
  728. PreVersionID int64 `json:"pre_version_id"`
  729. WorkServerNum int `json:"worker_server_num"`
  730. AppUrl string `json:"app_url"` //训练作业的代码目录
  731. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  732. Parameter []Parameter `json:"parameter"`
  733. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  734. //DatasetID string `json:"dataset_id"`
  735. //DataVersionID string `json:"dataset_version_id"`
  736. //DataSource []DataSource `json:"data_source"`
  737. //SpecID int64 `json:"spec_id"`
  738. EngineID int64 `json:"engine_id"`
  739. EngineName string `json:"engine_name"`
  740. EngineVersion string `json:"engine_version"`
  741. //ModelID int64 `json:"model_id"`
  742. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  743. LogUrl string `json:"log_url"`
  744. //UserImageUrl string `json:"user_image_url"`
  745. //UserCommand string `json:"user_command"`
  746. //Volumes []Volumes `json:"volumes"`
  747. Flavor Flavor `json:"flavor"`
  748. PoolID string `json:"pool_id"`
  749. PoolName string `json:"pool_name"`
  750. NasMountPath string `json:"nas_mount_path"`
  751. NasShareAddr string `json:"nas_share_addr"`
  752. DatasetName string
  753. ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话
  754. }
  755. type GetTrainJobLogResult struct {
  756. ErrorCode string `json:"error_code"`
  757. ErrorMsg string `json:"error_msg"`
  758. IsSuccess bool `json:"is_success"`
  759. Content string `json:"content"`
  760. Lines int `json:"lines"`
  761. StartLine string `json:"start_line"`
  762. EndLine string `json:"end_line"`
  763. }
  764. type GetTrainJobLogFileNamesResult struct {
  765. ErrorCode string `json:"error_code"`
  766. ErrorMsg string `json:"error_msg"`
  767. IsSuccess bool `json:"is_success"`
  768. LogFileList []string `json:"log_file_list"`
  769. }
  770. type TrainJobResult struct {
  771. ErrorCode string `json:"error_code"`
  772. ErrorMsg string `json:"error_msg"`
  773. IsSuccess bool `json:"is_success"`
  774. }
  775. type LogFile struct {
  776. Name string
  777. }
  778. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  779. sess := x.NewSession()
  780. defer sess.Close()
  781. var cond = builder.NewCond()
  782. if opts.RepoID > 0 {
  783. cond = cond.And(
  784. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  785. )
  786. }
  787. if opts.UserID > 0 {
  788. cond = cond.And(
  789. builder.Eq{"cloudbrain.user_id": opts.UserID},
  790. )
  791. }
  792. if (opts.JobID) != "" {
  793. cond = cond.And(
  794. builder.Eq{"cloudbrain.job_id": opts.JobID},
  795. )
  796. }
  797. if (opts.Type) >= 0 {
  798. cond = cond.And(
  799. builder.Eq{"cloudbrain.type": opts.Type},
  800. )
  801. }
  802. if (opts.JobType) != "" {
  803. cond = cond.And(
  804. builder.Eq{"cloudbrain.job_type": opts.JobType},
  805. )
  806. }
  807. if (opts.VersionName) != "" {
  808. cond = cond.And(
  809. builder.Eq{"cloudbrain.version_name": opts.VersionName},
  810. )
  811. }
  812. // switch opts.JobStatus {
  813. // case JobWaiting:
  814. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  815. // case JobFailed:
  816. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  817. // case JobStopped:
  818. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  819. // case JobSucceeded:
  820. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  821. // }
  822. if len(opts.CloudbrainIDs) > 0 {
  823. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  824. }
  825. count, err := sess.Where(cond).Count(new(Cloudbrain))
  826. if err != nil {
  827. return nil, 0, fmt.Errorf("Count: %v", err)
  828. }
  829. if opts.Page >= 0 && opts.PageSize > 0 {
  830. var start int
  831. if opts.Page == 0 {
  832. start = 0
  833. } else {
  834. start = (opts.Page - 1) * opts.PageSize
  835. }
  836. sess.Limit(opts.PageSize, start)
  837. }
  838. sess.OrderBy("cloudbrain.created_unix DESC")
  839. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  840. if err := sess.Table(&Cloudbrain{}).Where(cond).
  841. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  842. Find(&cloudbrains); err != nil {
  843. return nil, 0, fmt.Errorf("Find: %v", err)
  844. }
  845. sess.Close()
  846. return cloudbrains, count, nil
  847. }
  848. func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  849. sess := x.NewSession()
  850. defer sess.Close()
  851. var cond = builder.NewCond()
  852. if opts.RepoID > 0 {
  853. cond = cond.And(
  854. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  855. )
  856. }
  857. if opts.UserID > 0 {
  858. cond = cond.And(
  859. builder.Eq{"cloudbrain.user_id": opts.UserID},
  860. )
  861. }
  862. if (opts.Type) >= 0 {
  863. cond = cond.And(
  864. builder.Eq{"cloudbrain.type": opts.Type},
  865. )
  866. }
  867. if (opts.JobID) != "" {
  868. cond = cond.And(
  869. builder.Eq{"cloudbrain.job_id": opts.JobID},
  870. )
  871. }
  872. if (opts.JobType) != "" {
  873. cond = cond.And(
  874. builder.Eq{"cloudbrain.job_type": opts.JobType},
  875. )
  876. }
  877. if len(opts.CloudbrainIDs) > 0 {
  878. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  879. }
  880. count, err := sess.Where(cond).Count(new(Cloudbrain))
  881. if err != nil {
  882. return nil, 0, fmt.Errorf("Count: %v", err)
  883. }
  884. if opts.Page >= 0 && opts.PageSize > 0 {
  885. var start int
  886. if opts.Page == 0 {
  887. start = 0
  888. } else {
  889. start = (opts.Page - 1) * opts.PageSize
  890. }
  891. sess.Limit(opts.PageSize, start)
  892. }
  893. sess.OrderBy("cloudbrain.created_unix DESC")
  894. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  895. if err := sess.Table(&Cloudbrain{}).Where(cond).
  896. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  897. Find(&cloudbrains); err != nil {
  898. return nil, 0, fmt.Errorf("Find: %v", err)
  899. }
  900. sess.Close()
  901. return cloudbrains, count, nil
  902. }
  903. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  904. if _, err = x.Insert(cloudbrain); err != nil {
  905. return err
  906. }
  907. return nil
  908. }
  909. func CreateTrainjobConfigDetail(trainjobConfigDetail *TrainjobConfigDetail) (err error) {
  910. if _, err = x.Insert(trainjobConfigDetail); err != nil {
  911. return err
  912. }
  913. return nil
  914. }
  915. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  916. has, err := x.Get(cb)
  917. if err != nil {
  918. return nil, err
  919. } else if !has {
  920. return nil, ErrJobNotExist{}
  921. }
  922. return cb, nil
  923. }
  924. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  925. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  926. return getRepoCloudBrain(cb)
  927. }
  928. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  929. cb := &Cloudbrain{JobID: jobID}
  930. return getRepoCloudBrain(cb)
  931. }
  932. func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) {
  933. cb := &Cloudbrain{JobID: jobID, VersionName: versionName}
  934. return getRepoCloudBrain(cb)
  935. }
  936. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  937. cloudBrains := make([]*Cloudbrain, 0)
  938. err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  939. return cloudBrains, err
  940. }
  941. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  942. cloudBrains := make([]*Cloudbrain, 0)
  943. err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  944. return cloudBrains, err
  945. }
  946. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  947. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  948. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  949. return
  950. }
  951. func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) {
  952. cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration}
  953. _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
  954. return
  955. }
  956. func SetVersionCountByJobID(jobID string, versionName string, versionCount int64) (err error) {
  957. cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount}
  958. _, err = x.Cols("version_Count").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
  959. return
  960. }
  961. func UpdateJob(job *Cloudbrain) error {
  962. return updateJob(x, job)
  963. }
  964. func updateJob(e Engine, job *Cloudbrain) error {
  965. var sess *xorm.Session
  966. sess = e.Where("job_id = ?", job.JobID)
  967. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  968. return err
  969. }
  970. // func UpdateTrainJob(job *CloudbrainInfo) error {
  971. // return updateTrainJob(x, job)
  972. // }
  973. // func updateTrainJob(e Engine, job *CloudbrainInfo) error {
  974. // var sess *xorm.Session
  975. // sess = e.Where("job_id = ?", job.Cloudbrain.JobID)
  976. // _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  977. // return err
  978. // }
  979. func DeleteJob(job *Cloudbrain) error {
  980. return deleteJob(x, job)
  981. }
  982. func deleteJob(e Engine, job *Cloudbrain) error {
  983. _, err := e.ID(job.ID).Delete(job)
  984. return err
  985. }
  986. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  987. cb := &Cloudbrain{JobName: jobName}
  988. return getRepoCloudBrain(cb)
  989. }
  990. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  991. if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) {
  992. return false
  993. }
  994. repo, err := GetRepositoryByID(job.RepoID)
  995. if err != nil {
  996. log.Error("GetRepositoryByID failed:%v", err.Error())
  997. return false
  998. }
  999. permission, _ := GetUserRepoPermission(repo, user)
  1000. if err != nil {
  1001. log.Error("GetUserRepoPermission failed:%v", err.Error())
  1002. return false
  1003. }
  1004. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  1005. return true
  1006. }
  1007. return false
  1008. }