You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 37 kB

4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. JobTypeBrainScore JobType = "BRAINSCORE"
  26. JobTypeTrain JobType = "TRAIN"
  27. JobVersionName JobType = "V0001"
  28. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  29. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  30. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  31. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  32. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  33. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  34. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  35. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  36. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  37. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  38. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  39. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  40. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  41. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  42. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  43. )
  44. type Cloudbrain struct {
  45. ID int64 `xorm:"pk autoincr"`
  46. JobID string `xorm:"INDEX NOT NULL"`
  47. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  48. JobName string `xorm:"INDEX"`
  49. Status string `xorm:"INDEX"`
  50. UserID int64 `xorm:"INDEX"`
  51. RepoID int64 `xorm:"INDEX"`
  52. SubTaskName string `xorm:"INDEX"`
  53. ContainerID string
  54. ContainerIp string
  55. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  56. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  57. Duration int64 `xorm:"INDEX duration"`
  58. TrainJobDuration string
  59. DeletedAt time.Time `xorm:"deleted"`
  60. CanDebug bool `xorm:"-"`
  61. CanDel bool `xorm:"-"`
  62. Type int `xorm:"INDEX DEFAULT 0"`
  63. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  64. VersionName string
  65. Uuid string
  66. DatasetName string
  67. VersionCount int64 `xorm:"INDEX DEFAULT 1"`
  68. IsLatestVersion string
  69. CommitID string
  70. User *User `xorm:"-"`
  71. Repo *Repository `xorm:"-"`
  72. }
  73. type TrainjobConfigDetail struct {
  74. ID int64 `xorm:"pk autoincr"`
  75. JobID string `xorm:"INDEX"`
  76. JobName string `xorm:"INDEX"`
  77. ResourcePools string `xorm:"INDEX"`
  78. EngineVersions int `xorm:"INDEX"`
  79. FlavorInfos string `xorm:"INDEX"`
  80. TrainUrl string `xorm:"INDEX"`
  81. BootFile string `xorm:"INDEX"`
  82. Uuid string `xorm:"INDEX"`
  83. DatasetName string `xorm:"INDEX"`
  84. Params string `xorm:"INDEX"`
  85. BranchName string `xorm:"INDEX"`
  86. VersionName string `xorm:"INDEX"`
  87. User *User `xorm:"-"`
  88. Repo *Repository `xorm:"-"`
  89. }
  90. type CloudbrainInfo struct {
  91. Cloudbrain `xorm:"extends"`
  92. User `xorm:"extends"`
  93. }
  94. type CloudBrainLoginResult struct {
  95. Code string
  96. Msg string
  97. Payload map[string]interface{}
  98. }
  99. type TaskRole struct {
  100. Name string `json:"name"`
  101. TaskNumber int `json:"taskNumber"`
  102. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  103. MinFailedTaskCount int `json:"minFailedTaskCount"`
  104. CPUNumber int `json:"cpuNumber"`
  105. GPUNumber int `json:"gpuNumber"`
  106. MemoryMB int `json:"memoryMB"`
  107. ShmMB int `json:"shmMB"`
  108. Command string `json:"command"`
  109. NeedIBDevice bool `json:"needIBDevice"`
  110. IsMainRole bool `json:"isMainRole"`
  111. UseNNI bool `json:"useNNI"`
  112. }
  113. type StHostPath struct {
  114. Path string `json:"path"`
  115. MountPath string `json:"mountPath"`
  116. ReadOnly bool `json:"readOnly"`
  117. }
  118. type Volume struct {
  119. HostPath StHostPath `json:"hostPath"`
  120. }
  121. type CreateJobParams struct {
  122. JobName string `json:"jobName"`
  123. RetryCount int8 `json:"retryCount"`
  124. GpuType string `json:"gpuType"`
  125. Image string `json:"image"`
  126. TaskRoles []TaskRole `json:"taskRoles"`
  127. Volumes []Volume `json:"volumes"`
  128. }
  129. type CreateJobResult struct {
  130. Code string `json:"code"`
  131. Msg string `json:"msg"`
  132. Payload map[string]interface{} `json:"payload"`
  133. }
  134. type GetJobResult struct {
  135. Code string `json:"code"`
  136. Msg string `json:"msg"`
  137. Payload map[string]interface{} `json:"payload"`
  138. }
  139. type GetImagesResult struct {
  140. Code string `json:"code"`
  141. Msg string `json:"msg"`
  142. Payload GetImagesPayload `json:"payload"`
  143. }
  144. type GetImagesPayload struct {
  145. Count int `json:"count"`
  146. TotalPages int `json:"totalPages,omitempty"`
  147. ImageInfo []*ImageInfo `json:"rows"`
  148. }
  149. type CloudbrainsOptions struct {
  150. ListOptions
  151. RepoID int64 // include all repos if empty
  152. UserID int64
  153. JobID string
  154. SortType string
  155. CloudbrainIDs []int64
  156. // JobStatus CloudbrainStatus
  157. Type int
  158. JobType string
  159. VersionName string
  160. IsLatestVersion string
  161. }
  162. type TaskPod struct {
  163. TaskRoleStatus struct {
  164. Name string `json:"name"`
  165. } `json:"taskRoleStatus"`
  166. //TaskStatuses []struct {
  167. // TaskIndex int `json:"taskIndex"`
  168. // PodUID string `json:"podUid"`
  169. // PodIP string `json:"podIp"`
  170. // PodName string `json:"podName"`
  171. // ContainerID string `json:"containerId"`
  172. // ContainerIP string `json:"containerIp"`
  173. // ContainerGpus string `json:"containerGpus"`
  174. // State string `json:"state"`
  175. // StartAt time.Time `json:"startAt"`
  176. // FinishedAt time.Time `json:"finishedAt"`
  177. // ExitCode int `json:"exitCode"`
  178. // ExitDiagnostics string `json:"exitDiagnostics"`
  179. // RetriedCount int `json:"retriedCount"`
  180. // StartTime string
  181. // FinishedTime string
  182. //} `json:"taskStatuses"`
  183. TaskStatuses []TaskStatuses `json:"taskStatuses"`
  184. }
  185. type TaskStatuses struct {
  186. TaskIndex int `json:"taskIndex"`
  187. PodUID string `json:"podUid"`
  188. PodIP string `json:"podIp"`
  189. PodName string `json:"podName"`
  190. ContainerID string `json:"containerId"`
  191. ContainerIP string `json:"containerIp"`
  192. ContainerGpus string `json:"containerGpus"`
  193. State string `json:"state"`
  194. StartAt time.Time `json:"startAt"`
  195. FinishedAt time.Time `json:"finishedAt"`
  196. ExitCode int `json:"exitCode"`
  197. ExitDiagnostics string `json:"exitDiagnostics"`
  198. RetriedCount int `json:"retriedCount"`
  199. StartTime string
  200. FinishedTime string
  201. }
  202. type TaskInfo struct {
  203. Username string `json:"username"`
  204. TaskName string `json:"task_name"`
  205. CodeName string `json:"code_name"`
  206. BenchmarkCategory []string `json:"selected_category"`
  207. CodeLink string `json:"code_link"`
  208. GpuType string `json:"gpu_type"`
  209. }
  210. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  211. data, _ := json.Marshal(input)
  212. var taskPod TaskPod
  213. err := json.Unmarshal(data, &taskPod)
  214. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  215. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  216. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  217. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  218. taskPod.TaskStatuses[0].FinishedTime = "-"
  219. }
  220. return taskPod, err
  221. }
  222. type JobResultPayload struct {
  223. ID string `json:"id"`
  224. Name string `json:"name"`
  225. Platform string `json:"platform"`
  226. JobStatus struct {
  227. Username string `json:"username"`
  228. State string `json:"state"`
  229. SubState string `json:"subState"`
  230. ExecutionType string `json:"executionType"`
  231. Retries int `json:"retries"`
  232. CreatedTime int64 `json:"createdTime"`
  233. CompletedTime int64 `json:"completedTime"`
  234. AppID string `json:"appId"`
  235. AppProgress string `json:"appProgress"`
  236. AppTrackingURL string `json:"appTrackingUrl"`
  237. AppLaunchedTime int64 `json:"appLaunchedTime"`
  238. AppCompletedTime interface{} `json:"appCompletedTime"`
  239. AppExitCode int `json:"appExitCode"`
  240. AppExitDiagnostics string `json:"appExitDiagnostics"`
  241. AppExitType interface{} `json:"appExitType"`
  242. VirtualCluster string `json:"virtualCluster"`
  243. StartTime string
  244. EndTime string
  245. } `json:"jobStatus"`
  246. TaskRoles map[string]interface{} `json:"taskRoles"`
  247. Resource struct {
  248. CPU int `json:"cpu"`
  249. Memory string `json:"memory"`
  250. NvidiaComGpu int `json:"nvidia.com/gpu"`
  251. } `json:"resource"`
  252. Config struct {
  253. Image string `json:"image"`
  254. JobID string `json:"jobId"`
  255. GpuType string `json:"gpuType"`
  256. JobName string `json:"jobName"`
  257. JobType string `json:"jobType"`
  258. TaskRoles []struct {
  259. Name string `json:"name"`
  260. ShmMB int `json:"shmMB"`
  261. Command string `json:"command"`
  262. MemoryMB int `json:"memoryMB"`
  263. CPUNumber int `json:"cpuNumber"`
  264. GpuNumber int `json:"gpuNumber"`
  265. IsMainRole bool `json:"isMainRole"`
  266. TaskNumber int `json:"taskNumber"`
  267. NeedIBDevice bool `json:"needIBDevice"`
  268. MinFailedTaskCount int `json:"minFailedTaskCount"`
  269. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  270. } `json:"taskRoles"`
  271. RetryCount int `json:"retryCount"`
  272. } `json:"config"`
  273. Userinfo struct {
  274. User string `json:"user"`
  275. OrgID string `json:"org_id"`
  276. } `json:"userinfo"`
  277. }
  278. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  279. data, _ := json.Marshal(input)
  280. var jobResultPayload JobResultPayload
  281. err := json.Unmarshal(data, &jobResultPayload)
  282. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  283. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  284. if jobResultPayload.JobStatus.State == string(JobWaiting) {
  285. jobResultPayload.JobStatus.StartTime = "-"
  286. jobResultPayload.JobStatus.EndTime = "-"
  287. }
  288. return jobResultPayload, err
  289. }
  290. type ImagesResultPayload struct {
  291. Images []struct {
  292. ID int `json:"id"`
  293. Name string `json:"name"`
  294. Place string `json:"place"`
  295. Description string `json:"description"`
  296. Provider string `json:"provider"`
  297. Createtime string `json:"createtime"`
  298. Remark string `json:"remark"`
  299. } `json:"taskStatuses"`
  300. }
  301. type ImageInfo struct {
  302. ID int `json:"id"`
  303. Name string `json:"name"`
  304. Place string `json:"place"`
  305. Description string `json:"description"`
  306. Provider string `json:"provider"`
  307. Createtime string `json:"createtime"`
  308. Remark string `json:"remark"`
  309. IsPublic int `json:"isPublic"`
  310. PlaceView string
  311. }
  312. type Categories struct {
  313. Category []*Category `json:"category"`
  314. }
  315. type Category struct {
  316. Id int `json:"id"`
  317. Value string `json:"value"`
  318. }
  319. type GpuInfos struct {
  320. GpuInfo []*GpuInfo `json:"gpu_type"`
  321. }
  322. type GpuInfo struct {
  323. Id int `json:"id"`
  324. Value string `json:"value"`
  325. Queue string `json:"queue"`
  326. }
  327. type ResourceSpecs struct {
  328. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  329. }
  330. type ResourceSpec struct {
  331. Id int `json:"id"`
  332. CpuNum int `json:"cpu"`
  333. GpuNum int `json:"gpu"`
  334. MemMiB int `json:"memMiB"`
  335. ShareMemMiB int `json:"shareMemMiB"`
  336. }
  337. type FlavorInfos struct {
  338. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  339. }
  340. type FlavorInfo struct {
  341. Id int `json:"id"`
  342. Value string `json:"value"`
  343. }
  344. type PoolInfos struct {
  345. PoolInfo []*PoolInfo `json:"pool_info"`
  346. }
  347. type PoolInfo struct {
  348. PoolId string `json:"pool_id"`
  349. PoolName string `json:"pool_name"`
  350. PoolType string `json:"pool_type"`
  351. }
  352. type CommitImageParams struct {
  353. Ip string `json:"ip"`
  354. TaskContainerId string `json:"taskContainerId"`
  355. ImageTag string `json:"imageTag"`
  356. ImageDescription string `json:"imageDescription"`
  357. }
  358. type CommitImageResult struct {
  359. Code string `json:"code"`
  360. Msg string `json:"msg"`
  361. Payload map[string]interface{} `json:"payload"`
  362. }
  363. type CloudBrainResult struct {
  364. Code string `json:"code"`
  365. Msg string `json:"msg"`
  366. }
  367. type CreateNotebookParams struct {
  368. JobName string `json:"name"`
  369. Description string `json:"description"`
  370. ProfileID string `json:"profile_id"`
  371. Flavor string `json:"flavor"`
  372. Spec Spec `json:"spec"`
  373. Workspace Workspace `json:"workspace"`
  374. Pool Pool `json:"pool"`
  375. }
  376. type Pool struct {
  377. ID string `json:"id"`
  378. Name string `json:"name"`
  379. Type string `json:"type"`
  380. }
  381. type Workspace struct {
  382. ID string `json:"id"`
  383. }
  384. type Spec struct {
  385. Storage Storage `json:"storage"`
  386. AutoStop AutoStop `json:"auto_stop"`
  387. }
  388. type AutoStop struct {
  389. Enable bool `json:"enable"`
  390. Duration int `json:"duration"`
  391. }
  392. type Storage struct {
  393. Type string `json:"type"`
  394. Location Location `json:"location"`
  395. }
  396. type Location struct {
  397. Path string `json:"path"`
  398. }
  399. type NotebookResult struct {
  400. ErrorCode string `json:"error_code"`
  401. ErrorMsg string `json:"error_msg"`
  402. }
  403. type CreateNotebookResult struct {
  404. ErrorCode string `json:"error_code"`
  405. ErrorMsg string `json:"error_msg"`
  406. ID string `json:"id"`
  407. Name string `json:"name"`
  408. Description string `json:"description"`
  409. Status string `json:"status"`
  410. CreationTimestamp string `json:"creation_timestamp"`
  411. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  412. Profile struct {
  413. ID string `json:"id"`
  414. Name string `json:"name"`
  415. Description string `json:"description"`
  416. DeType string `json:"de_type"`
  417. FlavorType string `json:"flavor_type"`
  418. } `json:"profile"`
  419. Flavor string `json:"flavor"`
  420. FlavorDetails struct {
  421. Name string `json:"name"`
  422. Status string `json:"status"`
  423. QueuingNum int `json:"queuing_num"`
  424. QueueLeftTime int `json:"queue_left_time"` //s
  425. Duration int `json:"duration"` //auto_stop_time s
  426. } `json:"flavor_details"`
  427. }
  428. type GetNotebookResult struct {
  429. ErrorCode string `json:"error_code"`
  430. ErrorMsg string `json:"error_msg"`
  431. ID string `json:"id"`
  432. Name string `json:"name"`
  433. Description string `json:"description"`
  434. Status string `json:"status"`
  435. CreationTimestamp string `json:"creation_timestamp"`
  436. CreateTime string
  437. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  438. LatestUpdateTime string
  439. Profile struct {
  440. ID string `json:"id"`
  441. Name string `json:"name"`
  442. Description string `json:"description"`
  443. DeType string `json:"de_type"`
  444. FlavorType string `json:"flavor_type"`
  445. } `json:"profile"`
  446. Flavor string `json:"flavor"`
  447. FlavorDetails struct {
  448. Name string `json:"name"`
  449. Status string `json:"status"`
  450. QueuingNum int `json:"queuing_num"`
  451. QueueLeftTime int `json:"queue_left_time"` //s
  452. Duration int `json:"duration"` //auto_stop_time s
  453. } `json:"flavor_details"`
  454. QueuingInfo struct {
  455. ID string `json:"id"`
  456. Name string `json:"name"`
  457. Flavor string `json:"flavor"`
  458. DeType string `json:"de_type"`
  459. Status string `json:"status"`
  460. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  461. BeginTime string
  462. RemainTime int `json:"remain_time"` //remain time of instance
  463. EndTimestamp int `json:"end_timestamp"` //
  464. EndTime string
  465. Rank int `json:"rank"` //rank of instance in queue
  466. } `json:"queuing_info"`
  467. Spec struct {
  468. Annotations struct {
  469. TargetDomain string `json:"target_domain"`
  470. Url string `json:"url"`
  471. } `json:"annotations"`
  472. } `json:"spec"`
  473. }
  474. type GetTokenParams struct {
  475. Auth Auth `json:"auth"`
  476. }
  477. type Auth struct {
  478. Identity Identity `json:"identity"`
  479. Scope Scope `json:"scope"`
  480. }
  481. type Scope struct {
  482. Project Project `json:"project"`
  483. }
  484. type Project struct {
  485. Name string `json:"name"`
  486. }
  487. type Identity struct {
  488. Methods []string `json:"methods"`
  489. Password Password `json:"password"`
  490. }
  491. type Password struct {
  492. User NotebookUser `json:"user"`
  493. }
  494. type NotebookUser struct {
  495. Name string `json:"name"`
  496. Password string `json:"password"`
  497. Domain Domain `json:"domain"`
  498. }
  499. type Domain struct {
  500. Name string `json:"name"`
  501. }
  502. const (
  503. ActionStart = "start"
  504. ActionStop = "stop"
  505. ActionRestart = "restart"
  506. ActionQueue = "queue"
  507. ActionDequeue = "dequeue"
  508. )
  509. type NotebookAction struct {
  510. Action string `json:"action"`
  511. }
  512. type NotebookActionResult struct {
  513. ErrorCode string `json:"error_code"`
  514. ErrorMsg string `json:"error_msg"`
  515. CurrentStatus string `json:"current_status"`
  516. PreviousState string `json:"previous_state"`
  517. }
  518. type NotebookGetJobTokenResult struct {
  519. ErrorCode string `json:"error_code"`
  520. ErrorMsg string `json:"error_msg"`
  521. Token string `json:"token"`
  522. }
  523. type NotebookDelResult struct {
  524. InstanceID string `json:"instance_id"`
  525. }
  526. type CreateTrainJobParams struct {
  527. JobName string `json:"job_name"`
  528. Description string `json:"job_desc"`
  529. Config Config `json:"config"`
  530. WorkspaceID string `json:"workspace_id"`
  531. }
  532. type Config struct {
  533. WorkServerNum int `json:"worker_server_num"`
  534. AppUrl string `json:"app_url"` //训练作业的代码目录
  535. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  536. Parameter []Parameter `json:"parameter"`
  537. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  538. //DatasetID string `json:"dataset_id"`
  539. //DataVersionID string `json:"dataset_version_id"`
  540. //DataSource []DataSource `json:"data_source"`
  541. //SpecID int64 `json:"spec_id"`
  542. EngineID int64 `json:"engine_id"`
  543. //ModelID int64 `json:"model_id"`
  544. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  545. LogUrl string `json:"log_url"`
  546. //UserImageUrl string `json:"user_image_url"`
  547. //UserCommand string `json:"user_command"`
  548. CreateVersion bool `json:"create_version"`
  549. //Volumes []Volumes `json:"volumes"`
  550. Flavor Flavor `json:"flavor"`
  551. PoolID string `json:"pool_id"`
  552. }
  553. type CreateTrainJobVersionParams struct {
  554. Description string `json:"job_desc"`
  555. Config TrainJobVersionConfig `json:"config"`
  556. }
  557. type TrainJobVersionConfig struct {
  558. WorkServerNum int `json:"worker_server_num"`
  559. AppUrl string `json:"app_url"` //训练作业的代码目录
  560. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  561. Parameter []Parameter `json:"parameter"`
  562. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  563. //DatasetID string `json:"dataset_id"`
  564. //DataVersionID string `json:"dataset_version_id"`
  565. //DataSource []DataSource `json:"data_source"`
  566. //SpecID int64 `json:"spec_id"`
  567. EngineID int64 `json:"engine_id"`
  568. //ModelID int64 `json:"model_id"`
  569. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  570. LogUrl string `json:"log_url"`
  571. //UserImageUrl string `json:"user_image_url"`
  572. //UserCommand string `json:"user_command"`
  573. //Volumes []Volumes `json:"volumes"`
  574. Flavor Flavor `json:"flavor"`
  575. PoolID string `json:"pool_id"`
  576. PreVersionId int64 `json:"pre_version_id"`
  577. }
  578. type CreateConfigParams struct {
  579. ConfigName string `json:"config_name"`
  580. Description string `json:"config_desc"`
  581. WorkServerNum int `json:"worker_server_num"`
  582. AppUrl string `json:"app_url"` //训练作业的代码目录
  583. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  584. Parameter []Parameter `json:"parameter"`
  585. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  586. //DatasetID string `json:"dataset_id"`
  587. //DataVersionID string `json:"dataset_version_id"`
  588. //DataSource []DataSource `json:"data_source"`
  589. //SpecID int64 `json:"spec_id"`
  590. EngineID int64 `json:"engine_id"`
  591. //ModelID int64 `json:"model_id"`
  592. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  593. LogUrl string `json:"log_url"`
  594. //UserImageUrl string `json:"user_image_url"`
  595. //UserCommand string `json:"user_command"`
  596. //CreateVersion bool `json:"create_version"`
  597. //Volumes []Volumes `json:"volumes"`
  598. Flavor Flavor `json:"flavor"`
  599. PoolID string `json:"pool_id"`
  600. }
  601. type Parameter struct {
  602. Label string `json:"label"`
  603. Value string `json:"value"`
  604. }
  605. type Parameters struct {
  606. Parameter []Parameter `json:"parameter"`
  607. }
  608. type DataSource struct {
  609. DatasetID string `json:"dataset_id"`
  610. DatasetVersion string `json:"dataset_version"`
  611. Type string `json:"type"`
  612. DataUrl string `json:"data_url"`
  613. }
  614. type Volumes struct {
  615. Nfs Nfs `json:"nfs"`
  616. HostPath HostPath `json:"host_path"`
  617. }
  618. type Nfs struct {
  619. ID string `json:"id"`
  620. SourcePath string `json:"src_path"`
  621. DestPath string `json:"dest_path"`
  622. ReadOnly bool `json:"read_only"`
  623. }
  624. type HostPath struct {
  625. SourcePath string `json:"src_path"`
  626. DestPath string `json:"dest_path"`
  627. ReadOnly bool `json:"read_only"`
  628. }
  629. type Flavor struct {
  630. Code string `json:"code"`
  631. }
  632. type CreateTrainJobResult struct {
  633. ErrorCode string `json:"error_code"`
  634. ErrorMsg string `json:"error_msg"`
  635. IsSuccess bool `json:"is_success"`
  636. JobName string `json:"job_name"`
  637. JobID int64 `json:"job_id"`
  638. Status int `json:"status"`
  639. CreateTime int64 `json:"create_time"`
  640. VersionID int64 `json:"version_id"`
  641. ResourceID string `json:"resource_id"`
  642. VersionName string `json:"version_name"`
  643. }
  644. type CreateTrainJobConfigResult struct {
  645. ErrorCode string `json:"error_code"`
  646. ErrorMsg string `json:"error_msg"`
  647. IsSuccess bool `json:"is_success"`
  648. }
  649. type GetResourceSpecsResult struct {
  650. ErrorCode string `json:"error_code"`
  651. ErrorMsg string `json:"error_msg"`
  652. IsSuccess bool `json:"is_success"`
  653. SpecTotalCount int `json:"spec_total_count"`
  654. Specs []Specs `json:"specs"`
  655. }
  656. type Specs struct {
  657. Core string `json:"core"`
  658. Cpu string `json:"cpu"`
  659. IsNoResource bool `json:"no_resource"`
  660. GpuType string `json:"gpu_type"`
  661. SpecID int64 `json:"spec_id"`
  662. GpuNum int `json:"gpu_num"`
  663. SpecCode string `json:"spec_code"`
  664. Storage string `json:"storage"`
  665. MaxNum int `json:"max_num"`
  666. UnitNum int `json:"unit_num"`
  667. InterfaceType int `json:"interface_type"`
  668. }
  669. type GetConfigListResult struct {
  670. ErrorCode string `json:"error_code"`
  671. ErrorMsg string `json:"error_msg"`
  672. IsSuccess bool `json:"is_success"`
  673. ConfigTotalCount int `json:"config_total_count"`
  674. ParaConfigs []ParaConfig `json:"configs"`
  675. }
  676. type ParaConfig struct {
  677. ConfigName string `json:"config_name"`
  678. ConfigDesc string `json:"config_desc"`
  679. CreateTime int64 `json:"create_time"`
  680. EngineType int `json:"engine_type"`
  681. EngineName string `json:"engine_name"`
  682. EngineId int64 `json:"engine_id"`
  683. EngineVersion string `json:"engine_version"`
  684. UserImageUrl string `json:"user_image_url"`
  685. UserCommand string `json:"user_command"`
  686. Result GetConfigResult
  687. }
  688. type GetConfigResult struct {
  689. ErrorCode string `json:"error_code"`
  690. ErrorMsg string `json:"error_msg"`
  691. IsSuccess bool `json:"is_success"`
  692. ConfigName string `json:"config_name"`
  693. Description string `json:"config_desc"`
  694. WorkServerNum int `json:"worker_server_num"`
  695. AppUrl string `json:"app_url"` //训练作业的代码目录
  696. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  697. Parameter []Parameter `json:"parameter"`
  698. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  699. //DatasetID string `json:"dataset_id"`
  700. //DataVersionID string `json:"dataset_version_id"`
  701. //DataSource []DataSource `json:"data_source"`
  702. //SpecID int64 `json:"spec_id"`
  703. EngineID int64 `json:"engine_id"`
  704. //ModelID int64 `json:"model_id"`
  705. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  706. LogUrl string `json:"log_url"`
  707. //UserImageUrl string `json:"user_image_url"`
  708. //UserCommand string `json:"user_command"`
  709. //CreateVersion bool `json:"create_version"`
  710. //Volumes []Volumes `json:"volumes"`
  711. Flavor Flavor `json:"flavor"`
  712. PoolID string `json:"pool_id"`
  713. }
  714. type ErrorResult struct {
  715. ErrorCode string `json:"error_code"`
  716. ErrorMsg string `json:"error_message"`
  717. IsSuccess bool `json:"is_success"`
  718. }
  719. type GetTrainJobResult struct {
  720. IsSuccess bool `json:"is_success"`
  721. JobName string `json:"job_name"`
  722. JobID int64 `json:"job_id"`
  723. Description string `json:"job_desc"`
  724. IntStatus int `json:"status"`
  725. Status string
  726. LongCreateTime int64 `json:"create_time"`
  727. CreateTime string
  728. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  729. TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss
  730. VersionID int64 `json:"version_id"`
  731. ResourceID string `json:"resource_id"`
  732. VersionName string `json:"version_name"`
  733. PreVersionID int64 `json:"pre_version_id"`
  734. WorkServerNum int `json:"worker_server_num"`
  735. AppUrl string `json:"app_url"` //训练作业的代码目录
  736. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  737. Parameter []Parameter `json:"parameter"`
  738. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  739. //DatasetID string `json:"dataset_id"`
  740. //DataVersionID string `json:"dataset_version_id"`
  741. //DataSource []DataSource `json:"data_source"`
  742. //SpecID int64 `json:"spec_id"`
  743. EngineID int64 `json:"engine_id"`
  744. EngineName string `json:"engine_name"`
  745. EngineVersion string `json:"engine_version"`
  746. //ModelID int64 `json:"model_id"`
  747. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  748. LogUrl string `json:"log_url"`
  749. //UserImageUrl string `json:"user_image_url"`
  750. //UserCommand string `json:"user_command"`
  751. //Volumes []Volumes `json:"volumes"`
  752. Flavor Flavor `json:"flavor"`
  753. PoolID string `json:"pool_id"`
  754. PoolName string `json:"pool_name"`
  755. NasMountPath string `json:"nas_mount_path"`
  756. NasShareAddr string `json:"nas_share_addr"`
  757. DatasetName string
  758. ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话
  759. }
  760. type GetTrainJobLogResult struct {
  761. ErrorCode string `json:"error_code"`
  762. ErrorMsg string `json:"error_msg"`
  763. IsSuccess bool `json:"is_success"`
  764. Content string `json:"content"`
  765. Lines int `json:"lines"`
  766. StartLine string `json:"start_line"`
  767. EndLine string `json:"end_line"`
  768. }
  769. type GetTrainJobLogFileNamesResult struct {
  770. ErrorCode string `json:"error_code"`
  771. ErrorMsg string `json:"error_msg"`
  772. IsSuccess bool `json:"is_success"`
  773. LogFileList []string `json:"log_file_list"`
  774. }
  775. type TrainJobResult struct {
  776. ErrorCode string `json:"error_code"`
  777. ErrorMsg string `json:"error_msg"`
  778. IsSuccess bool `json:"is_success"`
  779. }
  780. type LogFile struct {
  781. Name string
  782. }
  783. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  784. sess := x.NewSession()
  785. defer sess.Close()
  786. var cond = builder.NewCond()
  787. if opts.RepoID > 0 {
  788. cond = cond.And(
  789. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  790. )
  791. }
  792. if opts.UserID > 0 {
  793. cond = cond.And(
  794. builder.Eq{"cloudbrain.user_id": opts.UserID},
  795. )
  796. }
  797. if (opts.JobID) != "" {
  798. cond = cond.And(
  799. builder.Eq{"cloudbrain.job_id": opts.JobID},
  800. )
  801. }
  802. if (opts.Type) >= 0 {
  803. cond = cond.And(
  804. builder.Eq{"cloudbrain.type": opts.Type},
  805. )
  806. }
  807. if (opts.JobType) != "" {
  808. cond = cond.And(
  809. builder.Eq{"cloudbrain.job_type": opts.JobType},
  810. )
  811. }
  812. if (opts.IsLatestVersion) != "" {
  813. cond = cond.And(
  814. builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion},
  815. )
  816. }
  817. // switch opts.JobStatus {
  818. // case JobWaiting:
  819. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  820. // case JobFailed:
  821. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  822. // case JobStopped:
  823. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  824. // case JobSucceeded:
  825. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  826. // }
  827. if len(opts.CloudbrainIDs) > 0 {
  828. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  829. }
  830. count, err := sess.Where(cond).Count(new(Cloudbrain))
  831. if err != nil {
  832. return nil, 0, fmt.Errorf("Count: %v", err)
  833. }
  834. if opts.Page >= 0 && opts.PageSize > 0 {
  835. var start int
  836. if opts.Page == 0 {
  837. start = 0
  838. } else {
  839. start = (opts.Page - 1) * opts.PageSize
  840. }
  841. sess.Limit(opts.PageSize, start)
  842. }
  843. sess.OrderBy("cloudbrain.created_unix DESC")
  844. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  845. if err := sess.Table(&Cloudbrain{}).Where(cond).
  846. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  847. Find(&cloudbrains); err != nil {
  848. return nil, 0, fmt.Errorf("Find: %v", err)
  849. }
  850. sess.Close()
  851. return cloudbrains, count, nil
  852. }
  853. func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  854. sess := x.NewSession()
  855. defer sess.Close()
  856. var cond = builder.NewCond()
  857. if opts.RepoID > 0 {
  858. cond = cond.And(
  859. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  860. )
  861. }
  862. if opts.UserID > 0 {
  863. cond = cond.And(
  864. builder.Eq{"cloudbrain.user_id": opts.UserID},
  865. )
  866. }
  867. if (opts.Type) >= 0 {
  868. cond = cond.And(
  869. builder.Eq{"cloudbrain.type": opts.Type},
  870. )
  871. }
  872. if (opts.JobID) != "" {
  873. cond = cond.And(
  874. builder.Eq{"cloudbrain.job_id": opts.JobID},
  875. )
  876. }
  877. if (opts.JobType) != "" {
  878. cond = cond.And(
  879. builder.Eq{"cloudbrain.job_type": opts.JobType},
  880. )
  881. }
  882. if len(opts.CloudbrainIDs) > 0 {
  883. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  884. }
  885. count, err := sess.Where(cond).Count(new(Cloudbrain))
  886. if err != nil {
  887. return nil, 0, fmt.Errorf("Count: %v", err)
  888. }
  889. if opts.Page >= 0 && opts.PageSize > 0 {
  890. var start int
  891. if opts.Page == 0 {
  892. start = 0
  893. } else {
  894. start = (opts.Page - 1) * opts.PageSize
  895. }
  896. sess.Limit(opts.PageSize, start)
  897. }
  898. sess.OrderBy("cloudbrain.created_unix DESC")
  899. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  900. if err := sess.Table(&Cloudbrain{}).Where(cond).
  901. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  902. Find(&cloudbrains); err != nil {
  903. return nil, 0, fmt.Errorf("Find: %v", err)
  904. }
  905. sess.Close()
  906. return cloudbrains, count, nil
  907. }
  908. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  909. if _, err = x.Insert(cloudbrain); err != nil {
  910. return err
  911. }
  912. return nil
  913. }
  914. func CreateTrainjobConfigDetail(trainjobConfigDetail *TrainjobConfigDetail) (err error) {
  915. if _, err = x.Insert(trainjobConfigDetail); err != nil {
  916. return err
  917. }
  918. return nil
  919. }
  920. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  921. has, err := x.Get(cb)
  922. if err != nil {
  923. return nil, err
  924. } else if !has {
  925. return nil, ErrJobNotExist{}
  926. }
  927. return cb, nil
  928. }
  929. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  930. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  931. return getRepoCloudBrain(cb)
  932. }
  933. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  934. cb := &Cloudbrain{JobID: jobID}
  935. return getRepoCloudBrain(cb)
  936. }
  937. func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) {
  938. cb := &Cloudbrain{JobID: jobID, VersionName: versionName}
  939. return getRepoCloudBrain(cb)
  940. }
  941. func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) {
  942. cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion}
  943. return getRepoCloudBrain(cb)
  944. }
  945. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  946. cloudBrains := make([]*Cloudbrain, 0)
  947. err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  948. return cloudBrains, err
  949. }
  950. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  951. cloudBrains := make([]*Cloudbrain, 0)
  952. err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  953. return cloudBrains, err
  954. }
  955. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  956. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  957. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  958. return
  959. }
  960. func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) {
  961. cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration}
  962. _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
  963. return
  964. }
  965. func SetVersionCountAndLatestVersionByJobIDAndVersionName(jobID string, versionName string, versionCount int64, isLatestVersion string) (err error) {
  966. cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion}
  967. _, err = x.Cols("version_Count", "is_latest_version").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
  968. return
  969. }
  970. func UpdateJob(job *Cloudbrain) error {
  971. return updateJob(x, job)
  972. }
  973. func updateJob(e Engine, job *Cloudbrain) error {
  974. var sess *xorm.Session
  975. sess = e.Where("job_id = ?", job.JobID)
  976. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  977. return err
  978. }
  979. // func UpdateTrainJob(job *CloudbrainInfo) error {
  980. // return updateTrainJob(x, job)
  981. // }
  982. // func updateTrainJob(e Engine, job *CloudbrainInfo) error {
  983. // var sess *xorm.Session
  984. // sess = e.Where("job_id = ?", job.Cloudbrain.JobID)
  985. // _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  986. // return err
  987. // }
  988. func DeleteJob(job *Cloudbrain) error {
  989. return deleteJob(x, job)
  990. }
  991. func deleteJob(e Engine, job *Cloudbrain) error {
  992. _, err := e.ID(job.ID).Delete(job)
  993. return err
  994. }
  995. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  996. cb := &Cloudbrain{JobName: jobName}
  997. return getRepoCloudBrain(cb)
  998. }
  999. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  1000. if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) {
  1001. return false
  1002. }
  1003. repo, err := GetRepositoryByID(job.RepoID)
  1004. if err != nil {
  1005. log.Error("GetRepositoryByID failed:%v", err.Error())
  1006. return false
  1007. }
  1008. permission, _ := GetUserRepoPermission(repo, user)
  1009. if err != nil {
  1010. log.Error("GetUserRepoPermission failed:%v", err.Error())
  1011. return false
  1012. }
  1013. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  1014. return true
  1015. }
  1016. return false
  1017. }