You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 31 kB

4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  26. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  27. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  28. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  29. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  30. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  31. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  32. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  33. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  34. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  35. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  36. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  37. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  38. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  39. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  40. )
  41. type Cloudbrain struct {
  42. ID int64 `xorm:"pk autoincr"`
  43. JobID string `xorm:"INDEX NOT NULL"`
  44. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  45. JobName string `xorm:"INDEX"`
  46. Status string `xorm:"INDEX"`
  47. UserID int64 `xorm:"INDEX"`
  48. RepoID int64 `xorm:"INDEX"`
  49. SubTaskName string `xorm:"INDEX"`
  50. ContainerID string
  51. ContainerIp string
  52. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  53. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  54. Duration int `xorm:"INDEX duration"`
  55. DeletedAt time.Time `xorm:"deleted"`
  56. CanDebug bool `xorm:"-"`
  57. CanDel bool `xorm:"-"`
  58. Type int `xorm:"INDEX DEFAULT 0"`
  59. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  60. VersionName string
  61. Uuid string
  62. User *User `xorm:"-"`
  63. Repo *Repository `xorm:"-"`
  64. }
  65. type CloudbrainInfo struct {
  66. Cloudbrain `xorm:"extends"`
  67. User `xorm:"extends"`
  68. }
  69. type CloudBrainLoginResult struct {
  70. Code string
  71. Msg string
  72. Payload map[string]interface{}
  73. }
  74. type TaskRole struct {
  75. Name string `json:"name"`
  76. TaskNumber int `json:"taskNumber"`
  77. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  78. MinFailedTaskCount int `json:"minFailedTaskCount"`
  79. CPUNumber int `json:"cpuNumber"`
  80. GPUNumber int `json:"gpuNumber"`
  81. MemoryMB int `json:"memoryMB"`
  82. ShmMB int `json:"shmMB"`
  83. Command string `json:"command"`
  84. NeedIBDevice bool `json:"needIBDevice"`
  85. IsMainRole bool `json:"isMainRole"`
  86. UseNNI bool `json:"useNNI"`
  87. }
  88. type StHostPath struct {
  89. Path string `json:"path"`
  90. MountPath string `json:"mountPath"`
  91. ReadOnly bool `json:"readOnly"`
  92. }
  93. type Volume struct {
  94. HostPath StHostPath `json:"hostPath"`
  95. }
  96. type CreateJobParams struct {
  97. JobName string `json:"jobName"`
  98. RetryCount int8 `json:"retryCount"`
  99. GpuType string `json:"gpuType"`
  100. Image string `json:"image"`
  101. TaskRoles []TaskRole `json:"taskRoles"`
  102. Volumes []Volume `json:"volumes"`
  103. }
  104. type CreateJobResult struct {
  105. Code string `json:"code"`
  106. Msg string `json:"msg"`
  107. Payload map[string]interface{} `json:"payload"`
  108. }
  109. type GetJobResult struct {
  110. Code string `json:"code"`
  111. Msg string `json:"msg"`
  112. Payload map[string]interface{} `json:"payload"`
  113. }
  114. type GetImagesResult struct {
  115. Code string `json:"code"`
  116. Msg string `json:"msg"`
  117. Payload GetImagesPayload `json:"payload"`
  118. }
  119. type GetImagesPayload struct {
  120. Count int `json:"count"`
  121. TotalPages int `json:"totalPages,omitempty"`
  122. ImageInfo []*ImageInfo `json:"rows"`
  123. }
  124. type CloudbrainsOptions struct {
  125. ListOptions
  126. RepoID int64 // include all repos if empty
  127. UserID int64
  128. JobID int64
  129. SortType string
  130. CloudbrainIDs []int64
  131. // JobStatus CloudbrainStatus
  132. Type int
  133. }
  134. type TaskPod struct {
  135. TaskRoleStatus struct {
  136. Name string `json:"name"`
  137. } `json:"taskRoleStatus"`
  138. TaskStatuses []struct {
  139. TaskIndex int `json:"taskIndex"`
  140. PodUID string `json:"podUid"`
  141. PodIP string `json:"podIp"`
  142. PodName string `json:"podName"`
  143. ContainerID string `json:"containerId"`
  144. ContainerIP string `json:"containerIp"`
  145. ContainerGpus string `json:"containerGpus"`
  146. State string `json:"state"`
  147. StartAt time.Time `json:"startAt"`
  148. FinishedAt time.Time `json:"finishedAt"`
  149. ExitCode int `json:"exitCode"`
  150. ExitDiagnostics string `json:"exitDiagnostics"`
  151. RetriedCount int `json:"retriedCount"`
  152. StartTime string
  153. FinishedTime string
  154. } `json:"taskStatuses"`
  155. }
  156. type TaskInfo struct {
  157. Username string `json:"username"`
  158. TaskName string `json:"task_name"`
  159. CodeName string `json:"code_name"`
  160. BenchmarkCategory []string `json:"selected_category"`
  161. CodeLink string `json:"code_link"`
  162. GpuType string `json:"gpu_type"`
  163. }
  164. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  165. data, _ := json.Marshal(input)
  166. var taskPod TaskPod
  167. err := json.Unmarshal(data, &taskPod)
  168. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  169. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  170. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  171. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  172. taskPod.TaskStatuses[0].FinishedTime = "-"
  173. }
  174. return taskPod, err
  175. }
  176. type JobResultPayload struct {
  177. ID string `json:"id"`
  178. Name string `json:"name"`
  179. Platform string `json:"platform"`
  180. JobStatus struct {
  181. Username string `json:"username"`
  182. State string `json:"state"`
  183. SubState string `json:"subState"`
  184. ExecutionType string `json:"executionType"`
  185. Retries int `json:"retries"`
  186. CreatedTime int64 `json:"createdTime"`
  187. CompletedTime int64 `json:"completedTime"`
  188. AppID string `json:"appId"`
  189. AppProgress string `json:"appProgress"`
  190. AppTrackingURL string `json:"appTrackingUrl"`
  191. AppLaunchedTime int64 `json:"appLaunchedTime"`
  192. AppCompletedTime interface{} `json:"appCompletedTime"`
  193. AppExitCode int `json:"appExitCode"`
  194. AppExitDiagnostics string `json:"appExitDiagnostics"`
  195. AppExitType interface{} `json:"appExitType"`
  196. VirtualCluster string `json:"virtualCluster"`
  197. StartTime string
  198. EndTime string
  199. } `json:"jobStatus"`
  200. TaskRoles map[string]interface{} `json:"taskRoles"`
  201. Resource struct {
  202. CPU int `json:"cpu"`
  203. Memory string `json:"memory"`
  204. NvidiaComGpu int `json:"nvidia.com/gpu"`
  205. } `json:"resource"`
  206. Config struct {
  207. Image string `json:"image"`
  208. JobID string `json:"jobId"`
  209. GpuType string `json:"gpuType"`
  210. JobName string `json:"jobName"`
  211. JobType string `json:"jobType"`
  212. TaskRoles []struct {
  213. Name string `json:"name"`
  214. ShmMB int `json:"shmMB"`
  215. Command string `json:"command"`
  216. MemoryMB int `json:"memoryMB"`
  217. CPUNumber int `json:"cpuNumber"`
  218. GpuNumber int `json:"gpuNumber"`
  219. IsMainRole bool `json:"isMainRole"`
  220. TaskNumber int `json:"taskNumber"`
  221. NeedIBDevice bool `json:"needIBDevice"`
  222. MinFailedTaskCount int `json:"minFailedTaskCount"`
  223. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  224. } `json:"taskRoles"`
  225. RetryCount int `json:"retryCount"`
  226. } `json:"config"`
  227. Userinfo struct {
  228. User string `json:"user"`
  229. OrgID string `json:"org_id"`
  230. } `json:"userinfo"`
  231. }
  232. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  233. data, _ := json.Marshal(input)
  234. var jobResultPayload JobResultPayload
  235. err := json.Unmarshal(data, &jobResultPayload)
  236. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  237. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  238. return jobResultPayload, err
  239. }
  240. type ImagesResultPayload struct {
  241. Images []struct {
  242. ID int `json:"id"`
  243. Name string `json:"name"`
  244. Place string `json:"place"`
  245. Description string `json:"description"`
  246. Provider string `json:"provider"`
  247. Createtime string `json:"createtime"`
  248. Remark string `json:"remark"`
  249. } `json:"taskStatuses"`
  250. }
  251. type ImageInfo struct {
  252. ID int `json:"id"`
  253. Name string `json:"name"`
  254. Place string `json:"place"`
  255. Description string `json:"description"`
  256. Provider string `json:"provider"`
  257. Createtime string `json:"createtime"`
  258. Remark string `json:"remark"`
  259. IsPublic int `json:"isPublic"`
  260. PlaceView string
  261. }
  262. type Categories struct {
  263. Category []*Category `json:"category"`
  264. }
  265. type Category struct {
  266. Id int `json:"id"`
  267. Value string `json:"value"`
  268. }
  269. type GpuInfos struct {
  270. GpuInfo []*GpuInfo `json:"gpu_type"`
  271. }
  272. type GpuInfo struct {
  273. Id int `json:"id"`
  274. Value string `json:"value"`
  275. Queue string `json:"queue"`
  276. }
  277. type ResourceSpecs struct {
  278. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  279. }
  280. type ResourceSpec struct {
  281. Id int `json:"id"`
  282. CpuNum int `json:"cpu"`
  283. GpuNum int `json:"gpu"`
  284. MemMiB int `json:"memMiB"`
  285. ShareMemMiB int `json:"shareMemMiB"`
  286. }
  287. type FlavorInfos struct {
  288. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  289. }
  290. type FlavorInfo struct {
  291. Id int `json:"id"`
  292. Value string `json:"value"`
  293. }
  294. type PoolInfos struct {
  295. PoolInfo []*PoolInfo `json:"pool_info"`
  296. }
  297. type PoolInfo struct {
  298. PoolId string `json:"pool_id"`
  299. PoolName string `json:"pool_name"`
  300. PoolType string `json:"pool_type"`
  301. }
  302. type CommitImageParams struct {
  303. Ip string `json:"ip"`
  304. TaskContainerId string `json:"taskContainerId"`
  305. ImageTag string `json:"imageTag"`
  306. ImageDescription string `json:"imageDescription"`
  307. }
  308. type CommitImageResult struct {
  309. Code string `json:"code"`
  310. Msg string `json:"msg"`
  311. Payload map[string]interface{} `json:"payload"`
  312. }
  313. type CloudBrainResult struct {
  314. Code string `json:"code"`
  315. Msg string `json:"msg"`
  316. }
  317. type CreateNotebookParams struct {
  318. JobName string `json:"name"`
  319. Description string `json:"description"`
  320. ProfileID string `json:"profile_id"`
  321. Flavor string `json:"flavor"`
  322. Spec Spec `json:"spec"`
  323. Workspace Workspace `json:"workspace"`
  324. Pool Pool `json:"pool"`
  325. }
  326. type Pool struct {
  327. ID string `json:"id"`
  328. Name string `json:"name"`
  329. Type string `json:"type"`
  330. }
  331. type Workspace struct {
  332. ID string `json:"id"`
  333. }
  334. type Spec struct {
  335. Storage Storage `json:"storage"`
  336. AutoStop AutoStop `json:"auto_stop"`
  337. }
  338. type AutoStop struct {
  339. Enable bool `json:"enable"`
  340. Duration int `json:"duration"`
  341. }
  342. type Storage struct {
  343. Type string `json:"type"`
  344. Location Location `json:"location"`
  345. }
  346. type Location struct {
  347. Path string `json:"path"`
  348. }
  349. type NotebookResult struct {
  350. ErrorCode string `json:"error_code"`
  351. ErrorMsg string `json:"error_msg"`
  352. }
  353. type CreateNotebookResult struct {
  354. ErrorCode string `json:"error_code"`
  355. ErrorMsg string `json:"error_msg"`
  356. ID string `json:"id"`
  357. Name string `json:"name"`
  358. Description string `json:"description"`
  359. Status string `json:"status"`
  360. CreationTimestamp string `json:"creation_timestamp"`
  361. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  362. Profile struct {
  363. ID string `json:"id"`
  364. Name string `json:"name"`
  365. Description string `json:"description"`
  366. DeType string `json:"de_type"`
  367. FlavorType string `json:"flavor_type"`
  368. } `json:"profile"`
  369. Flavor string `json:"flavor"`
  370. FlavorDetails struct {
  371. Name string `json:"name"`
  372. Status string `json:"status"`
  373. QueuingNum int `json:"queuing_num"`
  374. QueueLeftTime int `json:"queue_left_time"` //s
  375. Duration int `json:"duration"` //auto_stop_time s
  376. } `json:"flavor_details"`
  377. }
  378. type GetNotebookResult struct {
  379. ErrorCode string `json:"error_code"`
  380. ErrorMsg string `json:"error_msg"`
  381. ID string `json:"id"`
  382. Name string `json:"name"`
  383. Description string `json:"description"`
  384. Status string `json:"status"`
  385. CreationTimestamp string `json:"creation_timestamp"`
  386. CreateTime string
  387. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  388. LatestUpdateTime string
  389. Profile struct {
  390. ID string `json:"id"`
  391. Name string `json:"name"`
  392. Description string `json:"description"`
  393. DeType string `json:"de_type"`
  394. FlavorType string `json:"flavor_type"`
  395. } `json:"profile"`
  396. Flavor string `json:"flavor"`
  397. FlavorDetails struct {
  398. Name string `json:"name"`
  399. Status string `json:"status"`
  400. QueuingNum int `json:"queuing_num"`
  401. QueueLeftTime int `json:"queue_left_time"` //s
  402. Duration int `json:"duration"` //auto_stop_time s
  403. } `json:"flavor_details"`
  404. QueuingInfo struct {
  405. ID string `json:"id"`
  406. Name string `json:"name"`
  407. Flavor string `json:"flavor"`
  408. DeType string `json:"de_type"`
  409. Status string `json:"status"`
  410. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  411. BeginTime string
  412. RemainTime int `json:"remain_time"` //remain time of instance
  413. EndTimestamp int `json:"end_timestamp"` //
  414. EndTime string
  415. Rank int `json:"rank"` //rank of instance in queue
  416. } `json:"queuing_info"`
  417. Spec struct {
  418. Annotations struct {
  419. TargetDomain string `json:"target_domain"`
  420. Url string `json:"url"`
  421. } `json:"annotations"`
  422. } `json:"spec"`
  423. }
  424. type GetTokenParams struct {
  425. Auth Auth `json:"auth"`
  426. }
  427. type Auth struct {
  428. Identity Identity `json:"identity"`
  429. Scope Scope `json:"scope"`
  430. }
  431. type Scope struct {
  432. Project Project `json:"project"`
  433. }
  434. type Project struct {
  435. Name string `json:"name"`
  436. }
  437. type Identity struct {
  438. Methods []string `json:"methods"`
  439. Password Password `json:"password"`
  440. }
  441. type Password struct {
  442. User NotebookUser `json:"user"`
  443. }
  444. type NotebookUser struct {
  445. Name string `json:"name"`
  446. Password string `json:"password"`
  447. Domain Domain `json:"domain"`
  448. }
  449. type Domain struct {
  450. Name string `json:"name"`
  451. }
  452. const (
  453. ActionStart = "start"
  454. ActionStop = "stop"
  455. ActionRestart = "restart"
  456. ActionQueue = "queue"
  457. ActionDequeue = "dequeue"
  458. )
  459. type NotebookAction struct {
  460. Action string `json:"action"`
  461. }
  462. type NotebookActionResult struct {
  463. ErrorCode string `json:"error_code"`
  464. ErrorMsg string `json:"error_msg"`
  465. CurrentStatus string `json:"current_status"`
  466. PreviousState string `json:"previous_state"`
  467. }
  468. type NotebookGetJobTokenResult struct {
  469. ErrorCode string `json:"error_code"`
  470. ErrorMsg string `json:"error_msg"`
  471. Token string `json:"token"`
  472. }
  473. type NotebookDelResult struct {
  474. InstanceID string `json:"instance_id"`
  475. }
  476. type CreateTrainJobParams struct {
  477. JobName string `json:"job_name"`
  478. Description string `json:"job_desc"`
  479. Config Config `json:"config"`
  480. WorkspaceID string `json:"workspace_id"`
  481. }
  482. type Config struct {
  483. WorkServerNum int `json:"worker_server_num"`
  484. AppUrl string `json:"app_url"` //训练作业的代码目录
  485. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  486. Parameter []Parameter `json:"parameter"`
  487. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  488. //DatasetID string `json:"dataset_id"`
  489. //DataVersionID string `json:"dataset_version_id"`
  490. //DataSource []DataSource `json:"data_source"`
  491. //SpecID int64 `json:"spec_id"`
  492. EngineID int64 `json:"engine_id"`
  493. //ModelID int64 `json:"model_id"`
  494. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  495. LogUrl string `json:"log_url"`
  496. //UserImageUrl string `json:"user_image_url"`
  497. //UserCommand string `json:"user_command"`
  498. CreateVersion bool `json:"create_version"`
  499. //Volumes []Volumes `json:"volumes"`
  500. Flavor Flavor `json:"flavor"`
  501. PoolID string `json:"pool_id"`
  502. }
  503. type CreateConfigParams struct {
  504. ConfigName string `json:"config_name"`
  505. Description string `json:"config_desc"`
  506. WorkServerNum int `json:"worker_server_num"`
  507. AppUrl string `json:"app_url"` //训练作业的代码目录
  508. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  509. Parameter []Parameter `json:"parameter"`
  510. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  511. //DatasetID string `json:"dataset_id"`
  512. //DataVersionID string `json:"dataset_version_id"`
  513. //DataSource []DataSource `json:"data_source"`
  514. //SpecID int64 `json:"spec_id"`
  515. EngineID int64 `json:"engine_id"`
  516. //ModelID int64 `json:"model_id"`
  517. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  518. LogUrl string `json:"log_url"`
  519. //UserImageUrl string `json:"user_image_url"`
  520. //UserCommand string `json:"user_command"`
  521. //CreateVersion bool `json:"create_version"`
  522. //Volumes []Volumes `json:"volumes"`
  523. Flavor Flavor `json:"flavor"`
  524. PoolID string `json:"pool_id"`
  525. }
  526. type Parameter struct {
  527. Label string `json:"label"`
  528. Value string `json:"value"`
  529. }
  530. type Parameters struct {
  531. Parameter []Parameter `json:"parameter"`
  532. }
  533. type DataSource struct {
  534. DatasetID string `json:"dataset_id"`
  535. DatasetVersion string `json:"dataset_version"`
  536. Type string `json:"type"`
  537. DataUrl string `json:"data_url"`
  538. }
  539. type Volumes struct {
  540. Nfs Nfs `json:"nfs"`
  541. HostPath HostPath `json:"host_path"`
  542. }
  543. type Nfs struct {
  544. ID string `json:"id"`
  545. SourcePath string `json:"src_path"`
  546. DestPath string `json:"dest_path"`
  547. ReadOnly bool `json:"read_only"`
  548. }
  549. type HostPath struct {
  550. SourcePath string `json:"src_path"`
  551. DestPath string `json:"dest_path"`
  552. ReadOnly bool `json:"read_only"`
  553. }
  554. type Flavor struct {
  555. Code string `json:"code"`
  556. }
  557. type CreateTrainJobResult struct {
  558. ErrorCode string `json:"error_code"`
  559. ErrorMsg string `json:"error_msg"`
  560. IsSuccess bool `json:"is_success"`
  561. JobName string `json:"job_name"`
  562. JobID int64 `json:"job_id"`
  563. Status int `json:"status"`
  564. CreateTime int64 `json:"create_time"`
  565. VersionID int64 `json:"version_id"`
  566. ResourceID string `json:"resource_id"`
  567. VersionName string `json:"version_name"`
  568. }
  569. type CreateTrainJobConfigResult struct {
  570. ErrorCode string `json:"error_code"`
  571. ErrorMsg string `json:"error_msg"`
  572. IsSuccess bool `json:"is_success"`
  573. }
  574. type GetResourceSpecsResult struct {
  575. ErrorCode string `json:"error_code"`
  576. ErrorMsg string `json:"error_msg"`
  577. IsSuccess bool `json:"is_success"`
  578. SpecTotalCount int `json:"spec_total_count"`
  579. Specs []Specs `json:"specs"`
  580. }
  581. type Specs struct {
  582. Core string `json:"core"`
  583. Cpu string `json:"cpu"`
  584. IsNoResource bool `json:"no_resource"`
  585. GpuType string `json:"gpu_type"`
  586. SpecID int64 `json:"spec_id"`
  587. GpuNum int `json:"gpu_num"`
  588. SpecCode string `json:"spec_code"`
  589. Storage string `json:"storage"`
  590. MaxNum int `json:"max_num"`
  591. UnitNum int `json:"unit_num"`
  592. InterfaceType int `json:"interface_type"`
  593. }
  594. type GetConfigListResult struct {
  595. ErrorCode string `json:"error_code"`
  596. ErrorMsg string `json:"error_msg"`
  597. IsSuccess bool `json:"is_success"`
  598. ConfigTotalCount int `json:"config_total_count"`
  599. ParaConfigs []ParaConfig `json:"configs"`
  600. }
  601. type ParaConfig struct {
  602. ConfigName string `json:"config_name"`
  603. ConfigDesc string `json:"config_desc"`
  604. CreateTime int64 `json:"create_time"`
  605. EngineType int `json:"engine_type"`
  606. EngineName string `json:"engine_name"`
  607. EngineId int64 `json:"engine_id"`
  608. EngineVersion string `json:"engine_version"`
  609. UserImageUrl string `json:"user_image_url"`
  610. UserCommand string `json:"user_command"`
  611. Result GetConfigResult
  612. }
  613. type GetConfigResult struct {
  614. ErrorCode string `json:"error_code"`
  615. ErrorMsg string `json:"error_msg"`
  616. IsSuccess bool `json:"is_success"`
  617. ConfigName string `json:"config_name"`
  618. Description string `json:"config_desc"`
  619. WorkServerNum int `json:"worker_server_num"`
  620. AppUrl string `json:"app_url"` //训练作业的代码目录
  621. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  622. Parameter []Parameter `json:"parameter"`
  623. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  624. //DatasetID string `json:"dataset_id"`
  625. //DataVersionID string `json:"dataset_version_id"`
  626. //DataSource []DataSource `json:"data_source"`
  627. //SpecID int64 `json:"spec_id"`
  628. EngineID int64 `json:"engine_id"`
  629. //ModelID int64 `json:"model_id"`
  630. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  631. LogUrl string `json:"log_url"`
  632. //UserImageUrl string `json:"user_image_url"`
  633. //UserCommand string `json:"user_command"`
  634. //CreateVersion bool `json:"create_version"`
  635. //Volumes []Volumes `json:"volumes"`
  636. Flavor Flavor `json:"flavor"`
  637. PoolID string `json:"pool_id"`
  638. }
  639. type ErrorResult struct {
  640. ErrorCode string `json:"error_code"`
  641. ErrorMsg string `json:"error_message"`
  642. IsSuccess bool `json:"is_success"`
  643. }
  644. type GetTrainJobResult struct {
  645. IsSuccess bool `json:"is_success"`
  646. JobName string `json:"job_name"`
  647. JobID int64 `json:"job_id"`
  648. Description string `json:"job_desc"`
  649. IntStatus int `json:"status"`
  650. Status string
  651. LongCreateTime int64 `json:"create_time"`
  652. CreateTime string
  653. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  654. TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss
  655. VersionID int64 `json:"version_id"`
  656. ResourceID string `json:"resource_id"`
  657. VersionName string `json:"version_name"`
  658. PreVersionID int64 `json:"pre_version_id"`
  659. WorkServerNum int `json:"worker_server_num"`
  660. AppUrl string `json:"app_url"` //训练作业的代码目录
  661. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  662. Parameter []Parameter `json:"parameter"`
  663. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  664. //DatasetID string `json:"dataset_id"`
  665. //DataVersionID string `json:"dataset_version_id"`
  666. //DataSource []DataSource `json:"data_source"`
  667. //SpecID int64 `json:"spec_id"`
  668. EngineID int64 `json:"engine_id"`
  669. EngineName string `json:"engine_name"`
  670. EngineVersion string `json:"engine_version"`
  671. //ModelID int64 `json:"model_id"`
  672. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  673. LogUrl string `json:"log_url"`
  674. //UserImageUrl string `json:"user_image_url"`
  675. //UserCommand string `json:"user_command"`
  676. //Volumes []Volumes `json:"volumes"`
  677. Flavor Flavor `json:"flavor"`
  678. PoolID string `json:"pool_id"`
  679. PoolName string `json:"pool_name"`
  680. NasMountPath string `json:"nas_mount_path"`
  681. NasShareAddr string `json:"nas_share_addr"`
  682. DatasetName string
  683. }
  684. type GetTrainJobLogResult struct {
  685. ErrorCode string `json:"error_code"`
  686. ErrorMsg string `json:"error_msg"`
  687. IsSuccess bool `json:"is_success"`
  688. Content string `json:"content"`
  689. Lines int `json:"lines"`
  690. StartLine string `json:"start_line"`
  691. EndLine string `json:"end_line"`
  692. }
  693. type GetTrainJobLogFileNamesResult struct {
  694. ErrorCode string `json:"error_code"`
  695. ErrorMsg string `json:"error_msg"`
  696. IsSuccess bool `json:"is_success"`
  697. LogFileList []string `json:"log_file_list"`
  698. }
  699. type TrainJobResult struct {
  700. ErrorCode string `json:"error_code"`
  701. ErrorMsg string `json:"error_msg"`
  702. IsSuccess bool `json:"is_success"`
  703. }
  704. type LogFile struct {
  705. Name string
  706. }
  707. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  708. sess := x.NewSession()
  709. defer sess.Close()
  710. var cond = builder.NewCond()
  711. if opts.RepoID > 0 {
  712. cond = cond.And(
  713. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  714. )
  715. }
  716. if opts.UserID > 0 {
  717. cond = cond.And(
  718. builder.Eq{"cloudbrain.user_id": opts.UserID},
  719. )
  720. }
  721. if (opts.JobID) > 0 {
  722. cond = cond.And(
  723. builder.Eq{"cloudbrain.job_id": opts.JobID},
  724. )
  725. }
  726. if (opts.Type) >= 0 {
  727. cond = cond.And(
  728. builder.Eq{"cloudbrain.type": opts.Type},
  729. )
  730. }
  731. // switch opts.JobStatus {
  732. // case JobWaiting:
  733. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  734. // case JobFailed:
  735. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  736. // case JobStopped:
  737. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  738. // case JobSucceeded:
  739. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  740. // }
  741. if len(opts.CloudbrainIDs) > 0 {
  742. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  743. }
  744. count, err := sess.Where(cond).Count(new(Cloudbrain))
  745. if err != nil {
  746. return nil, 0, fmt.Errorf("Count: %v", err)
  747. }
  748. if opts.Page >= 0 && opts.PageSize > 0 {
  749. var start int
  750. if opts.Page == 0 {
  751. start = 0
  752. } else {
  753. start = (opts.Page - 1) * opts.PageSize
  754. }
  755. sess.Limit(opts.PageSize, start)
  756. }
  757. sess.OrderBy("cloudbrain.created_unix DESC")
  758. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  759. if err := sess.Table(&Cloudbrain{}).Where(cond).
  760. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  761. Find(&cloudbrains); err != nil {
  762. return nil, 0, fmt.Errorf("Find: %v", err)
  763. }
  764. sess.Close()
  765. return cloudbrains, count, nil
  766. }
  767. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  768. if _, err = x.Insert(cloudbrain); err != nil {
  769. return err
  770. }
  771. return nil
  772. }
  773. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  774. has, err := x.Get(cb)
  775. if err != nil {
  776. return nil, err
  777. } else if !has {
  778. return nil, ErrJobNotExist{}
  779. }
  780. return cb, nil
  781. }
  782. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  783. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  784. return getRepoCloudBrain(cb)
  785. }
  786. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  787. cb := &Cloudbrain{JobID: jobID}
  788. return getRepoCloudBrain(cb)
  789. }
  790. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  791. cloudBrains := make([]*Cloudbrain, 0)
  792. err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  793. return cloudBrains, err
  794. }
  795. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  796. cloudBrains := make([]*Cloudbrain, 0)
  797. err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  798. return cloudBrains, err
  799. }
  800. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  801. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  802. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  803. return
  804. }
  805. func SetTrainJobStatusByJobID(jobID string, status string, duration int) (err error) {
  806. cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration}
  807. _, err = x.Cols("status", "duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
  808. return
  809. }
  810. func UpdateJob(job *Cloudbrain) error {
  811. return updateJob(x, job)
  812. }
  813. func updateJob(e Engine, job *Cloudbrain) error {
  814. var sess *xorm.Session
  815. sess = e.Where("job_id = ?", job.JobID)
  816. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  817. return err
  818. }
  819. // func UpdateTrainJob(job *CloudbrainInfo) error {
  820. // return updateTrainJob(x, job)
  821. // }
  822. // func updateTrainJob(e Engine, job *CloudbrainInfo) error {
  823. // var sess *xorm.Session
  824. // sess = e.Where("job_id = ?", job.Cloudbrain.JobID)
  825. // _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  826. // return err
  827. // }
  828. func DeleteJob(job *Cloudbrain) error {
  829. return deleteJob(x, job)
  830. }
  831. func deleteJob(e Engine, job *Cloudbrain) error {
  832. _, err := e.ID(job.ID).Delete(job)
  833. return err
  834. }
  835. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  836. cb := &Cloudbrain{JobName: jobName}
  837. return getRepoCloudBrain(cb)
  838. }
  839. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  840. if !isSigned || job.Status != string(JobStopped) {
  841. return false
  842. }
  843. repo, err := GetRepositoryByID(job.RepoID)
  844. if err != nil {
  845. log.Error("GetRepositoryByID failed:%v", err.Error())
  846. return false
  847. }
  848. permission, _ := GetUserRepoPermission(repo, user)
  849. if err != nil {
  850. log.Error("GetUserRepoPermission failed:%v", err.Error())
  851. return false
  852. }
  853. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  854. return true
  855. }
  856. return false
  857. }