You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 37 kB

4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago

  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. JobTypeBrainScore JobType = "BRAINSCORE"
  26. JobTypeTrain JobType = "TRAIN"
  27. JobVersionName JobType = "V0001"
  28. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  29. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  30. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  31. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  32. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  33. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  34. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  35. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  36. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  37. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  38. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  39. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  40. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  41. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  42. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  43. )
  44. type Cloudbrain struct {
  45. ID int64 `xorm:"pk autoincr"`
  46. JobID string `xorm:"INDEX NOT NULL"`
  47. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  48. JobName string `xorm:"INDEX"`
  49. Status string `xorm:"INDEX"`
  50. UserID int64 `xorm:"INDEX"`
  51. RepoID int64 `xorm:"INDEX"`
  52. SubTaskName string `xorm:"INDEX"`
  53. ContainerID string
  54. ContainerIp string
  55. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  56. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  57. Duration int64 `xorm:"INDEX duration"`
  58. TrainJobDuration string
  59. DeletedAt time.Time `xorm:"deleted"`
  60. CanDebug bool `xorm:"-"`
  61. CanDel bool `xorm:"-"`
  62. Type int `xorm:"INDEX DEFAULT 0"`
  63. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  64. VersionName string
  65. Uuid string
  66. DatasetName string
  67. VersionCount int64 `xorm:"INDEX DEFAULT 1"`
  68. IsLatestVersion string
  69. CommitID string
  70. FatherVersionName string
  71. ComputeResource string
  72. EngineID int64
  73. User *User `xorm:"-"`
  74. Repo *Repository `xorm:"-"`
  75. }
  76. type TrainjobConfigDetail struct {
  77. ID int64 `xorm:"pk autoincr"`
  78. JobID string `xorm:"INDEX"`
  79. JobName string `xorm:"INDEX"`
  80. ResourcePools string `xorm:"INDEX"`
  81. EngineVersions int `xorm:"INDEX"`
  82. FlavorInfos string `xorm:"INDEX"`
  83. TrainUrl string `xorm:"INDEX"`
  84. BootFile string `xorm:"INDEX"`
  85. Uuid string `xorm:"INDEX"`
  86. DatasetName string `xorm:"INDEX"`
  87. Params string `xorm:"INDEX"`
  88. BranchName string `xorm:"INDEX"`
  89. VersionName string `xorm:"INDEX"`
  90. User *User `xorm:"-"`
  91. Repo *Repository `xorm:"-"`
  92. }
  93. type CloudbrainInfo struct {
  94. Cloudbrain `xorm:"extends"`
  95. User `xorm:"extends"`
  96. }
  97. type CloudBrainLoginResult struct {
  98. Code string
  99. Msg string
  100. Payload map[string]interface{}
  101. }
  102. type TaskRole struct {
  103. Name string `json:"name"`
  104. TaskNumber int `json:"taskNumber"`
  105. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  106. MinFailedTaskCount int `json:"minFailedTaskCount"`
  107. CPUNumber int `json:"cpuNumber"`
  108. GPUNumber int `json:"gpuNumber"`
  109. MemoryMB int `json:"memoryMB"`
  110. ShmMB int `json:"shmMB"`
  111. Command string `json:"command"`
  112. NeedIBDevice bool `json:"needIBDevice"`
  113. IsMainRole bool `json:"isMainRole"`
  114. UseNNI bool `json:"useNNI"`
  115. }
  116. type StHostPath struct {
  117. Path string `json:"path"`
  118. MountPath string `json:"mountPath"`
  119. ReadOnly bool `json:"readOnly"`
  120. }
  121. type Volume struct {
  122. HostPath StHostPath `json:"hostPath"`
  123. }
  124. type CreateJobParams struct {
  125. JobName string `json:"jobName"`
  126. RetryCount int8 `json:"retryCount"`
  127. GpuType string `json:"gpuType"`
  128. Image string `json:"image"`
  129. TaskRoles []TaskRole `json:"taskRoles"`
  130. Volumes []Volume `json:"volumes"`
  131. }
  132. type CreateJobResult struct {
  133. Code string `json:"code"`
  134. Msg string `json:"msg"`
  135. Payload map[string]interface{} `json:"payload"`
  136. }
  137. type GetJobResult struct {
  138. Code string `json:"code"`
  139. Msg string `json:"msg"`
  140. Payload map[string]interface{} `json:"payload"`
  141. }
  142. type GetImagesResult struct {
  143. Code string `json:"code"`
  144. Msg string `json:"msg"`
  145. Payload GetImagesPayload `json:"payload"`
  146. }
  147. type GetImagesPayload struct {
  148. Count int `json:"count"`
  149. TotalPages int `json:"totalPages,omitempty"`
  150. ImageInfo []*ImageInfo `json:"rows"`
  151. }
  152. type CloudbrainsOptions struct {
  153. ListOptions
  154. RepoID int64 // include all repos if empty
  155. UserID int64
  156. JobID string
  157. SortType string
  158. CloudbrainIDs []int64
  159. // JobStatus CloudbrainStatus
  160. Type int
  161. JobType string
  162. VersionName string
  163. IsLatestVersion string
  164. }
  165. type TaskPod struct {
  166. TaskRoleStatus struct {
  167. Name string `json:"name"`
  168. } `json:"taskRoleStatus"`
  169. //TaskStatuses []struct {
  170. // TaskIndex int `json:"taskIndex"`
  171. // PodUID string `json:"podUid"`
  172. // PodIP string `json:"podIp"`
  173. // PodName string `json:"podName"`
  174. // ContainerID string `json:"containerId"`
  175. // ContainerIP string `json:"containerIp"`
  176. // ContainerGpus string `json:"containerGpus"`
  177. // State string `json:"state"`
  178. // StartAt time.Time `json:"startAt"`
  179. // FinishedAt time.Time `json:"finishedAt"`
  180. // ExitCode int `json:"exitCode"`
  181. // ExitDiagnostics string `json:"exitDiagnostics"`
  182. // RetriedCount int `json:"retriedCount"`
  183. // StartTime string
  184. // FinishedTime string
  185. //} `json:"taskStatuses"`
  186. TaskStatuses []TaskStatuses `json:"taskStatuses"`
  187. }
  188. type TaskStatuses struct {
  189. TaskIndex int `json:"taskIndex"`
  190. PodUID string `json:"podUid"`
  191. PodIP string `json:"podIp"`
  192. PodName string `json:"podName"`
  193. ContainerID string `json:"containerId"`
  194. ContainerIP string `json:"containerIp"`
  195. ContainerGpus string `json:"containerGpus"`
  196. State string `json:"state"`
  197. StartAt time.Time `json:"startAt"`
  198. FinishedAt time.Time `json:"finishedAt"`
  199. ExitCode int `json:"exitCode"`
  200. ExitDiagnostics string `json:"exitDiagnostics"`
  201. RetriedCount int `json:"retriedCount"`
  202. StartTime string
  203. FinishedTime string
  204. }
  205. type TaskInfo struct {
  206. Username string `json:"username"`
  207. TaskName string `json:"task_name"`
  208. CodeName string `json:"code_name"`
  209. BenchmarkCategory []string `json:"selected_category"`
  210. CodeLink string `json:"code_link"`
  211. GpuType string `json:"gpu_type"`
  212. }
  213. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  214. data, _ := json.Marshal(input)
  215. var taskPod TaskPod
  216. err := json.Unmarshal(data, &taskPod)
  217. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  218. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  219. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  220. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  221. taskPod.TaskStatuses[0].FinishedTime = "-"
  222. }
  223. return taskPod, err
  224. }
  225. type JobResultPayload struct {
  226. ID string `json:"id"`
  227. Name string `json:"name"`
  228. Platform string `json:"platform"`
  229. JobStatus struct {
  230. Username string `json:"username"`
  231. State string `json:"state"`
  232. SubState string `json:"subState"`
  233. ExecutionType string `json:"executionType"`
  234. Retries int `json:"retries"`
  235. CreatedTime int64 `json:"createdTime"`
  236. CompletedTime int64 `json:"completedTime"`
  237. AppID string `json:"appId"`
  238. AppProgress string `json:"appProgress"`
  239. AppTrackingURL string `json:"appTrackingUrl"`
  240. AppLaunchedTime int64 `json:"appLaunchedTime"`
  241. AppCompletedTime interface{} `json:"appCompletedTime"`
  242. AppExitCode int `json:"appExitCode"`
  243. AppExitDiagnostics string `json:"appExitDiagnostics"`
  244. AppExitType interface{} `json:"appExitType"`
  245. VirtualCluster string `json:"virtualCluster"`
  246. StartTime string
  247. EndTime string
  248. } `json:"jobStatus"`
  249. TaskRoles map[string]interface{} `json:"taskRoles"`
  250. Resource struct {
  251. CPU int `json:"cpu"`
  252. Memory string `json:"memory"`
  253. NvidiaComGpu int `json:"nvidia.com/gpu"`
  254. } `json:"resource"`
  255. Config struct {
  256. Image string `json:"image"`
  257. JobID string `json:"jobId"`
  258. GpuType string `json:"gpuType"`
  259. JobName string `json:"jobName"`
  260. JobType string `json:"jobType"`
  261. TaskRoles []struct {
  262. Name string `json:"name"`
  263. ShmMB int `json:"shmMB"`
  264. Command string `json:"command"`
  265. MemoryMB int `json:"memoryMB"`
  266. CPUNumber int `json:"cpuNumber"`
  267. GpuNumber int `json:"gpuNumber"`
  268. IsMainRole bool `json:"isMainRole"`
  269. TaskNumber int `json:"taskNumber"`
  270. NeedIBDevice bool `json:"needIBDevice"`
  271. MinFailedTaskCount int `json:"minFailedTaskCount"`
  272. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  273. } `json:"taskRoles"`
  274. RetryCount int `json:"retryCount"`
  275. } `json:"config"`
  276. Userinfo struct {
  277. User string `json:"user"`
  278. OrgID string `json:"org_id"`
  279. } `json:"userinfo"`
  280. }
  281. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  282. data, _ := json.Marshal(input)
  283. var jobResultPayload JobResultPayload
  284. err := json.Unmarshal(data, &jobResultPayload)
  285. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  286. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  287. if jobResultPayload.JobStatus.State == string(JobWaiting) {
  288. jobResultPayload.JobStatus.StartTime = "-"
  289. jobResultPayload.JobStatus.EndTime = "-"
  290. }
  291. return jobResultPayload, err
  292. }
  293. type ImagesResultPayload struct {
  294. Images []struct {
  295. ID int `json:"id"`
  296. Name string `json:"name"`
  297. Place string `json:"place"`
  298. Description string `json:"description"`
  299. Provider string `json:"provider"`
  300. Createtime string `json:"createtime"`
  301. Remark string `json:"remark"`
  302. } `json:"taskStatuses"`
  303. }
  304. type ImageInfo struct {
  305. ID int `json:"id"`
  306. Name string `json:"name"`
  307. Place string `json:"place"`
  308. Description string `json:"description"`
  309. Provider string `json:"provider"`
  310. Createtime string `json:"createtime"`
  311. Remark string `json:"remark"`
  312. IsPublic int `json:"isPublic"`
  313. PlaceView string
  314. }
  315. type Categories struct {
  316. Category []*Category `json:"category"`
  317. }
  318. type Category struct {
  319. Id int `json:"id"`
  320. Value string `json:"value"`
  321. }
  322. type GpuInfos struct {
  323. GpuInfo []*GpuInfo `json:"gpu_type"`
  324. }
  325. type GpuInfo struct {
  326. Id int `json:"id"`
  327. Value string `json:"value"`
  328. Queue string `json:"queue"`
  329. }
  330. type ResourceSpecs struct {
  331. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  332. }
  333. type ResourceSpec struct {
  334. Id int `json:"id"`
  335. CpuNum int `json:"cpu"`
  336. GpuNum int `json:"gpu"`
  337. MemMiB int `json:"memMiB"`
  338. ShareMemMiB int `json:"shareMemMiB"`
  339. }
  340. type FlavorInfos struct {
  341. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  342. }
  343. type FlavorInfo struct {
  344. Id int `json:"id"`
  345. Value string `json:"value"`
  346. }
  347. type PoolInfos struct {
  348. PoolInfo []*PoolInfo `json:"pool_info"`
  349. }
  350. type PoolInfo struct {
  351. PoolId string `json:"pool_id"`
  352. PoolName string `json:"pool_name"`
  353. PoolType string `json:"pool_type"`
  354. }
  355. type CommitImageParams struct {
  356. Ip string `json:"ip"`
  357. TaskContainerId string `json:"taskContainerId"`
  358. ImageTag string `json:"imageTag"`
  359. ImageDescription string `json:"imageDescription"`
  360. }
  361. type CommitImageResult struct {
  362. Code string `json:"code"`
  363. Msg string `json:"msg"`
  364. Payload map[string]interface{} `json:"payload"`
  365. }
  366. type CloudBrainResult struct {
  367. Code string `json:"code"`
  368. Msg string `json:"msg"`
  369. }
  370. type CreateNotebookParams struct {
  371. JobName string `json:"name"`
  372. Description string `json:"description"`
  373. ProfileID string `json:"profile_id"`
  374. Flavor string `json:"flavor"`
  375. Spec Spec `json:"spec"`
  376. Workspace Workspace `json:"workspace"`
  377. Pool Pool `json:"pool"`
  378. }
  379. type Pool struct {
  380. ID string `json:"id"`
  381. Name string `json:"name"`
  382. Type string `json:"type"`
  383. }
  384. type Workspace struct {
  385. ID string `json:"id"`
  386. }
  387. type Spec struct {
  388. Storage Storage `json:"storage"`
  389. AutoStop AutoStop `json:"auto_stop"`
  390. }
  391. type AutoStop struct {
  392. Enable bool `json:"enable"`
  393. Duration int `json:"duration"`
  394. }
  395. type Storage struct {
  396. Type string `json:"type"`
  397. Location Location `json:"location"`
  398. }
  399. type Location struct {
  400. Path string `json:"path"`
  401. }
  402. type NotebookResult struct {
  403. ErrorCode string `json:"error_code"`
  404. ErrorMsg string `json:"error_msg"`
  405. }
  406. type CreateNotebookResult struct {
  407. ErrorCode string `json:"error_code"`
  408. ErrorMsg string `json:"error_msg"`
  409. ID string `json:"id"`
  410. Name string `json:"name"`
  411. Description string `json:"description"`
  412. Status string `json:"status"`
  413. CreationTimestamp string `json:"creation_timestamp"`
  414. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  415. Profile struct {
  416. ID string `json:"id"`
  417. Name string `json:"name"`
  418. Description string `json:"description"`
  419. DeType string `json:"de_type"`
  420. FlavorType string `json:"flavor_type"`
  421. } `json:"profile"`
  422. Flavor string `json:"flavor"`
  423. FlavorDetails struct {
  424. Name string `json:"name"`
  425. Status string `json:"status"`
  426. QueuingNum int `json:"queuing_num"`
  427. QueueLeftTime int `json:"queue_left_time"` //s
  428. Duration int `json:"duration"` //auto_stop_time s
  429. } `json:"flavor_details"`
  430. }
  431. type GetNotebookResult struct {
  432. ErrorCode string `json:"error_code"`
  433. ErrorMsg string `json:"error_msg"`
  434. ID string `json:"id"`
  435. Name string `json:"name"`
  436. Description string `json:"description"`
  437. Status string `json:"status"`
  438. CreationTimestamp string `json:"creation_timestamp"`
  439. CreateTime string
  440. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  441. LatestUpdateTime string
  442. Profile struct {
  443. ID string `json:"id"`
  444. Name string `json:"name"`
  445. Description string `json:"description"`
  446. DeType string `json:"de_type"`
  447. FlavorType string `json:"flavor_type"`
  448. } `json:"profile"`
  449. Flavor string `json:"flavor"`
  450. FlavorDetails struct {
  451. Name string `json:"name"`
  452. Status string `json:"status"`
  453. QueuingNum int `json:"queuing_num"`
  454. QueueLeftTime int `json:"queue_left_time"` //s
  455. Duration int `json:"duration"` //auto_stop_time s
  456. } `json:"flavor_details"`
  457. QueuingInfo struct {
  458. ID string `json:"id"`
  459. Name string `json:"name"`
  460. Flavor string `json:"flavor"`
  461. DeType string `json:"de_type"`
  462. Status string `json:"status"`
  463. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  464. BeginTime string
  465. RemainTime int `json:"remain_time"` //remain time of instance
  466. EndTimestamp int `json:"end_timestamp"` //
  467. EndTime string
  468. Rank int `json:"rank"` //rank of instance in queue
  469. } `json:"queuing_info"`
  470. Spec struct {
  471. Annotations struct {
  472. TargetDomain string `json:"target_domain"`
  473. Url string `json:"url"`
  474. } `json:"annotations"`
  475. } `json:"spec"`
  476. }
  477. type GetTokenParams struct {
  478. Auth Auth `json:"auth"`
  479. }
  480. type Auth struct {
  481. Identity Identity `json:"identity"`
  482. Scope Scope `json:"scope"`
  483. }
  484. type Scope struct {
  485. Project Project `json:"project"`
  486. }
  487. type Project struct {
  488. Name string `json:"name"`
  489. }
  490. type Identity struct {
  491. Methods []string `json:"methods"`
  492. Password Password `json:"password"`
  493. }
  494. type Password struct {
  495. User NotebookUser `json:"user"`
  496. }
  497. type NotebookUser struct {
  498. Name string `json:"name"`
  499. Password string `json:"password"`
  500. Domain Domain `json:"domain"`
  501. }
  502. type Domain struct {
  503. Name string `json:"name"`
  504. }
  505. const (
  506. ActionStart = "start"
  507. ActionStop = "stop"
  508. ActionRestart = "restart"
  509. ActionQueue = "queue"
  510. ActionDequeue = "dequeue"
  511. )
  512. type NotebookAction struct {
  513. Action string `json:"action"`
  514. }
  515. type NotebookActionResult struct {
  516. ErrorCode string `json:"error_code"`
  517. ErrorMsg string `json:"error_msg"`
  518. CurrentStatus string `json:"current_status"`
  519. PreviousState string `json:"previous_state"`
  520. }
  521. type NotebookGetJobTokenResult struct {
  522. ErrorCode string `json:"error_code"`
  523. ErrorMsg string `json:"error_msg"`
  524. Token string `json:"token"`
  525. }
  526. type NotebookDelResult struct {
  527. InstanceID string `json:"instance_id"`
  528. }
  529. type CreateTrainJobParams struct {
  530. JobName string `json:"job_name"`
  531. Description string `json:"job_desc"`
  532. Config Config `json:"config"`
  533. WorkspaceID string `json:"workspace_id"`
  534. }
  535. type Config struct {
  536. WorkServerNum int `json:"worker_server_num"`
  537. AppUrl string `json:"app_url"` //训练作业的代码目录
  538. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  539. Parameter []Parameter `json:"parameter"`
  540. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  541. //DatasetID string `json:"dataset_id"`
  542. //DataVersionID string `json:"dataset_version_id"`
  543. //DataSource []DataSource `json:"data_source"`
  544. //SpecID int64 `json:"spec_id"`
  545. EngineID int64 `json:"engine_id"`
  546. //ModelID int64 `json:"model_id"`
  547. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  548. LogUrl string `json:"log_url"`
  549. //UserImageUrl string `json:"user_image_url"`
  550. //UserCommand string `json:"user_command"`
  551. CreateVersion bool `json:"create_version"`
  552. //Volumes []Volumes `json:"volumes"`
  553. Flavor Flavor `json:"flavor"`
  554. PoolID string `json:"pool_id"`
  555. }
  556. type CreateTrainJobVersionParams struct {
  557. Description string `json:"job_desc"`
  558. Config TrainJobVersionConfig `json:"config"`
  559. }
  560. type TrainJobVersionConfig struct {
  561. WorkServerNum int `json:"worker_server_num"`
  562. AppUrl string `json:"app_url"` //训练作业的代码目录
  563. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  564. Parameter []Parameter `json:"parameter"`
  565. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  566. //DatasetID string `json:"dataset_id"`
  567. //DataVersionID string `json:"dataset_version_id"`
  568. //DataSource []DataSource `json:"data_source"`
  569. //SpecID int64 `json:"spec_id"`
  570. EngineID int64 `json:"engine_id"`
  571. //ModelID int64 `json:"model_id"`
  572. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  573. LogUrl string `json:"log_url"`
  574. //UserImageUrl string `json:"user_image_url"`
  575. //UserCommand string `json:"user_command"`
  576. //Volumes []Volumes `json:"volumes"`
  577. Flavor Flavor `json:"flavor"`
  578. PoolID string `json:"pool_id"`
  579. PreVersionId int64 `json:"pre_version_id"`
  580. }
  581. type CreateConfigParams struct {
  582. ConfigName string `json:"config_name"`
  583. Description string `json:"config_desc"`
  584. WorkServerNum int `json:"worker_server_num"`
  585. AppUrl string `json:"app_url"` //训练作业的代码目录
  586. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  587. Parameter []Parameter `json:"parameter"`
  588. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  589. //DatasetID string `json:"dataset_id"`
  590. //DataVersionID string `json:"dataset_version_id"`
  591. //DataSource []DataSource `json:"data_source"`
  592. //SpecID int64 `json:"spec_id"`
  593. EngineID int64 `json:"engine_id"`
  594. //ModelID int64 `json:"model_id"`
  595. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  596. LogUrl string `json:"log_url"`
  597. //UserImageUrl string `json:"user_image_url"`
  598. //UserCommand string `json:"user_command"`
  599. //CreateVersion bool `json:"create_version"`
  600. //Volumes []Volumes `json:"volumes"`
  601. Flavor Flavor `json:"flavor"`
  602. PoolID string `json:"pool_id"`
  603. }
  604. type Parameter struct {
  605. Label string `json:"label"`
  606. Value string `json:"value"`
  607. }
  608. type Parameters struct {
  609. Parameter []Parameter `json:"parameter"`
  610. }
  611. type DataSource struct {
  612. DatasetID string `json:"dataset_id"`
  613. DatasetVersion string `json:"dataset_version"`
  614. Type string `json:"type"`
  615. DataUrl string `json:"data_url"`
  616. }
  617. type Volumes struct {
  618. Nfs Nfs `json:"nfs"`
  619. HostPath HostPath `json:"host_path"`
  620. }
  621. type Nfs struct {
  622. ID string `json:"id"`
  623. SourcePath string `json:"src_path"`
  624. DestPath string `json:"dest_path"`
  625. ReadOnly bool `json:"read_only"`
  626. }
  627. type HostPath struct {
  628. SourcePath string `json:"src_path"`
  629. DestPath string `json:"dest_path"`
  630. ReadOnly bool `json:"read_only"`
  631. }
  632. type Flavor struct {
  633. Code string `json:"code"`
  634. }
  635. type CreateTrainJobResult struct {
  636. ErrorCode string `json:"error_code"`
  637. ErrorMsg string `json:"error_msg"`
  638. IsSuccess bool `json:"is_success"`
  639. JobName string `json:"job_name"`
  640. JobID int64 `json:"job_id"`
  641. Status int `json:"status"`
  642. CreateTime int64 `json:"create_time"`
  643. VersionID int64 `json:"version_id"`
  644. ResourceID string `json:"resource_id"`
  645. VersionName string `json:"version_name"`
  646. }
  647. type CreateTrainJobConfigResult struct {
  648. ErrorCode string `json:"error_code"`
  649. ErrorMsg string `json:"error_msg"`
  650. IsSuccess bool `json:"is_success"`
  651. }
  652. type GetResourceSpecsResult struct {
  653. ErrorCode string `json:"error_code"`
  654. ErrorMsg string `json:"error_msg"`
  655. IsSuccess bool `json:"is_success"`
  656. SpecTotalCount int `json:"spec_total_count"`
  657. Specs []Specs `json:"specs"`
  658. }
  659. type Specs struct {
  660. Core string `json:"core"`
  661. Cpu string `json:"cpu"`
  662. IsNoResource bool `json:"no_resource"`
  663. GpuType string `json:"gpu_type"`
  664. SpecID int64 `json:"spec_id"`
  665. GpuNum int `json:"gpu_num"`
  666. SpecCode string `json:"spec_code"`
  667. Storage string `json:"storage"`
  668. MaxNum int `json:"max_num"`
  669. UnitNum int `json:"unit_num"`
  670. InterfaceType int `json:"interface_type"`
  671. }
  672. type GetConfigListResult struct {
  673. ErrorCode string `json:"error_code"`
  674. ErrorMsg string `json:"error_msg"`
  675. IsSuccess bool `json:"is_success"`
  676. ConfigTotalCount int `json:"config_total_count"`
  677. ParaConfigs []ParaConfig `json:"configs"`
  678. }
  679. type ParaConfig struct {
  680. ConfigName string `json:"config_name"`
  681. ConfigDesc string `json:"config_desc"`
  682. CreateTime int64 `json:"create_time"`
  683. EngineType int `json:"engine_type"`
  684. EngineName string `json:"engine_name"`
  685. EngineId int64 `json:"engine_id"`
  686. EngineVersion string `json:"engine_version"`
  687. UserImageUrl string `json:"user_image_url"`
  688. UserCommand string `json:"user_command"`
  689. Result GetConfigResult
  690. }
  691. type GetConfigResult struct {
  692. ErrorCode string `json:"error_code"`
  693. ErrorMsg string `json:"error_msg"`
  694. IsSuccess bool `json:"is_success"`
  695. ConfigName string `json:"config_name"`
  696. Description string `json:"config_desc"`
  697. WorkServerNum int `json:"worker_server_num"`
  698. AppUrl string `json:"app_url"` //训练作业的代码目录
  699. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  700. Parameter []Parameter `json:"parameter"`
  701. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  702. //DatasetID string `json:"dataset_id"`
  703. //DataVersionID string `json:"dataset_version_id"`
  704. //DataSource []DataSource `json:"data_source"`
  705. //SpecID int64 `json:"spec_id"`
  706. EngineID int64 `json:"engine_id"`
  707. //ModelID int64 `json:"model_id"`
  708. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  709. LogUrl string `json:"log_url"`
  710. //UserImageUrl string `json:"user_image_url"`
  711. //UserCommand string `json:"user_command"`
  712. //CreateVersion bool `json:"create_version"`
  713. //Volumes []Volumes `json:"volumes"`
  714. Flavor Flavor `json:"flavor"`
  715. PoolID string `json:"pool_id"`
  716. }
  717. type ErrorResult struct {
  718. ErrorCode string `json:"error_code"`
  719. ErrorMsg string `json:"error_message"`
  720. IsSuccess bool `json:"is_success"`
  721. }
  722. type GetTrainJobResult struct {
  723. IsSuccess bool `json:"is_success"`
  724. JobName string `json:"job_name"`
  725. JobID int64 `json:"job_id"`
  726. Description string `json:"job_desc"`
  727. IntStatus int `json:"status"`
  728. Status string
  729. LongCreateTime int64 `json:"create_time"`
  730. CreateTime string
  731. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  732. TrainJobDuration string //训练作业的运行时间,格式为hh:mm:ss
  733. VersionID int64 `json:"version_id"`
  734. ResourceID string `json:"resource_id"`
  735. VersionName string `json:"version_name"`
  736. PreVersionID int64 `json:"pre_version_id"`
  737. WorkServerNum int `json:"worker_server_num"`
  738. AppUrl string `json:"app_url"` //训练作业的代码目录
  739. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  740. Parameter []Parameter `json:"parameter"`
  741. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  742. //DatasetID string `json:"dataset_id"`
  743. //DataVersionID string `json:"dataset_version_id"`
  744. //DataSource []DataSource `json:"data_source"`
  745. //SpecID int64 `json:"spec_id"`
  746. EngineID int64 `json:"engine_id"`
  747. EngineName string `json:"engine_name"`
  748. EngineVersion string `json:"engine_version"`
  749. //ModelID int64 `json:"model_id"`
  750. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  751. LogUrl string `json:"log_url"`
  752. //UserImageUrl string `json:"user_image_url"`
  753. //UserCommand string `json:"user_command"`
  754. //Volumes []Volumes `json:"volumes"`
  755. Flavor Flavor `json:"flavor"`
  756. PoolID string `json:"pool_id"`
  757. PoolName string `json:"pool_name"`
  758. NasMountPath string `json:"nas_mount_path"`
  759. NasShareAddr string `json:"nas_share_addr"`
  760. DatasetName string
  761. ModelMetricList string `json:"model_metric_list"` //列表里包含f1_score,recall,precision,accuracy,若有的话
  762. }
  763. type GetTrainJobLogResult struct {
  764. ErrorCode string `json:"error_code"`
  765. ErrorMsg string `json:"error_msg"`
  766. IsSuccess bool `json:"is_success"`
  767. Content string `json:"content"`
  768. Lines int `json:"lines"`
  769. StartLine string `json:"start_line"`
  770. EndLine string `json:"end_line"`
  771. }
  772. type GetTrainJobLogFileNamesResult struct {
  773. ErrorCode string `json:"error_code"`
  774. ErrorMsg string `json:"error_msg"`
  775. IsSuccess bool `json:"is_success"`
  776. LogFileList []string `json:"log_file_list"`
  777. }
  778. type TrainJobResult struct {
  779. ErrorCode string `json:"error_code"`
  780. ErrorMsg string `json:"error_msg"`
  781. IsSuccess bool `json:"is_success"`
  782. }
  783. type LogFile struct {
  784. Name string
  785. }
  786. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  787. sess := x.NewSession()
  788. defer sess.Close()
  789. var cond = builder.NewCond()
  790. if opts.RepoID > 0 {
  791. cond = cond.And(
  792. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  793. )
  794. }
  795. if opts.UserID > 0 {
  796. cond = cond.And(
  797. builder.Eq{"cloudbrain.user_id": opts.UserID},
  798. )
  799. }
  800. if (opts.JobID) != "" {
  801. cond = cond.And(
  802. builder.Eq{"cloudbrain.job_id": opts.JobID},
  803. )
  804. }
  805. if (opts.Type) >= 0 {
  806. cond = cond.And(
  807. builder.Eq{"cloudbrain.type": opts.Type},
  808. )
  809. }
  810. if (opts.JobType) != "" {
  811. cond = cond.And(
  812. builder.Eq{"cloudbrain.job_type": opts.JobType},
  813. )
  814. }
  815. if (opts.IsLatestVersion) != "" {
  816. cond = cond.And(
  817. builder.Eq{"cloudbrain.is_latest_version": opts.IsLatestVersion},
  818. )
  819. }
  820. // switch opts.JobStatus {
  821. // case JobWaiting:
  822. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  823. // case JobFailed:
  824. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  825. // case JobStopped:
  826. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  827. // case JobSucceeded:
  828. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  829. // }
  830. if len(opts.CloudbrainIDs) > 0 {
  831. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  832. }
  833. count, err := sess.Where(cond).Count(new(Cloudbrain))
  834. if err != nil {
  835. return nil, 0, fmt.Errorf("Count: %v", err)
  836. }
  837. if opts.Page >= 0 && opts.PageSize > 0 {
  838. var start int
  839. if opts.Page == 0 {
  840. start = 0
  841. } else {
  842. start = (opts.Page - 1) * opts.PageSize
  843. }
  844. sess.Limit(opts.PageSize, start)
  845. }
  846. sess.OrderBy("cloudbrain.created_unix DESC")
  847. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  848. if err := sess.Table(&Cloudbrain{}).Where(cond).
  849. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  850. Find(&cloudbrains); err != nil {
  851. return nil, 0, fmt.Errorf("Find: %v", err)
  852. }
  853. sess.Close()
  854. return cloudbrains, count, nil
  855. }
  856. func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  857. sess := x.NewSession()
  858. defer sess.Close()
  859. var cond = builder.NewCond()
  860. if opts.RepoID > 0 {
  861. cond = cond.And(
  862. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  863. )
  864. }
  865. if opts.UserID > 0 {
  866. cond = cond.And(
  867. builder.Eq{"cloudbrain.user_id": opts.UserID},
  868. )
  869. }
  870. if (opts.Type) >= 0 {
  871. cond = cond.And(
  872. builder.Eq{"cloudbrain.type": opts.Type},
  873. )
  874. }
  875. if (opts.JobID) != "" {
  876. cond = cond.And(
  877. builder.Eq{"cloudbrain.job_id": opts.JobID},
  878. )
  879. }
  880. if (opts.JobType) != "" {
  881. cond = cond.And(
  882. builder.Eq{"cloudbrain.job_type": opts.JobType},
  883. )
  884. }
  885. if len(opts.CloudbrainIDs) > 0 {
  886. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  887. }
  888. count, err := sess.Where(cond).Count(new(Cloudbrain))
  889. if err != nil {
  890. return nil, 0, fmt.Errorf("Count: %v", err)
  891. }
  892. if opts.Page >= 0 && opts.PageSize > 0 {
  893. var start int
  894. if opts.Page == 0 {
  895. start = 0
  896. } else {
  897. start = (opts.Page - 1) * opts.PageSize
  898. }
  899. sess.Limit(opts.PageSize, start)
  900. }
  901. sess.OrderBy("cloudbrain.created_unix DESC")
  902. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  903. if err := sess.Table(&Cloudbrain{}).Where(cond).
  904. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  905. Find(&cloudbrains); err != nil {
  906. return nil, 0, fmt.Errorf("Find: %v", err)
  907. }
  908. sess.Close()
  909. return cloudbrains, count, nil
  910. }
  911. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  912. if _, err = x.Insert(cloudbrain); err != nil {
  913. return err
  914. }
  915. return nil
  916. }
  917. func CreateTrainjobConfigDetail(trainjobConfigDetail *TrainjobConfigDetail) (err error) {
  918. if _, err = x.Insert(trainjobConfigDetail); err != nil {
  919. return err
  920. }
  921. return nil
  922. }
  923. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  924. has, err := x.Get(cb)
  925. if err != nil {
  926. return nil, err
  927. } else if !has {
  928. return nil, ErrJobNotExist{}
  929. }
  930. return cb, nil
  931. }
  932. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  933. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  934. return getRepoCloudBrain(cb)
  935. }
  936. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  937. cb := &Cloudbrain{JobID: jobID}
  938. return getRepoCloudBrain(cb)
  939. }
  940. func GetCloudbrainByJobIDAndVersionName(jobID string, versionName string) (*Cloudbrain, error) {
  941. cb := &Cloudbrain{JobID: jobID, VersionName: versionName}
  942. return getRepoCloudBrain(cb)
  943. }
  944. func GetCloudbrainByJobIDAndIsLatestVersion(jobID string, isLatestVersion string) (*Cloudbrain, error) {
  945. cb := &Cloudbrain{JobID: jobID, IsLatestVersion: isLatestVersion}
  946. return getRepoCloudBrain(cb)
  947. }
  948. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  949. cloudBrains := make([]*Cloudbrain, 0)
  950. err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  951. return cloudBrains, err
  952. }
  953. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  954. cloudBrains := make([]*Cloudbrain, 0)
  955. err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  956. return cloudBrains, err
  957. }
  958. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  959. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  960. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  961. return
  962. }
  963. func SetTrainJobStatusByJobID(jobID string, status string, duration int64, trainjobduration string) (err error) {
  964. cb := &Cloudbrain{JobID: jobID, Status: string(status), Duration: duration, TrainJobDuration: trainjobduration}
  965. _, err = x.Cols("status", "duration", "train_job_duration").Where("cloudbrain.job_id=?", jobID).Update(cb)
  966. return
  967. }
  968. func SetVersionCountAndLatestVersionByJobIDAndVersionName(jobID string, versionName string, versionCount int64, isLatestVersion string) (err error) {
  969. cb := &Cloudbrain{JobID: jobID, VersionName: versionName, VersionCount: versionCount, IsLatestVersion: isLatestVersion}
  970. _, err = x.Cols("version_Count", "is_latest_version").Where("cloudbrain.job_id=? AND cloudbrain.version_name=?", jobID, versionName).Update(cb)
  971. return
  972. }
  973. func UpdateJob(job *Cloudbrain) error {
  974. return updateJob(x, job)
  975. }
  976. func updateJob(e Engine, job *Cloudbrain) error {
  977. var sess *xorm.Session
  978. sess = e.Where("job_id = ?", job.JobID)
  979. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  980. return err
  981. }
  982. // func UpdateTrainJob(job *CloudbrainInfo) error {
  983. // return updateTrainJob(x, job)
  984. // }
  985. // func updateTrainJob(e Engine, job *CloudbrainInfo) error {
  986. // var sess *xorm.Session
  987. // sess = e.Where("job_id = ?", job.Cloudbrain.JobID)
  988. // _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  989. // return err
  990. // }
  991. func DeleteJob(job *Cloudbrain) error {
  992. return deleteJob(x, job)
  993. }
  994. func deleteJob(e Engine, job *Cloudbrain) error {
  995. _, err := e.ID(job.ID).Delete(job)
  996. return err
  997. }
  998. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  999. cb := &Cloudbrain{JobName: jobName}
  1000. return getRepoCloudBrain(cb)
  1001. }
  1002. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  1003. if !isSigned || (job.Status != string(JobStopped) && job.Status != string(JobFailed) && job.Status != string(ModelArtsStartFailed) && job.Status != string(ModelArtsCreateFailed)) {
  1004. return false
  1005. }
  1006. repo, err := GetRepositoryByID(job.RepoID)
  1007. if err != nil {
  1008. log.Error("GetRepositoryByID failed:%v", err.Error())
  1009. return false
  1010. }
  1011. permission, _ := GetUserRepoPermission(repo, user)
  1012. if err != nil {
  1013. log.Error("GetUserRepoPermission failed:%v", err.Error())
  1014. return false
  1015. }
  1016. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  1017. return true
  1018. }
  1019. return false
  1020. }