You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cloudbrain.go 30 kB

4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
4 years ago
5 years ago
4 years ago
5 years ago
4 years ago
4 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
4 years ago
4 years ago
4 years ago
4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952
  1. package models
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. "strings"
  6. "time"
  7. "xorm.io/builder"
  8. "xorm.io/xorm"
  9. "code.gitea.io/gitea/modules/log"
  10. "code.gitea.io/gitea/modules/setting"
  11. "code.gitea.io/gitea/modules/timeutil"
  12. )
  13. type CloudbrainStatus string
  14. type JobType string
  15. type ModelArtsJobStatus string
  16. const (
  17. JobWaiting CloudbrainStatus = "WAITING"
  18. JobStopped CloudbrainStatus = "STOPPED"
  19. JobSucceeded CloudbrainStatus = "SUCCEEDED"
  20. JobFailed CloudbrainStatus = "FAILED"
  21. JobRunning CloudbrainStatus = "RUNNING"
  22. JobTypeDebug JobType = "DEBUG"
  23. JobTypeBenchmark JobType = "BENCHMARK"
  24. JobTypeSnn4imagenet JobType = "SNN4IMAGENET"
  25. ModelArtsCreateQueue ModelArtsJobStatus = "CREATE_QUEUING" //免费资源创建排队中
  26. ModelArtsCreating ModelArtsJobStatus = "CREATING" //创建中
  27. ModelArtsCreateFailed ModelArtsJobStatus = "CREATE_FAILED" //创建失败
  28. ModelArtsStartQueuing ModelArtsJobStatus = "START_QUEUING" //免费资源启动排队中
  29. ModelArtsReadyToStart ModelArtsJobStatus = "READY_TO_START" //免费资源等待启动
  30. ModelArtsStarting ModelArtsJobStatus = "STARTING" //启动中
  31. ModelArtsRestarting ModelArtsJobStatus = "RESTARTING" //重启中
  32. ModelArtsStartFailed ModelArtsJobStatus = "START_FAILED" //启动失败
  33. ModelArtsRunning ModelArtsJobStatus = "RUNNING" //运行中
  34. ModelArtsStopping ModelArtsJobStatus = "STOPPING" //停止中
  35. ModelArtsStopped ModelArtsJobStatus = "STOPPED" //停止
  36. ModelArtsUnavailable ModelArtsJobStatus = "UNAVAILABLE" //故障
  37. ModelArtsDeleted ModelArtsJobStatus = "DELETED" //已删除
  38. ModelArtsResizing ModelArtsJobStatus = "RESIZING" //规格变更中
  39. ModelArtsResizFailed ModelArtsJobStatus = "RESIZE_FAILED" //规格变更失败
  40. )
  41. type Cloudbrain struct {
  42. ID int64 `xorm:"pk autoincr"`
  43. JobID string `xorm:"INDEX NOT NULL"`
  44. JobType string `xorm:"INDEX NOT NULL DEFAULT 'DEBUG'"`
  45. JobName string `xorm:"INDEX"`
  46. Status string `xorm:"INDEX"`
  47. UserID int64 `xorm:"INDEX"`
  48. RepoID int64 `xorm:"INDEX"`
  49. SubTaskName string `xorm:"INDEX"`
  50. ContainerID string
  51. ContainerIp string
  52. CreatedUnix timeutil.TimeStamp `xorm:"INDEX created"`
  53. UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"`
  54. DeletedAt time.Time `xorm:"deleted"`
  55. CanDebug bool `xorm:"-"`
  56. CanDel bool `xorm:"-"`
  57. Type int `xorm:"INDEX DEFAULT 0"`
  58. VersionID int64 `xorm:"INDEX DEFAULT 0"`
  59. VersionName string
  60. Uuid string
  61. User *User `xorm:"-"`
  62. Repo *Repository `xorm:"-"`
  63. }
  64. type CloudbrainInfo struct {
  65. Cloudbrain `xorm:"extends"`
  66. User `xorm:"extends"`
  67. }
  68. type CloudBrainLoginResult struct {
  69. Code string
  70. Msg string
  71. Payload map[string]interface{}
  72. }
  73. type TaskRole struct {
  74. Name string `json:"name"`
  75. TaskNumber int `json:"taskNumber"`
  76. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  77. MinFailedTaskCount int `json:"minFailedTaskCount"`
  78. CPUNumber int `json:"cpuNumber"`
  79. GPUNumber int `json:"gpuNumber"`
  80. MemoryMB int `json:"memoryMB"`
  81. ShmMB int `json:"shmMB"`
  82. Command string `json:"command"`
  83. NeedIBDevice bool `json:"needIBDevice"`
  84. IsMainRole bool `json:"isMainRole"`
  85. UseNNI bool `json:"useNNI"`
  86. }
  87. type StHostPath struct {
  88. Path string `json:"path"`
  89. MountPath string `json:"mountPath"`
  90. ReadOnly bool `json:"readOnly"`
  91. }
  92. type Volume struct {
  93. HostPath StHostPath `json:"hostPath"`
  94. }
  95. type CreateJobParams struct {
  96. JobName string `json:"jobName"`
  97. RetryCount int8 `json:"retryCount"`
  98. GpuType string `json:"gpuType"`
  99. Image string `json:"image"`
  100. TaskRoles []TaskRole `json:"taskRoles"`
  101. Volumes []Volume `json:"volumes"`
  102. }
  103. type CreateJobResult struct {
  104. Code string `json:"code"`
  105. Msg string `json:"msg"`
  106. Payload map[string]interface{} `json:"payload"`
  107. }
  108. type GetJobResult struct {
  109. Code string `json:"code"`
  110. Msg string `json:"msg"`
  111. Payload map[string]interface{} `json:"payload"`
  112. }
  113. type GetImagesResult struct {
  114. Code string `json:"code"`
  115. Msg string `json:"msg"`
  116. Payload GetImagesPayload `json:"payload"`
  117. }
  118. type GetImagesPayload struct {
  119. Count int `json:"count"`
  120. TotalPages int `json:"totalPages,omitempty"`
  121. ImageInfo []*ImageInfo `json:"rows"`
  122. }
  123. type CloudbrainsOptions struct {
  124. ListOptions
  125. RepoID int64 // include all repos if empty
  126. UserID int64
  127. JobID int64
  128. SortType string
  129. CloudbrainIDs []int64
  130. // JobStatus CloudbrainStatus
  131. Type int
  132. }
  133. type TaskPod struct {
  134. TaskRoleStatus struct {
  135. Name string `json:"name"`
  136. } `json:"taskRoleStatus"`
  137. TaskStatuses []struct {
  138. TaskIndex int `json:"taskIndex"`
  139. PodUID string `json:"podUid"`
  140. PodIP string `json:"podIp"`
  141. PodName string `json:"podName"`
  142. ContainerID string `json:"containerId"`
  143. ContainerIP string `json:"containerIp"`
  144. ContainerGpus string `json:"containerGpus"`
  145. State string `json:"state"`
  146. StartAt time.Time `json:"startAt"`
  147. FinishedAt time.Time `json:"finishedAt"`
  148. ExitCode int `json:"exitCode"`
  149. ExitDiagnostics string `json:"exitDiagnostics"`
  150. RetriedCount int `json:"retriedCount"`
  151. StartTime string
  152. FinishedTime string
  153. } `json:"taskStatuses"`
  154. }
  155. type TaskInfo struct {
  156. Username string `json:"username"`
  157. TaskName string `json:"task_name"`
  158. CodeName string `json:"code_name"`
  159. BenchmarkCategory []string `json:"selected_category"`
  160. CodeLink string `json:"code_link"`
  161. GpuType string `json:"gpu_type"`
  162. }
  163. func ConvertToTaskPod(input map[string]interface{}) (TaskPod, error) {
  164. data, _ := json.Marshal(input)
  165. var taskPod TaskPod
  166. err := json.Unmarshal(data, &taskPod)
  167. taskPod.TaskStatuses[0].StartTime = time.Unix(taskPod.TaskStatuses[0].StartAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  168. taskPod.TaskStatuses[0].FinishedTime = time.Unix(taskPod.TaskStatuses[0].FinishedAt.Unix()+8*3600, 0).UTC().Format("2006-01-02 15:04:05")
  169. //if the task is not finished or stopped,the cloudbrain renturns 0001-01-01 08:00:00, the finishedTime shows with -
  170. if strings.HasPrefix(taskPod.TaskStatuses[0].FinishedTime, "0001") {
  171. taskPod.TaskStatuses[0].FinishedTime = "-"
  172. }
  173. return taskPod, err
  174. }
  175. type JobResultPayload struct {
  176. ID string `json:"id"`
  177. Name string `json:"name"`
  178. Platform string `json:"platform"`
  179. JobStatus struct {
  180. Username string `json:"username"`
  181. State string `json:"state"`
  182. SubState string `json:"subState"`
  183. ExecutionType string `json:"executionType"`
  184. Retries int `json:"retries"`
  185. CreatedTime int64 `json:"createdTime"`
  186. CompletedTime int64 `json:"completedTime"`
  187. AppID string `json:"appId"`
  188. AppProgress string `json:"appProgress"`
  189. AppTrackingURL string `json:"appTrackingUrl"`
  190. AppLaunchedTime int64 `json:"appLaunchedTime"`
  191. AppCompletedTime interface{} `json:"appCompletedTime"`
  192. AppExitCode int `json:"appExitCode"`
  193. AppExitDiagnostics string `json:"appExitDiagnostics"`
  194. AppExitType interface{} `json:"appExitType"`
  195. VirtualCluster string `json:"virtualCluster"`
  196. StartTime string
  197. EndTime string
  198. } `json:"jobStatus"`
  199. TaskRoles map[string]interface{} `json:"taskRoles"`
  200. Resource struct {
  201. CPU int `json:"cpu"`
  202. Memory string `json:"memory"`
  203. NvidiaComGpu int `json:"nvidia.com/gpu"`
  204. } `json:"resource"`
  205. Config struct {
  206. Image string `json:"image"`
  207. JobID string `json:"jobId"`
  208. GpuType string `json:"gpuType"`
  209. JobName string `json:"jobName"`
  210. JobType string `json:"jobType"`
  211. TaskRoles []struct {
  212. Name string `json:"name"`
  213. ShmMB int `json:"shmMB"`
  214. Command string `json:"command"`
  215. MemoryMB int `json:"memoryMB"`
  216. CPUNumber int `json:"cpuNumber"`
  217. GpuNumber int `json:"gpuNumber"`
  218. IsMainRole bool `json:"isMainRole"`
  219. TaskNumber int `json:"taskNumber"`
  220. NeedIBDevice bool `json:"needIBDevice"`
  221. MinFailedTaskCount int `json:"minFailedTaskCount"`
  222. MinSucceededTaskCount int `json:"minSucceededTaskCount"`
  223. } `json:"taskRoles"`
  224. RetryCount int `json:"retryCount"`
  225. } `json:"config"`
  226. Userinfo struct {
  227. User string `json:"user"`
  228. OrgID string `json:"org_id"`
  229. } `json:"userinfo"`
  230. }
  231. func ConvertToJobResultPayload(input map[string]interface{}) (JobResultPayload, error) {
  232. data, _ := json.Marshal(input)
  233. var jobResultPayload JobResultPayload
  234. err := json.Unmarshal(data, &jobResultPayload)
  235. jobResultPayload.JobStatus.StartTime = time.Unix(jobResultPayload.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05")
  236. jobResultPayload.JobStatus.EndTime = time.Unix(jobResultPayload.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05")
  237. return jobResultPayload, err
  238. }
  239. type ImagesResultPayload struct {
  240. Images []struct {
  241. ID int `json:"id"`
  242. Name string `json:"name"`
  243. Place string `json:"place"`
  244. Description string `json:"description"`
  245. Provider string `json:"provider"`
  246. Createtime string `json:"createtime"`
  247. Remark string `json:"remark"`
  248. } `json:"taskStatuses"`
  249. }
  250. type ImageInfo struct {
  251. ID int `json:"id"`
  252. Name string `json:"name"`
  253. Place string `json:"place"`
  254. Description string `json:"description"`
  255. Provider string `json:"provider"`
  256. Createtime string `json:"createtime"`
  257. Remark string `json:"remark"`
  258. IsPublic int `json:"isPublic"`
  259. PlaceView string
  260. }
  261. type Categories struct {
  262. Category []*Category `json:"category"`
  263. }
  264. type Category struct {
  265. Id int `json:"id"`
  266. Value string `json:"value"`
  267. }
  268. type GpuInfos struct {
  269. GpuInfo []*GpuInfo `json:"gpu_type"`
  270. }
  271. type GpuInfo struct {
  272. Id int `json:"id"`
  273. Value string `json:"value"`
  274. Queue string `json:"queue"`
  275. }
  276. type ResourceSpecs struct {
  277. ResourceSpec []*ResourceSpec `json:"resorce_specs"`
  278. }
  279. type ResourceSpec struct {
  280. Id int `json:"id"`
  281. CpuNum int `json:"cpu"`
  282. GpuNum int `json:"gpu"`
  283. MemMiB int `json:"memMiB"`
  284. ShareMemMiB int `json:"shareMemMiB"`
  285. }
  286. type FlavorInfos struct {
  287. FlavorInfo []*FlavorInfo `json:"flavor_info"`
  288. }
  289. type FlavorInfo struct {
  290. Id int `json:"id"`
  291. Value string `json:"value"`
  292. }
  293. type PoolInfos struct {
  294. PoolInfo []*PoolInfo `json:"pool_info"`
  295. }
  296. type PoolInfo struct {
  297. PoolId string `json:"pool_id"`
  298. PoolName string `json:"pool_name"`
  299. PoolType string `json:"pool_type"`
  300. }
  301. type CommitImageParams struct {
  302. Ip string `json:"ip"`
  303. TaskContainerId string `json:"taskContainerId"`
  304. ImageTag string `json:"imageTag"`
  305. ImageDescription string `json:"imageDescription"`
  306. }
  307. type CommitImageResult struct {
  308. Code string `json:"code"`
  309. Msg string `json:"msg"`
  310. Payload map[string]interface{} `json:"payload"`
  311. }
  312. type CloudBrainResult struct {
  313. Code string `json:"code"`
  314. Msg string `json:"msg"`
  315. }
  316. type CreateNotebookParams struct {
  317. JobName string `json:"name"`
  318. Description string `json:"description"`
  319. ProfileID string `json:"profile_id"`
  320. Flavor string `json:"flavor"`
  321. Spec Spec `json:"spec"`
  322. Workspace Workspace `json:"workspace"`
  323. Pool Pool `json:"pool"`
  324. }
  325. type Pool struct {
  326. ID string `json:"id"`
  327. Name string `json:"name"`
  328. Type string `json:"type"`
  329. }
  330. type Workspace struct {
  331. ID string `json:"id"`
  332. }
  333. type Spec struct {
  334. Storage Storage `json:"storage"`
  335. AutoStop AutoStop `json:"auto_stop"`
  336. }
  337. type AutoStop struct {
  338. Enable bool `json:"enable"`
  339. Duration int `json:"duration"`
  340. }
  341. type Storage struct {
  342. Type string `json:"type"`
  343. Location Location `json:"location"`
  344. }
  345. type Location struct {
  346. Path string `json:"path"`
  347. }
  348. type NotebookResult struct {
  349. ErrorCode string `json:"error_code"`
  350. ErrorMsg string `json:"error_msg"`
  351. }
  352. type CreateNotebookResult struct {
  353. ErrorCode string `json:"error_code"`
  354. ErrorMsg string `json:"error_msg"`
  355. ID string `json:"id"`
  356. Name string `json:"name"`
  357. Description string `json:"description"`
  358. Status string `json:"status"`
  359. CreationTimestamp string `json:"creation_timestamp"`
  360. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  361. Profile struct {
  362. ID string `json:"id"`
  363. Name string `json:"name"`
  364. Description string `json:"description"`
  365. DeType string `json:"de_type"`
  366. FlavorType string `json:"flavor_type"`
  367. } `json:"profile"`
  368. Flavor string `json:"flavor"`
  369. FlavorDetails struct {
  370. Name string `json:"name"`
  371. Status string `json:"status"`
  372. QueuingNum int `json:"queuing_num"`
  373. QueueLeftTime int `json:"queue_left_time"` //s
  374. Duration int `json:"duration"` //auto_stop_time s
  375. } `json:"flavor_details"`
  376. }
  377. type GetNotebookResult struct {
  378. ErrorCode string `json:"error_code"`
  379. ErrorMsg string `json:"error_msg"`
  380. ID string `json:"id"`
  381. Name string `json:"name"`
  382. Description string `json:"description"`
  383. Status string `json:"status"`
  384. CreationTimestamp string `json:"creation_timestamp"`
  385. CreateTime string
  386. LatestUpdateTimestamp string `json:"latest_update_timestamp"`
  387. LatestUpdateTime string
  388. Profile struct {
  389. ID string `json:"id"`
  390. Name string `json:"name"`
  391. Description string `json:"description"`
  392. DeType string `json:"de_type"`
  393. FlavorType string `json:"flavor_type"`
  394. } `json:"profile"`
  395. Flavor string `json:"flavor"`
  396. FlavorDetails struct {
  397. Name string `json:"name"`
  398. Status string `json:"status"`
  399. QueuingNum int `json:"queuing_num"`
  400. QueueLeftTime int `json:"queue_left_time"` //s
  401. Duration int `json:"duration"` //auto_stop_time s
  402. } `json:"flavor_details"`
  403. QueuingInfo struct {
  404. ID string `json:"id"`
  405. Name string `json:"name"`
  406. Flavor string `json:"flavor"`
  407. DeType string `json:"de_type"`
  408. Status string `json:"status"`
  409. BeginTimestamp int `json:"begin_timestamp"` //time of instance begin in queue
  410. BeginTime string
  411. RemainTime int `json:"remain_time"` //remain time of instance
  412. EndTimestamp int `json:"end_timestamp"` //
  413. EndTime string
  414. Rank int `json:"rank"` //rank of instance in queue
  415. } `json:"queuing_info"`
  416. Spec struct {
  417. Annotations struct {
  418. TargetDomain string `json:"target_domain"`
  419. Url string `json:"url"`
  420. } `json:"annotations"`
  421. } `json:"spec"`
  422. }
  423. type GetTokenParams struct {
  424. Auth Auth `json:"auth"`
  425. }
  426. type Auth struct {
  427. Identity Identity `json:"identity"`
  428. Scope Scope `json:"scope"`
  429. }
  430. type Scope struct {
  431. Project Project `json:"project"`
  432. }
  433. type Project struct {
  434. Name string `json:"name"`
  435. }
  436. type Identity struct {
  437. Methods []string `json:"methods"`
  438. Password Password `json:"password"`
  439. }
  440. type Password struct {
  441. User NotebookUser `json:"user"`
  442. }
  443. type NotebookUser struct {
  444. Name string `json:"name"`
  445. Password string `json:"password"`
  446. Domain Domain `json:"domain"`
  447. }
  448. type Domain struct {
  449. Name string `json:"name"`
  450. }
  451. const (
  452. ActionStart = "start"
  453. ActionStop = "stop"
  454. ActionRestart = "restart"
  455. ActionQueue = "queue"
  456. ActionDequeue = "dequeue"
  457. )
  458. type NotebookAction struct {
  459. Action string `json:"action"`
  460. }
  461. type NotebookActionResult struct {
  462. ErrorCode string `json:"error_code"`
  463. ErrorMsg string `json:"error_msg"`
  464. CurrentStatus string `json:"current_status"`
  465. PreviousState string `json:"previous_state"`
  466. }
  467. type NotebookGetJobTokenResult struct {
  468. ErrorCode string `json:"error_code"`
  469. ErrorMsg string `json:"error_msg"`
  470. Token string `json:"token"`
  471. }
  472. type NotebookDelResult struct {
  473. InstanceID string `json:"instance_id"`
  474. }
  475. type CreateTrainJobParams struct {
  476. JobName string `json:"job_name"`
  477. Description string `json:"job_desc"`
  478. Config Config `json:"config"`
  479. WorkspaceID string `json:"workspace_id"`
  480. }
  481. type Config struct {
  482. WorkServerNum int `json:"worker_server_num"`
  483. AppUrl string `json:"app_url"` //训练作业的代码目录
  484. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  485. Parameter []Parameter `json:"parameter"`
  486. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  487. //DatasetID string `json:"dataset_id"`
  488. //DataVersionID string `json:"dataset_version_id"`
  489. //DataSource []DataSource `json:"data_source"`
  490. //SpecID int64 `json:"spec_id"`
  491. EngineID int64 `json:"engine_id"`
  492. //ModelID int64 `json:"model_id"`
  493. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  494. LogUrl string `json:"log_url"`
  495. //UserImageUrl string `json:"user_image_url"`
  496. //UserCommand string `json:"user_command"`
  497. CreateVersion bool `json:"create_version"`
  498. //Volumes []Volumes `json:"volumes"`
  499. Flavor Flavor `json:"flavor"`
  500. PoolID string `json:"pool_id"`
  501. }
  502. type CreateConfigParams struct {
  503. ConfigName string `json:"config_name"`
  504. Description string `json:"config_desc"`
  505. WorkServerNum int `json:"worker_server_num"`
  506. AppUrl string `json:"app_url"` //训练作业的代码目录
  507. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  508. Parameter []Parameter `json:"parameter"`
  509. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  510. //DatasetID string `json:"dataset_id"`
  511. //DataVersionID string `json:"dataset_version_id"`
  512. //DataSource []DataSource `json:"data_source"`
  513. //SpecID int64 `json:"spec_id"`
  514. EngineID int64 `json:"engine_id"`
  515. //ModelID int64 `json:"model_id"`
  516. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  517. LogUrl string `json:"log_url"`
  518. //UserImageUrl string `json:"user_image_url"`
  519. //UserCommand string `json:"user_command"`
  520. //CreateVersion bool `json:"create_version"`
  521. //Volumes []Volumes `json:"volumes"`
  522. Flavor Flavor `json:"flavor"`
  523. PoolID string `json:"pool_id"`
  524. }
  525. type Parameter struct {
  526. Label string `json:"label"`
  527. Value string `json:"value"`
  528. }
  529. type Parameters struct {
  530. Parameter []Parameter `json:"parameter"`
  531. }
  532. type DataSource struct {
  533. DatasetID string `json:"dataset_id"`
  534. DatasetVersion string `json:"dataset_version"`
  535. Type string `json:"type"`
  536. DataUrl string `json:"data_url"`
  537. }
  538. type Volumes struct {
  539. Nfs Nfs `json:"nfs"`
  540. HostPath HostPath `json:"host_path"`
  541. }
  542. type Nfs struct {
  543. ID string `json:"id"`
  544. SourcePath string `json:"src_path"`
  545. DestPath string `json:"dest_path"`
  546. ReadOnly bool `json:"read_only"`
  547. }
  548. type HostPath struct {
  549. SourcePath string `json:"src_path"`
  550. DestPath string `json:"dest_path"`
  551. ReadOnly bool `json:"read_only"`
  552. }
  553. type Flavor struct {
  554. Code string `json:"code"`
  555. }
  556. type CreateTrainJobResult struct {
  557. ErrorCode string `json:"error_code"`
  558. ErrorMsg string `json:"error_msg"`
  559. IsSuccess bool `json:"is_success"`
  560. JobName string `json:"job_name"`
  561. JobID int64 `json:"job_id"`
  562. Status int `json:"status"`
  563. CreateTime int64 `json:"create_time"`
  564. VersionID int64 `json:"version_id"`
  565. ResourceID string `json:"resource_id"`
  566. VersionName string `json:"version_name"`
  567. }
  568. type CreateTrainJobConfigResult struct {
  569. ErrorCode string `json:"error_code"`
  570. ErrorMsg string `json:"error_msg"`
  571. IsSuccess bool `json:"is_success"`
  572. }
  573. type GetResourceSpecsResult struct {
  574. ErrorCode string `json:"error_code"`
  575. ErrorMsg string `json:"error_msg"`
  576. IsSuccess bool `json:"is_success"`
  577. SpecTotalCount int `json:"spec_total_count"`
  578. Specs []Specs `json:"specs"`
  579. }
  580. type Specs struct {
  581. Core string `json:"core"`
  582. Cpu string `json:"cpu"`
  583. IsNoResource bool `json:"no_resource"`
  584. GpuType string `json:"gpu_type"`
  585. SpecID int64 `json:"spec_id"`
  586. GpuNum int `json:"gpu_num"`
  587. SpecCode string `json:"spec_code"`
  588. Storage string `json:"storage"`
  589. MaxNum int `json:"max_num"`
  590. UnitNum int `json:"unit_num"`
  591. InterfaceType int `json:"interface_type"`
  592. }
  593. type GetConfigListResult struct {
  594. ErrorCode string `json:"error_code"`
  595. ErrorMsg string `json:"error_msg"`
  596. IsSuccess bool `json:"is_success"`
  597. ConfigTotalCount int `json:"config_total_count"`
  598. ParaConfigs []ParaConfig `json:"configs"`
  599. }
  600. type ParaConfig struct {
  601. ConfigName string `json:"config_name"`
  602. ConfigDesc string `json:"config_desc"`
  603. CreateTime int64 `json:"create_time"`
  604. EngineType int `json:"engine_type"`
  605. EngineName string `json:"engine_name"`
  606. EngineId int64 `json:"engine_id"`
  607. EngineVersion string `json:"engine_version"`
  608. UserImageUrl string `json:"user_image_url"`
  609. UserCommand string `json:"user_command"`
  610. Result GetConfigResult
  611. }
  612. type GetConfigResult struct {
  613. ErrorCode string `json:"error_code"`
  614. ErrorMsg string `json:"error_msg"`
  615. IsSuccess bool `json:"is_success"`
  616. ConfigName string `json:"config_name"`
  617. Description string `json:"config_desc"`
  618. WorkServerNum int `json:"worker_server_num"`
  619. AppUrl string `json:"app_url"` //训练作业的代码目录
  620. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  621. Parameter []Parameter `json:"parameter"`
  622. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  623. //DatasetID string `json:"dataset_id"`
  624. //DataVersionID string `json:"dataset_version_id"`
  625. //DataSource []DataSource `json:"data_source"`
  626. //SpecID int64 `json:"spec_id"`
  627. EngineID int64 `json:"engine_id"`
  628. //ModelID int64 `json:"model_id"`
  629. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  630. LogUrl string `json:"log_url"`
  631. //UserImageUrl string `json:"user_image_url"`
  632. //UserCommand string `json:"user_command"`
  633. //CreateVersion bool `json:"create_version"`
  634. //Volumes []Volumes `json:"volumes"`
  635. Flavor Flavor `json:"flavor"`
  636. PoolID string `json:"pool_id"`
  637. }
  638. type ErrorResult struct {
  639. ErrorCode string `json:"error_code"`
  640. ErrorMsg string `json:"error_message"`
  641. IsSuccess bool `json:"is_success"`
  642. }
  643. type GetTrainJobResult struct {
  644. IsSuccess bool `json:"is_success"`
  645. JobName string `json:"job_name"`
  646. JobID int64 `json:"job_id"`
  647. Description string `json:"job_desc"`
  648. IntStatus int `json:"status"`
  649. Status string
  650. LongCreateTime int64 `json:"create_time"`
  651. CreateTime string
  652. Duration int64 `json:"duration"` //训练作业的运行时间,单位为毫秒
  653. VersionID int64 `json:"version_id"`
  654. ResourceID string `json:"resource_id"`
  655. VersionName string `json:"version_name"`
  656. PreVersionID int64 `json:"pre_version_id"`
  657. WorkServerNum int `json:"worker_server_num"`
  658. AppUrl string `json:"app_url"` //训练作业的代码目录
  659. BootFileUrl string `json:"boot_file_url"` //训练作业的代码启动文件,需要在代码目录下
  660. Parameter []Parameter `json:"parameter"`
  661. DataUrl string `json:"data_url"` //训练作业需要的数据集OBS路径URL
  662. //DatasetID string `json:"dataset_id"`
  663. //DataVersionID string `json:"dataset_version_id"`
  664. //DataSource []DataSource `json:"data_source"`
  665. //SpecID int64 `json:"spec_id"`
  666. EngineID int64 `json:"engine_id"`
  667. EngineName string `json:"engine_name"`
  668. EngineVersion string `json:"engine_version"`
  669. //ModelID int64 `json:"model_id"`
  670. TrainUrl string `json:"train_url"` //训练作业的输出文件OBS路径URL
  671. LogUrl string `json:"log_url"`
  672. //UserImageUrl string `json:"user_image_url"`
  673. //UserCommand string `json:"user_command"`
  674. //Volumes []Volumes `json:"volumes"`
  675. Flavor Flavor `json:"flavor"`
  676. PoolID string `json:"pool_id"`
  677. PoolName string `json:"pool_name"`
  678. NasMountPath string `json:"nas_mount_path"`
  679. NasShareAddr string `json:"nas_share_addr"`
  680. DatasetName string
  681. }
  682. type GetTrainJobLogResult struct {
  683. ErrorCode string `json:"error_code"`
  684. ErrorMsg string `json:"error_msg"`
  685. IsSuccess bool `json:"is_success"`
  686. Content string `json:"content"`
  687. Lines int `json:"lines"`
  688. StartLine string `json:"start_line"`
  689. EndLine string `json:"end_line"`
  690. }
  691. type GetTrainJobLogFileNamesResult struct {
  692. ErrorCode string `json:"error_code"`
  693. ErrorMsg string `json:"error_msg"`
  694. IsSuccess bool `json:"is_success"`
  695. LogFileList []string `json:"log_file_list"`
  696. }
  697. type TrainJobResult struct {
  698. ErrorCode string `json:"error_code"`
  699. ErrorMsg string `json:"error_msg"`
  700. IsSuccess bool `json:"is_success"`
  701. }
  702. type LogFile struct {
  703. Name string
  704. }
  705. func Cloudbrains(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int64, error) {
  706. sess := x.NewSession()
  707. defer sess.Close()
  708. var cond = builder.NewCond()
  709. if opts.RepoID > 0 {
  710. cond = cond.And(
  711. builder.Eq{"cloudbrain.repo_id": opts.RepoID},
  712. )
  713. }
  714. if opts.UserID > 0 {
  715. cond = cond.And(
  716. builder.Eq{"cloudbrain.user_id": opts.UserID},
  717. )
  718. }
  719. if (opts.JobID) > 0 {
  720. cond = cond.And(
  721. builder.Eq{"cloudbrain.job_id": opts.JobID},
  722. )
  723. }
  724. if (opts.Type) >= 0 {
  725. cond = cond.And(
  726. builder.Eq{"cloudbrain.type": opts.Type},
  727. )
  728. }
  729. // switch opts.JobStatus {
  730. // case JobWaiting:
  731. // cond.And(builder.Eq{"cloudbrain.status": int(JobWaiting)})
  732. // case JobFailed:
  733. // cond.And(builder.Eq{"cloudbrain.status": int(JobFailed)})
  734. // case JobStopped:
  735. // cond.And(builder.Eq{"cloudbrain.status": int(JobStopped)})
  736. // case JobSucceeded:
  737. // cond.And(builder.Eq{"cloudbrain.status": int(JobSucceeded)})
  738. // }
  739. if len(opts.CloudbrainIDs) > 0 {
  740. cond = cond.And(builder.In("cloudbrain.id", opts.CloudbrainIDs))
  741. }
  742. count, err := sess.Where(cond).Count(new(Cloudbrain))
  743. if err != nil {
  744. return nil, 0, fmt.Errorf("Count: %v", err)
  745. }
  746. if opts.Page >= 0 && opts.PageSize > 0 {
  747. var start int
  748. if opts.Page == 0 {
  749. start = 0
  750. } else {
  751. start = (opts.Page - 1) * opts.PageSize
  752. }
  753. sess.Limit(opts.PageSize, start)
  754. }
  755. sess.OrderBy("cloudbrain.created_unix DESC")
  756. cloudbrains := make([]*CloudbrainInfo, 0, setting.UI.IssuePagingNum)
  757. if err := sess.Table(&Cloudbrain{}).Where(cond).
  758. Join("left", "`user`", "cloudbrain.user_id = `user`.id").
  759. Find(&cloudbrains); err != nil {
  760. return nil, 0, fmt.Errorf("Find: %v", err)
  761. }
  762. sess.Close()
  763. return cloudbrains, count, nil
  764. }
  765. func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) {
  766. if _, err = x.Insert(cloudbrain); err != nil {
  767. return err
  768. }
  769. return nil
  770. }
  771. func getRepoCloudBrain(cb *Cloudbrain) (*Cloudbrain, error) {
  772. has, err := x.Get(cb)
  773. if err != nil {
  774. return nil, err
  775. } else if !has {
  776. return nil, ErrJobNotExist{}
  777. }
  778. return cb, nil
  779. }
  780. func GetRepoCloudBrainByJobID(repoID int64, jobID string) (*Cloudbrain, error) {
  781. cb := &Cloudbrain{JobID: jobID, RepoID: repoID}
  782. return getRepoCloudBrain(cb)
  783. }
  784. func GetCloudbrainByJobID(jobID string) (*Cloudbrain, error) {
  785. cb := &Cloudbrain{JobID: jobID}
  786. return getRepoCloudBrain(cb)
  787. }
  788. func GetCloudbrainsNeededStopByUserID(userID int64) ([]*Cloudbrain, error) {
  789. cloudBrains := make([]*Cloudbrain, 0)
  790. err := x.Cols("job_id", "status", "type").Where("user_id=? AND status !=?", userID, string(JobStopped)).Find(&cloudBrains)
  791. return cloudBrains, err
  792. }
  793. func GetCloudbrainsNeededStopByRepoID(repoID int64) ([]*Cloudbrain, error) {
  794. cloudBrains := make([]*Cloudbrain, 0)
  795. err := x.Cols("job_id", "status", "type").Where("repo_id=? AND status !=?", repoID, string(JobStopped)).Find(&cloudBrains)
  796. return cloudBrains, err
  797. }
  798. func SetCloudbrainStatusByJobID(jobID string, status CloudbrainStatus) (err error) {
  799. cb := &Cloudbrain{JobID: jobID, Status: string(status)}
  800. _, err = x.Cols("status").Where("cloudbrain.job_id=?", jobID).Update(cb)
  801. return
  802. }
  803. func UpdateJob(job *Cloudbrain) error {
  804. return updateJob(x, job)
  805. }
  806. func updateJob(e Engine, job *Cloudbrain) error {
  807. var sess *xorm.Session
  808. sess = e.Where("job_id = ?", job.JobID)
  809. _, err := sess.Cols("status", "container_id", "container_ip").Update(job)
  810. return err
  811. }
  812. func DeleteJob(job *Cloudbrain) error {
  813. return deleteJob(x, job)
  814. }
  815. func deleteJob(e Engine, job *Cloudbrain) error {
  816. _, err := e.ID(job.ID).Delete(job)
  817. return err
  818. }
  819. func GetCloudbrainByName(jobName string) (*Cloudbrain, error) {
  820. cb := &Cloudbrain{JobName: jobName}
  821. return getRepoCloudBrain(cb)
  822. }
  823. func CanDelJob(isSigned bool, user *User, job *CloudbrainInfo) bool {
  824. if !isSigned || job.Status != string(JobStopped) {
  825. return false
  826. }
  827. repo, err := GetRepositoryByID(job.RepoID)
  828. if err != nil {
  829. log.Error("GetRepositoryByID failed:%v", err.Error())
  830. return false
  831. }
  832. permission, _ := GetUserRepoPermission(repo, user)
  833. if err != nil {
  834. log.Error("GetUserRepoPermission failed:%v", err.Error())
  835. return false
  836. }
  837. if (user.ID == job.UserID && permission.AccessMode >= AccessModeWrite) || user.IsAdmin || permission.AccessMode >= AccessModeAdmin {
  838. return true
  839. }
  840. return false
  841. }