You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

aisafety.go 35 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028
  1. package repo
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "net/http"
  10. "os"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "code.gitea.io/gitea/models"
  15. "code.gitea.io/gitea/modules/aisafety"
  16. "code.gitea.io/gitea/modules/cloudbrain"
  17. "code.gitea.io/gitea/modules/context"
  18. "code.gitea.io/gitea/modules/git"
  19. "code.gitea.io/gitea/modules/log"
  20. "code.gitea.io/gitea/modules/modelarts"
  21. "code.gitea.io/gitea/modules/setting"
  22. "code.gitea.io/gitea/modules/storage"
  23. "code.gitea.io/gitea/modules/timeutil"
  24. "code.gitea.io/gitea/modules/util"
  25. "code.gitea.io/gitea/services/cloudbrain/resource"
  26. "code.gitea.io/gitea/services/reward/point/account"
  27. )
  28. const (
  29. tplModelSafetyTestCreateGrampusGpu = "repo/modelsafety/newgrampusgpu"
  30. tplModelSafetyTestCreateGrampusNpu = "repo/modelsafety/newgrampusnpu"
  31. tplModelSafetyTestCreateGpu = "repo/modelsafety/newgpu"
  32. tplModelSafetyTestCreateNpu = "repo/modelsafety/newnpu"
  33. tplModelSafetyTestShow = "repo/modelsafety/show"
  34. )
  35. func GetAiSafetyTaskByJob(job *models.Cloudbrain) {
  36. if job == nil {
  37. log.Error("GetCloudbrainByJobID failed")
  38. return
  39. }
  40. syncAiSafetyTaskStatus(job)
  41. }
  42. func GetAiSafetyTaskTmpl(ctx *context.Context) {
  43. ctx.Data["id"] = ctx.Params(":id")
  44. ctx.Data["PageIsCloudBrain"] = true
  45. ctx.HTML(200, tplModelSafetyTestShow)
  46. }
  47. func GetAiSafetyTask(ctx *context.Context) {
  48. var ID = ctx.Params(":id")
  49. job, err := models.GetCloudbrainByIDWithDeleted(ID)
  50. if err != nil {
  51. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  52. return
  53. }
  54. syncAiSafetyTaskStatus(job)
  55. job, err = models.GetCloudbrainByIDWithDeleted(ID)
  56. job.BenchmarkType = "安全评测"
  57. job.BenchmarkTypeName = "Image Classification"
  58. job.CanModify = cloudbrain.CanModifyJob(ctx, job)
  59. job.CanDel = cloudbrain.CanDeleteJob(ctx, job)
  60. if job.Parameters == "{\"parameter\":[]}" {
  61. job.Parameters = ""
  62. }
  63. s, err := resource.GetCloudbrainSpec(job.ID)
  64. if err == nil {
  65. job.Spec = s
  66. }
  67. user, err := models.GetUserByID(job.UserID)
  68. if err == nil {
  69. tmpUser := &models.User{
  70. Name: user.Name,
  71. }
  72. job.User = tmpUser
  73. }
  74. ctx.JSON(200, job)
  75. }
  76. func StopAiSafetyTask(ctx *context.Context) {
  77. log.Info("start to stop the task.")
  78. var ID = ctx.Params(":id")
  79. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  80. result := make(map[string]interface{})
  81. result["result_code"] = "-1"
  82. if err != nil {
  83. log.Info("query task error.err=" + err.Error())
  84. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  85. result["msg"] = "No such task."
  86. ctx.JSON(200, result)
  87. return
  88. }
  89. if isTaskNotFinished(task.Status) {
  90. if task.Type == models.TypeCloudBrainTwo {
  91. log.Info("start to stop model arts task.")
  92. _, err := modelarts.StopTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10))
  93. if err != nil {
  94. log.Info("stop failed.err=" + err.Error())
  95. }
  96. task.Status = string(models.JobStopped)
  97. if task.EndTime == 0 {
  98. task.EndTime = timeutil.TimeStampNow()
  99. }
  100. task.ComputeAndSetDuration()
  101. err = models.UpdateJob(task)
  102. if err != nil {
  103. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  104. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  105. ctx.JSON(200, result)
  106. return
  107. }
  108. //queryTaskStatusFromCloudbrainTwo(job)
  109. } else if task.Type == models.TypeCloudBrainOne {
  110. if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) || task.Status == string(models.JobSucceeded) {
  111. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  112. result["msg"] = "cloudbrain.Already_stopped"
  113. ctx.JSON(200, result)
  114. return
  115. }
  116. err := cloudbrain.StopJob(task.JobID)
  117. if err != nil {
  118. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  119. result["msg"] = "cloudbrain.Stopped_failed"
  120. ctx.JSON(200, result)
  121. return
  122. }
  123. task.Status = string(models.JobStopped)
  124. if task.EndTime == 0 {
  125. task.EndTime = timeutil.TimeStampNow()
  126. }
  127. task.ComputeAndSetDuration()
  128. err = models.UpdateJob(task)
  129. if err != nil {
  130. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  131. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  132. ctx.JSON(200, result)
  133. return
  134. }
  135. }
  136. } else {
  137. if task.Status == string(models.ModelSafetyTesting) {
  138. //修改为Failed
  139. task.Status = string(models.JobStopped)
  140. if task.EndTime == 0 {
  141. task.EndTime = timeutil.TimeStampNow()
  142. }
  143. task.ComputeAndSetDuration()
  144. err = models.UpdateJob(task)
  145. if err != nil {
  146. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  147. result["msg"] = "cloudbrain.Stopped_success_update_status_fail"
  148. ctx.JSON(200, result)
  149. return
  150. }
  151. } else {
  152. log.Info("The job is finished. status=" + task.Status)
  153. }
  154. }
  155. result["result_code"] = "0"
  156. result["msg"] = "succeed"
  157. ctx.JSON(200, result)
  158. }
  159. func DelAiSafetyTask(ctx *context.Context) {
  160. var ID = ctx.Params(":id")
  161. task, err := models.GetCloudbrainByIDWithDeleted(ID)
  162. if err != nil {
  163. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  164. ctx.ServerError("No such task.", err)
  165. return
  166. }
  167. if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) && task.Status != string(models.JobSucceeded) {
  168. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  169. ctx.ServerError("the job("+task.JobName+") has not been stopped", nil)
  170. return
  171. }
  172. if task.Type == models.TypeCloudBrainOne {
  173. DeleteCloudbrainJobStorage(task.JobName, models.TypeCloudBrainOne)
  174. }
  175. err = models.DeleteJob(task)
  176. if err != nil {
  177. ctx.ServerError(err.Error(), err)
  178. return
  179. }
  180. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  181. }
  182. func syncAiSafetyTaskStatus(job *models.Cloudbrain) {
  183. log.Info("start to query safety task status.")
  184. if isTaskNotFinished(job.Status) {
  185. if job.Type == models.TypeCloudBrainTwo {
  186. queryTaskStatusFromCloudbrainTwo(job)
  187. } else if job.Type == models.TypeCloudBrainOne {
  188. queryTaskStatusFromCloudbrain(job)
  189. }
  190. } else {
  191. if job.Status == string(models.ModelSafetyTesting) {
  192. queryTaskStatusFromModelSafetyTestServer(job)
  193. } else {
  194. log.Info("The job is finished. status=" + job.Status)
  195. }
  196. }
  197. }
  198. func TimerHandleModelSafetyTestTask() {
  199. log.Info("start to TimerHandleModelSafetyTestTask")
  200. tasks, err := models.GetModelSafetyTestTask()
  201. if err == nil {
  202. if tasks != nil && len(tasks) > 0 {
  203. for _, job := range tasks {
  204. syncAiSafetyTaskStatus(job)
  205. }
  206. } else {
  207. log.Info("query running model safety test task 0.")
  208. }
  209. } else {
  210. log.Info("query running model safety test task err." + err.Error())
  211. }
  212. }
  213. func queryTaskStatusFromCloudbrainTwo(job *models.Cloudbrain) {
  214. log.Info("The task not finished,name=" + job.DisplayJobName)
  215. result, err := modelarts.GetTrainJob(job.JobID, strconv.FormatInt(job.VersionID, 10))
  216. if err != nil {
  217. log.Info("query train job error." + err.Error())
  218. return
  219. }
  220. job.Status = modelarts.TransTrainJobStatus(result.IntStatus)
  221. job.Duration = result.Duration / 1000
  222. job.TrainJobDuration = result.TrainJobDuration
  223. if job.StartTime == 0 && result.StartTime > 0 {
  224. job.StartTime = timeutil.TimeStamp(result.StartTime / 1000)
  225. }
  226. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  227. if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 {
  228. job.EndTime = job.StartTime.Add(job.Duration)
  229. }
  230. job.CorrectCreateUnix()
  231. if job.Status != string(models.ModelArtsTrainJobCompleted) {
  232. log.Info("CloudbrainTwo task status=" + job.Status)
  233. err = models.UpdateJob(job)
  234. if err != nil {
  235. log.Error("UpdateJob failed:", err)
  236. }
  237. } else {
  238. log.Info("start to deal ModelSafetyTesting, task status=" + job.Status)
  239. job.Status = string(models.ModelSafetyTesting)
  240. err = models.UpdateJob(job)
  241. if err != nil {
  242. log.Error("UpdateJob failed:", err)
  243. }
  244. //send msg to beihang
  245. sendNPUInferenceResultToTest(job)
  246. }
  247. }
  248. func queryTaskStatusFromCloudbrain(job *models.Cloudbrain) {
  249. log.Info("The task not finished,name=" + job.DisplayJobName)
  250. jobResult, err := cloudbrain.GetJob(job.JobID)
  251. result, err := models.ConvertToJobResultPayload(jobResult.Payload)
  252. if err != nil {
  253. log.Error("ConvertToJobResultPayload failed:", err)
  254. return
  255. }
  256. job.Status = result.JobStatus.State
  257. if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) {
  258. taskRoles := result.TaskRoles
  259. taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{}))
  260. job.Status = taskRes.TaskStatuses[0].State
  261. }
  262. models.ParseAndSetDurationFromCloudBrainOne(result, job)
  263. //updateCloudBrainOneJobTime(job)
  264. log.Info("cloud brain one job status=" + job.Status)
  265. if result.JobStatus.State != string(models.JobSucceeded) {
  266. err = models.UpdateJob(job)
  267. if err != nil {
  268. log.Error("UpdateJob failed:", err)
  269. }
  270. } else {
  271. //
  272. job.Status = string(models.ModelSafetyTesting)
  273. job.EndTime = 0
  274. err = models.UpdateJob(job)
  275. if err != nil {
  276. log.Error("UpdateJob failed:", err)
  277. }
  278. //send msg to beihang
  279. sendGPUInferenceResultToTest(job)
  280. }
  281. }
  282. func queryTaskStatusFromModelSafetyTestServer(job *models.Cloudbrain) {
  283. result, err := aisafety.GetTaskStatus(job.PreVersionName)
  284. if err == nil {
  285. if result.Code == "0" {
  286. if result.Data.Status == 1 {
  287. log.Info("The task is running....")
  288. } else {
  289. job.EndTime = timeutil.TimeStampNow()
  290. job.Duration = (job.EndTime.AsTime().Unix() - job.StartTime.AsTime().Unix()) / 1000
  291. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  292. if result.Data.Code == 0 {
  293. job.ResultJson = result.Data.StandardJson
  294. job.Status = string(models.JobSucceeded)
  295. err = models.UpdateJob(job)
  296. if err != nil {
  297. log.Error("UpdateJob failed:", err)
  298. }
  299. } else {
  300. job.ResultJson = result.Data.Msg
  301. job.Status = string(models.JobFailed)
  302. err = models.UpdateJob(job)
  303. if err != nil {
  304. log.Error("UpdateJob failed:", err)
  305. }
  306. }
  307. }
  308. } else {
  309. log.Info("The task is failed.")
  310. job.Status = string(models.JobFailed)
  311. err = models.UpdateJob(job)
  312. if err != nil {
  313. log.Error("UpdateJob failed:", err)
  314. }
  315. }
  316. } else {
  317. log.Info("The task not found.....")
  318. }
  319. }
  320. func getAisafetyTaskReq(job *models.Cloudbrain) aisafety.TaskReq {
  321. datasetname := job.DatasetName
  322. datasetnames := strings.Split(datasetname, ";")
  323. indicator := job.LabelName
  324. EvalContent := "test1"
  325. if job.Description != "" {
  326. EvalContent = job.Description
  327. }
  328. req := aisafety.TaskReq{
  329. UnionId: job.JobID,
  330. EvalName: job.DisplayJobName,
  331. EvalContent: EvalContent,
  332. TLPath: "test1",
  333. Indicators: strings.Split(indicator, ";"),
  334. CDName: strings.Split(datasetnames[1], ".")[0],
  335. BDName: strings.Split(datasetnames[0], ".")[0] + "基础数据集",
  336. }
  337. log.Info("CDName=" + req.CDName)
  338. log.Info("BDName=" + req.BDName)
  339. return req
  340. }
  341. func sendGPUInferenceResultToTest(job *models.Cloudbrain) {
  342. log.Info("send sendGPUInferenceResultToTest")
  343. req := getAisafetyTaskReq(job)
  344. resultDir := "/result"
  345. prefix := setting.CBCodePathPrefix + job.JobName + resultDir
  346. files, err := storage.GetOneLevelAllObjectUnderDirMinio(setting.Attachment.Minio.Bucket, prefix, "")
  347. if err != nil {
  348. log.Error("query cloudbrain one model failed: %v", err)
  349. return
  350. }
  351. jsonContent := ""
  352. for _, file := range files {
  353. if strings.HasSuffix(file.FileName, "result.json") {
  354. path := storage.GetMinioPath(job.JobName+resultDir+"/", file.FileName)
  355. log.Info("path=" + path)
  356. reader, err := os.Open(path)
  357. defer reader.Close()
  358. if err == nil {
  359. r := bufio.NewReader(reader)
  360. for {
  361. line, error := r.ReadString('\n')
  362. jsonContent += line
  363. if error == io.EOF {
  364. log.Info("read file completed.")
  365. break
  366. }
  367. if error != nil {
  368. log.Info("read file error." + error.Error())
  369. break
  370. }
  371. }
  372. }
  373. break
  374. }
  375. }
  376. if jsonContent != "" {
  377. sendHttpReqToBeihang(job, jsonContent, req)
  378. } else {
  379. updateJobFailed(job, "推理生成的Json数据为空,无法进行评测。")
  380. }
  381. }
  382. func sendNPUInferenceResultToTest(job *models.Cloudbrain) {
  383. log.Info("start to sendNPUInferenceResultToTest")
  384. req := getAisafetyTaskReq(job)
  385. jsonContent := ""
  386. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  387. resultPath := modelarts.JobPath + job.JobName + modelarts.ResultPath + VersionOutputPath + "/result.json"
  388. resultPath = resultPath[1:]
  389. log.Info("bucket=" + setting.Bucket + " resultPath=" + resultPath)
  390. body, err := storage.ObsDownloadAFile(setting.Bucket, resultPath)
  391. if err != nil {
  392. log.Info("ObsDownloadAFile error." + err.Error() + " resultPath=" + resultPath)
  393. } else {
  394. defer body.Close()
  395. var data []byte
  396. p := make([]byte, 4096)
  397. var readErr error
  398. var readCount int
  399. for {
  400. readCount, readErr = body.Read(p)
  401. if readCount > 0 {
  402. data = append(data, p[:readCount]...)
  403. }
  404. if readErr != nil || readCount == 0 {
  405. break
  406. }
  407. }
  408. jsonContent = string(data)
  409. }
  410. if jsonContent != "" {
  411. sendHttpReqToBeihang(job, jsonContent, req)
  412. } else {
  413. updateJobFailed(job, "推理生成的Json数据为空,无法进行评测。")
  414. }
  415. }
  416. func updateJobFailed(job *models.Cloudbrain, msg string) {
  417. log.Info("The json is null. so set it failed.")
  418. //update task failed.
  419. job.Status = string(models.ModelArtsTrainJobFailed)
  420. job.ResultJson = msg
  421. job.EndTime = timeutil.TimeStampNow()
  422. job.Duration = (job.EndTime.AsTime().Unix() - job.StartTime.AsTime().Unix()) / 1000
  423. job.TrainJobDuration = models.ConvertDurationToStr(job.Duration)
  424. err := models.UpdateJob(job)
  425. if err != nil {
  426. log.Error("UpdateJob failed:", err)
  427. }
  428. }
  429. func sendHttpReqToBeihang(job *models.Cloudbrain, jsonContent string, req aisafety.TaskReq) {
  430. log.Info("start to send beihang ...")
  431. serialNo, err := aisafety.CreateSafetyTask(req, jsonContent)
  432. if err == nil {
  433. //update serial no to db
  434. job.PreVersionName = serialNo
  435. err = models.UpdateJob(job)
  436. if err != nil {
  437. log.Error("UpdateJob failed:", err)
  438. }
  439. }
  440. }
  441. func isTaskNotFinished(status string) bool {
  442. if status == string(models.ModelArtsTrainJobRunning) || status == string(models.ModelArtsTrainJobWaiting) {
  443. return true
  444. }
  445. if status == string(models.JobWaiting) || status == string(models.JobRunning) {
  446. return true
  447. }
  448. if status == string(models.ModelArtsTrainJobUnknown) || status == string(models.ModelArtsTrainJobInit) {
  449. return true
  450. }
  451. if status == string(models.ModelArtsTrainJobImageCreating) || status == string(models.ModelArtsTrainJobSubmitTrying) {
  452. return true
  453. }
  454. return false
  455. }
  456. func AiSafetyCreateForGetGPU(ctx *context.Context) {
  457. t := time.Now()
  458. ctx.Data["PageIsCloudBrain"] = true
  459. ctx.Data["IsCreate"] = true
  460. ctx.Data["type"] = models.TypeCloudBrainOne
  461. ctx.Data["compute_resource"] = models.GPUResource
  462. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  463. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName
  464. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID
  465. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName
  466. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID
  467. log.Info("GPUBaseDataSetName=" + setting.ModelSafetyTest.GPUBaseDataSetName)
  468. log.Info("GPUBaseDataSetUUID=" + setting.ModelSafetyTest.GPUBaseDataSetUUID)
  469. log.Info("GPUCombatDataSetName=" + setting.ModelSafetyTest.GPUCombatDataSetName)
  470. log.Info("GPUCombatDataSetUUID=" + setting.ModelSafetyTest.GPUCombatDataSetUUID)
  471. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  472. ctx.Data["display_job_name"] = displayJobName
  473. prepareCloudbrainOneSpecs(ctx)
  474. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  475. if queuesDetail != nil {
  476. ctx.Data["QueuesDetail"] = queuesDetail
  477. reqPara, _ := json.Marshal(queuesDetail)
  478. log.Warn("The GPU WaitCount json:", string(reqPara))
  479. } else {
  480. log.Info("The GPU WaitCount not get")
  481. }
  482. NotStopTaskCount, _ := models.GetModelSafetyCountByUserID(ctx.User.ID)
  483. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  484. ctx.HTML(200, tplModelSafetyTestCreateGpu)
  485. }
  486. func AiSafetyCreateForGetNPU(ctx *context.Context) {
  487. t := time.Now()
  488. ctx.Data["PageIsCloudBrain"] = true
  489. ctx.Data["IsCreate"] = true
  490. ctx.Data["type"] = models.TypeCloudBrainTwo
  491. ctx.Data["compute_resource"] = models.NPUResource
  492. var displayJobName = jobNamePrefixValid(cutString(ctx.User.Name, 5)) + t.Format("2006010215") + strconv.Itoa(int(t.Unix()))[5:]
  493. ctx.Data["display_job_name"] = displayJobName
  494. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  495. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName
  496. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID
  497. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName
  498. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID
  499. log.Info("NPUBaseDataSetName=" + setting.ModelSafetyTest.NPUBaseDataSetName)
  500. log.Info("NPUBaseDataSetUUID=" + setting.ModelSafetyTest.NPUBaseDataSetUUID)
  501. log.Info("NPUCombatDataSetName=" + setting.ModelSafetyTest.NPUCombatDataSetName)
  502. log.Info("NPUCombatDataSetUUID=" + setting.ModelSafetyTest.NPUCombatDataSetUUID)
  503. var resourcePools modelarts.ResourcePool
  504. if err := json.Unmarshal([]byte(setting.ResourcePools), &resourcePools); err != nil {
  505. ctx.ServerError("json.Unmarshal failed:", err)
  506. }
  507. ctx.Data["resource_pools"] = resourcePools.Info
  508. var engines modelarts.Engine
  509. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  510. ctx.ServerError("json.Unmarshal failed:", err)
  511. }
  512. ctx.Data["engines"] = engines.Info
  513. var versionInfos modelarts.VersionInfo
  514. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  515. ctx.ServerError("json.Unmarshal failed:", err)
  516. }
  517. ctx.Data["engine_versions"] = versionInfos.Version
  518. prepareCloudbrainTwoInferenceSpecs(ctx)
  519. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  520. ctx.Data["WaitCount"] = waitCount
  521. log.Info("The NPU WaitCount is " + fmt.Sprint(waitCount))
  522. NotStopTaskCount, _ := models.GetModelSafetyCountByUserID(ctx.User.ID)
  523. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  524. ctx.HTML(200, tplModelSafetyTestCreateNpu)
  525. }
  526. func AiSafetyCreateForPost(ctx *context.Context) {
  527. ctx.Data["PageIsCloudBrain"] = true
  528. displayJobName := ctx.Query("display_job_name")
  529. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  530. taskType := ctx.QueryInt("type")
  531. description := ctx.Query("description")
  532. ctx.Data["type"] = taskType
  533. ctx.Data["displayJobName"] = displayJobName
  534. ctx.Data["description"] = description
  535. repo := ctx.Repo.Repository
  536. tpname := tplCloudBrainModelSafetyNewNpu
  537. if taskType == models.TypeCloudBrainOne {
  538. tpname = tplCloudBrainModelSafetyNewGpu
  539. }
  540. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeModelSafety), displayJobName)
  541. if err == nil {
  542. if len(tasks) != 0 {
  543. log.Error("the job name did already exist", ctx.Data["MsgID"])
  544. modelSafetyNewDataPrepare(ctx)
  545. ctx.RenderWithErr("the job name did already exist", tpname, nil)
  546. return
  547. }
  548. } else {
  549. if !models.IsErrJobNotExist(err) {
  550. log.Error("system error, %v", err, ctx.Data["MsgID"])
  551. modelSafetyNewDataPrepare(ctx)
  552. ctx.RenderWithErr("system error", tpname, nil)
  553. return
  554. }
  555. }
  556. if !jobNamePattern.MatchString(jobName) {
  557. modelSafetyNewDataPrepare(ctx)
  558. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpname, nil)
  559. return
  560. }
  561. count, err := models.GetModelSafetyCountByUserID(ctx.User.ID)
  562. if err != nil {
  563. log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
  564. modelSafetyNewDataPrepare(ctx)
  565. ctx.RenderWithErr("system error", tpname, nil)
  566. return
  567. } else {
  568. if count >= 1 {
  569. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  570. modelSafetyNewDataPrepare(ctx)
  571. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpname, nil)
  572. return
  573. }
  574. }
  575. BootFile := ctx.Query("boot_file")
  576. BootFile = strings.TrimSpace(BootFile)
  577. bootFileExist, err := ctx.Repo.FileExists(BootFile, cloudbrain.DefaultBranchName)
  578. if err != nil || !bootFileExist {
  579. log.Error("Get bootfile error:", err)
  580. modelSafetyNewDataPrepare(ctx)
  581. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpname, nil)
  582. return
  583. }
  584. if taskType == models.TypeCloudBrainTwo {
  585. err = createForNPU(ctx, jobName)
  586. } else if taskType == models.TypeCloudBrainOne {
  587. err = createForGPU(ctx, jobName)
  588. }
  589. if err != nil {
  590. modelSafetyNewDataPrepare(ctx)
  591. ctx.RenderWithErr(err.Error(), tpname, nil)
  592. } else {
  593. log.Info("to redirect...")
  594. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/cloudbrain/benchmark")
  595. }
  596. }
  597. func createForNPU(ctx *context.Context, jobName string) error {
  598. VersionOutputPath := modelarts.GetOutputPathByCount(modelarts.TotalVersionCount)
  599. BootFile := ctx.Query("boot_file")
  600. BootFile = strings.TrimSpace(BootFile)
  601. displayJobName := ctx.Query("display_job_name")
  602. description := ctx.Query("description")
  603. srcDataset := ctx.Query("src_dataset") //uuid
  604. combatDataset := ctx.Query("combat_dataset") //uuid
  605. evaluationIndex := ctx.Query("evaluation_index")
  606. Params := ctx.Query("run_para_list")
  607. specId := ctx.QueryInt64("spec_id")
  608. engineID := ctx.QueryInt("engine_id")
  609. log.Info("engine_id=" + fmt.Sprint(engineID))
  610. poolID := ctx.Query("pool_id")
  611. repo := ctx.Repo.Repository
  612. trainUrl := ctx.Query("train_url")
  613. modelName := ctx.Query("model_name")
  614. modelVersion := ctx.Query("model_version")
  615. ckptName := ctx.Query("ckpt_name")
  616. ckptUrl := "/" + trainUrl + ckptName
  617. log.Info("ckpt url:" + ckptUrl)
  618. FlavorName := ctx.Query("flaver_names")
  619. EngineName := ctx.Query("engine_names")
  620. isLatestVersion := modelarts.IsLatestVersion
  621. VersionCount := modelarts.VersionCountOne
  622. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  623. codeObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.CodePath
  624. resultObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.ResultPath + VersionOutputPath + "/"
  625. logObsPath := "/" + setting.Bucket + modelarts.JobPath + jobName + modelarts.LogPath + VersionOutputPath + "/"
  626. log.Info("ckpt url:" + ckptUrl)
  627. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  628. JobType: models.JobTypeInference,
  629. ComputeResource: models.NPU,
  630. Cluster: models.OpenICluster,
  631. AiCenterCode: models.AICenterOfCloudBrainTwo})
  632. if err != nil || spec == nil {
  633. //ctx.RenderWithErr("Resource specification not available", tplCloudBrainModelSafetyNewNpu, nil)
  634. return errors.New("Resource specification not available")
  635. }
  636. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  637. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  638. return errors.New(ctx.Tr("points.insufficient_points_balance"))
  639. }
  640. //todo: del the codeLocalPath
  641. _, err = ioutil.ReadDir(codeLocalPath)
  642. if err == nil {
  643. os.RemoveAll(codeLocalPath)
  644. }
  645. gitRepo, _ := git.OpenRepository(repo.RepoPath())
  646. commitID, _ := gitRepo.GetBranchCommitID(cloudbrain.DefaultBranchName)
  647. if err := downloadCode(repo, codeLocalPath, cloudbrain.DefaultBranchName); err != nil {
  648. log.Error("Create task failed, server timed out: %s (%v)", repo.FullName(), err)
  649. return errors.New(ctx.Tr("cloudbrain.load_code_failed"))
  650. }
  651. //todo: upload code (send to file_server todo this work?)
  652. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.ResultPath + VersionOutputPath + "/"); err != nil {
  653. log.Error("Failed to obsMkdir_result: %s (%v)", repo.FullName(), err)
  654. return errors.New("Failed to obsMkdir_result")
  655. }
  656. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.LogPath + VersionOutputPath + "/"); err != nil {
  657. log.Error("Failed to obsMkdir_log: %s (%v)", repo.FullName(), err)
  658. return errors.New("Failed to obsMkdir_log")
  659. }
  660. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  661. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  662. return errors.New(ctx.Tr("cloudbrain.load_code_failed"))
  663. }
  664. var parameters models.Parameters
  665. param := make([]models.Parameter, 0)
  666. param = append(param, models.Parameter{
  667. Label: modelarts.ResultUrl,
  668. Value: "s3:/" + resultObsPath,
  669. }, models.Parameter{
  670. Label: modelarts.CkptUrl,
  671. Value: "s3:/" + ckptUrl,
  672. })
  673. uuid := srcDataset + ";" + combatDataset
  674. datasUrlList, dataUrl, datasetNames, isMultiDataset, err := getDatasUrlListByUUIDS(uuid)
  675. if err != nil {
  676. return err
  677. }
  678. dataPath := dataUrl
  679. jsondatas, err := json.Marshal(datasUrlList)
  680. if err != nil {
  681. log.Error("Failed to Marshal: %v", err)
  682. return err
  683. }
  684. if isMultiDataset {
  685. param = append(param, models.Parameter{
  686. Label: modelarts.MultiDataUrl,
  687. Value: string(jsondatas),
  688. })
  689. }
  690. existDeviceTarget := false
  691. if len(Params) != 0 {
  692. err := json.Unmarshal([]byte(Params), &parameters)
  693. if err != nil {
  694. log.Error("Failed to Unmarshal params: %s (%v)", Params, err)
  695. return errors.New("运行参数错误")
  696. }
  697. for _, parameter := range parameters.Parameter {
  698. if parameter.Label == modelarts.DeviceTarget {
  699. existDeviceTarget = true
  700. }
  701. if parameter.Label != modelarts.TrainUrl && parameter.Label != modelarts.DataUrl {
  702. param = append(param, models.Parameter{
  703. Label: parameter.Label,
  704. Value: parameter.Value,
  705. })
  706. }
  707. }
  708. }
  709. if !existDeviceTarget {
  710. param = append(param, models.Parameter{
  711. Label: modelarts.DeviceTarget,
  712. Value: modelarts.Ascend,
  713. })
  714. }
  715. req := &modelarts.GenerateInferenceJobReq{
  716. JobName: jobName,
  717. DisplayJobName: displayJobName,
  718. DataUrl: dataPath,
  719. Description: description,
  720. CodeObsPath: codeObsPath,
  721. BootFileUrl: codeObsPath + BootFile,
  722. BootFile: BootFile,
  723. TrainUrl: trainUrl,
  724. WorkServerNumber: 1,
  725. EngineID: int64(engineID),
  726. LogUrl: logObsPath,
  727. PoolID: poolID,
  728. Uuid: uuid,
  729. Parameters: param, //modelarts train parameters
  730. CommitID: commitID,
  731. BranchName: cloudbrain.DefaultBranchName,
  732. Params: Params,
  733. FlavorName: FlavorName,
  734. EngineName: EngineName,
  735. LabelName: evaluationIndex,
  736. IsLatestVersion: isLatestVersion,
  737. VersionCount: VersionCount,
  738. TotalVersionCount: modelarts.TotalVersionCount,
  739. ModelName: modelName,
  740. ModelVersion: modelVersion,
  741. CkptName: ckptName,
  742. ResultUrl: resultObsPath,
  743. Spec: spec,
  744. DatasetName: datasetNames,
  745. JobType: string(models.JobTypeModelSafety),
  746. }
  747. _, err = modelarts.GenerateInferenceJob(ctx, req)
  748. if err != nil {
  749. log.Error("GenerateTrainJob failed:%v", err.Error())
  750. return err
  751. }
  752. return nil
  753. }
  754. func createForGPU(ctx *context.Context, jobName string) error {
  755. BootFile := ctx.Query("boot_file")
  756. BootFile = strings.TrimSpace(BootFile)
  757. displayJobName := ctx.Query("display_job_name")
  758. description := ctx.Query("description")
  759. image := strings.TrimSpace(ctx.Query("image"))
  760. srcDataset := ctx.Query("src_dataset") //uuid
  761. combatDataset := ctx.Query("combat_dataset") //uuid
  762. evaluationIndex := ctx.Query("evaluation_index")
  763. Params := ctx.Query("run_para_list")
  764. specId := ctx.QueryInt64("spec_id")
  765. TrainUrl := ctx.Query("train_url")
  766. CkptName := ctx.Query("ckpt_name")
  767. modelName := ctx.Query("model_name")
  768. modelVersion := ctx.Query("model_version")
  769. ckptUrl := setting.Attachment.Minio.RealPath + TrainUrl + CkptName
  770. log.Info("ckpt url:" + ckptUrl)
  771. spec, err := resource.GetAndCheckSpec(ctx.User.ID, specId, models.FindSpecsOptions{
  772. JobType: models.JobTypeBenchmark,
  773. ComputeResource: models.GPU,
  774. Cluster: models.OpenICluster,
  775. AiCenterCode: models.AICenterOfCloudBrainOne})
  776. if err != nil || spec == nil {
  777. return errors.New("Resource specification not available")
  778. }
  779. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  780. log.Error("point balance is not enough,userId=%d specId=%d ", ctx.User.ID, spec.ID)
  781. return errors.New(ctx.Tr("points.insufficient_points_balance"))
  782. }
  783. repo := ctx.Repo.Repository
  784. codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath
  785. os.RemoveAll(codePath)
  786. if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil {
  787. log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"])
  788. return errors.New("system error")
  789. }
  790. err = uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/")
  791. if err != nil {
  792. log.Error("uploadCodeToMinio failed, %v", err, ctx.Data["MsgID"])
  793. return errors.New("system error")
  794. }
  795. uuid := srcDataset + ";" + combatDataset
  796. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
  797. log.Info("uuid=" + uuid)
  798. if err != nil {
  799. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  800. return errors.New(ctx.Tr("cloudbrain.error.dataset_select"))
  801. }
  802. command, err := getGpuModelSafetyCommand(BootFile, Params, CkptName, displayJobName)
  803. if err != nil {
  804. log.Error("Get Command failed: %v", err, ctx.Data["MsgID"])
  805. return errors.New(ctx.Tr("cloudbrain.error.dataset_select"))
  806. }
  807. log.Info("Command=" + command)
  808. req := cloudbrain.GenerateCloudBrainTaskReq{
  809. Ctx: ctx,
  810. DisplayJobName: displayJobName,
  811. JobName: jobName,
  812. Image: image,
  813. Command: command,
  814. Uuids: uuid,
  815. DatasetNames: datasetNames,
  816. DatasetInfos: datasetInfos,
  817. CodePath: storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"),
  818. ModelPath: setting.Attachment.Minio.RealPath + TrainUrl,
  819. BenchmarkPath: storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"),
  820. Snn4ImageNetPath: storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"),
  821. BrainScorePath: storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"),
  822. JobType: string(models.JobTypeModelSafety),
  823. Description: description,
  824. BranchName: cloudbrain.DefaultBranchName,
  825. BootFile: BootFile,
  826. Params: Params,
  827. CommitID: "",
  828. ModelName: modelName,
  829. ModelVersion: modelVersion,
  830. CkptName: CkptName,
  831. ResultPath: storage.GetMinioPath(jobName, cloudbrain.ResultPath+"/"),
  832. Spec: spec,
  833. LabelName: evaluationIndex,
  834. }
  835. _, err = cloudbrain.GenerateTask(req)
  836. if err != nil {
  837. return err
  838. }
  839. return nil
  840. }
  841. func getGpuModelSafetyCommand(BootFile string, params string, CkptName string, DisplayJobName string) (string, error) {
  842. var command string
  843. bootFile := strings.TrimSpace(BootFile)
  844. if !strings.HasSuffix(bootFile, ".py") {
  845. log.Error("bootFile(%s) format error", bootFile)
  846. return command, errors.New("bootFile format error")
  847. }
  848. var parameters models.Parameters
  849. var param string
  850. if len(params) != 0 {
  851. err := json.Unmarshal([]byte(params), &parameters)
  852. if err != nil {
  853. log.Error("Failed to Unmarshal params: %s (%v)", params, err)
  854. return command, err
  855. }
  856. for _, parameter := range parameters.Parameter {
  857. param += " --" + parameter.Label + "=" + parameter.Value
  858. }
  859. }
  860. param += " --ckpt_url=" + cloudbrain.ModelMountPath + "/" + CkptName
  861. command += "python /code/" + bootFile + param + " > " + cloudbrain.ResultPath + "/" + DisplayJobName + "-" + cloudbrain.LogFile
  862. return command, nil
  863. }
  864. func modelSafetyNewDataPrepare(ctx *context.Context) error {
  865. ctx.Data["PageIsCloudBrain"] = true
  866. ctx.Data["type"] = ctx.QueryInt("type")
  867. ctx.Data["boot_file"] = ctx.Query("boot_file")
  868. ctx.Data["display_job_name"] = ctx.Query("display_job_name")
  869. ctx.Data["description"] = ctx.Query("description")
  870. ctx.Data["image"] = strings.TrimSpace(ctx.Query("image"))
  871. ctx.Data["src_dataset"] = ctx.Query("src_dataset") //uuid
  872. ctx.Data["combat_dataset"] = ctx.Query("combat_dataset") //uuid
  873. ctx.Data["evaluationIndex"] = ctx.Query("evaluation_index")
  874. ctx.Data["run_para_list"] = ctx.Query("run_para_list")
  875. ctx.Data["spec_id"] = ctx.QueryInt64("spec_id")
  876. ctx.Data["train_url"] = ctx.Query("train_url")
  877. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  878. ctx.Data["train_url"] = ctx.Query("train_url")
  879. ctx.Data["ckpt_name"] = ctx.Query("ckpt_name")
  880. ctx.Data["model_name"] = ctx.Query("model_name")
  881. ctx.Data["model_version"] = ctx.Query("model_version")
  882. NotStopTaskCount, _ := models.GetModelSafetyCountByUserID(ctx.User.ID)
  883. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  884. if ctx.QueryInt("type") == models.TypeCloudBrainOne {
  885. ctx.Data["type"] = models.TypeCloudBrainOne
  886. ctx.Data["compute_resource"] = models.GPUResource
  887. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  888. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.GPUBaseDataSetName
  889. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.GPUBaseDataSetUUID
  890. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.GPUCombatDataSetName
  891. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.GPUCombatDataSetUUID
  892. prepareCloudbrainOneSpecs(ctx)
  893. queuesDetail, _ := cloudbrain.GetQueuesDetail()
  894. if queuesDetail != nil {
  895. ctx.Data["QueuesDetail"] = queuesDetail
  896. }
  897. } else {
  898. ctx.Data["engine_id"] = ctx.QueryInt("engine_id")
  899. ctx.Data["pool_id"] = ctx.Query("pool_id")
  900. ctx.Data["type"] = models.TypeCloudBrainTwo
  901. ctx.Data["compute_resource"] = models.NPUResource
  902. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  903. ctx.Data["BaseDataSetName"] = setting.ModelSafetyTest.NPUBaseDataSetName
  904. ctx.Data["BaseDataSetUUID"] = setting.ModelSafetyTest.NPUBaseDataSetUUID
  905. ctx.Data["CombatDataSetName"] = setting.ModelSafetyTest.NPUCombatDataSetName
  906. ctx.Data["CombatDataSetUUID"] = setting.ModelSafetyTest.NPUCombatDataSetUUID
  907. var engines modelarts.Engine
  908. if err := json.Unmarshal([]byte(setting.Engines), &engines); err != nil {
  909. ctx.ServerError("json.Unmarshal failed:", err)
  910. }
  911. ctx.Data["engines"] = engines.Info
  912. var versionInfos modelarts.VersionInfo
  913. if err := json.Unmarshal([]byte(setting.EngineVersions), &versionInfos); err != nil {
  914. ctx.ServerError("json.Unmarshal failed:", err)
  915. }
  916. ctx.Data["engine_versions"] = versionInfos.Version
  917. prepareCloudbrainTwoInferenceSpecs(ctx)
  918. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeCloudBrainTwo, "")
  919. ctx.Data["WaitCount"] = waitCount
  920. }
  921. return nil
  922. }
  923. func getJsonContent(url string) (string, error) {
  924. resp, err := http.Get(url)
  925. if err != nil || resp.StatusCode != 200 {
  926. log.Info("Get organizations url error=" + err.Error())
  927. return "", err
  928. }
  929. bytes, err := ioutil.ReadAll(resp.Body)
  930. resp.Body.Close()
  931. if err != nil {
  932. log.Info("Get organizations url error=" + err.Error())
  933. return "", err
  934. }
  935. str := string(bytes)
  936. //log.Info("json str =" + str)
  937. return str, nil
  938. }