You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grampus.go 60 kB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
3 years ago
2 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
3 years ago
2 years ago
2 years ago
2 years ago
3 years ago
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802
  1. package repo
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. "io/ioutil"
  7. "net/http"
  8. "os"
  9. "path"
  10. "strconv"
  11. "strings"
  12. "code.gitea.io/gitea/modules/urfs_client/urchin"
  13. "code.gitea.io/gitea/routers/response"
  14. "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask"
  15. "code.gitea.io/gitea/modules/dataset"
  16. "code.gitea.io/gitea/services/cloudbrain/resource"
  17. "code.gitea.io/gitea/services/reward/point/account"
  18. "code.gitea.io/gitea/modules/auth"
  19. "code.gitea.io/gitea/modules/git"
  20. "code.gitea.io/gitea/modules/grampus"
  21. "code.gitea.io/gitea/modules/modelarts"
  22. "code.gitea.io/gitea/modules/notification"
  23. "code.gitea.io/gitea/modules/redis/redis_key"
  24. "code.gitea.io/gitea/modules/redis/redis_lock"
  25. "code.gitea.io/gitea/modules/timeutil"
  26. "code.gitea.io/gitea/modules/util"
  27. "github.com/unknwon/com"
  28. "code.gitea.io/gitea/models"
  29. "code.gitea.io/gitea/modules/base"
  30. "code.gitea.io/gitea/modules/cloudbrain"
  31. "code.gitea.io/gitea/modules/context"
  32. "code.gitea.io/gitea/modules/log"
  33. "code.gitea.io/gitea/modules/setting"
  34. cloudbrainService "code.gitea.io/gitea/services/cloudbrain"
  35. )
  36. const (
  37. tplGrampusTrainJobShow base.TplName = "repo/grampus/trainjob/show"
  38. tplGrampusNotebookShow base.TplName = "repo/grampus/notebook/show"
  39. //GPU
  40. tplGrampusNotebookGPUNew base.TplName = "repo/grampus/notebook/gpu/new"
  41. tplGrampusTrainJobGPUNew base.TplName = "repo/grampus/trainjob/gpu/new"
  42. //NPU
  43. tplGrampusNotebookNPUNew base.TplName = "repo/grampus/notebook/npu/new"
  44. tplGrampusTrainJobNPUNew base.TplName = "repo/grampus/trainjob/npu/new"
  45. //GCU
  46. tplGrampusNotebookGCUNew base.TplName = "repo/grampus/notebook/gcu/new"
  47. )
  48. func GrampusNotebookNew(ctx *context.Context) {
  49. ctx.Data["IsCreate"] = true
  50. notebookType := ctx.QueryInt("type")
  51. processType := grampus.ProcessorTypeGPU
  52. if notebookType == 1 {
  53. processType = grampus.ProcessorTypeNPU
  54. } else if notebookType == 2 {
  55. processType = grampus.ProcessorTypeGCU
  56. }
  57. err := grampusNotebookNewDataPrepare(ctx, processType)
  58. if err != nil {
  59. ctx.ServerError("get new notebook-job info failed", err)
  60. return
  61. }
  62. if processType == grampus.ProcessorTypeGPU {
  63. ctx.HTML(http.StatusOK, tplGrampusNotebookGPUNew)
  64. } else if processType == grampus.ProcessorTypeNPU {
  65. ctx.HTML(http.StatusOK, tplGrampusNotebookNPUNew)
  66. } else if processType == grampus.ProcessorTypeGCU {
  67. ctx.HTML(http.StatusOK, tplGrampusNotebookGCUNew)
  68. }
  69. }
  70. func GrampusTrainJobGPUNew(ctx *context.Context) {
  71. ctx.Data["IsCreate"] = true
  72. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  73. if err != nil {
  74. ctx.ServerError("get new train-job info failed", err)
  75. return
  76. }
  77. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  78. }
  79. func GrampusTrainJobNPUNew(ctx *context.Context) {
  80. ctx.Data["IsCreate"] = true
  81. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  82. if err != nil {
  83. ctx.ServerError("get new train-job info failed", err)
  84. return
  85. }
  86. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  87. }
  88. func GrampusNotebookCreate(ctx *context.Context, form auth.CreateGrampusNotebookForm) {
  89. ctx.Data["IsCreate"] = true
  90. displayJobName := form.DisplayJobName
  91. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  92. uuid := form.Attachment
  93. description := form.Description
  94. repo := ctx.Repo.Repository
  95. branchName := form.BranchName
  96. image := strings.TrimSpace(form.Image)
  97. codeStoragePath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  98. tpl := tplGrampusNotebookGPUNew
  99. processType := grampus.ProcessorTypeGPU
  100. computeSource := models.GPUResource
  101. computeSourceSimple := models.GPU
  102. if form.Type == 1 {
  103. tpl = tplGrampusNotebookNPUNew
  104. processType = grampus.ProcessorTypeNPU
  105. computeSource = models.NPUResource
  106. computeSourceSimple = models.NPU
  107. codeStoragePath = grampus.JobPath + jobName + modelarts.CodePath
  108. } else if form.Type == 2 {
  109. tpl = tplGrampusNotebookGCUNew
  110. processType = grampus.ProcessorTypeGCU
  111. computeSource = models.GCUResource
  112. computeSourceSimple = models.GCU
  113. codeStoragePath = setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  114. }
  115. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeDebug), displayJobName))
  116. defer lock.UnLock()
  117. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  118. if !isOk {
  119. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  120. grampusNotebookNewDataPrepare(ctx, processType)
  121. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tpl, &form)
  122. return
  123. }
  124. if !jobNamePattern.MatchString(displayJobName) {
  125. grampusNotebookNewDataPrepare(ctx, processType)
  126. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  127. return
  128. }
  129. //check count limit
  130. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeSource)
  131. if err != nil {
  132. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  133. grampusNotebookNewDataPrepare(ctx, processType)
  134. ctx.RenderWithErr("system error", tpl, &form)
  135. return
  136. } else {
  137. if count >= 1 {
  138. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  139. grampusNotebookNewDataPrepare(ctx, processType)
  140. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  141. return
  142. }
  143. }
  144. //check whether the task name in the project is duplicated
  145. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName)
  146. if err == nil {
  147. if len(tasks) != 0 {
  148. log.Error("the job name did already exist", ctx.Data["MsgID"])
  149. grampusNotebookNewDataPrepare(ctx, processType)
  150. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  151. return
  152. }
  153. } else {
  154. if !models.IsErrJobNotExist(err) {
  155. log.Error("system error, %v", err, ctx.Data["MsgID"])
  156. grampusNotebookNewDataPrepare(ctx, processType)
  157. ctx.RenderWithErr("system error", tpl, &form)
  158. return
  159. }
  160. }
  161. //check specification
  162. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  163. JobType: models.JobTypeDebug,
  164. ComputeResource: computeSourceSimple,
  165. Cluster: models.C2NetCluster,
  166. })
  167. if err != nil || spec == nil {
  168. grampusNotebookNewDataPrepare(ctx, processType)
  169. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  170. return
  171. }
  172. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  173. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  174. grampusNotebookNewDataPrepare(ctx, processType)
  175. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tpl, &form)
  176. return
  177. }
  178. var datasetInfos map[string]models.DatasetInfo
  179. var datasetNames string
  180. //var
  181. if uuid != "" {
  182. datasetInfos, datasetNames, err = models.GetDatasetInfo(uuid, computeSourceSimple)
  183. if err != nil {
  184. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  185. grampusNotebookNewDataPrepare(ctx, processType)
  186. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  187. return
  188. }
  189. uuidArray := strings.Split(uuid, ";")
  190. if datasetInfos == nil || len(datasetInfos) < len(uuidArray) {
  191. grampusNotebookNewDataPrepare(ctx, processType)
  192. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.partial_datasets_not_available"), tpl, &form)
  193. return
  194. }
  195. }
  196. //prepare code and out path
  197. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  198. _, err = ioutil.ReadDir(codeLocalPath)
  199. if err == nil {
  200. os.RemoveAll(codeLocalPath)
  201. }
  202. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  203. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  204. grampusNotebookNewDataPrepare(ctx, processType)
  205. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  206. return
  207. }
  208. if processType == grampus.ProcessorTypeGPU || processType == grampus.ProcessorTypeGCU {
  209. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  210. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  211. grampusNotebookNewDataPrepare(ctx, processType)
  212. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  213. return
  214. }
  215. } else {
  216. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  217. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  218. grampusNotebookNewDataPrepare(ctx, processType)
  219. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  220. return
  221. }
  222. }
  223. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  224. req := &grampus.GenerateNotebookJobReq{
  225. JobName: jobName,
  226. DisplayJobName: displayJobName,
  227. ComputeResource: computeSource,
  228. ProcessType: processType,
  229. ImageUrl: image,
  230. ImageId: form.ImageID,
  231. Description: description,
  232. Uuid: uuid,
  233. CommitID: commitID,
  234. BranchName: branchName,
  235. DatasetNames: datasetNames,
  236. DatasetInfos: datasetInfos,
  237. Spec: spec,
  238. CodeStoragePath: codeStoragePath,
  239. CodeName: strings.ToLower(repo.Name),
  240. }
  241. if form.ModelName != "" { //使用预训练模型训练
  242. m, err := models.QueryModelByPath(form.PreTrainModelUrl)
  243. if err != nil {
  244. log.Error("Can not find model", err)
  245. grampusNotebookNewDataPrepare(ctx, processType)
  246. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_not_exist"), tpl, &form)
  247. return
  248. }
  249. if !cloudbrainTask.IsModelFileExists(m, form.CkptName) {
  250. log.Error("model file not exist.name = %s", form.CkptName)
  251. grampusNotebookNewDataPrepare(ctx, processType)
  252. ctx.RenderWithErr(ctx.Tr("repo.modelconvert.manage.model_file_not_exist"), tpl, &form)
  253. return
  254. }
  255. req.ModelName = form.ModelName
  256. req.LabelName = form.LabelName
  257. req.CkptName = form.CkptName
  258. req.ModelVersion = form.ModelVersion
  259. req.PreTrainModelUrl = form.PreTrainModelUrl
  260. req.PreTrainModelPath = getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  261. req.ModelStorageType = m.Type
  262. }
  263. _, err = grampus.GenerateNotebookJob(ctx, req)
  264. if err != nil {
  265. log.Error("GenerateNotebookJob failed:%v", err.Error(), ctx.Data["MsgID"])
  266. grampusTrainJobNewDataPrepare(ctx, processType)
  267. ctx.RenderWithErr(err.Error(), tpl, &form)
  268. return
  269. }
  270. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all")
  271. }
  272. func grampusNotebookNewDataPrepare(ctx *context.Context, processType string) error {
  273. ctx.Data["PageIsCloudBrain"] = true
  274. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  275. ctx.Data["display_job_name"] = displayJobName
  276. //get valid images
  277. if processType == grampus.ProcessorTypeNPU || processType == grampus.ProcessorTypeGCU {
  278. images, err := grampus.GetImages(processType, string(models.JobTypeDebug))
  279. if err != nil {
  280. log.Error("GetImages failed:", err.Error())
  281. } else {
  282. ctx.Data["images"] = images.Infos
  283. }
  284. }
  285. //prepare available specs
  286. computeResourceSimple := models.GPU
  287. datasetType := models.TypeCloudBrainOne
  288. computeResource := models.GPUResource
  289. if processType == grampus.ProcessorTypeNPU {
  290. computeResourceSimple = models.NPU
  291. datasetType = models.TypeCloudBrainTwo
  292. computeResource = models.NPUResource
  293. } else if processType == grampus.ProcessorTypeGCU {
  294. computeResourceSimple = models.GCU
  295. datasetType = models.TypeCloudBrainAll
  296. computeResource = models.GCUResource
  297. }
  298. prepareGrampusSpecs(ctx, computeResourceSimple, models.JobTypeDebug)
  299. //get branches
  300. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  301. if err != nil {
  302. log.Error("GetBranches error:", err.Error())
  303. } else {
  304. ctx.Data["branches"] = branches
  305. }
  306. ctx.Data["branchName"] = ctx.Repo.BranchName
  307. ctx.Data["datasetType"] = datasetType
  308. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, computeResource, models.JobTypeDebug)
  309. ctx.Data["WaitCount"] = waitCount
  310. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), computeResource)
  311. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  312. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  313. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  314. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  315. return nil
  316. }
  317. func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) error {
  318. ctx.Data["PageIsCloudBrain"] = true
  319. var displayJobName = cloudbrainService.GetDisplayJobName(ctx.User.Name)
  320. ctx.Data["display_job_name"] = displayJobName
  321. //get valid images
  322. if processType == grampus.ProcessorTypeNPU {
  323. images, err := grampus.GetImages(processType, string(models.JobTypeTrain))
  324. if err != nil {
  325. log.Error("GetImages failed:", err.Error())
  326. } else {
  327. ctx.Data["images"] = images.Infos
  328. }
  329. }
  330. //prepare available specs
  331. if processType == grampus.ProcessorTypeNPU {
  332. prepareGrampusSpecs(ctx, models.NPU)
  333. } else if processType == grampus.ProcessorTypeGPU {
  334. prepareGrampusSpecs(ctx, models.GPU)
  335. }
  336. //get branches
  337. branches, _, err := ctx.Repo.GitRepo.GetBranches(0, 0)
  338. if err != nil {
  339. log.Error("GetBranches error:", err.Error())
  340. } else {
  341. ctx.Data["branches"] = branches
  342. }
  343. ctx.Data["branchName"] = ctx.Repo.BranchName
  344. if processType == grampus.ProcessorTypeGPU {
  345. ctx.Data["datasetType"] = models.TypeCloudBrainOne
  346. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.GPUResource, models.JobTypeTrain)
  347. ctx.Data["WaitCount"] = waitCount
  348. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  349. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  350. } else if processType == grampus.ProcessorTypeNPU {
  351. ctx.Data["datasetType"] = models.TypeCloudBrainTwo
  352. waitCount := cloudbrain.GetWaitingCloudbrainCount(models.TypeC2Net, models.NPUResource, models.JobTypeTrain)
  353. ctx.Data["WaitCount"] = waitCount
  354. NotStopTaskCount, _ := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  355. ctx.Data["NotStopTaskCount"] = NotStopTaskCount
  356. }
  357. if ctx.Cloudbrain != nil {
  358. uuids, datasetNames := dataset.GetFilterDeletedAttachments(ctx.Cloudbrain.Uuid)
  359. ctx.Data["attachment"] = uuids
  360. ctx.Data["boot_file"] = ctx.Cloudbrain.BootFile
  361. ctx.Data["image_id"] = ctx.Cloudbrain.ImageID
  362. ctx.Data["run_para_list"] = ctx.Cloudbrain.Parameters
  363. ctx.Data["description"] = ctx.Cloudbrain.Description
  364. ctx.Data["branch_name"] = ctx.Cloudbrain.BranchName
  365. ctx.Data["engine_name"] = ctx.Cloudbrain.EngineName
  366. ctx.Data["work_server_number"] = ctx.Cloudbrain.WorkServerNumber
  367. if ctx.Cloudbrain.Image != "" {
  368. ctx.Data["image"] = ctx.Cloudbrain.Image
  369. } else {
  370. ctx.Data["image"] = ctx.Cloudbrain.EngineName
  371. }
  372. ctx.Data["dataset_name"] = datasetNames
  373. ctx.Data["model_name"] = ctx.Cloudbrain.ModelName
  374. ctx.Data["model_version"] = ctx.Cloudbrain.ModelVersion
  375. ctx.Data["ckpt_name"] = ctx.Cloudbrain.CkptName
  376. ctx.Data["label_names"] = ctx.Cloudbrain.LabelName
  377. ctx.Data["pre_train_model_url"] = ctx.Cloudbrain.PreTrainModelUrl
  378. spec, _ := resource.GetCloudbrainSpec(ctx.Cloudbrain.ID)
  379. if spec != nil {
  380. ctx.Data["spec_id"] = spec.ID
  381. }
  382. }
  383. return nil
  384. }
  385. func GrampusTrainJobVersionNew(ctx *context.Context) {
  386. task := ctx.Cloudbrain
  387. ctx.Data["IsCreate"] = false
  388. if task.ComputeResource == models.GPUResource {
  389. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  390. if err != nil {
  391. ctx.ServerError("get new train-job version info failed", err)
  392. return
  393. }
  394. ctx.HTML(http.StatusOK, tplGrampusTrainJobGPUNew)
  395. } else if task.ComputeResource == models.NPUResource {
  396. err := grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  397. if err != nil {
  398. ctx.ServerError("get new train-job version info failed", err)
  399. return
  400. }
  401. ctx.HTML(200, tplGrampusTrainJobNPUNew)
  402. }
  403. }
  404. func prepareGrampusSpecs(ctx *context.Context, computeResource string, jobType ...models.JobType) {
  405. tempJobType := models.JobTypeTrain
  406. if len(jobType) > 0 {
  407. tempJobType = jobType[0]
  408. }
  409. noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
  410. JobType: tempJobType,
  411. ComputeResource: computeResource,
  412. Cluster: models.C2NetCluster,
  413. })
  414. ctx.Data["Specs"] = noteBookSpecs
  415. }
  416. func grampusParamCheckCreateTrainJob(form auth.CreateGrampusTrainJobForm) error {
  417. if !strings.HasSuffix(strings.TrimSpace(form.BootFile), ".py") {
  418. log.Error("the boot file(%s) must be a python file", form.BootFile)
  419. return errors.New("启动文件必须是python文件")
  420. }
  421. if form.BranchName == "" {
  422. log.Error("the branch must not be null!", form.BranchName)
  423. return errors.New("代码分支不能为空!")
  424. }
  425. return nil
  426. }
  427. func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  428. ctx.Data["IsCreate"] = true
  429. grampusTrainJobGpuCreate(ctx, form)
  430. }
  431. func grampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  432. displayJobName := form.DisplayJobName
  433. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  434. uuid := form.Attachment
  435. description := form.Description
  436. bootFile := strings.TrimSpace(form.BootFile)
  437. params := form.Params
  438. repo := ctx.Repo.Repository
  439. codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/"
  440. codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/"
  441. branchName := form.BranchName
  442. image := strings.TrimSpace(form.Image)
  443. tpl := tplGrampusTrainJobGPUNew
  444. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  445. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  446. if !isOk {
  447. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  448. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  449. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplGrampusTrainJobGPUNew, &form)
  450. return
  451. }
  452. defer lock.UnLock()
  453. if !jobNamePattern.MatchString(displayJobName) {
  454. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  455. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  456. return
  457. }
  458. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  459. if err != nil || !bootFileExist {
  460. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  461. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  462. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  463. return
  464. }
  465. //check count limit
  466. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.GPUResource)
  467. if err != nil {
  468. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  469. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  470. ctx.RenderWithErr("system error", tpl, &form)
  471. return
  472. } else {
  473. if count >= 1 {
  474. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  475. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  476. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  477. return
  478. }
  479. }
  480. //check param
  481. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  482. log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
  483. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  484. ctx.RenderWithErr(err.Error(), tpl, &form)
  485. return
  486. }
  487. //check whether the task name in the project is duplicated
  488. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  489. if err == nil {
  490. if len(tasks) != 0 {
  491. log.Error("the job name did already exist", ctx.Data["MsgID"])
  492. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  493. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  494. return
  495. }
  496. } else {
  497. if !models.IsErrJobNotExist(err) {
  498. log.Error("system error, %v", err, ctx.Data["MsgID"])
  499. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  500. ctx.RenderWithErr("system error", tpl, &form)
  501. return
  502. }
  503. }
  504. //check specification
  505. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  506. JobType: models.JobTypeTrain,
  507. ComputeResource: models.GPU,
  508. Cluster: models.C2NetCluster,
  509. })
  510. if err != nil || spec == nil {
  511. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  512. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  513. return
  514. }
  515. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  516. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  517. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  518. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobGPUNew, &form)
  519. return
  520. }
  521. //check dataset
  522. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.GPU)
  523. if err != nil {
  524. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  525. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  526. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  527. return
  528. }
  529. //prepare code and out path
  530. _, err = ioutil.ReadDir(codeLocalPath)
  531. if err == nil {
  532. os.RemoveAll(codeLocalPath)
  533. }
  534. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  535. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  536. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  537. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  538. return
  539. }
  540. //todo: upload code (send to file_server todo this work?)
  541. //upload code
  542. if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
  543. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  544. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  545. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  546. return
  547. }
  548. modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
  549. if err := mkModelPath(modelPath); err != nil {
  550. log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  551. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  552. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  553. return
  554. }
  555. //init model readme
  556. if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
  557. log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
  558. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  559. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  560. return
  561. }
  562. var datasetRemotePath, allFileName string
  563. for _, datasetInfo := range datasetInfos {
  564. if datasetRemotePath == "" {
  565. datasetRemotePath = datasetInfo.DataLocalPath
  566. allFileName = datasetInfo.FullName
  567. } else {
  568. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath
  569. allFileName = allFileName + ";" + datasetInfo.FullName
  570. }
  571. }
  572. //prepare command
  573. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  574. command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", allFileName, preTrainModelPath, form.CkptName, "")
  575. if err != nil {
  576. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  577. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  578. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  579. return
  580. }
  581. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  582. req := &grampus.GenerateTrainJobReq{
  583. JobName: jobName,
  584. DisplayJobName: displayJobName,
  585. ComputeResource: models.GPUResource,
  586. ProcessType: grampus.ProcessorTypeGPU,
  587. Command: command,
  588. ImageUrl: image,
  589. Description: description,
  590. BootFile: bootFile,
  591. Uuid: uuid,
  592. CommitID: commitID,
  593. BranchName: branchName,
  594. Params: form.Params,
  595. EngineName: image,
  596. DatasetNames: datasetNames,
  597. DatasetInfos: datasetInfos,
  598. IsLatestVersion: modelarts.IsLatestVersion,
  599. VersionCount: modelarts.VersionCountOne,
  600. WorkServerNumber: 1,
  601. Spec: spec,
  602. }
  603. if form.ModelName != "" { //使用预训练模型训练
  604. req.ModelName = form.ModelName
  605. req.LabelName = form.LabelName
  606. req.CkptName = form.CkptName
  607. req.ModelVersion = form.ModelVersion
  608. req.PreTrainModelUrl = form.PreTrainModelUrl
  609. }
  610. _, err = grampus.GenerateTrainJob(ctx, req)
  611. if err != nil {
  612. log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
  613. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
  614. ctx.RenderWithErr(err.Error(), tpl, &form)
  615. return
  616. }
  617. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  618. }
  619. func getPreTrainModelPath(pretrainModelDir string, fileName string) string {
  620. index := strings.Index(pretrainModelDir, "/")
  621. if index > 0 {
  622. filterBucket := pretrainModelDir[index+1:]
  623. return filterBucket + fileName
  624. } else {
  625. return ""
  626. }
  627. }
  628. func GrampusTrainJobVersionCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  629. ctx.Data["IsCreate"] = false
  630. computeResource := ctx.Query("compute_resource")
  631. if computeResource == models.GPUResource {
  632. grampusTrainJobGpuCreate(ctx, form)
  633. } else if computeResource == models.NPUResource {
  634. grampusTrainJobNpuCreate(ctx, form)
  635. } else {
  636. ctx.ServerError("resource error", errors.New("compute resource is not support"))
  637. return
  638. }
  639. }
  640. func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  641. ctx.Data["IsCreate"] = true
  642. grampusTrainJobNpuCreate(ctx, form)
  643. }
  644. func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrainJobForm) {
  645. displayJobName := form.DisplayJobName
  646. jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
  647. uuid := form.Attachment
  648. description := form.Description
  649. bootFile := strings.TrimSpace(form.BootFile)
  650. params := form.Params
  651. repo := ctx.Repo.Repository
  652. codeLocalPath := setting.JobPath + jobName + modelarts.CodePath
  653. codeObsPath := grampus.JobPath + jobName + modelarts.CodePath
  654. //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/"
  655. branchName := form.BranchName
  656. isLatestVersion := modelarts.IsLatestVersion
  657. versionCount := modelarts.VersionCountOne
  658. engineName := form.EngineName
  659. tpl := tplGrampusTrainJobNPUNew
  660. lock := redis_lock.NewDistributeLock(redis_key.CloudbrainBindingJobNameKey(fmt.Sprint(repo.ID), string(models.JobTypeTrain), displayJobName))
  661. isOk, err := lock.Lock(models.CloudbrainKeyDuration)
  662. if !isOk {
  663. log.Error("lock processed failed:%v", err, ctx.Data["MsgID"])
  664. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  665. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_samejob_err"), tplGrampusTrainJobNPUNew, &form)
  666. return
  667. }
  668. defer lock.UnLock()
  669. if !jobNamePattern.MatchString(displayJobName) {
  670. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  671. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
  672. return
  673. }
  674. bootFileExist, err := ctx.Repo.FileExists(bootFile, branchName)
  675. if err != nil || !bootFileExist {
  676. log.Error("Get bootfile error:", err, ctx.Data["MsgID"])
  677. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  678. ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_bootfile_err"), tpl, &form)
  679. return
  680. }
  681. //check count limit
  682. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeTrain), models.NPUResource)
  683. if err != nil {
  684. log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
  685. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  686. ctx.RenderWithErr("system error", tpl, &form)
  687. return
  688. } else {
  689. if count >= 1 {
  690. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  691. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  692. ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form)
  693. return
  694. }
  695. }
  696. //check param
  697. if err := grampusParamCheckCreateTrainJob(form); err != nil {
  698. log.Error("paramCheckCreateTrainJob failed:(%v)", err)
  699. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  700. ctx.RenderWithErr(err.Error(), tpl, &form)
  701. return
  702. }
  703. //check whether the task name in the project is duplicated
  704. tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeTrain), displayJobName)
  705. if err == nil {
  706. if len(tasks) != 0 {
  707. log.Error("the job name did already exist", ctx.Data["MsgID"])
  708. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  709. ctx.RenderWithErr("the job name did already exist", tpl, &form)
  710. return
  711. }
  712. } else {
  713. if !models.IsErrJobNotExist(err) {
  714. log.Error("system error, %v", err, ctx.Data["MsgID"])
  715. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  716. ctx.RenderWithErr("system error", tpl, &form)
  717. return
  718. }
  719. }
  720. //check specification
  721. spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
  722. JobType: models.JobTypeTrain,
  723. ComputeResource: models.NPU,
  724. Cluster: models.C2NetCluster,
  725. })
  726. if err != nil || spec == nil {
  727. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  728. ctx.RenderWithErr("Resource specification not available", tpl, &form)
  729. return
  730. }
  731. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  732. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  733. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  734. ctx.RenderWithErr(ctx.Tr("points.insufficient_points_balance"), tplGrampusTrainJobNPUNew, &form)
  735. return
  736. }
  737. //check dataset
  738. datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid, models.NPU)
  739. if err != nil {
  740. log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
  741. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  742. ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
  743. return
  744. }
  745. //prepare code and out path
  746. _, err = ioutil.ReadDir(codeLocalPath)
  747. if err == nil {
  748. os.RemoveAll(codeLocalPath)
  749. }
  750. if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
  751. log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
  752. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  753. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  754. return
  755. }
  756. //todo: upload code (send to file_server todo this work?)
  757. if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
  758. log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
  759. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  760. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  761. return
  762. }
  763. if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
  764. log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
  765. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  766. ctx.RenderWithErr(ctx.Tr("cloudbrain.load_code_failed"), tpl, &form)
  767. return
  768. }
  769. var datasetRemotePath, allFileName string
  770. for _, datasetInfo := range datasetInfos {
  771. if datasetRemotePath == "" {
  772. datasetRemotePath = datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  773. allFileName = datasetInfo.FullName
  774. } else {
  775. datasetRemotePath = datasetRemotePath + ";" + datasetInfo.DataLocalPath + "'" + datasetInfo.FullName + "'"
  776. allFileName = allFileName + ";" + datasetInfo.FullName
  777. }
  778. }
  779. //prepare command
  780. preTrainModelPath := getPreTrainModelPath(form.PreTrainModelUrl, form.CkptName)
  781. command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", datasetRemotePath, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, allFileName, preTrainModelPath, form.CkptName, grampus.GetNpuModelRemoteObsUrl(jobName))
  782. if err != nil {
  783. log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
  784. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  785. ctx.RenderWithErr("Create task failed, internal error", tpl, &form)
  786. return
  787. }
  788. commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName)
  789. req := &grampus.GenerateTrainJobReq{
  790. JobName: jobName,
  791. DisplayJobName: displayJobName,
  792. ComputeResource: models.NPUResource,
  793. ProcessType: grampus.ProcessorTypeNPU,
  794. Command: command,
  795. ImageId: form.ImageID,
  796. Description: description,
  797. CodeObsPath: codeObsPath,
  798. BootFileUrl: codeObsPath + bootFile,
  799. BootFile: bootFile,
  800. WorkServerNumber: form.WorkServerNumber,
  801. Uuid: uuid,
  802. CommitID: commitID,
  803. IsLatestVersion: isLatestVersion,
  804. BranchName: branchName,
  805. Params: form.Params,
  806. EngineName: engineName,
  807. VersionCount: versionCount,
  808. TotalVersionCount: modelarts.TotalVersionCount,
  809. DatasetNames: datasetNames,
  810. DatasetInfos: datasetInfos,
  811. Spec: spec,
  812. CodeName: strings.ToLower(repo.Name),
  813. }
  814. if form.ModelName != "" { //使用预训练模型训练
  815. req.ModelName = form.ModelName
  816. req.LabelName = form.LabelName
  817. req.CkptName = form.CkptName
  818. req.ModelVersion = form.ModelVersion
  819. req.PreTrainModelUrl = form.PreTrainModelUrl
  820. req.PreTrainModelPath = preTrainModelPath
  821. }
  822. _, err = grampus.GenerateTrainJob(ctx, req)
  823. if err != nil {
  824. log.Error("GenerateTrainJob failed:%v", err.Error())
  825. grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU)
  826. ctx.RenderWithErr(err.Error(), tpl, &form)
  827. return
  828. }
  829. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job")
  830. }
  831. func GetGrampusNotebook(ctx *context.APIContext) {
  832. var (
  833. err error
  834. )
  835. ID := ctx.Params(":id")
  836. job, err := models.GetCloudbrainByID(ID)
  837. if err != nil {
  838. ctx.NotFound("", err)
  839. log.Error("GetCloudbrainByID failed:", err)
  840. return
  841. }
  842. jobAfter, err := cloudbrainTask.SyncGrampusNotebookStatus(job)
  843. aiCenterName := cloudbrainService.GetAiCenterShow(jobAfter.AiCenter, ctx.Context)
  844. if err != nil {
  845. ctx.NotFound(err)
  846. log.Error("Sync cloud brain one status failed:", err)
  847. return
  848. }
  849. ctx.JSON(http.StatusOK, map[string]interface{}{
  850. "ID": ID,
  851. "JobName": jobAfter.JobName,
  852. "JobStatus": jobAfter.Status,
  853. "AiCenter": aiCenterName,
  854. "CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"),
  855. "CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"),
  856. "JobDuration": jobAfter.TrainJobDuration,
  857. })
  858. }
  859. func GrampusStopJob(ctx *context.Context) {
  860. var ID = ctx.Params(":id")
  861. var resultCode = "0"
  862. var errorMsg = ""
  863. var status = ""
  864. task := ctx.Cloudbrain
  865. for {
  866. if task.Status == models.GrampusStatusStopped || task.Status == models.GrampusStatusFailed || task.Status == models.GrampusStatusSucceeded {
  867. log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"])
  868. resultCode = "-1"
  869. errorMsg = ctx.Tr("cloudbrain.Already_stopped")
  870. break
  871. }
  872. res, err := grampus.StopJob(task.JobID, task.JobType)
  873. if err != nil {
  874. log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  875. resultCode = strconv.Itoa(res.ErrorCode)
  876. errorMsg = ctx.Tr("cloudbrain.Stopped_failed")
  877. break
  878. }
  879. oldStatus := task.Status
  880. task.Status = getStopJobResponseStatus(res)
  881. if task.EndTime == 0 {
  882. task.EndTime = timeutil.TimeStampNow()
  883. }
  884. task.ComputeAndSetDuration()
  885. if oldStatus != task.Status {
  886. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  887. }
  888. err = models.UpdateJob(task)
  889. if err != nil {
  890. log.Error("UpdateJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"])
  891. resultCode = "-1"
  892. errorMsg = "system error"
  893. break
  894. }
  895. status = task.Status
  896. break
  897. }
  898. ctx.JSON(200, map[string]interface{}{
  899. "result_code": resultCode,
  900. "error_msg": errorMsg,
  901. "status": status,
  902. "id": ID,
  903. "StatusOK": 0,
  904. })
  905. }
  906. func getStopJobResponseStatus(res *models.GrampusStopJobResponse) string {
  907. newStatus := models.GrampusStatusStopping
  908. if res.Status != "" {
  909. newStatus = grampus.TransTrainJobStatus(res.Status)
  910. }
  911. return newStatus
  912. }
  913. func GrampusNotebookDel(ctx *context.Context) {
  914. var listType = ctx.Query("listType")
  915. if err := deleteGrampusJob(ctx); err != nil {
  916. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  917. ctx.ServerError(err.Error(), err)
  918. return
  919. }
  920. var isAdminPage = ctx.Query("isadminpage")
  921. var isHomePage = ctx.Query("ishomepage")
  922. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  923. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  924. } else if isHomePage == "true" {
  925. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  926. } else {
  927. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=" + listType)
  928. }
  929. }
  930. func GrampusTrainJobDel(ctx *context.Context) {
  931. var listType = ctx.Query("listType")
  932. if err := deleteGrampusJob(ctx); err != nil {
  933. log.Error("deleteGrampusJob failed: %v", err, ctx.Data["msgID"])
  934. ctx.ServerError(err.Error(), err)
  935. return
  936. }
  937. var isAdminPage = ctx.Query("isadminpage")
  938. var isHomePage = ctx.Query("ishomepage")
  939. if ctx.IsUserSiteAdmin() && isAdminPage == "true" {
  940. ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains")
  941. } else if isHomePage == "true" {
  942. ctx.Redirect(setting.AppSubURL + "/cloudbrains")
  943. } else {
  944. ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType)
  945. }
  946. }
  947. func deleteGrampusJob(ctx *context.Context) error {
  948. task := ctx.Cloudbrain
  949. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  950. log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"])
  951. return errors.New(ctx.Tr("cloudbrain.Not_Stopped"))
  952. }
  953. err := models.DeleteJob(task)
  954. if err != nil {
  955. log.Error("DeleteJob failed: %v", err, ctx.Data["msgID"])
  956. return err
  957. }
  958. storageType := models.TypeCloudBrainOne
  959. if task.ComputeResource == models.NPUResource {
  960. storageType = models.TypeCloudBrainTwo
  961. }
  962. DeleteCloudbrainJobStorage(task.JobName, storageType)
  963. return nil
  964. }
  965. type NotebookDataset struct {
  966. DatasetUrl string `json:"dataset_url"`
  967. }
  968. func GrampusNotebookShow(ctx *context.Context) {
  969. ctx.Data["PageIsCloudBrain"] = true
  970. var task *models.Cloudbrain
  971. task, err := models.GetCloudbrainByIDWithDeleted(ctx.Params(":id"))
  972. if err != nil {
  973. log.Error("GetCloudbrainByID failed:" + err.Error())
  974. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  975. return
  976. }
  977. task.ContainerIp = ""
  978. if task.DeletedAt.IsZero() && cloudbrainTask.IsTaskNotStop(task) { //normal record
  979. result, err := grampus.GetNotebookJob(task.JobID)
  980. if err != nil {
  981. log.Error("GetJob failed:" + err.Error())
  982. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  983. return
  984. }
  985. if result != nil {
  986. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  987. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  988. }
  989. oldStatus := task.Status
  990. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  991. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  992. task.Duration = result.JobInfo.RunSec
  993. if task.Duration < 0 {
  994. task.Duration = 0
  995. }
  996. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  997. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  998. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  999. }
  1000. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  1001. task.EndTime = task.StartTime.Add(task.Duration)
  1002. }
  1003. task.CorrectCreateUnix()
  1004. if oldStatus != task.Status {
  1005. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1006. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1007. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1008. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1009. }
  1010. }
  1011. }
  1012. }
  1013. err = models.UpdateJob(task)
  1014. if err != nil {
  1015. log.Error("UpdateJob failed:" + err.Error())
  1016. }
  1017. }
  1018. }
  1019. if len(task.Parameters) > 0 {
  1020. var parameters models.Parameters
  1021. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1022. if err != nil {
  1023. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1024. ctx.ServerError("system error", err)
  1025. return
  1026. }
  1027. if len(parameters.Parameter) > 0 {
  1028. paramTemp := ""
  1029. for _, Parameter := range parameters.Parameter {
  1030. param := Parameter.Label + " = " + Parameter.Value + "; "
  1031. paramTemp = paramTemp + param
  1032. }
  1033. task.Parameters = paramTemp[:len(paramTemp)-2]
  1034. } else {
  1035. task.Parameters = ""
  1036. }
  1037. }
  1038. user, err := models.GetUserByID(task.UserID)
  1039. if err == nil {
  1040. task.User = user
  1041. }
  1042. prepareSpec4Show(ctx, task)
  1043. ctx.Data["task"] = task
  1044. ctx.Data["datasetDownload"] = getDatasetDownloadInfo(ctx, task)
  1045. ctx.Data["modelDownload"] = getModelDownloadInfo(ctx, task)
  1046. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1047. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1048. ctx.Data["code_path"] = cloudbrain.CodeMountPath
  1049. ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath
  1050. ctx.Data["model_path"] = cloudbrain.ModelMountPath
  1051. ctx.HTML(http.StatusOK, tplGrampusNotebookShow)
  1052. }
  1053. func getDatasetDownloadInfo(ctx *context.Context, task *models.Cloudbrain) []*models.DatasetDownload {
  1054. datasetDownload := make([]*models.DatasetDownload, 0)
  1055. if ctx.IsSigned {
  1056. if task.Uuid != "" && task.UserID == ctx.User.ID {
  1057. if task.IsGPUTask() {
  1058. return GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1059. } else {
  1060. datasetDownload = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1061. datasetObsUrlList := make([]NotebookDataset, 0)
  1062. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1063. for _, datasetInfo := range datasetDownload {
  1064. for _, datasetObs := range datasetObsUrlList {
  1065. log.Info("datasetObsUrl:" + datasetObs.DatasetUrl + "datasetName:" + datasetInfo.DatasetName)
  1066. if strings.Contains(datasetObs.DatasetUrl, datasetInfo.DatasetName) {
  1067. datasetInfo.DatasetDownloadLink = datasetObs.DatasetUrl
  1068. break
  1069. }
  1070. }
  1071. }
  1072. }
  1073. }
  1074. }
  1075. return datasetDownload
  1076. }
  1077. func getModelDownloadInfo(ctx *context.Context, task *models.Cloudbrain) *models.ModelDownload {
  1078. var modelDownload models.ModelDownload
  1079. if ctx.IsSigned {
  1080. if task.ModelName != "" && task.UserID == ctx.User.ID {
  1081. if task.IsNPUTask() {
  1082. modelDownload = models.ModelDownload{
  1083. Name: task.CkptName,
  1084. DownloadLink: "",
  1085. IsDelete: false,
  1086. }
  1087. if !HasModelFile(task) {
  1088. modelDownload.IsDelete = true
  1089. }
  1090. datasetObsUrlList := make([]NotebookDataset, 0)
  1091. _ = json.Unmarshal([]byte(task.DataUrl), &datasetObsUrlList)
  1092. for _, datasetObs := range datasetObsUrlList {
  1093. if strings.Contains(datasetObs.DatasetUrl, task.CkptName) {
  1094. modelDownload.DownloadLink = datasetObs.DatasetUrl
  1095. break
  1096. }
  1097. }
  1098. }
  1099. }
  1100. }
  1101. return &modelDownload
  1102. }
  1103. func GrampusTrainJobShow(ctx *context.Context) {
  1104. ctx.Data["PageIsCloudBrain"] = true
  1105. var task *models.Cloudbrain
  1106. task, err := models.GetCloudbrainByJobIDWithDeleted(ctx.Params(":jobid"))
  1107. if err != nil {
  1108. log.Error("GetCloudbrainByJobID failed:" + err.Error())
  1109. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1110. return
  1111. }
  1112. task.ContainerIp = ""
  1113. task.User, _ = models.GetUserByID(task.UserID)
  1114. if task.DeletedAt.IsZero() { //normal record
  1115. result, err := grampus.GetJob(task.JobID)
  1116. if err != nil {
  1117. log.Error("GetJob failed:" + err.Error())
  1118. ctx.NotFound(ctx.Req.URL.RequestURI(), nil)
  1119. return
  1120. }
  1121. if result != nil {
  1122. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1123. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1124. }
  1125. oldStatus := task.Status
  1126. task.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1127. if task.Status != oldStatus || task.Status == models.GrampusStatusRunning {
  1128. task.Duration = result.JobInfo.RunSec
  1129. if task.Duration < 0 {
  1130. task.Duration = 0
  1131. }
  1132. task.TrainJobDuration = models.ConvertDurationToStr(task.Duration)
  1133. if task.StartTime == 0 && result.JobInfo.StartedAt > 0 {
  1134. task.StartTime = timeutil.TimeStamp(result.JobInfo.StartedAt)
  1135. }
  1136. if task.EndTime == 0 && models.IsTrainJobTerminal(task.Status) && task.StartTime > 0 {
  1137. task.EndTime = task.StartTime.Add(task.Duration)
  1138. }
  1139. task.CorrectCreateUnix()
  1140. if oldStatus != task.Status {
  1141. notification.NotifyChangeCloudbrainStatus(task, oldStatus)
  1142. if models.IsTrainJobTerminal(task.Status) && task.ComputeResource == models.NPUResource {
  1143. if len(result.JobInfo.Tasks[0].CenterID) == 1 {
  1144. urchin.GetBackNpuModel(task.ID, grampus.GetRemoteEndPoint(result.JobInfo.Tasks[0].CenterID[0]), grampus.BucketRemote, grampus.GetNpuModelObjectKey(task.JobName), grampus.GetCenterProxy(setting.Grampus.LocalCenterID))
  1145. }
  1146. }
  1147. }
  1148. }
  1149. err = models.UpdateJob(task)
  1150. if err != nil {
  1151. log.Error("UpdateJob failed:" + err.Error())
  1152. }
  1153. }
  1154. }
  1155. if len(task.Parameters) > 0 {
  1156. var parameters models.Parameters
  1157. err := json.Unmarshal([]byte(task.Parameters), &parameters)
  1158. if err != nil {
  1159. log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err)
  1160. ctx.ServerError("system error", err)
  1161. return
  1162. }
  1163. if len(parameters.Parameter) > 0 {
  1164. paramTemp := ""
  1165. for _, Parameter := range parameters.Parameter {
  1166. param := Parameter.Label + " = " + Parameter.Value + "; "
  1167. paramTemp = paramTemp + param
  1168. }
  1169. task.Parameters = paramTemp[:len(paramTemp)-2]
  1170. } else {
  1171. task.Parameters = ""
  1172. }
  1173. }
  1174. taskList := make([]*models.Cloudbrain, 0)
  1175. taskList = append(taskList, task)
  1176. prepareSpec4Show(ctx, task)
  1177. ctx.Data["version_list_task"] = taskList
  1178. ctx.Data["datasetDownload"] = GetCloudBrainDataSetInfo(task.Uuid, task.DatasetName, false)
  1179. ctx.Data["canDownload"] = cloudbrain.CanModifyJob(ctx, task)
  1180. ctx.Data["displayJobName"] = task.DisplayJobName
  1181. ctx.Data["ai_center"] = cloudbrainService.GetAiCenterShow(task.AiCenter, ctx)
  1182. ctx.HTML(http.StatusOK, tplGrampusTrainJobShow)
  1183. }
  1184. func GrampusDownloadLog(ctx *context.Context) {
  1185. jobID := ctx.Params(":jobid")
  1186. job, err := models.GetCloudbrainByJobID(jobID)
  1187. if err != nil {
  1188. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1189. ctx.ServerError(err.Error(), err)
  1190. return
  1191. }
  1192. content, err := grampus.GetTrainJobLog(job.JobID)
  1193. if err != nil {
  1194. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1195. content = ""
  1196. }
  1197. fileName := job.JobName + "-log.txt"
  1198. ctx.Resp.Header().Set("Content-Disposition", "attachment; filename="+fileName)
  1199. ctx.Resp.Header().Set("Content-Type", "application/octet-stream")
  1200. var b []byte = []byte(content)
  1201. ctx.Resp.Write(b)
  1202. }
  1203. func GrampusGetLog(ctx *context.Context) {
  1204. jobID := ctx.Params(":jobid")
  1205. job, err := models.GetCloudbrainByJobID(jobID)
  1206. if err != nil {
  1207. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1208. ctx.ServerError(err.Error(), err)
  1209. return
  1210. }
  1211. result, err := grampus.GetJob(jobID)
  1212. if err != nil {
  1213. log.Error("GetJob(%s) failed:%v", job.JobName, err)
  1214. ctx.JSON(http.StatusOK, map[string]interface{}{
  1215. "JobName": job.JobName,
  1216. "Content": "",
  1217. "CanLogDownload": false,
  1218. })
  1219. return
  1220. }
  1221. exitDiagnostics := ""
  1222. if result != nil {
  1223. exitDiagnostics = result.ExitDiagnostics
  1224. }
  1225. content, err := grampus.GetTrainJobLog(job.JobID)
  1226. if err != nil {
  1227. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1228. ctx.JSON(http.StatusOK, map[string]interface{}{
  1229. "JobName": job.JobName,
  1230. "Content": exitDiagnostics,
  1231. "CanLogDownload": false,
  1232. })
  1233. return
  1234. }
  1235. if result != nil {
  1236. job.Status = grampus.TransTrainJobStatus(result.JobInfo.Status)
  1237. if job.Status == models.GrampusStatusFailed {
  1238. content = content + "\n" + exitDiagnostics
  1239. }
  1240. }
  1241. canLogDownload := err == nil && job.IsUserHasRight(ctx.User)
  1242. ctx.JSON(http.StatusOK, map[string]interface{}{
  1243. "JobName": job.JobName,
  1244. "Content": content,
  1245. "CanLogDownload": canLogDownload,
  1246. })
  1247. return
  1248. }
  1249. func GrampusMetrics(ctx *context.Context) {
  1250. jobID := ctx.Params(":jobid")
  1251. job, err := models.GetCloudbrainByJobID(jobID)
  1252. if err != nil {
  1253. log.Error("GetCloudbrainByJobID failed: %v", err, ctx.Data["MsgID"])
  1254. ctx.ServerError(err.Error(), err)
  1255. return
  1256. }
  1257. result, err := grampus.GetGrampusMetrics(job.JobID)
  1258. if err != nil {
  1259. log.Error("GetTrainJobLog failed: %v", err, ctx.Data["MsgID"])
  1260. }
  1261. ctx.JSON(http.StatusOK, map[string]interface{}{
  1262. "JobID": jobID,
  1263. "Interval": result.Interval,
  1264. "MetricsInfo": result.MetricsInfo,
  1265. })
  1266. return
  1267. }
  1268. func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bootFile, paramSrc, outputRemotePath, datasetName, pretrainModelPath, pretrainModelFileName, modelRemoteObsUrl string) (string, error) {
  1269. var command string
  1270. //prepare
  1271. workDir := grampus.NpuWorkDir
  1272. if processorType == grampus.ProcessorTypeNPU {
  1273. command += "pwd;cd " + workDir + grampus.CommandPrepareScriptNpu
  1274. } else if processorType == grampus.ProcessorTypeGPU {
  1275. workDir = grampus.GpuWorkDir
  1276. command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScriptGpu, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
  1277. }
  1278. //download code & dataset
  1279. if processorType == grampus.ProcessorTypeNPU {
  1280. //no need to download code & dataset by internet
  1281. } else if processorType == grampus.ProcessorTypeGPU {
  1282. commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
  1283. commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
  1284. command += commandDownload
  1285. }
  1286. //unzip code & dataset
  1287. if processorType == grampus.ProcessorTypeNPU {
  1288. //no need to process
  1289. } else if processorType == grampus.ProcessorTypeGPU {
  1290. unZipDatasetCommand := cloudbrainTask.GenerateDatasetUnzipCommand(datasetName)
  1291. commandUnzip := "cd " + workDir + "code;unzip -q master.zip;rm -f master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
  1292. command += commandUnzip
  1293. }
  1294. command += "echo \"unzip finished;start to exec code;\";"
  1295. // set export
  1296. var commandExport string
  1297. if processorType == grampus.ProcessorTypeNPU {
  1298. commandExport = "export bucket=" + setting.Bucket + " && export remote_path=" + outputRemotePath + ";"
  1299. } else if processorType == grampus.ProcessorTypeGPU {
  1300. commandExport = "export env=" + setting.Grampus.Env + " && export remote_path=" + outputRemotePath + ";"
  1301. }
  1302. command += commandExport
  1303. //exec code
  1304. var parameters models.Parameters
  1305. var paramCode string
  1306. if len(paramSrc) != 0 {
  1307. err := json.Unmarshal([]byte(paramSrc), &parameters)
  1308. if err != nil {
  1309. log.Error("Failed to Unmarshal params: %s (%v)", paramSrc, err)
  1310. return command, err
  1311. }
  1312. for _, parameter := range parameters.Parameter {
  1313. paramCode += " --" + parameter.Label + "=" + parameter.Value
  1314. }
  1315. }
  1316. var commandCode string
  1317. if processorType == grampus.ProcessorTypeNPU {
  1318. paramCode += " --model_url=" + modelRemoteObsUrl
  1319. commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py " + grampus.NpuLocalLogUrl + paramCode + ";"
  1320. } else if processorType == grampus.ProcessorTypeGPU {
  1321. if pretrainModelFileName != "" {
  1322. paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName
  1323. }
  1324. commandCode = "cd " + workDir + "code/" + strings.ToLower(repoName) + ";python " + bootFile + paramCode + ";"
  1325. }
  1326. command += commandCode
  1327. //get exec result
  1328. commandGetRes := "result=$?;"
  1329. command += commandGetRes
  1330. //upload models
  1331. if processorType == grampus.ProcessorTypeNPU {
  1332. // no need to upload
  1333. } else if processorType == grampus.ProcessorTypeGPU {
  1334. commandUpload := "cd " + workDir + setting.Grampus.SyncScriptProject + "/;./uploader_for_gpu " + setting.Grampus.Env + " " + outputRemotePath + " " + workDir + "output/;"
  1335. command += commandUpload
  1336. }
  1337. //check exec result
  1338. commandCheckRes := "bash -c \"[[ $result -eq 0 ]] && exit 0 || exit -1\""
  1339. command += commandCheckRes
  1340. return command, nil
  1341. }
  1342. func processPretrainModelParameter(pretrainModelPath string, pretrainModelFileName string, commandDownload string) string {
  1343. commandDownloadTemp := commandDownload
  1344. if pretrainModelPath != "" {
  1345. commandDownloadTemp += " '" + pretrainModelPath + "' '" + pretrainModelFileName + "'"
  1346. }
  1347. commandDownloadTemp += ";"
  1348. return commandDownloadTemp
  1349. }
  1350. func downloadZipCode(ctx *context.Context, codePath, branchName string) error {
  1351. archiveType := git.ZIP
  1352. archivePath := codePath
  1353. if !com.IsDir(archivePath) {
  1354. if err := os.MkdirAll(archivePath, os.ModePerm); err != nil {
  1355. log.Error("MkdirAll failed:" + err.Error())
  1356. return err
  1357. }
  1358. }
  1359. // Get corresponding commit.
  1360. var (
  1361. commit *git.Commit
  1362. err error
  1363. )
  1364. gitRepo := ctx.Repo.GitRepo
  1365. if err != nil {
  1366. log.Error("OpenRepository failed:" + err.Error())
  1367. return err
  1368. }
  1369. if gitRepo.IsBranchExist(branchName) {
  1370. commit, err = gitRepo.GetBranchCommit(branchName)
  1371. if err != nil {
  1372. log.Error("GetBranchCommit failed:" + err.Error())
  1373. return err
  1374. }
  1375. } else {
  1376. log.Error("the branch is not exist: " + branchName)
  1377. return fmt.Errorf("The branch does not exist.")
  1378. }
  1379. archivePath = path.Join(archivePath, grampus.CodeArchiveName)
  1380. if !com.IsFile(archivePath) {
  1381. if err := commit.CreateArchive(archivePath, git.CreateArchiveOpts{
  1382. Format: archiveType,
  1383. Prefix: setting.Repository.PrefixArchiveFiles,
  1384. }); err != nil {
  1385. log.Error("CreateArchive failed:" + err.Error())
  1386. return err
  1387. }
  1388. }
  1389. return nil
  1390. }
  1391. func HandleTaskWithAiCenter(ctx *context.Context) {
  1392. log.Info("HandleTaskWithAiCenter start")
  1393. updateCounts := 0
  1394. cloudBrains, err := models.GetC2NetWithAiCenterWrongJob()
  1395. if err != nil {
  1396. log.Error("GetC2NetWithAiCenterWrongJob failed:" + err.Error())
  1397. return
  1398. }
  1399. if len(cloudBrains) == 0 {
  1400. log.Info("HandleC2NetWithAiCenterWrongJob:no task need handle")
  1401. return
  1402. }
  1403. cloudBrainCounts := len(cloudBrains)
  1404. for _, task := range cloudBrains {
  1405. result, err := grampus.GetJob(task.JobID)
  1406. if err != nil {
  1407. log.Error("GetJob failed:" + err.Error())
  1408. continue
  1409. }
  1410. if len(result.JobInfo.Tasks) != 0 {
  1411. if len(result.JobInfo.Tasks[0].CenterID) == 1 && len(result.JobInfo.Tasks[0].CenterName) == 1 {
  1412. task.AiCenter = result.JobInfo.Tasks[0].CenterID[0] + "+" + result.JobInfo.Tasks[0].CenterName[0]
  1413. }
  1414. err = models.UpdateJob(task)
  1415. if err != nil {
  1416. log.Error("UpdateJob failed:" + err.Error())
  1417. }
  1418. updateCounts++
  1419. }
  1420. }
  1421. r := make(map[string]interface{}, 0)
  1422. r["cloudBrainCounts"] = cloudBrainCounts
  1423. r["updateCounts"] = updateCounts
  1424. ctx.JSON(http.StatusOK, response.SuccessWithData(r))
  1425. }
  1426. func GrampusNotebookDebug(ctx *context.Context) {
  1427. result, err := grampus.GetNotebookJob(ctx.Cloudbrain.JobID)
  1428. if err != nil {
  1429. ctx.RenderWithErr(err.Error(), tplDebugJobIndex, nil)
  1430. return
  1431. }
  1432. if len(result.JobInfo.Tasks) > 0 {
  1433. ctx.Redirect(result.JobInfo.Tasks[0].Url + "?token=" + result.JobInfo.Tasks[0].Token)
  1434. return
  1435. }
  1436. ctx.NotFound("Can not find the job.", nil)
  1437. }
  1438. func GrampusNotebookRestart(ctx *context.Context) {
  1439. var id = ctx.Params(":id")
  1440. var resultCode = "-1"
  1441. var errorMsg = ""
  1442. var status = ""
  1443. var spec *models.Specification
  1444. task := ctx.Cloudbrain
  1445. if ctx.Written() {
  1446. return
  1447. }
  1448. for {
  1449. if task.Status != models.GrampusStatusStopped && task.Status != models.GrampusStatusSucceeded && task.Status != models.GrampusStatusFailed {
  1450. log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"])
  1451. errorMsg = "the job is not stopped"
  1452. break
  1453. }
  1454. count, err := cloudbrainTask.GetNotFinalStatusTaskCount(ctx.User.ID, models.TypeC2Net, string(models.JobTypeDebug), task.ComputeResource)
  1455. if err != nil {
  1456. log.Error("GetCloudbrainNotebookCountByUserID failed:%v", err, ctx.Data["MsgID"])
  1457. errorMsg = "system error"
  1458. break
  1459. } else {
  1460. if count >= 1 {
  1461. log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
  1462. resultCode = "2"
  1463. errorMsg = ctx.Tr("repo.cloudbrain.morethanonejob")
  1464. break
  1465. }
  1466. }
  1467. oldSpec, err := resource.GetCloudbrainSpec(task.ID)
  1468. if err != nil || oldSpec == nil {
  1469. log.Error("NotebookManage GetCloudbrainSpec error.%v", err)
  1470. errorMsg = "Resource specification not available"
  1471. break
  1472. }
  1473. computeSourceSimple := models.GPU
  1474. action := models.ActionCreateGrampusGPUDebugTask
  1475. if task.ComputeResource == models.NPUResource {
  1476. computeSourceSimple = models.NPU
  1477. action = models.ActionCreateGrampusNPUDebugTask
  1478. } else if task.ComputeResource == models.GCUResource {
  1479. computeSourceSimple = models.GCU
  1480. action = models.ActionCreateGrampusGCUDebugTask
  1481. }
  1482. spec, err = resource.GetAndCheckSpec(ctx.User.ID, oldSpec.ID, models.FindSpecsOptions{
  1483. JobType: models.JobType(task.JobType),
  1484. ComputeResource: computeSourceSimple,
  1485. Cluster: models.C2NetCluster,
  1486. })
  1487. if err != nil || spec == nil {
  1488. log.Error("NotebookManage GetAndCheckSpec error.task.id = %d", task.ID)
  1489. errorMsg = "Resource specification not support any more"
  1490. break
  1491. }
  1492. if !account.IsPointBalanceEnough(ctx.User.ID, spec.UnitPrice) {
  1493. log.Error("point balance is not enough,userId=%d specId=%d", ctx.User.ID, spec.ID)
  1494. errorMsg = ctx.Tr("points.insufficient_points_balance")
  1495. break
  1496. }
  1497. if task.IsGPUTask() || task.IsGCUTask() {
  1498. if _, err := os.Stat(getOldJobPath(task)); err != nil {
  1499. log.Error("Can not find job minio path", err)
  1500. resultCode = "-1"
  1501. errorMsg = ctx.Tr("cloudbrain.result_cleared")
  1502. break
  1503. }
  1504. }
  1505. if !HasModelFile(task) { //使用预训练模型训练
  1506. errorMsg = ctx.Tr("repo.debug.manage.model_not_exist")
  1507. break
  1508. }
  1509. if hasDatasetDeleted(task) {
  1510. errorMsg = ctx.Tr("repo.debug.manage.dataset_not_exist")
  1511. break
  1512. }
  1513. createTime := timeutil.TimeStampNow()
  1514. res, err := grampus.RestartNotebookJob(task.JobID)
  1515. if err != nil {
  1516. log.Error("ManageNotebook2(%s) failed:%v", task.DisplayJobName, err.Error(), ctx.Data["MsgID"])
  1517. errorMsg = ctx.Tr("repo.debug_again_fail")
  1518. break
  1519. }
  1520. if res.GrampusResult.ErrorCode != 0 || res.NewId == "" {
  1521. log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg)
  1522. errorMsg = ctx.Tr("repo.debug_again_fail")
  1523. if res.GrampusResult.ErrorCode == 5005 {
  1524. errorMsg = ctx.Tr("repo.debug_again_fail_forever")
  1525. }
  1526. break
  1527. }
  1528. newTask := &models.Cloudbrain{
  1529. Status: res.Status,
  1530. UserID: task.UserID,
  1531. RepoID: task.RepoID,
  1532. JobID: res.NewId,
  1533. JobName: task.JobName,
  1534. DisplayJobName: task.DisplayJobName,
  1535. JobType: task.JobType,
  1536. Type: task.Type,
  1537. Uuid: task.Uuid,
  1538. Image: task.Image,
  1539. ImageID: task.ImageID,
  1540. EngineID: task.EngineID,
  1541. CommitID: task.CommitID,
  1542. EngineName: task.EngineName,
  1543. IsLatestVersion: "1",
  1544. BranchName: task.BranchName,
  1545. DatasetName: task.DatasetName,
  1546. ComputeResource: task.ComputeResource,
  1547. Description: task.Description,
  1548. CreatedUnix: createTime,
  1549. UpdatedUnix: createTime,
  1550. Spec: spec,
  1551. ModelName: task.ModelName,
  1552. ModelVersion: task.ModelVersion,
  1553. LabelName: task.LabelName,
  1554. PreTrainModelUrl: task.PreTrainModelUrl,
  1555. CkptName: task.CkptName,
  1556. WorkServerNumber: 1,
  1557. }
  1558. err = models.RestartCloudbrain(task, newTask)
  1559. if err != nil {
  1560. log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"])
  1561. errorMsg = "system error"
  1562. break
  1563. }
  1564. id = strconv.FormatInt(newTask.ID, 10)
  1565. status = res.Status
  1566. resultCode = "0"
  1567. notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, id, newTask.DisplayJobName, action)
  1568. break
  1569. }
  1570. ctx.JSON(200, map[string]string{
  1571. "result_code": resultCode,
  1572. "error_msg": errorMsg,
  1573. "status": status,
  1574. "id": id,
  1575. })
  1576. }