Browse Source

Merge pull request '智算网络NPU训练任务优化--取消代码下载时对外网的依赖' (#3029) from npu-internet-limit into V20221019

Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/3029
Reviewed-by: ychao_1983 <ychao_1983@sina.com>
tags/v1.22.10.1^2
ychao_1983 3 years ago
parent
commit
ec7a5c1cbf
3 changed files with 17 additions and 5 deletions
  1. +2
    -0
      models/cloudbrain.go
  2. +11
    -0
      modules/grampus/grampus.go
  3. +4
    -5
      routers/repo/grampus.go

+ 2
- 0
models/cloudbrain.go View File

@@ -1490,6 +1490,8 @@ type GrampusTasks struct {
ReplicaNum int `json:"replicaNum"` ReplicaNum int `json:"replicaNum"`
Datasets []GrampusDataset `json:"datasets"` Datasets []GrampusDataset `json:"datasets"`
Models []GrampusDataset `json:"models"` Models []GrampusDataset `json:"models"`
Code GrampusDataset `json:"code"`
BootFile string `json:"bootFile"`
} }


type GrampusDataset struct { type GrampusDataset struct {


+ 11
- 0
modules/grampus/grampus.go View File

@@ -1,6 +1,7 @@
package grampus package grampus


import ( import (
"code.gitea.io/gitea/modules/cloudbrain"
"encoding/json" "encoding/json"
"strings" "strings"


@@ -73,6 +74,7 @@ type GenerateTrainJobReq struct {
PreTrainModelPath string PreTrainModelPath string
PreTrainModelUrl string PreTrainModelUrl string
Spec *models.Specification Spec *models.Specification
CodeName string
} }


func getEndPoint() string { func getEndPoint() string {
@@ -102,6 +104,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
centerID, centerName := getCentersParamter(ctx, req) centerID, centerName := getCentersParamter(ctx, req)


var datasetGrampus, modelGrampus []models.GrampusDataset var datasetGrampus, modelGrampus []models.GrampusDataset
var codeGrampus models.GrampusDataset
if ProcessorTypeNPU == req.ProcessType { if ProcessorTypeNPU == req.ProcessType {
datasetGrampus = getDatasetGrampus(req.DatasetInfos) datasetGrampus = getDatasetGrampus(req.DatasetInfos)
if len(req.ModelName) != 0 { if len(req.ModelName) != 0 {
@@ -114,6 +117,12 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
}, },
} }
} }
codeGrampus = models.GrampusDataset{
Name: req.CodeName,
Bucket: setting.Bucket,
EndPoint: getEndPoint(),
ObjectKey: req.CodeObsPath + cloudbrain.DefaultBranchName + ".zip",
}
} }


jobResult, err := createJob(models.CreateGrampusJobRequest{ jobResult, err := createJob(models.CreateGrampusJobRequest{
@@ -130,6 +139,8 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
ReplicaNum: 1, ReplicaNum: 1,
Datasets: datasetGrampus, Datasets: datasetGrampus,
Models: modelGrampus, Models: modelGrampus,
Code: codeGrampus,
BootFile: req.BootFile,
}, },
}, },
}) })


+ 4
- 5
routers/repo/grampus.go View File

@@ -713,6 +713,7 @@ func grampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
DatasetNames: datasetNames, DatasetNames: datasetNames,
DatasetInfos: datasetInfos, DatasetInfos: datasetInfos,
Spec: spec, Spec: spec,
CodeName: strings.ToLower(repo.Name),
} }
if form.ModelName != "" { //使用预训练模型训练 if form.ModelName != "" { //使用预训练模型训练
req.ModelName = form.ModelName req.ModelName = form.ModelName
@@ -977,8 +978,7 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo
command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject) command += "pwd;cd " + workDir + fmt.Sprintf(grampus.CommandPrepareScript, setting.Grampus.SyncScriptProject, setting.Grampus.SyncScriptProject)
//download code & dataset //download code & dataset
if processorType == grampus.ProcessorTypeNPU { if processorType == grampus.ProcessorTypeNPU {
commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + ";"
command += commandDownload
//no need to download code & dataset by internet
} else if processorType == grampus.ProcessorTypeGPU { } else if processorType == grampus.ProcessorTypeGPU {
commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'" commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " '" + dataRemotePath + "' '" + datasetName + "'"
commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload) commandDownload = processPretrainModelParameter(pretrainModelPath, pretrainModelFileName, commandDownload)
@@ -987,8 +987,7 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo


//unzip code & dataset //unzip code & dataset
if processorType == grampus.ProcessorTypeNPU { if processorType == grampus.ProcessorTypeNPU {
commandUnzip := "cd " + workDir + "code;unzip -q master.zip;"
command += commandUnzip
//no need to process
} else if processorType == grampus.ProcessorTypeGPU { } else if processorType == grampus.ProcessorTypeGPU {
unZipDatasetCommand := generateDatasetUnzipCommand(datasetName) unZipDatasetCommand := generateDatasetUnzipCommand(datasetName)
commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand commandUnzip := "cd " + workDir + "code;unzip -q master.zip;echo \"start to unzip dataset\";cd " + workDir + "dataset;" + unZipDatasetCommand
@@ -1025,7 +1024,7 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo


var commandCode string var commandCode string
if processorType == grampus.ProcessorTypeNPU { if processorType == grampus.ProcessorTypeNPU {
commandCode = "/bin/bash /home/work/run_train_for_openi.sh " + workDir + "code/" + strings.ToLower(repoName) + "/" + bootFile + " /tmp/log/train.log" + paramCode + ";"
commandCode = "/bin/bash /home/work/run_train_for_openi.sh /home/work/openi.py /tmp/log/train.log" + paramCode + ";"
} else if processorType == grampus.ProcessorTypeGPU { } else if processorType == grampus.ProcessorTypeGPU {
if pretrainModelFileName != "" { if pretrainModelFileName != "" {
paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName paramCode += " --ckpt_url" + "=" + workDir + "pretrainmodel/" + pretrainModelFileName


Loading…
Cancel
Save