|
|
|
@@ -1,7 +1,11 @@ |
|
|
|
package repo |
|
|
|
|
|
|
|
import ( |
|
|
|
"encoding/json" |
|
|
|
"errors" |
|
|
|
|
|
|
|
"code.gitea.io/gitea/models" |
|
|
|
"code.gitea.io/gitea/modules/cloudbrain" |
|
|
|
"code.gitea.io/gitea/modules/context" |
|
|
|
"code.gitea.io/gitea/modules/log" |
|
|
|
"code.gitea.io/gitea/modules/setting" |
|
|
|
@@ -14,6 +18,19 @@ const ( |
|
|
|
PYTORCH_ENGINE = 0 |
|
|
|
TENSORFLOW_ENGINE = 1 |
|
|
|
MINDSPORE_ENGIN = 2 |
|
|
|
ModelMountPath = "/model" |
|
|
|
CodeMountPath = "/code" |
|
|
|
DataSetMountPath = "/dataset" |
|
|
|
LogFile = "log.txt" |
|
|
|
DefaultBranchName = "master" |
|
|
|
SubTaskName = "task1" |
|
|
|
GpuQueue = "openidgx" |
|
|
|
Success = "S000" |
|
|
|
GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap" |
|
|
|
) |
|
|
|
|
|
|
|
var ( |
|
|
|
TrainResourceSpecs *models.ResourceSpecs |
|
|
|
) |
|
|
|
|
|
|
|
func SaveModelConvert(ctx *context.Context) { |
|
|
|
@@ -25,6 +42,7 @@ func SaveModelConvert(ctx *context.Context) { |
|
|
|
name := ctx.Query("name") |
|
|
|
desc := ctx.Query("desc") |
|
|
|
modelId := ctx.Query("modelId") |
|
|
|
modelPath := ctx.Query("modelPath") |
|
|
|
SrcEngine := ctx.QueryInt("SrcEngine") |
|
|
|
InputShape := ctx.Query("inputshape") |
|
|
|
InputDataFormat := ctx.Query("inputdataformat") |
|
|
|
@@ -40,6 +58,7 @@ func SaveModelConvert(ctx *context.Context) { |
|
|
|
SrcEngine: SrcEngine, |
|
|
|
RepoId: ctx.Repo.Repository.ID, |
|
|
|
ModelId: modelId, |
|
|
|
ModelPath: modelPath, |
|
|
|
DestFormat: DestFormat, |
|
|
|
NetOutputFormat: NetOutputFormat, |
|
|
|
InputShape: InputShape, |
|
|
|
@@ -52,11 +71,89 @@ func SaveModelConvert(ctx *context.Context) { |
|
|
|
}) |
|
|
|
} |
|
|
|
|
|
|
|
func createTrainJob(modelId string, SrcEngine int, ctx *context.Context) { |
|
|
|
func createTrainJob(modelConvertId string, modelId string, SrcEngine int, ctx *context.Context, modelPath string) error { |
|
|
|
repo, _ := models.GetRepositoryByID(ctx.Repo.Repository.RepoID) |
|
|
|
if SrcEngine == PYTORCH_ENGINE { |
|
|
|
codePath := setting.JobPath + modelConvertId + CodeMountPath |
|
|
|
downloadCode(repo, codePath, DefaultBranchName) |
|
|
|
uploadCodeToMinio(codePath+"/", modelConvertId, CodeMountPath+"/") |
|
|
|
|
|
|
|
modelPath := setting.JobPath + modelConvertId + ModelMountPath + "/" |
|
|
|
mkModelPath(modelPath) |
|
|
|
uploadCodeToMinio(modelPath, modelConvertId, ModelMountPath+"/") |
|
|
|
command := getModelConvertCommand(modelConvertId) |
|
|
|
dataActualPath := setting.Attachment.Minio.RealPath + modelPath |
|
|
|
|
|
|
|
if TrainResourceSpecs == nil { |
|
|
|
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) |
|
|
|
} |
|
|
|
resourceSpec := TrainResourceSpecs.ResourceSpec[1] |
|
|
|
jobResult, err := cloudbrain.CreateJob(modelConvertId, models.CreateJobParams{ |
|
|
|
JobName: modelConvertId, |
|
|
|
RetryCount: 1, |
|
|
|
GpuType: GpuQueue, |
|
|
|
Image: GPU_PYTORCH_IMAGE, |
|
|
|
TaskRoles: []models.TaskRole{ |
|
|
|
{ |
|
|
|
Name: SubTaskName, |
|
|
|
TaskNumber: 1, |
|
|
|
MinSucceededTaskCount: 1, |
|
|
|
MinFailedTaskCount: 1, |
|
|
|
CPUNumber: resourceSpec.CpuNum, |
|
|
|
GPUNumber: resourceSpec.GpuNum, |
|
|
|
MemoryMB: resourceSpec.MemMiB, |
|
|
|
ShmMB: resourceSpec.ShareMemMiB, |
|
|
|
Command: command, |
|
|
|
NeedIBDevice: false, |
|
|
|
IsMainRole: false, |
|
|
|
UseNNI: false, |
|
|
|
}, |
|
|
|
}, |
|
|
|
Volumes: []models.Volume{ |
|
|
|
{ |
|
|
|
HostPath: models.StHostPath{ |
|
|
|
Path: codePath, |
|
|
|
MountPath: CodeMountPath, |
|
|
|
ReadOnly: false, |
|
|
|
}, |
|
|
|
}, |
|
|
|
{ |
|
|
|
HostPath: models.StHostPath{ |
|
|
|
Path: dataActualPath, |
|
|
|
MountPath: DataSetMountPath, |
|
|
|
ReadOnly: true, |
|
|
|
}, |
|
|
|
}, |
|
|
|
{ |
|
|
|
HostPath: models.StHostPath{ |
|
|
|
Path: modelPath, |
|
|
|
MountPath: ModelMountPath, |
|
|
|
ReadOnly: false, |
|
|
|
}, |
|
|
|
}, |
|
|
|
}, |
|
|
|
}) |
|
|
|
if err != nil { |
|
|
|
log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) |
|
|
|
return err |
|
|
|
} |
|
|
|
if jobResult.Code != Success { |
|
|
|
log.Error("CreateJob(%s) failed:%s", modelConvertId, jobResult.Msg, ctx.Data["MsgID"]) |
|
|
|
return errors.New(jobResult.Msg) |
|
|
|
} |
|
|
|
|
|
|
|
var jobID = jobResult.Payload["jobId"].(string) |
|
|
|
log.Info("jobId=" + jobID) |
|
|
|
} |
|
|
|
|
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func getModelConvertCommand(name string) string { |
|
|
|
var command string |
|
|
|
bootFile := "convert_pytorch.py" |
|
|
|
command += "python /code/" + bootFile + " > " + ModelMountPath + "/" + name + "-" + LogFile |
|
|
|
return command |
|
|
|
} |
|
|
|
|
|
|
|
func DeleteModelConvert(ctx *context.Context) { |
|
|
|
|