# Conflicts:
# internal/logic/adapters/createclusterlogic.go
Former-commit-id: 1e969cc3e7
pull/322/head
| @@ -38,10 +38,9 @@ jobs: | |||||
| username: ${{ secrets.ALIYUN_USERNAME }} | username: ${{ secrets.ALIYUN_USERNAME }} | ||||
| password: ${{ secrets.ALIYUN_PASSWORD }} | password: ${{ secrets.ALIYUN_PASSWORD }} | ||||
| - name: Build and push | |||||
| - name: Build and push multi-arch image | |||||
| run: | | run: | | ||||
| docker build -t ${{env.REGISTRY}}/${{env.IMAGE_NAME}}:${{env.IMAGE_TAG}} . | |||||
| docker push ${{env.REGISTRY}}/${{env.IMAGE_NAME}}:${{env.IMAGE_TAG}} | |||||
| docker buildx build --platform linux/amd64,linux/arm64 -t ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ env.IMAGE_TAG }} --push . | |||||
| - name: Set up SSH key | - name: Set up SSH key | ||||
| run: | | run: | | ||||
| @@ -1,4 +1,4 @@ | |||||
| FROM golang:1.22.4-alpine3.20 AS builder | |||||
| FROM --platform=$BUILDPLATFORM golang:1.22.4-alpine3.20 AS builder | |||||
| WORKDIR /app | WORKDIR /app | ||||
| COPY . . | COPY . . | ||||
| @@ -6,10 +6,12 @@ COPY . . | |||||
| ENV GO111MODULE=on GOPROXY=https://goproxy.cn,direct | ENV GO111MODULE=on GOPROXY=https://goproxy.cn,direct | ||||
| RUN go mod download | RUN go mod download | ||||
| RUN CGO_ENABLED=0 go build -ldflags="-w -s" -o pcm-core-api | |||||
| ARG TARGETOS | |||||
| ARG TARGETARCH | |||||
| # 使用 GOOS 和 GOARCH 环境变量来构建不同架构的二进制文件 | |||||
| RUN CGO_ENABLED=0 GOOS=$TARGETOS GOARCH=$TARGETARCH go build -ldflags="-w -s" -o pcm-core-api | |||||
| FROM alpine:latest | |||||
| FROM --platform=$TARGETPLATFORM alpine:latest | |||||
| WORKDIR /app | WORKDIR /app | ||||
| #修改alpine源为上海交通大学 | #修改alpine源为上海交通大学 | ||||
| @@ -18,7 +20,7 @@ RUN apk add --no-cache ca-certificates && update-ca-certificates && \ | |||||
| rm -rf /var/cache/apk/* | rm -rf /var/cache/apk/* | ||||
| COPY --from=builder /app/pcm-core-api . | COPY --from=builder /app/pcm-core-api . | ||||
| COPY etc/pcm.yaml . | |||||
| COPY --from=builder /app/etc/pcm.yaml . | |||||
| ENV TZ=Asia/Shanghai | ENV TZ=Asia/Shanghai | ||||
| @@ -136,24 +136,61 @@ type CloudInfo struct { | |||||
| } | } | ||||
| type AiInfo struct { | type AiInfo struct { | ||||
| TaskId int64 `json:"taskId,omitempty"` | |||||
| ProjectId string `json:"project_id,omitempty"` | |||||
| AdapterId int64 `json:"adapterId,omitempty,optional"` | |||||
| AdapterName string `json:"adapterName,omitempty,optional"` | |||||
| ClusterId int64 `json:"clusterId,omitempty,optional"` | |||||
| ClusterName string `json:"clusterName,omitempty,optional"` | |||||
| Name string `json:"name,omitempty"` | |||||
| Status string `json:"status,omitempty"` | |||||
| StartTime string `json:"startTime,omitempty"` | |||||
| RunningTime int64 `json:"runningTime,omitempty"` | |||||
| Result string `json:"result,omitempty"` | |||||
| JobId string `json:"jobId,omitempty"` | |||||
| Id int64 `json:"id"` // id | |||||
| AdapterId int64 `json:"adapterId,omitempty,optional"` | |||||
| AdapterName string `json:"adapterName,omitempty,optional"` | |||||
| ClusterId int64 `json:"clusterId,omitempty,optional"` | |||||
| ClusterIds []int64 `json:"clusterIds,omitempty,optional"` | |||||
| TaskId int64 `json:"taskId,omitempty"` | |||||
| TaskName string `json:"taskName,omitempty"` | |||||
| Replica int32 `json:"replica,omitempty"` | |||||
| ResourceType string `json:"resourceType,omitempty"` | |||||
| CpuCoreNum int32 `json:"cpuCoreNum,omitempty"` | |||||
| TaskType string `json:"taskType,omitempty"` | |||||
| DatasetsName string `json:"datasetsName,omitempty"` | |||||
| ProjectId string `json:"project_id,omitempty"` | |||||
| StrategyName string `json:"strategyName,omitempty"` | |||||
| ClusterToStaticWeight map[string]int32 `json:"clusterToStaticWeight,omitempty"` | |||||
| Tops float64 `json:"tops,omitempty"` | |||||
| ComputeCard string `json:"computeCard,omitempty,optional"` | |||||
| CodeType string `json:"codeType,omitempty,optional"` | |||||
| ClusterName string `json:"clusterName,omitempty,optional"` | |||||
| ModelName string `json:"ModelName,omitempty,optional"` | |||||
| AlgorithmName string `json:"algorithmName,omitempty,optional"` | |||||
| Strategy string `json:"strategy,omitempty"` | |||||
| ImageId string `json:"imageId,omitempty"` | |||||
| SpecId string `json:"specId,omitempty"` | |||||
| DatasetsId string `json:"datasetsId,omitempty"` | |||||
| CodeId string `json:"codeId,omitempty"` | |||||
| ResourceId string `json:"resourceId,omitempty"` | |||||
| AlgorithmId string `json:"algorithmId,omitempty"` | |||||
| MetadataName string `json:"metadataName,omitempty"` | |||||
| Cmd string `json:"cmd,omitempty"` | |||||
| Envs []string `json:"envs,omitempty"` | |||||
| Params []string `json:"params,omitempty"` | |||||
| Environments string `json:"environments,omitempty"` | |||||
| Parameters string `json:"parameters,omitempty"` | |||||
| Name string `json:"name,omitempty"` | |||||
| Status string `json:"status,omitempty"` | |||||
| StartTime string `json:"startTime,omitempty"` | |||||
| RunningTime int64 `json:"runningTime,omitempty"` | |||||
| Result string `json:"result,omitempty"` | |||||
| JobId string `json:"jobId,omitempty"` | |||||
| Datasets string `json:"datasets,omitempty"` | |||||
| AlgorithmCode string `json:"algorithmCode,omitempty"` | |||||
| Image string `json:"image,omitempty"` | |||||
| CreateTime string `json:"createTime,omitempty"` | CreateTime string `json:"createTime,omitempty"` | ||||
| ImageUrl string `json:"imageUrl,omitempty"` | ImageUrl string `json:"imageUrl,omitempty"` | ||||
| Command string `json:"command,omitempty"` | Command string `json:"command,omitempty"` | ||||
| FlavorId string `json:"flavorId,omitempty"` | FlavorId string `json:"flavorId,omitempty"` | ||||
| SubscriptionId string `json:"subscriptionId,omitempty"` | SubscriptionId string `json:"subscriptionId,omitempty"` | ||||
| ItemVersionId string `json:"itemVersionId,omitempty"` | ItemVersionId string `json:"itemVersionId,omitempty"` | ||||
| ObsUrl string `json:"obsUrl,omitempty"` | |||||
| } | } | ||||
| type VmInfo struct { | type VmInfo struct { | ||||
| @@ -0,0 +1,44 @@ | |||||
| **PCM arm环境部署流程总结** | |||||
| 10月12日: | |||||
| 现场环境为银河麒麟V10(GFB) | |||||
| 预先准备的安装文档依赖底层环境为X86架构的centos操作系统,所有很多工作都需要重新整理 | |||||
| 我们尝试在阿里云服务器 创建一台按需计费的arm64 ubuntu系统虚拟机来模拟现场环境 | |||||
| 首先需要安装K8S集群,参考sealos官方的文档可以提供arm版本的二进制工具,所以沿用sealos进行二进制安装的方式 | |||||
| sealos安装集群要求原操作系统没有docker环境,现场有三台机器,但是有两台已安装docker并不能调整,所以使用另外一台没有安装docker的机器进行实验 | |||||
| 10月14日: | |||||
| mysql现场本地有安装包可以提供安装包,但是后期现场直接提供了mysql环境,实施过程中仅用脚本导入了数据即可,结合此次经验,可以考虑将数据库也放到容器环境中,但是要做好pvc部分的处理 | |||||
| sealos启动k8s集群的过程中,出现了can't get ip address of node kylin-pc,error:no default route found in "/proc/net/route" or "/proc/net/ipv6_route" node="kylin-pc",检查了一下服务器的名称为kyin-PC,这里查询后发现**k8s不支持机器名称为大写**,调整虚拟机名称为kylin-pc之后此报错依旧存在,此时经过网络搜索排查初步怀疑是机器的路由配置存在问题,但问题没有解决 | |||||
| 10月15日: | |||||
| 重新调整思路开始排查docker 进程,发现apiserver和cm和scheduler启动的日志报错都是提示 | |||||
| exec /pause:permission denied,此时怀疑是镜像打包的系统版本和现场环境的版本差异导致镜像应用不兼容, 找舒总帮忙协调找一个同样版本的系统镜像搭建模拟环境,然后给舒总看了相关的报错之后,舒总提示在麒麟的系统重有一个安全相关的设置要关闭,通过执行**sudo setstatus softmode -p关闭系统安全限制**(后续验证其实就是将系统界面设置里面的防火墙关闭即可),不然在镜像中会提示二进制无法执行。然后sealos安装集群正常往下走。 | |||||
| 10月16日: | |||||
| 发现sealos安装流程可以正常走完,apiserver、cs和schedule但是在执行网络插件cilium安装的过程中出现报错"Envoy; Binary "cilium-enovy" cannot be executed" error="signal: aborted(core dumped)" subsys=envoy-manager,此时咨询舒总,推测是**应用版本和系统内核不兼容**,这个问题在长沙办公室同版本镜像装在PC机器上可以重现出来。 | |||||
| 然后尝试将k8s版本降低,同时网络插件从cilium换成了calico,中途还是会出现coredns容器起不来的情况,中间过程尝试用gfb镜像在阿里云操作发现经常会死机所以放弃使用。 | |||||
| 10月17日: | |||||
| 联系舒总看是否可以联系到麒麟云的同事提供一个容器云的安装包来部署K8S集群,中途发现需要使用rpm命令来安装工具包,而现场环境不支持rpm命令所以放弃。 | |||||
| 因为K8S集群还存在问题无法搭建,所以中途考虑使用纯docker的方案也把容器暂时跑起来运行了。但是结合业务场景提供不了太多支持。 | |||||
| 10月18日: | |||||
| 长沙办公室GFB系统安装完成,在上文提到的关闭防火墙操作执行之后,1.25版本的集群安装没有出现问题。然后在这个系统中的所有素材打包发往现场。用这个包在安装之后,发现所有的pod都能够启动,状态为running,网络插件中的calico-node的pod可用数一直为0,查看calico-node一直报错kubelet Readiness probe failed calico/node is not ready: BIRD is not ready:Error querying BIRD:ubable to connect to BIRDv4 socket:dial unix /var/run/calico/bird.ctl: connection refused或者是二进制文件找不到,总之是网络相关的配置有问题,然后发现calico-server的镜像没有,pod也没有起来,以为是打的包有问题,所以又重新怀疑到网络问题,调整dns发现没有效果。 | |||||
| 10月19日: | |||||
| 找到阳哥帮忙来查看一下,起初以为是因为calico没有识别到网卡,需要在calico的cm中去配置网卡识别规则来重新识别,改动cm之后没有效果,所有其他pod的报错日志都是Error from server: no preferred addresses found; known addresses: [],然后查看kubelet日志,其中报错can't get ip address of node error:no default routes,阳哥提醒再检查一下路由,发现果然没有配置默认路由,然后**配置了一个同网段不存在的地址作为默认网关**,此时所有的pod都正常启动可以运行了。 | |||||
| 后续pcm服务的部署过程相对比较顺利,其中一个点事pcm-kubernetes的配置文件中服务名的配置需要和集群名称一一对应,这个配置规则当时没有注意到,后续需要把规则调整,一个服务要可以代理多个集群,**服务的名称不应该和集群名称相关联**。 | |||||
| 至此所有的服务也业务部署流程基本完成,PCM平台在arm64的麒麟系统中部署基本形成一套完整的素材。过程中遇到了很多问题,总结起来可以提高效率的地方: | |||||
| **1. 在得知现场环境的版本之后,如果有条件第一时间搭建对应的模拟环境,方便后续的问题复现以及排查 | |||||
| 2. 目前团队对于网络方面的储备略有不足,需要多加学习 | |||||
| 3. 有些安装繁琐或者对系统版本要求多但是可以通过容器化的东西,可以考虑转到容器中运行,例如mysql等** | |||||
| @@ -26,7 +26,8 @@ func (l *ListNoticeLogic) ListNotice(req *clientCore.ListNoticeReq) (*clientCore | |||||
| var resp clientCore.ListNoticeResp | var resp clientCore.ListNoticeResp | ||||
| l.svcCtx.DbEngin.Raw("select * from t_notice order by created_time desc").Scan(¬ices) | |||||
| //TODO 防止数据量过大,限制查询记录100条 | |||||
| l.svcCtx.DbEngin.Raw("select * from t_notice order by created_time desc limit 100").Scan(¬ices) | |||||
| for _, notice := range notices { | for _, notice := range notices { | ||||
| resp.Data = append(resp.Data, notice) | resp.Data = append(resp.Data, notice) | ||||
| } | } | ||||
| @@ -2,6 +2,8 @@ package core | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| "encoding/json" | |||||
| "fmt" | |||||
| "github.com/jinzhu/copier" | "github.com/jinzhu/copier" | ||||
| clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client" | clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | ||||
| @@ -9,6 +11,7 @@ import ( | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" | ||||
| "gitlink.org.cn/jcce-pcm/utils/tool" | "gitlink.org.cn/jcce-pcm/utils/tool" | ||||
| "gorm.io/gorm" | "gorm.io/gorm" | ||||
| "log" | |||||
| "github.com/zeromicro/go-zero/core/logx" | "github.com/zeromicro/go-zero/core/logx" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | ||||
| @@ -76,12 +79,45 @@ func (l *PullTaskInfoLogic) PullTaskInfo(req *clientCore.PullTaskInfoReq) (*clie | |||||
| } | } | ||||
| case 1: | case 1: | ||||
| var aiModelList []models.Ai | |||||
| var aiModelList []models.TaskAi | |||||
| err := findModelList(req.AdapterId, l.svcCtx.DbEngin, &aiModelList) | err := findModelList(req.AdapterId, l.svcCtx.DbEngin, &aiModelList) | ||||
| if err != nil { | if err != nil { | ||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| utils.Convert(aiModelList, &resp.AiInfoList) | utils.Convert(aiModelList, &resp.AiInfoList) | ||||
| if len(resp.AiInfoList) > 0 { | |||||
| for i, aiInfo := range aiModelList { | |||||
| if resp.AiInfoList[i].Environments != "" { | |||||
| // 定义一个map来存储解析后的JSON数据 | |||||
| var result map[string]interface{} | |||||
| // 解析JSON字符串 | |||||
| err := json.Unmarshal([]byte(resp.AiInfoList[i].Environments), &result) | |||||
| if err != nil { | |||||
| log.Fatalf("Error parsing JSON: %v", err) | |||||
| } | |||||
| // 如果你需要将解析后的map再次转换为JSON字符串,可以使用json.MarshalIndent | |||||
| formattedJSON, err := json.MarshalIndent(result, "", " ") | |||||
| aiInfo.Environments = string(formattedJSON) | |||||
| fmt.Println(aiInfo.Environments) | |||||
| resp.AiInfoList[i].Environments = aiInfo.Environments | |||||
| } | |||||
| if resp.AiInfoList[i].Parameters != "" { | |||||
| // 定义一个map来存储解析后的JSON数据 | |||||
| var result []interface{} | |||||
| // 解析JSON字符串 | |||||
| err := json.Unmarshal([]byte(resp.AiInfoList[i].Parameters), &result) | |||||
| if err != nil { | |||||
| log.Fatalf("Error parsing JSON: %v", err) | |||||
| } | |||||
| // 如果你需要将解析后的map再次转换为JSON字符串,可以使用json.MarshalIndent | |||||
| formattedJSON, err := json.MarshalIndent(result, "", " ") | |||||
| aiInfo.Parameters = string(formattedJSON) | |||||
| fmt.Println(aiInfo.Parameters) | |||||
| resp.AiInfoList[i].Parameters = aiInfo.Parameters | |||||
| } | |||||
| } | |||||
| } | |||||
| } | } | ||||
| return &resp, nil | return &resp, nil | ||||
| } | } | ||||
| @@ -9,7 +9,6 @@ import ( | |||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/svc" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types" | ||||
| "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" | ||||
| "time" | |||||
| ) | ) | ||||
| type DeployInstanceListLogic struct { | type DeployInstanceListLogic struct { | ||||
| @@ -72,18 +71,6 @@ func (l *DeployInstanceListLogic) DeployInstanceList(req *types.DeployInstanceLi | |||||
| if len(list) != 0 { | if len(list) != 0 { | ||||
| go status.UpdateDeployInstanceStatusBatch(l.svcCtx, list, true) | go status.UpdateDeployInstanceStatusBatch(l.svcCtx, list, true) | ||||
| ins := list[0] | |||||
| for i := range list { | |||||
| uTime, _ := time.Parse(time.RFC3339, ins.UpdateTime) | |||||
| latest, _ := time.Parse(time.RFC3339, list[i].UpdateTime) | |||||
| if latest.After(uTime) { | |||||
| ins = list[i] | |||||
| } | |||||
| } | |||||
| go status.UpdateDeployInstanceStatus(l.svcCtx, ins, true, nil) | |||||
| go status.UpdateDeployTaskStatus(l.svcCtx) | |||||
| } | } | ||||
| resp.List = &deployTasks | resp.List = &deployTasks | ||||
| @@ -103,6 +103,15 @@ func (l *StartAllByDeployTaskIdLogic) startAll(list []*models.AiInferDeployInsta | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| ins.Status = "Updating" | |||||
| err = l.svcCtx.Scheduler.AiStorages.UpdateInferDeployInstance(ins, true) | |||||
| if err != nil { | |||||
| wg.Done() | |||||
| <-buf | |||||
| return | |||||
| } | |||||
| wg.Done() | wg.Done() | ||||
| <-buf | <-buf | ||||
| }() | }() | ||||
| @@ -45,7 +45,11 @@ func (l *StartDeployInstanceListLogic) StartDeployInstanceList(req *types.StartD | |||||
| } | } | ||||
| } | } | ||||
| go status.UpdateDeployInstanceStatus(l.svcCtx, ins, true, nil) | |||||
| ins.Status = "Updating" | |||||
| err = l.svcCtx.Scheduler.AiStorages.UpdateInferDeployInstance(ins, true) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| return resp, nil | return resp, nil | ||||
| } | } | ||||
| @@ -104,6 +104,15 @@ func (l *StopAllByDeployTaskIdLogic) stopAll(list []*models.AiInferDeployInstanc | |||||
| return | return | ||||
| } | } | ||||
| } | } | ||||
| ins.Status = "Updating" | |||||
| err = l.svcCtx.Scheduler.AiStorages.UpdateInferDeployInstance(ins, true) | |||||
| if err != nil { | |||||
| wg.Done() | |||||
| <-buf | |||||
| return | |||||
| } | |||||
| wg.Done() | wg.Done() | ||||
| <-buf | <-buf | ||||
| }() | }() | ||||
| @@ -45,7 +45,11 @@ func (l *StopDeployInstanceLogic) StopDeployInstance(req *types.StopDeployInstan | |||||
| } | } | ||||
| } | } | ||||
| go status.UpdateDeployInstanceStatus(l.svcCtx, ins, true, nil) | |||||
| ins.Status = "Updating" | |||||
| err = l.svcCtx.Scheduler.AiStorages.UpdateInferDeployInstance(ins, true) | |||||
| if err != nil { | |||||
| return nil, err | |||||
| } | |||||
| return resp, nil | return resp, nil | ||||
| } | } | ||||
| @@ -28,6 +28,7 @@ import ( | |||||
| "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice" | "gitlink.org.cn/JointCloud/pcm-modelarts/client/modelartsservice" | ||||
| "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" | "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" | ||||
| modelartsclient "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" | modelartsclient "gitlink.org.cn/JointCloud/pcm-modelarts/modelarts" | ||||
| "gorm.io/gorm" | |||||
| "log" | "log" | ||||
| "mime/multipart" | "mime/multipart" | ||||
| "strconv" | "strconv" | ||||
| @@ -54,6 +55,7 @@ type ModelArtsLink struct { | |||||
| Version string | Version string | ||||
| ModelId string | ModelId string | ||||
| ModelType string | ModelType string | ||||
| DbEngin *gorm.DB | |||||
| } | } | ||||
| // Version 结构体表示版本号 | // Version 结构体表示版本号 | ||||
| @@ -61,6 +63,10 @@ type Version struct { | |||||
| Major, Minor, Patch int | Major, Minor, Patch int | ||||
| } | } | ||||
| type AiStorage struct { | |||||
| DbEngin *gorm.DB | |||||
| } | |||||
| // ParseVersion 从字符串解析版本号 | // ParseVersion 从字符串解析版本号 | ||||
| func ParseVersion(versionStr string) (*Version, error) { | func ParseVersion(versionStr string) (*Version, error) { | ||||
| parts := strings.Split(versionStr, ".") | parts := strings.Split(versionStr, ".") | ||||
| @@ -174,6 +180,11 @@ func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd stri | |||||
| Platform: m.platform, | Platform: m.platform, | ||||
| } | } | ||||
| resp, err := m.modelArtsRpc.CreateTrainingJob(ctx, req) | resp, err := m.modelArtsRpc.CreateTrainingJob(ctx, req) | ||||
| //tx := m.DbEngin.Create(adapterId) | |||||
| /*if tx.Error != nil { | |||||
| return tx.Error, nil | |||||
| }*/ | |||||
| if err != nil { | if err != nil { | ||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| @@ -45,8 +45,8 @@ const ( | |||||
| WorkPath = "/work/home/acgnnmfbwo/pcmv1/" | WorkPath = "/work/home/acgnnmfbwo/pcmv1/" | ||||
| TimeoutLimit = "10:00:00" | TimeoutLimit = "10:00:00" | ||||
| PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" | PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py" | ||||
| DATASETS_DIR = "/work/home/acgnnmfbwo/pcmv1/dataset" | |||||
| ALGORITHM_DIR = "/work/home/acgnnmfbwo/pcmv1/algorithm" | |||||
| DATASETS_DIR = KUNSHAN_DIR + "/dataset" | |||||
| ALGORITHM_DIR = KUNSHAN_DIR + "/algorithm" | |||||
| KUNSHAN_DIR = "/public/home/acgnnmfbwo/pcmv1" | KUNSHAN_DIR = "/public/home/acgnnmfbwo/pcmv1" | ||||
| TRAIN_FILE = "train.py" | TRAIN_FILE = "train.py" | ||||
| CPUCOREPRICEPERHOUR = 0.09 | CPUCOREPRICEPERHOUR = 0.09 | ||||
| @@ -97,8 +97,8 @@ var ( | |||||
| } | } | ||||
| ModelNameCmdMap = map[string]string{ | ModelNameCmdMap = map[string]string{ | ||||
| "blip-image-captioning-base": "pip install transformers python-multipart fastapi uvicorn[standard]; python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/blip_image_captioning_base/infer.py", | |||||
| "imagenet_resnet50": "pip install fastapi uvicorn[standard] python-multipart; python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/imagenet_resnet50/infer.py", | |||||
| "blip-image-captioning-base": "sudo pip install transformers python-multipart fastapi uvicorn[standard]; sudo python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/blip_image_captioning_base/infer.py", | |||||
| "imagenet_resnet50": "sudo pip install fastapi uvicorn[standard] python-multipart; sudo python /public/home/acgnnmfbwo/pcmv1/inference/pytorch/imagenet_resnet50/infer.py", | |||||
| } | } | ||||
| ) | ) | ||||
| @@ -883,6 +883,10 @@ func (s *ShuguangAi) GetInferDeployInstance(ctx context.Context, id string) (*in | |||||
| return nil, err | return nil, err | ||||
| } | } | ||||
| if resp.Data == nil { | |||||
| return nil, errors.New("GetInferDeployInstance empty") | |||||
| } | |||||
| var url string | var url string | ||||
| if resp.Data.Status == constants.Running { | if resp.Data.Status == constants.Running { | ||||
| url = resp.Data.ContainerPortInfoList[0].AccessUrl | url = resp.Data.ContainerPortInfoList[0].AccessUrl | ||||
| @@ -10,7 +10,6 @@ import ( | |||||
| "time" | "time" | ||||
| "github.com/zeromicro/go-zero/core/stores/builder" | "github.com/zeromicro/go-zero/core/stores/builder" | ||||
| "github.com/zeromicro/go-zero/core/stores/sqlc" | |||||
| "github.com/zeromicro/go-zero/core/stores/sqlx" | "github.com/zeromicro/go-zero/core/stores/sqlx" | ||||
| "github.com/zeromicro/go-zero/core/stringx" | "github.com/zeromicro/go-zero/core/stringx" | ||||
| ) | ) | ||||
| @@ -36,26 +35,34 @@ type ( | |||||
| } | } | ||||
| TaskAi struct { | TaskAi struct { | ||||
| Id int64 `db:"id"` // id | |||||
| TaskId int64 `db:"task_id"` // 任务id | |||||
| AdapterId int64 `db:"adapter_id"` // 适配器id | |||||
| AdapterName string `db:"adapter_name"` // 适配器名称 | |||||
| ClusterId int64 `db:"cluster_id"` // 集群id | |||||
| ClusterName string `db:"cluster_name"` // 集群名称 | |||||
| Name string `db:"name"` // 任务名 | |||||
| Replica int64 `db:"replica"` // 执行数 | |||||
| JobId string `db:"job_id"` // 集群返回任务id | |||||
| Strategy string `db:"strategy"` // 主任务使用策略 | |||||
| Status string `db:"status"` // 任务状态 | |||||
| Msg string `db:"msg"` // 集群返回任务信息 | |||||
| CommitTime time.Time `db:"commit_time"` // 提交时间 | |||||
| StartTime string `db:"start_time"` // 开始时间 | |||||
| EndTime string `db:"end_time"` // 结束时间 | |||||
| TaskType string `db:"task_type"` | |||||
| DeletedAt *time.Time `db:"deleted_at"` | |||||
| Card string `db:"card"` | |||||
| InferUrl string `db:"infer_url"` | |||||
| ModelName string `db:"model_name"` | |||||
| Id int64 `db:"id"` // id | |||||
| TaskId int64 `db:"task_id"` // 任务id | |||||
| AdapterId int64 `db:"adapter_id"` // 适配器id | |||||
| AdapterName string `db:"adapter_name"` // 适配器名称 | |||||
| ClusterId int64 `db:"cluster_id"` // 集群id | |||||
| ClusterName string `db:"cluster_name"` // 集群名称 | |||||
| Name string `db:"name"` // 任务名 | |||||
| Replica int64 `db:"replica"` // 执行数 | |||||
| JobId string `db:"job_id"` // 集群返回任务id | |||||
| Strategy string `db:"strategy"` // 主任务使用策略 | |||||
| Status string `db:"status"` // 任务状态 | |||||
| Msg string `db:"msg"` // 集群返回任务信息 | |||||
| CommitTime time.Time `db:"commit_time"` // 提交时间 | |||||
| StartTime string `db:"start_time"` // 开始时间 | |||||
| EndTime string `db:"end_time"` // 结束时间 | |||||
| TaskType string `db:"task_type"` | |||||
| DeletedAt time.Time `db:"deleted_at"` | |||||
| Card string `db:"card"` | |||||
| Remark string `db:"remark"` // 备注 | |||||
| InferUrl string `db:"infer_url"` | |||||
| ModelName string `db:"model_name"` | |||||
| AlgorithmId string `db:"algorithm_id"` // 算法id | |||||
| ImageId string `db:"image_id"` // 镜像id | |||||
| Command string `db:"command"` // 启动命令 | |||||
| Environments string `db:"environments"` // 训练作业的环境变量 | |||||
| Parameters string `db:"parameters"` // 训练作业的运行参数 | |||||
| FlavorId string `db:"flavor_id"` // 规格id | |||||
| MetadataName string `db:"metadata_name"` // 训练作业名称 | |||||
| } | } | ||||
| ) | ) | ||||
| @@ -66,13 +73,6 @@ func newTaskAiModel(conn sqlx.SqlConn) *defaultTaskAiModel { | |||||
| } | } | ||||
| } | } | ||||
| func (m *defaultTaskAiModel) withSession(session sqlx.Session) *defaultTaskAiModel { | |||||
| return &defaultTaskAiModel{ | |||||
| conn: sqlx.NewSqlConnFromSession(session), | |||||
| table: "`task_ai`", | |||||
| } | |||||
| } | |||||
| func (m *defaultTaskAiModel) Delete(ctx context.Context, id int64) error { | func (m *defaultTaskAiModel) Delete(ctx context.Context, id int64) error { | ||||
| query := fmt.Sprintf("delete from %s where `id` = ?", m.table) | query := fmt.Sprintf("delete from %s where `id` = ?", m.table) | ||||
| _, err := m.conn.ExecCtx(ctx, query, id) | _, err := m.conn.ExecCtx(ctx, query, id) | ||||
| @@ -86,7 +86,7 @@ func (m *defaultTaskAiModel) FindOne(ctx context.Context, id int64) (*TaskAi, er | |||||
| switch err { | switch err { | ||||
| case nil: | case nil: | ||||
| return &resp, nil | return &resp, nil | ||||
| case sqlc.ErrNotFound: | |||||
| case sqlx.ErrNotFound: | |||||
| return nil, ErrNotFound | return nil, ErrNotFound | ||||
| default: | default: | ||||
| return nil, err | return nil, err | ||||
| @@ -94,14 +94,14 @@ func (m *defaultTaskAiModel) FindOne(ctx context.Context, id int64) (*TaskAi, er | |||||
| } | } | ||||
| func (m *defaultTaskAiModel) Insert(ctx context.Context, data *TaskAi) (sql.Result, error) { | func (m *defaultTaskAiModel) Insert(ctx context.Context, data *TaskAi) (sql.Result, error) { | ||||
| query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet) | |||||
| ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card) | |||||
| query := fmt.Sprintf("insert into %s (%s) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", m.table, taskAiRowsExpectAutoSet) | |||||
| ret, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card, data.Remark, data.InferUrl, data.ModelName, data.AlgorithmId, data.ImageId, data.Command, data.Environments, data.Parameters, data.FlavorId, data.MetadataName) | |||||
| return ret, err | return ret, err | ||||
| } | } | ||||
| func (m *defaultTaskAiModel) Update(ctx context.Context, data *TaskAi) error { | func (m *defaultTaskAiModel) Update(ctx context.Context, data *TaskAi) error { | ||||
| query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskAiRowsWithPlaceHolder) | query := fmt.Sprintf("update %s set %s where `id` = ?", m.table, taskAiRowsWithPlaceHolder) | ||||
| _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card, data.Id) | |||||
| _, err := m.conn.ExecCtx(ctx, query, data.TaskId, data.AdapterId, data.AdapterName, data.ClusterId, data.ClusterName, data.Name, data.Replica, data.JobId, data.Strategy, data.Status, data.Msg, data.CommitTime, data.StartTime, data.EndTime, data.TaskType, data.DeletedAt, data.Card, data.Remark, data.InferUrl, data.ModelName, data.AlgorithmId, data.ImageId, data.Command, data.Environments, data.Parameters, data.FlavorId, data.MetadataName, data.Id) | |||||
| return err | return err | ||||
| } | } | ||||