| @@ -0,0 +1,31 @@ | |||||
| HELP.md | |||||
| target/ | |||||
| !.mvn/wrapper/maven-wrapper.jar | |||||
| !**/src/main/** | |||||
| !**/src/test/** | |||||
| ### STS ### | |||||
| .apt_generated | |||||
| .classpath | |||||
| .factorypath | |||||
| .project | |||||
| .settings | |||||
| .springBeans | |||||
| .sts4-cache | |||||
| ### IntelliJ IDEA ### | |||||
| .idea | |||||
| *.iws | |||||
| *.iml | |||||
| *.ipr | |||||
| ### NetBeans ### | |||||
| /nbproject/private/ | |||||
| /nbbuild/ | |||||
| /dist/ | |||||
| /nbdist/ | |||||
| /.nb-gradle/ | |||||
| build/ | |||||
| ### VS Code ### | |||||
| .vscode/ | |||||
| @@ -23,4 +23,4 @@ mvn clean compile package | |||||
| ``` | ``` | ||||
| ### 部署 | ### 部署 | ||||
| 部署过程参看文档:[部署 分布式训练operator](http://tianshu.org.cn/?/course/1.html) | |||||
| 部署过程参看文档:[部署 分布式训练operator](http://docs.dubhe.ai/docs/setup/deploy-distribute-train-operator) | |||||
| @@ -48,18 +48,22 @@ spec: | |||||
| value: 6 | value: 6 | ||||
| - name: NCCL_DEBUG | - name: NCCL_DEBUG | ||||
| value: INFO | value: INFO | ||||
| datasetStorage: | |||||
| name: pvc-dataset | |||||
| nfs: | |||||
| path: {{DATASET}} | |||||
| server: {{NFS}} | |||||
| workspaceStorage: | |||||
| name: pvc-workspace | |||||
| nfs: | |||||
| path: /nfs/resnet50/workspace | |||||
| server: {{WORKSPACE}} | |||||
| modelStorage: | |||||
| name: pvc-model | |||||
| nfs: | |||||
| path: /nfs/resnet50/model | |||||
| server: {{MODEL}} | |||||
| volumeMounts: | |||||
| - mountPath: /dataset | |||||
| name: volume-0 | |||||
| - mountPath: /workspace | |||||
| name: volume-1 | |||||
| volumes: | |||||
| - name: volume-0 | |||||
| nfs: | |||||
| path: /nfs/dubhe-prod/dataset/5/versionFile/V0001/ofrecord/train | |||||
| server: {{NFS IP}} | |||||
| - name: volume-1 | |||||
| nfs: | |||||
| path: /nfs/dubhe-prod/train-manage/1/train-1-20200825173815-v0020 | |||||
| server: {{NFS IP}} | |||||
| tolerations: | |||||
| - key: "platform/node-isolate" | |||||
| operator: "Equal" | |||||
| value: "prod-isolate-1" | |||||
| effect: "NoSchedule" | |||||
| @@ -45,12 +45,12 @@ spec: | |||||
| type: object | type: object | ||||
| initContainer: | initContainer: | ||||
| type: object | type: object | ||||
| datasetStorage: | |||||
| type: object | |||||
| workspaceStorage: | |||||
| type: object | |||||
| modelStorage: | |||||
| type: object | |||||
| volumeMounts: | |||||
| type: array | |||||
| volumes: | |||||
| type: array | |||||
| tolerations: | |||||
| type: array | |||||
| required: | required: | ||||
| - image | - image | ||||
| - imagePullPolicy | - imagePullPolicy | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -184,11 +184,11 @@ public class DistributeTrainOperatorManager { | |||||
| .addToProperties("slaveResources", objectType) | .addToProperties("slaveResources", objectType) | ||||
| .addToProperties("nodeSelector", objectType) | .addToProperties("nodeSelector", objectType) | ||||
| .addToProperties("initContainer", objectType) | .addToProperties("initContainer", objectType) | ||||
| .addToProperties("datasetStorage", objectType) | |||||
| .addToProperties("workspaceStorage", objectType) | |||||
| .addToProperties("modelStorage", objectType) | |||||
| .addToProperties("volumeMounts", arrayType) | |||||
| .addToProperties("volumes", arrayType) | |||||
| .addToProperties("tolerations", arrayType) | |||||
| .withType("object") | .withType("object") | ||||
| .addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd", "workspaceStorage") | |||||
| .addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd") | |||||
| .build(); | .build(); | ||||
| properties.put("apiVersion", stringType); | properties.put("apiVersion", stringType); | ||||
| properties.put("kind", stringType); | properties.put("kind", stringType); | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -123,29 +123,27 @@ public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { | |||||
| private Container initContainer; | private Container initContainer; | ||||
| /** | /** | ||||
| * 工作目录挂载 | |||||
| * 环境变量 | |||||
| */ | */ | ||||
| private Volume workspaceVolume; | |||||
| private List<EnvVar> env; | |||||
| /** | /** | ||||
| * 数据集目录挂载 | |||||
| * 拥有者信息 | |||||
| */ | */ | ||||
| private Volume datasetVolume; | |||||
| private OwnerReference ownerReference; | |||||
| /** | /** | ||||
| * 模型目录挂载 | |||||
| * 内部映射 | |||||
| */ | */ | ||||
| private Volume modelVolume; | |||||
| private List<VolumeMount> volumeMounts; | |||||
| /** | /** | ||||
| * 环境变量 | |||||
| * 外部挂载 | |||||
| */ | */ | ||||
| private List<EnvVar> env; | |||||
| private List<Volume> volumes; | |||||
| /** | /** | ||||
| * 拥有者信息 | |||||
| * 容忍度 | |||||
| */ | */ | ||||
| private OwnerReference ownerReference; | |||||
| private List<Toleration> tolerations; | |||||
| /** | /** | ||||
| * 将分布式训练转换为K8S的资源信息 | * 将分布式训练转换为K8S的资源信息 | ||||
| @@ -173,12 +171,14 @@ public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { | |||||
| info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) | info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) | ||||
| .setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); | .setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); | ||||
| //挂载 | //挂载 | ||||
| Optional.ofNullable(distributeTrain.getSpec().getWorkspaceStorage()) | |||||
| .ifPresent(v -> info.setWorkspaceVolume(v)); | |||||
| Optional.ofNullable(distributeTrain.getSpec().getDatasetStorage()) | |||||
| .ifPresent(v -> info.setDatasetVolume(v)); | |||||
| Optional.ofNullable(distributeTrain.getSpec().getModelStorage()) | |||||
| .ifPresent(v -> info.setModelVolume(v)); | |||||
| Optional.ofNullable(distributeTrain.getSpec().getVolumeMounts()) | |||||
| .ifPresent(v -> info.setVolumeMounts(v)); | |||||
| Optional.ofNullable(distributeTrain.getSpec().getVolumes()) | |||||
| .ifPresent(v -> info.setVolumes(v)); | |||||
| //容忍度 | |||||
| Optional.ofNullable(distributeTrain.getSpec().getTolerations()) | |||||
| .ifPresent(v -> info.setTolerations(v)); | |||||
| //主从两组资源限制 | //主从两组资源限制 | ||||
| Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) | Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -33,6 +33,7 @@ import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | ||||
| import org.onebrain.operator.action.deployer.JobDeployer; | import org.onebrain.operator.action.deployer.JobDeployer; | ||||
| import org.onebrain.operator.constants.KubeConstants; | import org.onebrain.operator.constants.KubeConstants; | ||||
| import org.springframework.util.CollectionUtils; | |||||
| import java.util.*; | import java.util.*; | ||||
| @@ -46,13 +47,7 @@ import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||||
| */ | */ | ||||
| public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | ||||
| public static final String PVC_WORKSPACE = "pvc-workspace"; | |||||
| public static final String SSH = "ssh"; | public static final String SSH = "ssh"; | ||||
| public static final String WORKSPACE = "/workspace"; | |||||
| public static final String PVC_DATASET = "pvc-dataset"; | |||||
| public static final String DATASET = "/dataset"; | |||||
| public static final String PVC_MODEL = "pvc-model"; | |||||
| public static final String MODEL = "/model"; | |||||
| public static final String MEMORY = "Memory"; | public static final String MEMORY = "Memory"; | ||||
| public static final String DEV_SHM = "/dev/shm"; | public static final String DEV_SHM = "/dev/shm"; | ||||
| public static final String BIN_BASH = "/bin/bash"; | public static final String BIN_BASH = "/bin/bash"; | ||||
| @@ -74,6 +69,11 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
| //挂载 | //挂载 | ||||
| List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | ||||
| if (!CollectionUtils.isEmpty(info.getVolumes()) && !CollectionUtils.isEmpty(info.getVolumeMounts())){ | |||||
| volumes.addAll(info.getVolumes()); | |||||
| volumeMounts.addAll(info.getVolumeMounts()); | |||||
| } | |||||
| container.setVolumeMounts(volumeMounts); | container.setVolumeMounts(volumeMounts); | ||||
| //启动命令 | //启动命令 | ||||
| @@ -123,6 +123,7 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
| .addToContainers(container) | .addToContainers(container) | ||||
| .addToVolumes(volumes.toArray(new Volume[volumes.size()])) | .addToVolumes(volumes.toArray(new Volume[volumes.size()])) | ||||
| .withRestartPolicy(RESTART_POLICY_NEVER) | .withRestartPolicy(RESTART_POLICY_NEVER) | ||||
| .withTolerations(info.getTolerations()) | |||||
| .endSpec() | .endSpec() | ||||
| .endTemplate() | .endTemplate() | ||||
| .endSpec(); | .endSpec(); | ||||
| @@ -192,9 +193,6 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
| private List<Volume> buildVolumes(ChildResourceCreateInfo info){ | private List<Volume> buildVolumes(ChildResourceCreateInfo info){ | ||||
| //存储卷 | //存储卷 | ||||
| List<Volume> volumes = new LinkedList<>(); | List<Volume> volumes = new LinkedList<>(); | ||||
| Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||||
| Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||||
| Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||||
| //shm默认就有 | //shm默认就有 | ||||
| volumes.add(new VolumeBuilder() | volumes.add(new VolumeBuilder() | ||||
| .withName(KubeConstants.VOLUME_SHM) | .withName(KubeConstants.VOLUME_SHM) | ||||
| @@ -213,30 +211,6 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
| */ | */ | ||||
| private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | ||||
| List<VolumeMount> volumeMounts = new LinkedList<>(); | List<VolumeMount> volumeMounts = new LinkedList<>(); | ||||
| for (Volume volume : volumes) { | |||||
| if(PVC_WORKSPACE.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(WORKSPACE) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| if(PVC_DATASET.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(DATASET) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| if(PVC_MODEL.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(MODEL) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| } | |||||
| volumeMounts.add(new VolumeMountBuilder() | volumeMounts.add(new VolumeMountBuilder() | ||||
| .withName(KubeConstants.VOLUME_SHM) | .withName(KubeConstants.VOLUME_SHM) | ||||
| .withMountPath(DEV_SHM) | .withMountPath(DEV_SHM) | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -35,6 +35,7 @@ import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | ||||
| import org.onebrain.operator.action.deployer.StatefulSetDeployer; | import org.onebrain.operator.action.deployer.StatefulSetDeployer; | ||||
| import org.onebrain.operator.constants.KubeConstants; | import org.onebrain.operator.constants.KubeConstants; | ||||
| import org.springframework.util.CollectionUtils; | |||||
| import java.util.Arrays; | import java.util.Arrays; | ||||
| import java.util.Collections; | import java.util.Collections; | ||||
| @@ -55,12 +56,6 @@ import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||||
| public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> { | public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> { | ||||
| public static final String SSH = "ssh"; | public static final String SSH = "ssh"; | ||||
| public static final String PVC_WORKSPACE = "pvc-workspace"; | |||||
| public static final String WORKSPACE = "/workspace"; | |||||
| public static final String PVC_DATASET = "pvc-dataset"; | |||||
| public static final String DATASET = "/dataset"; | |||||
| public static final String PVC_MODEL = "pvc-model"; | |||||
| public static final String MODEL = "/model"; | |||||
| public static final String MEMORY = "Memory"; | public static final String MEMORY = "Memory"; | ||||
| public static final String DEV_SHM = "/dev/shm"; | public static final String DEV_SHM = "/dev/shm"; | ||||
| public static final String BIN_BASH = "/bin/bash"; | public static final String BIN_BASH = "/bin/bash"; | ||||
| @@ -83,6 +78,11 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc | |||||
| //挂载 | //挂载 | ||||
| List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | ||||
| if (!CollectionUtils.isEmpty(info.getVolumes()) && !CollectionUtils.isEmpty(info.getVolumeMounts())){ | |||||
| volumes.addAll(info.getVolumes()); | |||||
| volumeMounts.addAll(info.getVolumeMounts()); | |||||
| } | |||||
| container.setVolumeMounts(volumeMounts); | container.setVolumeMounts(volumeMounts); | ||||
| //启动命令 | //启动命令 | ||||
| @@ -126,6 +126,7 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc | |||||
| .withTerminationGracePeriodSeconds(LONG_NUMBER_60) | .withTerminationGracePeriodSeconds(LONG_NUMBER_60) | ||||
| .addToContainers(container) | .addToContainers(container) | ||||
| .addToVolumes(volumes.toArray(new Volume[0])) | .addToVolumes(volumes.toArray(new Volume[0])) | ||||
| .withTolerations(info.getTolerations()) | |||||
| .endSpec() | .endSpec() | ||||
| .endTemplate() | .endTemplate() | ||||
| .endSpec(); | .endSpec(); | ||||
| @@ -191,9 +192,6 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc | |||||
| */ | */ | ||||
| private List<Volume> buildVolumes(ChildResourceCreateInfo info) { | private List<Volume> buildVolumes(ChildResourceCreateInfo info) { | ||||
| List<Volume> volumes = new LinkedList<>(); | List<Volume> volumes = new LinkedList<>(); | ||||
| Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||||
| Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||||
| Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||||
| //shm默认就有 | //shm默认就有 | ||||
| volumes.add(new VolumeBuilder() | volumes.add(new VolumeBuilder() | ||||
| @@ -213,30 +211,6 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc | |||||
| */ | */ | ||||
| private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | ||||
| List<VolumeMount> volumeMounts=new LinkedList<>(); | List<VolumeMount> volumeMounts=new LinkedList<>(); | ||||
| for (Volume volume : volumes) { | |||||
| if(PVC_WORKSPACE.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(WORKSPACE) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| if(PVC_DATASET.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(DATASET) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| if(PVC_MODEL.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(MODEL) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| } | |||||
| volumeMounts.add(new VolumeMountBuilder() | volumeMounts.add(new VolumeMountBuilder() | ||||
| .withName(KubeConstants.VOLUME_SHM) | .withName(KubeConstants.VOLUME_SHM) | ||||
| .withMountPath(DEV_SHM) | .withMountPath(DEV_SHM) | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -143,7 +143,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @param distributeTrain | * @param distributeTrain | ||||
| */ | */ | ||||
| public void doAction(DistributeTrain distributeTrain) { | public void doAction(DistributeTrain distributeTrain) { | ||||
| log.info("doAction=>distributeTrain : 【{}】", distributeTrain); | |||||
| log.info("doAction=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); | |||||
| ChildResourceCreateInfo info = null; | ChildResourceCreateInfo info = null; | ||||
| try { | try { | ||||
| //redis重复检查 | //redis重复检查 | ||||
| @@ -200,7 +200,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| */ | */ | ||||
| @Override | @Override | ||||
| public void handlerAction(DistributeTrain distributeTrain) { | public void handlerAction(DistributeTrain distributeTrain) { | ||||
| log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); | |||||
| log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); | |||||
| HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); | HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); | ||||
| pool.getActiveCount(); | pool.getActiveCount(); | ||||
| pool.execute(handlerActionTask); | pool.execute(handlerActionTask); | ||||
| @@ -211,7 +211,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @param distributeTrain 分布式训练 | * @param distributeTrain 分布式训练 | ||||
| */ | */ | ||||
| private void validateParams(DistributeTrain distributeTrain) { | private void validateParams(DistributeTrain distributeTrain) { | ||||
| log.info("validateParams=>distributeTrain : 【{}】", distributeTrain); | |||||
| log.info("validateParams=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); | |||||
| Integer size = distributeTrain.getSpec().getSize(); | Integer size = distributeTrain.getSpec().getSize(); | ||||
| if (size < NUMBER_2) { | if (size < NUMBER_2) { | ||||
| throw new OperatorException("size must be greater than 1"); | throw new OperatorException("size must be greater than 1"); | ||||
| @@ -254,7 +254,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @param info 资源信息 | * @param info 资源信息 | ||||
| */ | */ | ||||
| private void createStatefulSet(ChildResourceCreateInfo info) { | private void createStatefulSet(ChildResourceCreateInfo info) { | ||||
| log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info); | |||||
| log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
| StatefulSet statefulSet = client.apps().statefulSets() | StatefulSet statefulSet = client.apps().statefulSets() | ||||
| .inNamespace(info.getNamespace()) | .inNamespace(info.getNamespace()) | ||||
| .withName(info.getStatefulSetName()).get(); | .withName(info.getStatefulSetName()).get(); | ||||
| @@ -298,7 +298,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @param info Job信息 | * @param info Job信息 | ||||
| */ | */ | ||||
| private void createJob(ChildResourceCreateInfo info) { | private void createJob(ChildResourceCreateInfo info) { | ||||
| log.info("createJob=>childResourceCreateInfo : 【{}】", info); | |||||
| log.info("createJob=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
| Job job = client.batch().jobs() | Job job = client.batch().jobs() | ||||
| .inNamespace(info.getNamespace()) | .inNamespace(info.getNamespace()) | ||||
| .withName(info.getJobName()).get(); | .withName(info.getJobName()).get(); | ||||
| @@ -311,7 +311,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| JobDeployer deployer = new BaseJobDeployer(); | JobDeployer deployer = new BaseJobDeployer(); | ||||
| JobBuilder builder = deployer.deploy(info); | JobBuilder builder = deployer.deploy(info); | ||||
| job = builder.build(); | job = builder.build(); | ||||
| log.info("job is : 【{}】", job); | |||||
| log.info("job is : 【{}】", job.getMetadata().getName()); | |||||
| client.batch().jobs().create(job); | client.batch().jobs().create(job); | ||||
| log.info("create job【{}】 successfully", job.getMetadata().getName()); | log.info("create job【{}】 successfully", job.getMetadata().getName()); | ||||
| } | } | ||||
| @@ -394,7 +394,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @param slavePods | * @param slavePods | ||||
| */ | */ | ||||
| private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) { | private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) { | ||||
| log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】, slavePods : 【{}】", info, masterPod, slavePods); | |||||
| log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】", info.getParentName(), masterPod.getMetadata().getName()); | |||||
| String key = info.getOwnerReference().getUid(); | String key = info.getOwnerReference().getUid(); | ||||
| if (dtMap.containsKey(key)) { | if (dtMap.containsKey(key)) { | ||||
| dtMap.remove(key); | dtMap.remove(key); | ||||
| @@ -527,7 +527,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @param info | * @param info | ||||
| */ | */ | ||||
| private void recycleCr(ChildResourceCreateInfo info) { | private void recycleCr(ChildResourceCreateInfo info) { | ||||
| log.info("recycleCr=>childResourceCreateInfo : 【{}】", info); | |||||
| log.info("recycleCr=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
| Optional.ofNullable(DistributeTrainClientHolder.getClient()) | Optional.ofNullable(DistributeTrainClientHolder.getClient()) | ||||
| .ifPresent(distributeTrainClient -> { | .ifPresent(distributeTrainClient -> { | ||||
| ObjectMeta metadata = new ObjectMeta(); | ObjectMeta metadata = new ObjectMeta(); | ||||
| @@ -542,7 +542,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| /**更新状态*/ | /**更新状态*/ | ||||
| private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { | private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { | ||||
| log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info, distributeTrain); | |||||
| log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info.getParentName(), distributeTrain.getMetadata().getName()); | |||||
| if (distributeTrain.getStatus() == null) { | if (distributeTrain.getStatus() == null) { | ||||
| distributeTrain.setStatus(new DistributeTrainStatus()); | distributeTrain.setStatus(new DistributeTrainStatus()); | ||||
| } | } | ||||
| @@ -568,7 +568,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @return List<Pod> 分布式相关Pod集合 | * @return List<Pod> 分布式相关Pod集合 | ||||
| */ | */ | ||||
| private List<Pod> getPods(ChildResourceCreateInfo info) { | private List<Pod> getPods(ChildResourceCreateInfo info) { | ||||
| log.info("getPods=>childResourceCreateInfo : 【{}】", info); | |||||
| log.info("getPods=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
| List<Pod> pods = Lists.newArrayList(); | List<Pod> pods = Lists.newArrayList(); | ||||
| pods.add(getMasterPod(info)); | pods.add(getMasterPod(info)); | ||||
| pods.addAll(getSlavePods(info)); | pods.addAll(getSlavePods(info)); | ||||
| @@ -584,7 +584,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @return Pod Master节点对应的Pod | * @return Pod Master节点对应的Pod | ||||
| */ | */ | ||||
| private Pod getMasterPod(ChildResourceCreateInfo info) { | private Pod getMasterPod(ChildResourceCreateInfo info) { | ||||
| log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info); | |||||
| log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
| List<Pod> masterPods = client.pods().inNamespace(info.getNamespace()) | List<Pod> masterPods = client.pods().inNamespace(info.getNamespace()) | ||||
| .withLabel(JOB_LABEL, info.getJobName()) | .withLabel(JOB_LABEL, info.getJobName()) | ||||
| .list().getItems(); | .list().getItems(); | ||||
| @@ -600,7 +600,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| * @return List<Pod> Slave节点对应的Pod集合 | * @return List<Pod> Slave节点对应的Pod集合 | ||||
| */ | */ | ||||
| private List<Pod> getSlavePods(ChildResourceCreateInfo info) { | private List<Pod> getSlavePods(ChildResourceCreateInfo info) { | ||||
| log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info); | |||||
| log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
| //取得从的所有pod | //取得从的所有pod | ||||
| List<Pod> slavePods = client.pods().inNamespace(info.getNamespace()) | List<Pod> slavePods = client.pods().inNamespace(info.getNamespace()) | ||||
| .withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) | .withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -105,4 +105,18 @@ public class DistributeTrainSpec implements KubernetesResource { | |||||
| */ | */ | ||||
| private Volume modelStorage; | private Volume modelStorage; | ||||
| /** | |||||
| * 内部映射 | |||||
| */ | |||||
| private List<VolumeMount> volumeMounts; | |||||
| /** | |||||
| * 外部挂载 | |||||
| */ | |||||
| private List<Volume> volumes; | |||||
| /** | |||||
| * 容忍度 | |||||
| */ | |||||
| private List<Toleration> tolerations; | |||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -58,7 +58,7 @@ public class JobHandler { | |||||
| * @param job | * @param job | ||||
| */ | */ | ||||
| public void handleJob(Job job) { | public void handleJob(Job job) { | ||||
| log.info("handleJob=>job : 【{}】", job); | |||||
| log.info("handleJob=>job : 【{}】", job.getMetadata().getName()); | |||||
| //筛选出DistributeTrain下的job | //筛选出DistributeTrain下的job | ||||
| List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences(); | List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences(); | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -51,7 +51,7 @@ public class JobWatcher implements Watcher<Job> { | |||||
| */ | */ | ||||
| @Override | @Override | ||||
| public void eventReceived(Action action, Job job) { | public void eventReceived(Action action, Job job) { | ||||
| log.info("Job Event received: {} at {}", job.getMetadata().getUid(), job.getMetadata().getCreationTimestamp()); | |||||
| log.info("Job Event received: {} action {}", job.getMetadata().getName(), action.toString()); | |||||
| jobHandler.handleJob(job); | jobHandler.handleJob(job); | ||||
| } | } | ||||
| @@ -1,5 +1,5 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| /** | |||||
| * Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
| * | * | ||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| * you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
| @@ -1,46 +1,43 @@ | |||||
| #!/bin/bash | #!/bin/bash | ||||
| if [ ! -f "/etc/init.d/ssh" ]; then | |||||
| if [ ! -f "/etc/redhat-release" ]; then | |||||
| if [ -f "/etc/lsb-release" ]; then | |||||
| if [ ! -f "/etc/init.d/ssh" ]; then | |||||
| echo 'apt install -y openssh-server' >> pretreatment.log | echo 'apt install -y openssh-server' >> pretreatment.log | ||||
| apt update >> pretreatment.log | apt update >> pretreatment.log | ||||
| apt install -y openssh-server >> pretreatment.log | apt install -y openssh-server >> pretreatment.log | ||||
| echo '/etc/init.d/ssh start' >> pretreatment.log | |||||
| /etc/init.d/ssh start >> pretreatment.log | |||||
| fi | fi | ||||
| if [ ! -f "/etc/lsb-release" ]; then | |||||
| echo 'yum install -y sshd' >> pretreatment.log | |||||
| yum update >> pretreatment.log | |||||
| yum install -y sshd >> pretreatment.log | |||||
| fi | |||||
| fi | |||||
| echo '/etc/init.d/ssh start' >> pretreatment.log | |||||
| /etc/init.d/ssh start >> pretreatment.log | |||||
| if [ -f "/etc/redhat-release" ]; then | |||||
| if command -v nslookup >/dev/null 2>&1; then | |||||
| echo 'exists nslookup' >> pretreatment.log | |||||
| else | |||||
| echo 'yum install dnsutils jq' >> pretreatment.log | |||||
| yum install -y dnsutils >> pretreatment.log | |||||
| yum install -y jq >> pretreatment.log | |||||
| fi | |||||
| if command -v nslookup >/dev/null 2>&1; then | |||||
| echo 'exists nslookup' >> pretreatment.log | |||||
| else | |||||
| echo 'yum install dnsutils jq' >> pretreatment.log | |||||
| yum install -y dnsutils >> pretreatment.log | |||||
| yum install -y jq >> pretreatment.log | |||||
| fi | |||||
| fi | |||||
| if [ -f "/etc/lsb-release" ]; then | |||||
| if command -v jq >/dev/null 2>&1; then | |||||
| if command -v jq >/dev/null 2>&1; then | |||||
| echo 'exists jq' >> pretreatment.log | echo 'exists jq' >> pretreatment.log | ||||
| else | |||||
| else | |||||
| echo 'apt install jq' >> pretreatment.log | echo 'apt install jq' >> pretreatment.log | ||||
| apt install -y jq >> pretreatment.log | apt install -y jq >> pretreatment.log | ||||
| fi | fi | ||||
| if command -v nslookup >/dev/null 2>&1; then | |||||
| if command -v nslookup >/dev/null 2>&1; then | |||||
| echo 'exists nslookup' >> pretreatment.log | echo 'exists nslookup' >> pretreatment.log | ||||
| else | |||||
| else | |||||
| echo 'apt install dnsutils' >> pretreatment.log | echo 'apt install dnsutils' >> pretreatment.log | ||||
| apt install -y dnsutils >> pretreatment.log | apt install -y dnsutils >> pretreatment.log | ||||
| fi | fi | ||||
| fi | fi | ||||
| if [ -f "/etc/redhat-release" ]; then | |||||
| if [ ! -f "/usr/sbin/sshd" ]; then | |||||
| echo 'yum install -y sshd' >> pretreatment.log | |||||
| yum update >> pretreatment.log | |||||
| yum install -y sshd >> pretreatment.log | |||||
| echo 'sshd start' >> pretreatment.log | |||||
| service sshd start >> pretreatment.log | |||||
| fi | |||||
| if command -v nslookup >/dev/null 2>&1; then | |||||
| echo 'exists nslookup' >> pretreatment.log | |||||
| else | |||||
| echo 'yum install -y bind-utils' >> pretreatment.log | |||||
| yum install -y bind-utils >> pretreatment.log | |||||
| fi | |||||
| if command -v jq >/dev/null 2>&1; then | |||||
| echo 'exists jq' >> pretreatment.log | |||||
| else | |||||
| echo 'yum install -y jq' >> pretreatment.log | |||||
| yum install -y jq >> pretreatment.log | |||||
| fi | |||||
| fi | |||||
| @@ -1,43 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator; | |||||
| import org.onebrain.operator.api.pod.PodApi; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.boot.test.context.SpringBootTest; | |||||
| import java.io.File; | |||||
| import java.net.URISyntaxException; | |||||
| import java.net.URL; | |||||
| @SpringBootTest | |||||
| public class DistributeTrainOperatorApplicationTests { | |||||
| @Autowired | |||||
| private PodApi podApi; | |||||
| // @Test | |||||
| public void contextLoads() throws URISyntaxException { | |||||
| final URL url = getClass().getClassLoader().getResource("key/id_rsa"); | |||||
| File file = new File(url.toURI()); | |||||
| podApi.copyToPod("default", "distribute-train-test-job-sv2dj", KubeConstants.MASTER_CONTAINER_NAME, file, "/root/.ssh/id_rsa"); | |||||
| } | |||||
| } | |||||