@@ -0,0 +1,31 @@ | |||||
HELP.md | |||||
target/ | |||||
!.mvn/wrapper/maven-wrapper.jar | |||||
!**/src/main/** | |||||
!**/src/test/** | |||||
### STS ### | |||||
.apt_generated | |||||
.classpath | |||||
.factorypath | |||||
.project | |||||
.settings | |||||
.springBeans | |||||
.sts4-cache | |||||
### IntelliJ IDEA ### | |||||
.idea | |||||
*.iws | |||||
*.iml | |||||
*.ipr | |||||
### NetBeans ### | |||||
/nbproject/private/ | |||||
/nbbuild/ | |||||
/dist/ | |||||
/nbdist/ | |||||
/.nb-gradle/ | |||||
build/ | |||||
### VS Code ### | |||||
.vscode/ |
@@ -23,4 +23,4 @@ mvn clean compile package | |||||
``` | ``` | ||||
### 部署 | ### 部署 | ||||
部署过程参看文档:[部署 分布式训练operator](http://tianshu.org.cn/?/course/1.html) | |||||
部署过程参看文档:[部署 分布式训练operator](http://docs.dubhe.ai/docs/setup/deploy-distribute-train-operator) |
@@ -48,18 +48,22 @@ spec: | |||||
value: 6 | value: 6 | ||||
- name: NCCL_DEBUG | - name: NCCL_DEBUG | ||||
value: INFO | value: INFO | ||||
datasetStorage: | |||||
name: pvc-dataset | |||||
nfs: | |||||
path: {{DATASET}} | |||||
server: {{NFS}} | |||||
workspaceStorage: | |||||
name: pvc-workspace | |||||
nfs: | |||||
path: /nfs/resnet50/workspace | |||||
server: {{WORKSPACE}} | |||||
modelStorage: | |||||
name: pvc-model | |||||
nfs: | |||||
path: /nfs/resnet50/model | |||||
server: {{MODEL}} | |||||
volumeMounts: | |||||
- mountPath: /dataset | |||||
name: volume-0 | |||||
- mountPath: /workspace | |||||
name: volume-1 | |||||
volumes: | |||||
- name: volume-0 | |||||
nfs: | |||||
path: /nfs/dubhe-prod/dataset/5/versionFile/V0001/ofrecord/train | |||||
server: {{NFS IP}} | |||||
- name: volume-1 | |||||
nfs: | |||||
path: /nfs/dubhe-prod/train-manage/1/train-1-20200825173815-v0020 | |||||
server: {{NFS IP}} | |||||
tolerations: | |||||
- key: "platform/node-isolate" | |||||
operator: "Equal" | |||||
value: "prod-isolate-1" | |||||
effect: "NoSchedule" |
@@ -45,12 +45,12 @@ spec: | |||||
type: object | type: object | ||||
initContainer: | initContainer: | ||||
type: object | type: object | ||||
datasetStorage: | |||||
type: object | |||||
workspaceStorage: | |||||
type: object | |||||
modelStorage: | |||||
type: object | |||||
volumeMounts: | |||||
type: array | |||||
volumes: | |||||
type: array | |||||
tolerations: | |||||
type: array | |||||
required: | required: | ||||
- image | - image | ||||
- imagePullPolicy | - imagePullPolicy | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -184,11 +184,11 @@ public class DistributeTrainOperatorManager { | |||||
.addToProperties("slaveResources", objectType) | .addToProperties("slaveResources", objectType) | ||||
.addToProperties("nodeSelector", objectType) | .addToProperties("nodeSelector", objectType) | ||||
.addToProperties("initContainer", objectType) | .addToProperties("initContainer", objectType) | ||||
.addToProperties("datasetStorage", objectType) | |||||
.addToProperties("workspaceStorage", objectType) | |||||
.addToProperties("modelStorage", objectType) | |||||
.addToProperties("volumeMounts", arrayType) | |||||
.addToProperties("volumes", arrayType) | |||||
.addToProperties("tolerations", arrayType) | |||||
.withType("object") | .withType("object") | ||||
.addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd", "workspaceStorage") | |||||
.addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd") | |||||
.build(); | .build(); | ||||
properties.put("apiVersion", stringType); | properties.put("apiVersion", stringType); | ||||
properties.put("kind", stringType); | properties.put("kind", stringType); | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -123,29 +123,27 @@ public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { | |||||
private Container initContainer; | private Container initContainer; | ||||
/** | /** | ||||
* 工作目录挂载 | |||||
* 环境变量 | |||||
*/ | */ | ||||
private Volume workspaceVolume; | |||||
private List<EnvVar> env; | |||||
/** | /** | ||||
* 数据集目录挂载 | |||||
* 拥有者信息 | |||||
*/ | */ | ||||
private Volume datasetVolume; | |||||
private OwnerReference ownerReference; | |||||
/** | /** | ||||
* 模型目录挂载 | |||||
* 内部映射 | |||||
*/ | */ | ||||
private Volume modelVolume; | |||||
private List<VolumeMount> volumeMounts; | |||||
/** | /** | ||||
* 环境变量 | |||||
* 外部挂载 | |||||
*/ | */ | ||||
private List<EnvVar> env; | |||||
private List<Volume> volumes; | |||||
/** | /** | ||||
* 拥有者信息 | |||||
* 容忍度 | |||||
*/ | */ | ||||
private OwnerReference ownerReference; | |||||
private List<Toleration> tolerations; | |||||
/** | /** | ||||
* 将分布式训练转换为K8S的资源信息 | * 将分布式训练转换为K8S的资源信息 | ||||
@@ -173,12 +171,14 @@ public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { | |||||
info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) | info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) | ||||
.setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); | .setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); | ||||
//挂载 | //挂载 | ||||
Optional.ofNullable(distributeTrain.getSpec().getWorkspaceStorage()) | |||||
.ifPresent(v -> info.setWorkspaceVolume(v)); | |||||
Optional.ofNullable(distributeTrain.getSpec().getDatasetStorage()) | |||||
.ifPresent(v -> info.setDatasetVolume(v)); | |||||
Optional.ofNullable(distributeTrain.getSpec().getModelStorage()) | |||||
.ifPresent(v -> info.setModelVolume(v)); | |||||
Optional.ofNullable(distributeTrain.getSpec().getVolumeMounts()) | |||||
.ifPresent(v -> info.setVolumeMounts(v)); | |||||
Optional.ofNullable(distributeTrain.getSpec().getVolumes()) | |||||
.ifPresent(v -> info.setVolumes(v)); | |||||
//容忍度 | |||||
Optional.ofNullable(distributeTrain.getSpec().getTolerations()) | |||||
.ifPresent(v -> info.setTolerations(v)); | |||||
//主从两组资源限制 | //主从两组资源限制 | ||||
Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) | Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -33,6 +33,7 @@ import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||||
import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | ||||
import org.onebrain.operator.action.deployer.JobDeployer; | import org.onebrain.operator.action.deployer.JobDeployer; | ||||
import org.onebrain.operator.constants.KubeConstants; | import org.onebrain.operator.constants.KubeConstants; | ||||
import org.springframework.util.CollectionUtils; | |||||
import java.util.*; | import java.util.*; | ||||
@@ -46,13 +47,7 @@ import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||||
*/ | */ | ||||
public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | ||||
public static final String PVC_WORKSPACE = "pvc-workspace"; | |||||
public static final String SSH = "ssh"; | public static final String SSH = "ssh"; | ||||
public static final String WORKSPACE = "/workspace"; | |||||
public static final String PVC_DATASET = "pvc-dataset"; | |||||
public static final String DATASET = "/dataset"; | |||||
public static final String PVC_MODEL = "pvc-model"; | |||||
public static final String MODEL = "/model"; | |||||
public static final String MEMORY = "Memory"; | public static final String MEMORY = "Memory"; | ||||
public static final String DEV_SHM = "/dev/shm"; | public static final String DEV_SHM = "/dev/shm"; | ||||
public static final String BIN_BASH = "/bin/bash"; | public static final String BIN_BASH = "/bin/bash"; | ||||
@@ -74,6 +69,11 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
//挂载 | //挂载 | ||||
List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | ||||
if (!CollectionUtils.isEmpty(info.getVolumes()) && !CollectionUtils.isEmpty(info.getVolumeMounts())){ | |||||
volumes.addAll(info.getVolumes()); | |||||
volumeMounts.addAll(info.getVolumeMounts()); | |||||
} | |||||
container.setVolumeMounts(volumeMounts); | container.setVolumeMounts(volumeMounts); | ||||
//启动命令 | //启动命令 | ||||
@@ -123,6 +123,7 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
.addToContainers(container) | .addToContainers(container) | ||||
.addToVolumes(volumes.toArray(new Volume[volumes.size()])) | .addToVolumes(volumes.toArray(new Volume[volumes.size()])) | ||||
.withRestartPolicy(RESTART_POLICY_NEVER) | .withRestartPolicy(RESTART_POLICY_NEVER) | ||||
.withTolerations(info.getTolerations()) | |||||
.endSpec() | .endSpec() | ||||
.endTemplate() | .endTemplate() | ||||
.endSpec(); | .endSpec(); | ||||
@@ -192,9 +193,6 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
private List<Volume> buildVolumes(ChildResourceCreateInfo info){ | private List<Volume> buildVolumes(ChildResourceCreateInfo info){ | ||||
//存储卷 | //存储卷 | ||||
List<Volume> volumes = new LinkedList<>(); | List<Volume> volumes = new LinkedList<>(); | ||||
Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||||
Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||||
Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||||
//shm默认就有 | //shm默认就有 | ||||
volumes.add(new VolumeBuilder() | volumes.add(new VolumeBuilder() | ||||
.withName(KubeConstants.VOLUME_SHM) | .withName(KubeConstants.VOLUME_SHM) | ||||
@@ -213,30 +211,6 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
*/ | */ | ||||
private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | ||||
List<VolumeMount> volumeMounts = new LinkedList<>(); | List<VolumeMount> volumeMounts = new LinkedList<>(); | ||||
for (Volume volume : volumes) { | |||||
if(PVC_WORKSPACE.equals(volume.getName())){ | |||||
volumeMounts.add(new VolumeMountBuilder() | |||||
.withName(volume.getName()) | |||||
.withMountPath(WORKSPACE) | |||||
.build()); | |||||
continue; | |||||
} | |||||
if(PVC_DATASET.equals(volume.getName())){ | |||||
volumeMounts.add(new VolumeMountBuilder() | |||||
.withName(volume.getName()) | |||||
.withMountPath(DATASET) | |||||
.build()); | |||||
continue; | |||||
} | |||||
if(PVC_MODEL.equals(volume.getName())){ | |||||
volumeMounts.add(new VolumeMountBuilder() | |||||
.withName(volume.getName()) | |||||
.withMountPath(MODEL) | |||||
.build()); | |||||
continue; | |||||
} | |||||
} | |||||
volumeMounts.add(new VolumeMountBuilder() | volumeMounts.add(new VolumeMountBuilder() | ||||
.withName(KubeConstants.VOLUME_SHM) | .withName(KubeConstants.VOLUME_SHM) | ||||
.withMountPath(DEV_SHM) | .withMountPath(DEV_SHM) | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -35,6 +35,7 @@ import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||||
import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | ||||
import org.onebrain.operator.action.deployer.StatefulSetDeployer; | import org.onebrain.operator.action.deployer.StatefulSetDeployer; | ||||
import org.onebrain.operator.constants.KubeConstants; | import org.onebrain.operator.constants.KubeConstants; | ||||
import org.springframework.util.CollectionUtils; | |||||
import java.util.Arrays; | import java.util.Arrays; | ||||
import java.util.Collections; | import java.util.Collections; | ||||
@@ -55,12 +56,6 @@ import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||||
public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> { | public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> { | ||||
public static final String SSH = "ssh"; | public static final String SSH = "ssh"; | ||||
public static final String PVC_WORKSPACE = "pvc-workspace"; | |||||
public static final String WORKSPACE = "/workspace"; | |||||
public static final String PVC_DATASET = "pvc-dataset"; | |||||
public static final String DATASET = "/dataset"; | |||||
public static final String PVC_MODEL = "pvc-model"; | |||||
public static final String MODEL = "/model"; | |||||
public static final String MEMORY = "Memory"; | public static final String MEMORY = "Memory"; | ||||
public static final String DEV_SHM = "/dev/shm"; | public static final String DEV_SHM = "/dev/shm"; | ||||
public static final String BIN_BASH = "/bin/bash"; | public static final String BIN_BASH = "/bin/bash"; | ||||
@@ -83,6 +78,11 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc | |||||
//挂载 | //挂载 | ||||
List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | ||||
if (!CollectionUtils.isEmpty(info.getVolumes()) && !CollectionUtils.isEmpty(info.getVolumeMounts())){ | |||||
volumes.addAll(info.getVolumes()); | |||||
volumeMounts.addAll(info.getVolumeMounts()); | |||||
} | |||||
container.setVolumeMounts(volumeMounts); | container.setVolumeMounts(volumeMounts); | ||||
//启动命令 | //启动命令 | ||||
@@ -126,6 +126,7 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc | |||||
.withTerminationGracePeriodSeconds(LONG_NUMBER_60) | .withTerminationGracePeriodSeconds(LONG_NUMBER_60) | ||||
.addToContainers(container) | .addToContainers(container) | ||||
.addToVolumes(volumes.toArray(new Volume[0])) | .addToVolumes(volumes.toArray(new Volume[0])) | ||||
.withTolerations(info.getTolerations()) | |||||
.endSpec() | .endSpec() | ||||
.endTemplate() | .endTemplate() | ||||
.endSpec(); | .endSpec(); | ||||
@@ -191,9 +192,6 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc | |||||
*/ | */ | ||||
private List<Volume> buildVolumes(ChildResourceCreateInfo info) { | private List<Volume> buildVolumes(ChildResourceCreateInfo info) { | ||||
List<Volume> volumes = new LinkedList<>(); | List<Volume> volumes = new LinkedList<>(); | ||||
Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||||
Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||||
Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||||
//shm默认就有 | //shm默认就有 | ||||
volumes.add(new VolumeBuilder() | volumes.add(new VolumeBuilder() | ||||
@@ -213,30 +211,6 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc | |||||
*/ | */ | ||||
private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | ||||
List<VolumeMount> volumeMounts=new LinkedList<>(); | List<VolumeMount> volumeMounts=new LinkedList<>(); | ||||
for (Volume volume : volumes) { | |||||
if(PVC_WORKSPACE.equals(volume.getName())){ | |||||
volumeMounts.add(new VolumeMountBuilder() | |||||
.withName(volume.getName()) | |||||
.withMountPath(WORKSPACE) | |||||
.build()); | |||||
continue; | |||||
} | |||||
if(PVC_DATASET.equals(volume.getName())){ | |||||
volumeMounts.add(new VolumeMountBuilder() | |||||
.withName(volume.getName()) | |||||
.withMountPath(DATASET) | |||||
.build()); | |||||
continue; | |||||
} | |||||
if(PVC_MODEL.equals(volume.getName())){ | |||||
volumeMounts.add(new VolumeMountBuilder() | |||||
.withName(volume.getName()) | |||||
.withMountPath(MODEL) | |||||
.build()); | |||||
continue; | |||||
} | |||||
} | |||||
volumeMounts.add(new VolumeMountBuilder() | volumeMounts.add(new VolumeMountBuilder() | ||||
.withName(KubeConstants.VOLUME_SHM) | .withName(KubeConstants.VOLUME_SHM) | ||||
.withMountPath(DEV_SHM) | .withMountPath(DEV_SHM) | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -143,7 +143,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @param distributeTrain | * @param distributeTrain | ||||
*/ | */ | ||||
public void doAction(DistributeTrain distributeTrain) { | public void doAction(DistributeTrain distributeTrain) { | ||||
log.info("doAction=>distributeTrain : 【{}】", distributeTrain); | |||||
log.info("doAction=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); | |||||
ChildResourceCreateInfo info = null; | ChildResourceCreateInfo info = null; | ||||
try { | try { | ||||
//redis重复检查 | //redis重复检查 | ||||
@@ -200,7 +200,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
*/ | */ | ||||
@Override | @Override | ||||
public void handlerAction(DistributeTrain distributeTrain) { | public void handlerAction(DistributeTrain distributeTrain) { | ||||
log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); | |||||
log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); | |||||
HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); | HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); | ||||
pool.getActiveCount(); | pool.getActiveCount(); | ||||
pool.execute(handlerActionTask); | pool.execute(handlerActionTask); | ||||
@@ -211,7 +211,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @param distributeTrain 分布式训练 | * @param distributeTrain 分布式训练 | ||||
*/ | */ | ||||
private void validateParams(DistributeTrain distributeTrain) { | private void validateParams(DistributeTrain distributeTrain) { | ||||
log.info("validateParams=>distributeTrain : 【{}】", distributeTrain); | |||||
log.info("validateParams=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); | |||||
Integer size = distributeTrain.getSpec().getSize(); | Integer size = distributeTrain.getSpec().getSize(); | ||||
if (size < NUMBER_2) { | if (size < NUMBER_2) { | ||||
throw new OperatorException("size must be greater than 1"); | throw new OperatorException("size must be greater than 1"); | ||||
@@ -254,7 +254,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @param info 资源信息 | * @param info 资源信息 | ||||
*/ | */ | ||||
private void createStatefulSet(ChildResourceCreateInfo info) { | private void createStatefulSet(ChildResourceCreateInfo info) { | ||||
log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info); | |||||
log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
StatefulSet statefulSet = client.apps().statefulSets() | StatefulSet statefulSet = client.apps().statefulSets() | ||||
.inNamespace(info.getNamespace()) | .inNamespace(info.getNamespace()) | ||||
.withName(info.getStatefulSetName()).get(); | .withName(info.getStatefulSetName()).get(); | ||||
@@ -298,7 +298,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @param info Job信息 | * @param info Job信息 | ||||
*/ | */ | ||||
private void createJob(ChildResourceCreateInfo info) { | private void createJob(ChildResourceCreateInfo info) { | ||||
log.info("createJob=>childResourceCreateInfo : 【{}】", info); | |||||
log.info("createJob=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
Job job = client.batch().jobs() | Job job = client.batch().jobs() | ||||
.inNamespace(info.getNamespace()) | .inNamespace(info.getNamespace()) | ||||
.withName(info.getJobName()).get(); | .withName(info.getJobName()).get(); | ||||
@@ -311,7 +311,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
JobDeployer deployer = new BaseJobDeployer(); | JobDeployer deployer = new BaseJobDeployer(); | ||||
JobBuilder builder = deployer.deploy(info); | JobBuilder builder = deployer.deploy(info); | ||||
job = builder.build(); | job = builder.build(); | ||||
log.info("job is : 【{}】", job); | |||||
log.info("job is : 【{}】", job.getMetadata().getName()); | |||||
client.batch().jobs().create(job); | client.batch().jobs().create(job); | ||||
log.info("create job【{}】 successfully", job.getMetadata().getName()); | log.info("create job【{}】 successfully", job.getMetadata().getName()); | ||||
} | } | ||||
@@ -394,7 +394,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @param slavePods | * @param slavePods | ||||
*/ | */ | ||||
private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) { | private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) { | ||||
log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】, slavePods : 【{}】", info, masterPod, slavePods); | |||||
log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】", info.getParentName(), masterPod.getMetadata().getName()); | |||||
String key = info.getOwnerReference().getUid(); | String key = info.getOwnerReference().getUid(); | ||||
if (dtMap.containsKey(key)) { | if (dtMap.containsKey(key)) { | ||||
dtMap.remove(key); | dtMap.remove(key); | ||||
@@ -527,7 +527,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @param info | * @param info | ||||
*/ | */ | ||||
private void recycleCr(ChildResourceCreateInfo info) { | private void recycleCr(ChildResourceCreateInfo info) { | ||||
log.info("recycleCr=>childResourceCreateInfo : 【{}】", info); | |||||
log.info("recycleCr=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
Optional.ofNullable(DistributeTrainClientHolder.getClient()) | Optional.ofNullable(DistributeTrainClientHolder.getClient()) | ||||
.ifPresent(distributeTrainClient -> { | .ifPresent(distributeTrainClient -> { | ||||
ObjectMeta metadata = new ObjectMeta(); | ObjectMeta metadata = new ObjectMeta(); | ||||
@@ -542,7 +542,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
/**更新状态*/ | /**更新状态*/ | ||||
private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { | private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { | ||||
log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info, distributeTrain); | |||||
log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info.getParentName(), distributeTrain.getMetadata().getName()); | |||||
if (distributeTrain.getStatus() == null) { | if (distributeTrain.getStatus() == null) { | ||||
distributeTrain.setStatus(new DistributeTrainStatus()); | distributeTrain.setStatus(new DistributeTrainStatus()); | ||||
} | } | ||||
@@ -568,7 +568,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @return List<Pod> 分布式相关Pod集合 | * @return List<Pod> 分布式相关Pod集合 | ||||
*/ | */ | ||||
private List<Pod> getPods(ChildResourceCreateInfo info) { | private List<Pod> getPods(ChildResourceCreateInfo info) { | ||||
log.info("getPods=>childResourceCreateInfo : 【{}】", info); | |||||
log.info("getPods=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
List<Pod> pods = Lists.newArrayList(); | List<Pod> pods = Lists.newArrayList(); | ||||
pods.add(getMasterPod(info)); | pods.add(getMasterPod(info)); | ||||
pods.addAll(getSlavePods(info)); | pods.addAll(getSlavePods(info)); | ||||
@@ -584,7 +584,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @return Pod Master节点对应的Pod | * @return Pod Master节点对应的Pod | ||||
*/ | */ | ||||
private Pod getMasterPod(ChildResourceCreateInfo info) { | private Pod getMasterPod(ChildResourceCreateInfo info) { | ||||
log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info); | |||||
log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
List<Pod> masterPods = client.pods().inNamespace(info.getNamespace()) | List<Pod> masterPods = client.pods().inNamespace(info.getNamespace()) | ||||
.withLabel(JOB_LABEL, info.getJobName()) | .withLabel(JOB_LABEL, info.getJobName()) | ||||
.list().getItems(); | .list().getItems(); | ||||
@@ -600,7 +600,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { | |||||
* @return List<Pod> Slave节点对应的Pod集合 | * @return List<Pod> Slave节点对应的Pod集合 | ||||
*/ | */ | ||||
private List<Pod> getSlavePods(ChildResourceCreateInfo info) { | private List<Pod> getSlavePods(ChildResourceCreateInfo info) { | ||||
log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info); | |||||
log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info.getParentName()); | |||||
//取得从的所有pod | //取得从的所有pod | ||||
List<Pod> slavePods = client.pods().inNamespace(info.getNamespace()) | List<Pod> slavePods = client.pods().inNamespace(info.getNamespace()) | ||||
.withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) | .withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -105,4 +105,18 @@ public class DistributeTrainSpec implements KubernetesResource { | |||||
*/ | */ | ||||
private Volume modelStorage; | private Volume modelStorage; | ||||
/** | |||||
* 内部映射 | |||||
*/ | |||||
private List<VolumeMount> volumeMounts; | |||||
/** | |||||
* 外部挂载 | |||||
*/ | |||||
private List<Volume> volumes; | |||||
/** | |||||
* 容忍度 | |||||
*/ | |||||
private List<Toleration> tolerations; | |||||
} | } |
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -58,7 +58,7 @@ public class JobHandler { | |||||
* @param job | * @param job | ||||
*/ | */ | ||||
public void handleJob(Job job) { | public void handleJob(Job job) { | ||||
log.info("handleJob=>job : 【{}】", job); | |||||
log.info("handleJob=>job : 【{}】", job.getMetadata().getName()); | |||||
//筛选出DistributeTrain下的job | //筛选出DistributeTrain下的job | ||||
List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences(); | List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences(); | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -51,7 +51,7 @@ public class JobWatcher implements Watcher<Job> { | |||||
*/ | */ | ||||
@Override | @Override | ||||
public void eventReceived(Action action, Job job) { | public void eventReceived(Action action, Job job) { | ||||
log.info("Job Event received: {} at {}", job.getMetadata().getUid(), job.getMetadata().getCreationTimestamp()); | |||||
log.info("Job Event received: {} action {}", job.getMetadata().getName(), action.toString()); | |||||
jobHandler.handleJob(job); | jobHandler.handleJob(job); | ||||
} | } | ||||
@@ -1,5 +1,5 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
/** | |||||
* Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
* | * | ||||
* Licensed under the Apache License, Version 2.0 (the "License"); | * Licensed under the Apache License, Version 2.0 (the "License"); | ||||
* you may not use this file except in compliance with the License. | * you may not use this file except in compliance with the License. | ||||
@@ -1,46 +1,43 @@ | |||||
#!/bin/bash | #!/bin/bash | ||||
if [ ! -f "/etc/init.d/ssh" ]; then | |||||
if [ ! -f "/etc/redhat-release" ]; then | |||||
if [ -f "/etc/lsb-release" ]; then | |||||
if [ ! -f "/etc/init.d/ssh" ]; then | |||||
echo 'apt install -y openssh-server' >> pretreatment.log | echo 'apt install -y openssh-server' >> pretreatment.log | ||||
apt update >> pretreatment.log | apt update >> pretreatment.log | ||||
apt install -y openssh-server >> pretreatment.log | apt install -y openssh-server >> pretreatment.log | ||||
echo '/etc/init.d/ssh start' >> pretreatment.log | |||||
/etc/init.d/ssh start >> pretreatment.log | |||||
fi | fi | ||||
if [ ! -f "/etc/lsb-release" ]; then | |||||
echo 'yum install -y sshd' >> pretreatment.log | |||||
yum update >> pretreatment.log | |||||
yum install -y sshd >> pretreatment.log | |||||
fi | |||||
fi | |||||
echo '/etc/init.d/ssh start' >> pretreatment.log | |||||
/etc/init.d/ssh start >> pretreatment.log | |||||
if [ -f "/etc/redhat-release" ]; then | |||||
if command -v nslookup >/dev/null 2>&1; then | |||||
echo 'exists nslookup' >> pretreatment.log | |||||
else | |||||
echo 'yum install dnsutils jq' >> pretreatment.log | |||||
yum install -y dnsutils >> pretreatment.log | |||||
yum install -y jq >> pretreatment.log | |||||
fi | |||||
if command -v nslookup >/dev/null 2>&1; then | |||||
echo 'exists nslookup' >> pretreatment.log | |||||
else | |||||
echo 'yum install dnsutils jq' >> pretreatment.log | |||||
yum install -y dnsutils >> pretreatment.log | |||||
yum install -y jq >> pretreatment.log | |||||
fi | |||||
fi | |||||
if [ -f "/etc/lsb-release" ]; then | |||||
if command -v jq >/dev/null 2>&1; then | |||||
if command -v jq >/dev/null 2>&1; then | |||||
echo 'exists jq' >> pretreatment.log | echo 'exists jq' >> pretreatment.log | ||||
else | |||||
else | |||||
echo 'apt install jq' >> pretreatment.log | echo 'apt install jq' >> pretreatment.log | ||||
apt install -y jq >> pretreatment.log | apt install -y jq >> pretreatment.log | ||||
fi | fi | ||||
if command -v nslookup >/dev/null 2>&1; then | |||||
if command -v nslookup >/dev/null 2>&1; then | |||||
echo 'exists nslookup' >> pretreatment.log | echo 'exists nslookup' >> pretreatment.log | ||||
else | |||||
else | |||||
echo 'apt install dnsutils' >> pretreatment.log | echo 'apt install dnsutils' >> pretreatment.log | ||||
apt install -y dnsutils >> pretreatment.log | apt install -y dnsutils >> pretreatment.log | ||||
fi | fi | ||||
fi | fi | ||||
if [ -f "/etc/redhat-release" ]; then | |||||
if [ ! -f "/usr/sbin/sshd" ]; then | |||||
echo 'yum install -y sshd' >> pretreatment.log | |||||
yum update >> pretreatment.log | |||||
yum install -y sshd >> pretreatment.log | |||||
echo 'sshd start' >> pretreatment.log | |||||
service sshd start >> pretreatment.log | |||||
fi | |||||
if command -v nslookup >/dev/null 2>&1; then | |||||
echo 'exists nslookup' >> pretreatment.log | |||||
else | |||||
echo 'yum install -y bind-utils' >> pretreatment.log | |||||
yum install -y bind-utils >> pretreatment.log | |||||
fi | |||||
if command -v jq >/dev/null 2>&1; then | |||||
echo 'exists jq' >> pretreatment.log | |||||
else | |||||
echo 'yum install -y jq' >> pretreatment.log | |||||
yum install -y jq >> pretreatment.log | |||||
fi | |||||
fi |
@@ -1,43 +0,0 @@ | |||||
/** | |||||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
* | |||||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||||
* you may not use this file except in compliance with the License. | |||||
* You may obtain a copy of the License at | |||||
* | |||||
* http://www.apache.org/licenses/LICENSE-2.0 | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, software | |||||
* distributed under the License is distributed on an "AS IS" BASIS, | |||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
* See the License for the specific language governing permissions and | |||||
* limitations under the License. | |||||
* ============================================================= | |||||
*/ | |||||
package org.onebrain.operator; | |||||
import org.onebrain.operator.api.pod.PodApi; | |||||
import org.onebrain.operator.constants.KubeConstants; | |||||
import org.springframework.beans.factory.annotation.Autowired; | |||||
import org.springframework.boot.test.context.SpringBootTest; | |||||
import java.io.File; | |||||
import java.net.URISyntaxException; | |||||
import java.net.URL; | |||||
@SpringBootTest | |||||
public class DistributeTrainOperatorApplicationTests { | |||||
@Autowired | |||||
private PodApi podApi; | |||||
// @Test | |||||
public void contextLoads() throws URISyntaxException { | |||||
final URL url = getClass().getClassLoader().getResource("key/id_rsa"); | |||||
File file = new File(url.toURI()); | |||||
podApi.copyToPod("default", "distribute-train-test-job-sv2dj", KubeConstants.MASTER_CONTAINER_NAME, file, "/root/.ssh/id_rsa"); | |||||
} | |||||
} |