Browse Source

update operator

tags/v0.4.0
之江天枢 3 years ago
parent
commit
87e58112aa
49 changed files with 236 additions and 285 deletions
  1. +31
    -0
      distribute-train-operator/.gitignore
  2. +1
    -1
      distribute-train-operator/README.md
  3. +19
    -15
      distribute-train-operator/docs/crds/distribute-train-cr.yaml
  4. +6
    -6
      distribute-train-operator/docs/crds/distribute-train-crd.yaml
  5. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/DistributeTrainOperatorApplication.java
  6. +6
    -6
      distribute-train-operator/src/main/java/org/onebrain/operator/action/DistributeTrainOperatorManager.java
  7. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/OperatorRunner.java
  8. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/PodInfo.java
  9. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/AbstractResourceCreateInfo.java
  10. +20
    -20
      distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ChildResourceCreateInfo.java
  11. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/JobDeployer.java
  12. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ServiceDeployer.java
  13. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/StatefulSetDeployer.java
  14. +9
    -35
      distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseJobDeployer.java
  15. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseServiceDeployer.java
  16. +9
    -35
      distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseStatefulSetDeployer.java
  17. +14
    -14
      distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/AddActionHandler.java
  18. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DeleteActionHandler.java
  19. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DistributeTrainActionHandler.java
  20. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/DefaultPodExecListener.java
  21. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/PodApi.java
  22. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/StdPodExecListener.java
  23. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/config/KubeConfig.java
  24. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/constants/CrdConstants.java
  25. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/constants/KubeConstants.java
  26. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/constants/NumberConstant.java
  27. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/context/KubeContext.java
  28. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/controller/DistributeTrainController.java
  29. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrain.java
  30. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainList.java
  31. +16
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainSpec.java
  32. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainStatus.java
  33. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/crd/DoneableDistributeTrain.java
  34. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/enums/AccessModeEnum.java
  35. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/exception/OperatorException.java
  36. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/properties/KubeProperties.java
  37. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/redis/AbstractKeyPrefix.java
  38. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/redis/RedisService.java
  39. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/redis/key/OperatorKey.java
  40. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/utils/DistributeTrainClientHolder.java
  41. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/utils/FastjsonUtils.java
  42. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/utils/IOUtils.java
  43. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/utils/RedisUtils.java
  44. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/utils/SpringContextHolder.java
  45. +3
    -3
      distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobHandler.java
  46. +3
    -3
      distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobWatcher.java
  47. +2
    -2
      distribute-train-operator/src/main/java/org/onebrain/operator/watcher/KubeWatcherManager.java
  48. +29
    -32
      distribute-train-operator/src/main/resources/shell/pretreatment
  49. +0
    -43
      distribute-train-operator/src/test/java/org/onebrain/operator/DistributeTrainOperatorApplicationTests.java

+ 31
- 0
distribute-train-operator/.gitignore View File

@@ -0,0 +1,31 @@
HELP.md
target/
!.mvn/wrapper/maven-wrapper.jar
!**/src/main/**
!**/src/test/**

### STS ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache

### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr

### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
build/

### VS Code ###
.vscode/

+ 1
- 1
distribute-train-operator/README.md View File

@@ -23,4 +23,4 @@ mvn clean compile package
``` ```


### 部署 ### 部署
部署过程参看文档:[部署 分布式训练operator](http://tianshu.org.cn/?/course/1.html)
部署过程参看文档:[部署 分布式训练operator](http://docs.dubhe.ai/docs/setup/deploy-distribute-train-operator)

+ 19
- 15
distribute-train-operator/docs/crds/distribute-train-cr.yaml View File

@@ -48,18 +48,22 @@ spec:
value: 6 value: 6
- name: NCCL_DEBUG - name: NCCL_DEBUG
value: INFO value: INFO
datasetStorage:
name: pvc-dataset
nfs:
path: {{DATASET}}
server: {{NFS}}
workspaceStorage:
name: pvc-workspace
nfs:
path: /nfs/resnet50/workspace
server: {{WORKSPACE}}
modelStorage:
name: pvc-model
nfs:
path: /nfs/resnet50/model
server: {{MODEL}}
volumeMounts:
- mountPath: /dataset
name: volume-0
- mountPath: /workspace
name: volume-1
volumes:
- name: volume-0
nfs:
path: /nfs/dubhe-prod/dataset/5/versionFile/V0001/ofrecord/train
server: {{NFS IP}}
- name: volume-1
nfs:
path: /nfs/dubhe-prod/train-manage/1/train-1-20200825173815-v0020
server: {{NFS IP}}
tolerations:
- key: "platform/node-isolate"
operator: "Equal"
value: "prod-isolate-1"
effect: "NoSchedule"

+ 6
- 6
distribute-train-operator/docs/crds/distribute-train-crd.yaml View File

@@ -45,12 +45,12 @@ spec:
type: object type: object
initContainer: initContainer:
type: object type: object
datasetStorage:
type: object
workspaceStorage:
type: object
modelStorage:
type: object
volumeMounts:
type: array
volumes:
type: array
tolerations:
type: array
required: required:
- image - image
- imagePullPolicy - imagePullPolicy


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/DistributeTrainOperatorApplication.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 6
- 6
distribute-train-operator/src/main/java/org/onebrain/operator/action/DistributeTrainOperatorManager.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@@ -184,11 +184,11 @@ public class DistributeTrainOperatorManager {
.addToProperties("slaveResources", objectType) .addToProperties("slaveResources", objectType)
.addToProperties("nodeSelector", objectType) .addToProperties("nodeSelector", objectType)
.addToProperties("initContainer", objectType) .addToProperties("initContainer", objectType)
.addToProperties("datasetStorage", objectType)
.addToProperties("workspaceStorage", objectType)
.addToProperties("modelStorage", objectType)
.addToProperties("volumeMounts", arrayType)
.addToProperties("volumes", arrayType)
.addToProperties("tolerations", arrayType)
.withType("object") .withType("object")
.addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd", "workspaceStorage")
.addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd")
.build(); .build();
properties.put("apiVersion", stringType); properties.put("apiVersion", stringType);
properties.put("kind", stringType); properties.put("kind", stringType);


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/OperatorRunner.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/PodInfo.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/AbstractResourceCreateInfo.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 20
- 20
distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ChildResourceCreateInfo.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@@ -123,29 +123,27 @@ public class ChildResourceCreateInfo extends AbstractResourceCreateInfo {
private Container initContainer; private Container initContainer;


/** /**
* 工作目录挂载
* 环境变量
*/ */
private Volume workspaceVolume;
private List<EnvVar> env;


/** /**
* 数据集目录挂载
* 拥有者信息
*/ */
private Volume datasetVolume;
private OwnerReference ownerReference;


/** /**
* 模型目录挂载
* 内部映射
*/ */
private Volume modelVolume;

private List<VolumeMount> volumeMounts;
/** /**
* 环境变量
* 外部挂载
*/ */
private List<EnvVar> env;

private List<Volume> volumes;
/** /**
* 拥有者信息
* 容忍度
*/ */
private OwnerReference ownerReference;
private List<Toleration> tolerations;


/** /**
* 将分布式训练转换为K8S的资源信息 * 将分布式训练转换为K8S的资源信息
@@ -173,12 +171,14 @@ public class ChildResourceCreateInfo extends AbstractResourceCreateInfo {
info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) info.setMasterCmd(distributeTrain.getSpec().getMasterCmd())
.setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); .setSlaveCmd(distributeTrain.getSpec().getSlaveCmd());
//挂载 //挂载
Optional.ofNullable(distributeTrain.getSpec().getWorkspaceStorage())
.ifPresent(v -> info.setWorkspaceVolume(v));
Optional.ofNullable(distributeTrain.getSpec().getDatasetStorage())
.ifPresent(v -> info.setDatasetVolume(v));
Optional.ofNullable(distributeTrain.getSpec().getModelStorage())
.ifPresent(v -> info.setModelVolume(v));
Optional.ofNullable(distributeTrain.getSpec().getVolumeMounts())
.ifPresent(v -> info.setVolumeMounts(v));
Optional.ofNullable(distributeTrain.getSpec().getVolumes())
.ifPresent(v -> info.setVolumes(v));

//容忍度
Optional.ofNullable(distributeTrain.getSpec().getTolerations())
.ifPresent(v -> info.setTolerations(v));


//主从两组资源限制 //主从两组资源限制
Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) Optional.ofNullable(distributeTrain.getSpec().getMasterResources())


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/JobDeployer.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ServiceDeployer.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/StatefulSetDeployer.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 9
- 35
distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseJobDeployer.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@ import io.fabric8.kubernetes.api.model.batch.JobBuilder;
import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; import org.onebrain.operator.action.deployer.ChildResourceCreateInfo;
import org.onebrain.operator.action.deployer.JobDeployer; import org.onebrain.operator.action.deployer.JobDeployer;
import org.onebrain.operator.constants.KubeConstants; import org.onebrain.operator.constants.KubeConstants;
import org.springframework.util.CollectionUtils;


import java.util.*; import java.util.*;


@@ -46,13 +47,7 @@ import static org.onebrain.operator.constants.NumberConstant.NUMBER_22;
*/ */
public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> {


public static final String PVC_WORKSPACE = "pvc-workspace";
public static final String SSH = "ssh"; public static final String SSH = "ssh";
public static final String WORKSPACE = "/workspace";
public static final String PVC_DATASET = "pvc-dataset";
public static final String DATASET = "/dataset";
public static final String PVC_MODEL = "pvc-model";
public static final String MODEL = "/model";
public static final String MEMORY = "Memory"; public static final String MEMORY = "Memory";
public static final String DEV_SHM = "/dev/shm"; public static final String DEV_SHM = "/dev/shm";
public static final String BIN_BASH = "/bin/bash"; public static final String BIN_BASH = "/bin/bash";
@@ -74,6 +69,11 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> {
//挂载 //挂载
List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); List<VolumeMount> volumeMounts = buildVolumeMounts(volumes);


if (!CollectionUtils.isEmpty(info.getVolumes()) && !CollectionUtils.isEmpty(info.getVolumeMounts())){
volumes.addAll(info.getVolumes());
volumeMounts.addAll(info.getVolumeMounts());
}

container.setVolumeMounts(volumeMounts); container.setVolumeMounts(volumeMounts);


//启动命令 //启动命令
@@ -123,6 +123,7 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> {
.addToContainers(container) .addToContainers(container)
.addToVolumes(volumes.toArray(new Volume[volumes.size()])) .addToVolumes(volumes.toArray(new Volume[volumes.size()]))
.withRestartPolicy(RESTART_POLICY_NEVER) .withRestartPolicy(RESTART_POLICY_NEVER)
.withTolerations(info.getTolerations())
.endSpec() .endSpec()
.endTemplate() .endTemplate()
.endSpec(); .endSpec();
@@ -192,9 +193,6 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> {
private List<Volume> buildVolumes(ChildResourceCreateInfo info){ private List<Volume> buildVolumes(ChildResourceCreateInfo info){
//存储卷 //存储卷
List<Volume> volumes = new LinkedList<>(); List<Volume> volumes = new LinkedList<>();
Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v));
Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v));
Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v));
//shm默认就有 //shm默认就有
volumes.add(new VolumeBuilder() volumes.add(new VolumeBuilder()
.withName(KubeConstants.VOLUME_SHM) .withName(KubeConstants.VOLUME_SHM)
@@ -213,30 +211,6 @@ public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> {
*/ */
private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) {
List<VolumeMount> volumeMounts = new LinkedList<>(); List<VolumeMount> volumeMounts = new LinkedList<>();
for (Volume volume : volumes) {
if(PVC_WORKSPACE.equals(volume.getName())){
volumeMounts.add(new VolumeMountBuilder()
.withName(volume.getName())
.withMountPath(WORKSPACE)
.build());
continue;
}
if(PVC_DATASET.equals(volume.getName())){
volumeMounts.add(new VolumeMountBuilder()
.withName(volume.getName())
.withMountPath(DATASET)
.build());
continue;
}
if(PVC_MODEL.equals(volume.getName())){
volumeMounts.add(new VolumeMountBuilder()
.withName(volume.getName())
.withMountPath(MODEL)
.build());
continue;
}
}

volumeMounts.add(new VolumeMountBuilder() volumeMounts.add(new VolumeMountBuilder()
.withName(KubeConstants.VOLUME_SHM) .withName(KubeConstants.VOLUME_SHM)
.withMountPath(DEV_SHM) .withMountPath(DEV_SHM)


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseServiceDeployer.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 9
- 35
distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseStatefulSetDeployer.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@ import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder;
import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; import org.onebrain.operator.action.deployer.ChildResourceCreateInfo;
import org.onebrain.operator.action.deployer.StatefulSetDeployer; import org.onebrain.operator.action.deployer.StatefulSetDeployer;
import org.onebrain.operator.constants.KubeConstants; import org.onebrain.operator.constants.KubeConstants;
import org.springframework.util.CollectionUtils;


import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
@@ -55,12 +56,6 @@ import static org.onebrain.operator.constants.NumberConstant.NUMBER_22;
public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> { public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> {


public static final String SSH = "ssh"; public static final String SSH = "ssh";
public static final String PVC_WORKSPACE = "pvc-workspace";
public static final String WORKSPACE = "/workspace";
public static final String PVC_DATASET = "pvc-dataset";
public static final String DATASET = "/dataset";
public static final String PVC_MODEL = "pvc-model";
public static final String MODEL = "/model";
public static final String MEMORY = "Memory"; public static final String MEMORY = "Memory";
public static final String DEV_SHM = "/dev/shm"; public static final String DEV_SHM = "/dev/shm";
public static final String BIN_BASH = "/bin/bash"; public static final String BIN_BASH = "/bin/bash";
@@ -83,6 +78,11 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc
//挂载 //挂载
List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); List<VolumeMount> volumeMounts = buildVolumeMounts(volumes);


if (!CollectionUtils.isEmpty(info.getVolumes()) && !CollectionUtils.isEmpty(info.getVolumeMounts())){
volumes.addAll(info.getVolumes());
volumeMounts.addAll(info.getVolumeMounts());
}

container.setVolumeMounts(volumeMounts); container.setVolumeMounts(volumeMounts);


//启动命令 //启动命令
@@ -126,6 +126,7 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc
.withTerminationGracePeriodSeconds(LONG_NUMBER_60) .withTerminationGracePeriodSeconds(LONG_NUMBER_60)
.addToContainers(container) .addToContainers(container)
.addToVolumes(volumes.toArray(new Volume[0])) .addToVolumes(volumes.toArray(new Volume[0]))
.withTolerations(info.getTolerations())
.endSpec() .endSpec()
.endTemplate() .endTemplate()
.endSpec(); .endSpec();
@@ -191,9 +192,6 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc
*/ */
private List<Volume> buildVolumes(ChildResourceCreateInfo info) { private List<Volume> buildVolumes(ChildResourceCreateInfo info) {
List<Volume> volumes = new LinkedList<>(); List<Volume> volumes = new LinkedList<>();
Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v));
Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v));
Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v));


//shm默认就有 //shm默认就有
volumes.add(new VolumeBuilder() volumes.add(new VolumeBuilder()
@@ -213,30 +211,6 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourc
*/ */
private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) {
List<VolumeMount> volumeMounts=new LinkedList<>(); List<VolumeMount> volumeMounts=new LinkedList<>();
for (Volume volume : volumes) {
if(PVC_WORKSPACE.equals(volume.getName())){
volumeMounts.add(new VolumeMountBuilder()
.withName(volume.getName())
.withMountPath(WORKSPACE)
.build());
continue;
}
if(PVC_DATASET.equals(volume.getName())){
volumeMounts.add(new VolumeMountBuilder()
.withName(volume.getName())
.withMountPath(DATASET)
.build());
continue;
}
if(PVC_MODEL.equals(volume.getName())){
volumeMounts.add(new VolumeMountBuilder()
.withName(volume.getName())
.withMountPath(MODEL)
.build());
continue;
}
}

volumeMounts.add(new VolumeMountBuilder() volumeMounts.add(new VolumeMountBuilder()
.withName(KubeConstants.VOLUME_SHM) .withName(KubeConstants.VOLUME_SHM)
.withMountPath(DEV_SHM) .withMountPath(DEV_SHM)


+ 14
- 14
distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/AddActionHandler.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@@ -143,7 +143,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @param distributeTrain * @param distributeTrain
*/ */
public void doAction(DistributeTrain distributeTrain) { public void doAction(DistributeTrain distributeTrain) {
log.info("doAction=>distributeTrain : 【{}】", distributeTrain);
log.info("doAction=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName());
ChildResourceCreateInfo info = null; ChildResourceCreateInfo info = null;
try { try {
//redis重复检查 //redis重复检查
@@ -200,7 +200,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
*/ */
@Override @Override
public void handlerAction(DistributeTrain distributeTrain) { public void handlerAction(DistributeTrain distributeTrain) {
log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain);
log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName());
HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain);
pool.getActiveCount(); pool.getActiveCount();
pool.execute(handlerActionTask); pool.execute(handlerActionTask);
@@ -211,7 +211,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @param distributeTrain 分布式训练 * @param distributeTrain 分布式训练
*/ */
private void validateParams(DistributeTrain distributeTrain) { private void validateParams(DistributeTrain distributeTrain) {
log.info("validateParams=>distributeTrain : 【{}】", distributeTrain);
log.info("validateParams=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName());
Integer size = distributeTrain.getSpec().getSize(); Integer size = distributeTrain.getSpec().getSize();
if (size < NUMBER_2) { if (size < NUMBER_2) {
throw new OperatorException("size must be greater than 1"); throw new OperatorException("size must be greater than 1");
@@ -254,7 +254,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @param info 资源信息 * @param info 资源信息
*/ */
private void createStatefulSet(ChildResourceCreateInfo info) { private void createStatefulSet(ChildResourceCreateInfo info) {
log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info);
log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info.getParentName());
StatefulSet statefulSet = client.apps().statefulSets() StatefulSet statefulSet = client.apps().statefulSets()
.inNamespace(info.getNamespace()) .inNamespace(info.getNamespace())
.withName(info.getStatefulSetName()).get(); .withName(info.getStatefulSetName()).get();
@@ -298,7 +298,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @param info Job信息 * @param info Job信息
*/ */
private void createJob(ChildResourceCreateInfo info) { private void createJob(ChildResourceCreateInfo info) {
log.info("createJob=>childResourceCreateInfo : 【{}】", info);
log.info("createJob=>childResourceCreateInfo : 【{}】", info.getParentName());
Job job = client.batch().jobs() Job job = client.batch().jobs()
.inNamespace(info.getNamespace()) .inNamespace(info.getNamespace())
.withName(info.getJobName()).get(); .withName(info.getJobName()).get();
@@ -311,7 +311,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
JobDeployer deployer = new BaseJobDeployer(); JobDeployer deployer = new BaseJobDeployer();
JobBuilder builder = deployer.deploy(info); JobBuilder builder = deployer.deploy(info);
job = builder.build(); job = builder.build();
log.info("job is : 【{}】", job);
log.info("job is : 【{}】", job.getMetadata().getName());
client.batch().jobs().create(job); client.batch().jobs().create(job);
log.info("create job【{}】 successfully", job.getMetadata().getName()); log.info("create job【{}】 successfully", job.getMetadata().getName());
} }
@@ -394,7 +394,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @param slavePods * @param slavePods
*/ */
private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) { private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) {
log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】, slavePods : 【{}】", info, masterPod, slavePods);
log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】", info.getParentName(), masterPod.getMetadata().getName());
String key = info.getOwnerReference().getUid(); String key = info.getOwnerReference().getUid();
if (dtMap.containsKey(key)) { if (dtMap.containsKey(key)) {
dtMap.remove(key); dtMap.remove(key);
@@ -527,7 +527,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @param info * @param info
*/ */
private void recycleCr(ChildResourceCreateInfo info) { private void recycleCr(ChildResourceCreateInfo info) {
log.info("recycleCr=>childResourceCreateInfo : 【{}】", info);
log.info("recycleCr=>childResourceCreateInfo : 【{}】", info.getParentName());
Optional.ofNullable(DistributeTrainClientHolder.getClient()) Optional.ofNullable(DistributeTrainClientHolder.getClient())
.ifPresent(distributeTrainClient -> { .ifPresent(distributeTrainClient -> {
ObjectMeta metadata = new ObjectMeta(); ObjectMeta metadata = new ObjectMeta();
@@ -542,7 +542,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {


/**更新状态*/ /**更新状态*/
private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) {
log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info, distributeTrain);
log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info.getParentName(), distributeTrain.getMetadata().getName());
if (distributeTrain.getStatus() == null) { if (distributeTrain.getStatus() == null) {
distributeTrain.setStatus(new DistributeTrainStatus()); distributeTrain.setStatus(new DistributeTrainStatus());
} }
@@ -568,7 +568,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @return List<Pod> 分布式相关Pod集合 * @return List<Pod> 分布式相关Pod集合
*/ */
private List<Pod> getPods(ChildResourceCreateInfo info) { private List<Pod> getPods(ChildResourceCreateInfo info) {
log.info("getPods=>childResourceCreateInfo : 【{}】", info);
log.info("getPods=>childResourceCreateInfo : 【{}】", info.getParentName());
List<Pod> pods = Lists.newArrayList(); List<Pod> pods = Lists.newArrayList();
pods.add(getMasterPod(info)); pods.add(getMasterPod(info));
pods.addAll(getSlavePods(info)); pods.addAll(getSlavePods(info));
@@ -584,7 +584,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @return Pod Master节点对应的Pod * @return Pod Master节点对应的Pod
*/ */
private Pod getMasterPod(ChildResourceCreateInfo info) { private Pod getMasterPod(ChildResourceCreateInfo info) {
log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info);
log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info.getParentName());
List<Pod> masterPods = client.pods().inNamespace(info.getNamespace()) List<Pod> masterPods = client.pods().inNamespace(info.getNamespace())
.withLabel(JOB_LABEL, info.getJobName()) .withLabel(JOB_LABEL, info.getJobName())
.list().getItems(); .list().getItems();
@@ -600,7 +600,7 @@ public class AddActionHandler implements DistributeTrainActionHandler {
* @return List<Pod> Slave节点对应的Pod集合 * @return List<Pod> Slave节点对应的Pod集合
*/ */
private List<Pod> getSlavePods(ChildResourceCreateInfo info) { private List<Pod> getSlavePods(ChildResourceCreateInfo info) {
log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info);
log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info.getParentName());
//取得从的所有pod //取得从的所有pod
List<Pod> slavePods = client.pods().inNamespace(info.getNamespace()) List<Pod> slavePods = client.pods().inNamespace(info.getNamespace())
.withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) .withLabel(STATEFULSET_LABEL, info.getStatefulSetName())


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DeleteActionHandler.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DistributeTrainActionHandler.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/DefaultPodExecListener.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/PodApi.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/StdPodExecListener.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/config/KubeConfig.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/constants/CrdConstants.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/constants/KubeConstants.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/constants/NumberConstant.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/context/KubeContext.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/controller/DistributeTrainController.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrain.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainList.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 16
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainSpec.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@@ -105,4 +105,18 @@ public class DistributeTrainSpec implements KubernetesResource {
*/ */
private Volume modelStorage; private Volume modelStorage;


/**
* 内部映射
*/
private List<VolumeMount> volumeMounts;
/**
* 外部挂载
*/
private List<Volume> volumes;

/**
* 容忍度
*/
private List<Toleration> tolerations;

} }

+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainStatus.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/crd/DoneableDistributeTrain.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/enums/AccessModeEnum.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/exception/OperatorException.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/properties/KubeProperties.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/redis/AbstractKeyPrefix.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/redis/RedisService.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/redis/key/OperatorKey.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/utils/DistributeTrainClientHolder.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/utils/FastjsonUtils.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/utils/IOUtils.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/utils/RedisUtils.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/utils/SpringContextHolder.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 3
- 3
distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobHandler.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ public class JobHandler {
* @param job * @param job
*/ */
public void handleJob(Job job) { public void handleJob(Job job) {
log.info("handleJob=>job : 【{}】", job);
log.info("handleJob=>job : 【{}】", job.getMetadata().getName());


//筛选出DistributeTrain下的job //筛选出DistributeTrain下的job
List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences(); List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences();


+ 3
- 3
distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobWatcher.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ public class JobWatcher implements Watcher<Job> {
*/ */
@Override @Override
public void eventReceived(Action action, Job job) { public void eventReceived(Action action, Job job) {
log.info("Job Event received: {} at {}", job.getMetadata().getUid(), job.getMetadata().getCreationTimestamp());
log.info("Job Event received: {} action {}", job.getMetadata().getName(), action.toString());
jobHandler.handleJob(job); jobHandler.handleJob(job);
} }




+ 2
- 2
distribute-train-operator/src/main/java/org/onebrain/operator/watcher/KubeWatcherManager.java View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.


+ 29
- 32
distribute-train-operator/src/main/resources/shell/pretreatment View File

@@ -1,46 +1,43 @@
#!/bin/bash #!/bin/bash
if [ ! -f "/etc/init.d/ssh" ]; then
if [ ! -f "/etc/redhat-release" ]; then
if [ -f "/etc/lsb-release" ]; then
if [ ! -f "/etc/init.d/ssh" ]; then
echo 'apt install -y openssh-server' >> pretreatment.log echo 'apt install -y openssh-server' >> pretreatment.log
apt update >> pretreatment.log apt update >> pretreatment.log
apt install -y openssh-server >> pretreatment.log apt install -y openssh-server >> pretreatment.log
echo '/etc/init.d/ssh start' >> pretreatment.log
/etc/init.d/ssh start >> pretreatment.log
fi fi
if [ ! -f "/etc/lsb-release" ]; then
echo 'yum install -y sshd' >> pretreatment.log
yum update >> pretreatment.log
yum install -y sshd >> pretreatment.log
fi
fi
echo '/etc/init.d/ssh start' >> pretreatment.log
/etc/init.d/ssh start >> pretreatment.log
if [ -f "/etc/redhat-release" ]; then
if command -v nslookup >/dev/null 2>&1; then
echo 'exists nslookup' >> pretreatment.log
else
echo 'yum install dnsutils jq' >> pretreatment.log
yum install -y dnsutils >> pretreatment.log
yum install -y jq >> pretreatment.log
fi
if command -v nslookup >/dev/null 2>&1; then
echo 'exists nslookup' >> pretreatment.log
else
echo 'yum install dnsutils jq' >> pretreatment.log
yum install -y dnsutils >> pretreatment.log
yum install -y jq >> pretreatment.log
fi
fi

if [ -f "/etc/lsb-release" ]; then
if command -v jq >/dev/null 2>&1; then
if command -v jq >/dev/null 2>&1; then
echo 'exists jq' >> pretreatment.log echo 'exists jq' >> pretreatment.log
else
else
echo 'apt install jq' >> pretreatment.log echo 'apt install jq' >> pretreatment.log
apt install -y jq >> pretreatment.log apt install -y jq >> pretreatment.log
fi fi
if command -v nslookup >/dev/null 2>&1; then
if command -v nslookup >/dev/null 2>&1; then
echo 'exists nslookup' >> pretreatment.log echo 'exists nslookup' >> pretreatment.log
else
else
echo 'apt install dnsutils' >> pretreatment.log echo 'apt install dnsutils' >> pretreatment.log
apt install -y dnsutils >> pretreatment.log apt install -y dnsutils >> pretreatment.log
fi fi
fi fi
if [ -f "/etc/redhat-release" ]; then
if [ ! -f "/usr/sbin/sshd" ]; then
echo 'yum install -y sshd' >> pretreatment.log
yum update >> pretreatment.log
yum install -y sshd >> pretreatment.log
echo 'sshd start' >> pretreatment.log
service sshd start >> pretreatment.log
fi
if command -v nslookup >/dev/null 2>&1; then
echo 'exists nslookup' >> pretreatment.log
else
echo 'yum install -y bind-utils' >> pretreatment.log
yum install -y bind-utils >> pretreatment.log
fi
if command -v jq >/dev/null 2>&1; then
echo 'exists jq' >> pretreatment.log
else
echo 'yum install -y jq' >> pretreatment.log
yum install -y jq >> pretreatment.log
fi
fi

+ 0
- 43
distribute-train-operator/src/test/java/org/onebrain/operator/DistributeTrainOperatorApplicationTests.java View File

@@ -1,43 +0,0 @@
/**
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/


package org.onebrain.operator;

import org.onebrain.operator.api.pod.PodApi;
import org.onebrain.operator.constants.KubeConstants;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.File;
import java.net.URISyntaxException;
import java.net.URL;

@SpringBootTest
public class DistributeTrainOperatorApplicationTests {

@Autowired
private PodApi podApi;

// @Test
public void contextLoads() throws URISyntaxException {
final URL url = getClass().getClassLoader().getResource("key/id_rsa");
File file = new File(url.toURI());
podApi.copyToPod("default", "distribute-train-test-job-sv2dj", KubeConstants.MASTER_CONTAINER_NAME, file, "/root/.ssh/id_rsa");
}

}

Loading…
Cancel
Save