diff --git a/distribute-train-operator/.gitignore b/distribute-train-operator/.gitignore new file mode 100644 index 0000000..a2a3040 --- /dev/null +++ b/distribute-train-operator/.gitignore @@ -0,0 +1,31 @@ +HELP.md +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/** +!**/src/test/** + +### STS ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### IntelliJ IDEA ### +.idea +*.iws +*.iml +*.ipr + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ + +### VS Code ### +.vscode/ diff --git a/distribute-train-operator/README.md b/distribute-train-operator/README.md index 926fb8b..64ab334 100644 --- a/distribute-train-operator/README.md +++ b/distribute-train-operator/README.md @@ -23,4 +23,4 @@ mvn clean compile package ``` ### 部署 -部署过程参看文档:[部署 分布式训练operator](http://tianshu.org.cn/?/course/1.html) +部署过程参看文档:[部署 分布式训练operator](http://docs.dubhe.ai/docs/setup/deploy-distribute-train-operator) diff --git a/distribute-train-operator/docs/crds/distribute-train-cr.yaml b/distribute-train-operator/docs/crds/distribute-train-cr.yaml index 40b2f72..af9fe29 100644 --- a/distribute-train-operator/docs/crds/distribute-train-cr.yaml +++ b/distribute-train-operator/docs/crds/distribute-train-cr.yaml @@ -48,18 +48,22 @@ spec: value: 6 - name: NCCL_DEBUG value: INFO - datasetStorage: - name: pvc-dataset - nfs: - path: {{DATASET}} - server: {{NFS}} - workspaceStorage: - name: pvc-workspace - nfs: - path: /nfs/resnet50/workspace - server: {{WORKSPACE}} - modelStorage: - name: pvc-model - nfs: - path: /nfs/resnet50/model - server: {{MODEL}} \ No newline at end of file + volumeMounts: + - mountPath: /dataset + name: volume-0 + - mountPath: /workspace + name: volume-1 + volumes: + - name: volume-0 + nfs: + path: /nfs/dubhe-prod/dataset/5/versionFile/V0001/ofrecord/train + server: {{NFS IP}} + - name: volume-1 + nfs: + path: /nfs/dubhe-prod/train-manage/1/train-1-20200825173815-v0020 + server: {{NFS IP}} + tolerations: + - key: "platform/node-isolate" + operator: "Equal" + value: "prod-isolate-1" + effect: "NoSchedule" \ No newline at end of file diff --git a/distribute-train-operator/docs/crds/distribute-train-crd.yaml b/distribute-train-operator/docs/crds/distribute-train-crd.yaml index 07bea5a..c5a4133 100644 --- a/distribute-train-operator/docs/crds/distribute-train-crd.yaml +++ b/distribute-train-operator/docs/crds/distribute-train-crd.yaml @@ -45,12 +45,12 @@ spec: type: object initContainer: type: object - datasetStorage: - type: object - workspaceStorage: - type: object - modelStorage: - type: object + volumeMounts: + type: array + volumes: + type: array + tolerations: + type: array required: - image - imagePullPolicy diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/DistributeTrainOperatorApplication.java b/distribute-train-operator/src/main/java/org/onebrain/operator/DistributeTrainOperatorApplication.java index 1479e00..f27190a 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/DistributeTrainOperatorApplication.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/DistributeTrainOperatorApplication.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/DistributeTrainOperatorManager.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/DistributeTrainOperatorManager.java index 7756545..dab9f5c 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/DistributeTrainOperatorManager.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/DistributeTrainOperatorManager.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -184,11 +184,11 @@ public class DistributeTrainOperatorManager { .addToProperties("slaveResources", objectType) .addToProperties("nodeSelector", objectType) .addToProperties("initContainer", objectType) - .addToProperties("datasetStorage", objectType) - .addToProperties("workspaceStorage", objectType) - .addToProperties("modelStorage", objectType) + .addToProperties("volumeMounts", arrayType) + .addToProperties("volumes", arrayType) + .addToProperties("tolerations", arrayType) .withType("object") - .addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd", "workspaceStorage") + .addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd") .build(); properties.put("apiVersion", stringType); properties.put("kind", stringType); diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/OperatorRunner.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/OperatorRunner.java index e6f5002..ed0d6de 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/OperatorRunner.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/OperatorRunner.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/PodInfo.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/PodInfo.java index b863ab3..fd1df6b 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/PodInfo.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/PodInfo.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/AbstractResourceCreateInfo.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/AbstractResourceCreateInfo.java index a8f031e..bba5838 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/AbstractResourceCreateInfo.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/AbstractResourceCreateInfo.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ChildResourceCreateInfo.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ChildResourceCreateInfo.java index 94d27eb..c4ee78c 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ChildResourceCreateInfo.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ChildResourceCreateInfo.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -123,29 +123,27 @@ public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { private Container initContainer; /** - * 工作目录挂载 + * 环境变量 */ - private Volume workspaceVolume; + private List env; /** - * 数据集目录挂载 + * 拥有者信息 */ - private Volume datasetVolume; + private OwnerReference ownerReference; /** - * 模型目录挂载 + * 内部映射 */ - private Volume modelVolume; - + private List volumeMounts; /** - * 环境变量 + * 外部挂载 */ - private List env; - + private List volumes; /** - * 拥有者信息 + * 容忍度 */ - private OwnerReference ownerReference; + private List tolerations; /** * 将分布式训练转换为K8S的资源信息 @@ -173,12 +171,14 @@ public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) .setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); //挂载 - Optional.ofNullable(distributeTrain.getSpec().getWorkspaceStorage()) - .ifPresent(v -> info.setWorkspaceVolume(v)); - Optional.ofNullable(distributeTrain.getSpec().getDatasetStorage()) - .ifPresent(v -> info.setDatasetVolume(v)); - Optional.ofNullable(distributeTrain.getSpec().getModelStorage()) - .ifPresent(v -> info.setModelVolume(v)); + Optional.ofNullable(distributeTrain.getSpec().getVolumeMounts()) + .ifPresent(v -> info.setVolumeMounts(v)); + Optional.ofNullable(distributeTrain.getSpec().getVolumes()) + .ifPresent(v -> info.setVolumes(v)); + + //容忍度 + Optional.ofNullable(distributeTrain.getSpec().getTolerations()) + .ifPresent(v -> info.setTolerations(v)); //主从两组资源限制 Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/JobDeployer.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/JobDeployer.java index 68f4161..6b22951 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/JobDeployer.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/JobDeployer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ServiceDeployer.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ServiceDeployer.java index 4221c04..ba6fdb6 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ServiceDeployer.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/ServiceDeployer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/StatefulSetDeployer.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/StatefulSetDeployer.java index 3be8d96..3c666b9 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/StatefulSetDeployer.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/StatefulSetDeployer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseJobDeployer.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseJobDeployer.java index 68bf788..6a5ff0c 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseJobDeployer.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseJobDeployer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ import io.fabric8.kubernetes.api.model.batch.JobBuilder; import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; import org.onebrain.operator.action.deployer.JobDeployer; import org.onebrain.operator.constants.KubeConstants; +import org.springframework.util.CollectionUtils; import java.util.*; @@ -46,13 +47,7 @@ import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; */ public class BaseJobDeployer implements JobDeployer { - public static final String PVC_WORKSPACE = "pvc-workspace"; public static final String SSH = "ssh"; - public static final String WORKSPACE = "/workspace"; - public static final String PVC_DATASET = "pvc-dataset"; - public static final String DATASET = "/dataset"; - public static final String PVC_MODEL = "pvc-model"; - public static final String MODEL = "/model"; public static final String MEMORY = "Memory"; public static final String DEV_SHM = "/dev/shm"; public static final String BIN_BASH = "/bin/bash"; @@ -74,6 +69,11 @@ public class BaseJobDeployer implements JobDeployer { //挂载 List volumeMounts = buildVolumeMounts(volumes); + if (!CollectionUtils.isEmpty(info.getVolumes()) && !CollectionUtils.isEmpty(info.getVolumeMounts())){ + volumes.addAll(info.getVolumes()); + volumeMounts.addAll(info.getVolumeMounts()); + } + container.setVolumeMounts(volumeMounts); //启动命令 @@ -123,6 +123,7 @@ public class BaseJobDeployer implements JobDeployer { .addToContainers(container) .addToVolumes(volumes.toArray(new Volume[volumes.size()])) .withRestartPolicy(RESTART_POLICY_NEVER) + .withTolerations(info.getTolerations()) .endSpec() .endTemplate() .endSpec(); @@ -192,9 +193,6 @@ public class BaseJobDeployer implements JobDeployer { private List buildVolumes(ChildResourceCreateInfo info){ //存储卷 List volumes = new LinkedList<>(); - Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); - Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); - Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); //shm默认就有 volumes.add(new VolumeBuilder() .withName(KubeConstants.VOLUME_SHM) @@ -213,30 +211,6 @@ public class BaseJobDeployer implements JobDeployer { */ private List buildVolumeMounts(List volumes) { List volumeMounts = new LinkedList<>(); - for (Volume volume : volumes) { - if(PVC_WORKSPACE.equals(volume.getName())){ - volumeMounts.add(new VolumeMountBuilder() - .withName(volume.getName()) - .withMountPath(WORKSPACE) - .build()); - continue; - } - if(PVC_DATASET.equals(volume.getName())){ - volumeMounts.add(new VolumeMountBuilder() - .withName(volume.getName()) - .withMountPath(DATASET) - .build()); - continue; - } - if(PVC_MODEL.equals(volume.getName())){ - volumeMounts.add(new VolumeMountBuilder() - .withName(volume.getName()) - .withMountPath(MODEL) - .build()); - continue; - } - } - volumeMounts.add(new VolumeMountBuilder() .withName(KubeConstants.VOLUME_SHM) .withMountPath(DEV_SHM) diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseServiceDeployer.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseServiceDeployer.java index e18a625..d3e8be7 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseServiceDeployer.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseServiceDeployer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseStatefulSetDeployer.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseStatefulSetDeployer.java index c51e43a..d9c07f8 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseStatefulSetDeployer.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/deployer/impl/BaseStatefulSetDeployer.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,7 @@ import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; import org.onebrain.operator.action.deployer.StatefulSetDeployer; import org.onebrain.operator.constants.KubeConstants; +import org.springframework.util.CollectionUtils; import java.util.Arrays; import java.util.Collections; @@ -55,12 +56,6 @@ import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; public class BaseStatefulSetDeployer implements StatefulSetDeployer { public static final String SSH = "ssh"; - public static final String PVC_WORKSPACE = "pvc-workspace"; - public static final String WORKSPACE = "/workspace"; - public static final String PVC_DATASET = "pvc-dataset"; - public static final String DATASET = "/dataset"; - public static final String PVC_MODEL = "pvc-model"; - public static final String MODEL = "/model"; public static final String MEMORY = "Memory"; public static final String DEV_SHM = "/dev/shm"; public static final String BIN_BASH = "/bin/bash"; @@ -83,6 +78,11 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer volumeMounts = buildVolumeMounts(volumes); + if (!CollectionUtils.isEmpty(info.getVolumes()) && !CollectionUtils.isEmpty(info.getVolumeMounts())){ + volumes.addAll(info.getVolumes()); + volumeMounts.addAll(info.getVolumeMounts()); + } + container.setVolumeMounts(volumeMounts); //启动命令 @@ -126,6 +126,7 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer buildVolumes(ChildResourceCreateInfo info) { List volumes = new LinkedList<>(); - Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); - Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); - Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); //shm默认就有 volumes.add(new VolumeBuilder() @@ -213,30 +211,6 @@ public class BaseStatefulSetDeployer implements StatefulSetDeployer buildVolumeMounts(List volumes) { List volumeMounts=new LinkedList<>(); - for (Volume volume : volumes) { - if(PVC_WORKSPACE.equals(volume.getName())){ - volumeMounts.add(new VolumeMountBuilder() - .withName(volume.getName()) - .withMountPath(WORKSPACE) - .build()); - continue; - } - if(PVC_DATASET.equals(volume.getName())){ - volumeMounts.add(new VolumeMountBuilder() - .withName(volume.getName()) - .withMountPath(DATASET) - .build()); - continue; - } - if(PVC_MODEL.equals(volume.getName())){ - volumeMounts.add(new VolumeMountBuilder() - .withName(volume.getName()) - .withMountPath(MODEL) - .build()); - continue; - } - } - volumeMounts.add(new VolumeMountBuilder() .withName(KubeConstants.VOLUME_SHM) .withMountPath(DEV_SHM) diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/AddActionHandler.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/AddActionHandler.java index 99f6f52..bed6dba 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/AddActionHandler.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/AddActionHandler.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -143,7 +143,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @param distributeTrain */ public void doAction(DistributeTrain distributeTrain) { - log.info("doAction=>distributeTrain : 【{}】", distributeTrain); + log.info("doAction=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); ChildResourceCreateInfo info = null; try { //redis重复检查 @@ -200,7 +200,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { */ @Override public void handlerAction(DistributeTrain distributeTrain) { - log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); + log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); pool.getActiveCount(); pool.execute(handlerActionTask); @@ -211,7 +211,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @param distributeTrain 分布式训练 */ private void validateParams(DistributeTrain distributeTrain) { - log.info("validateParams=>distributeTrain : 【{}】", distributeTrain); + log.info("validateParams=>distributeTrain : 【{}】", distributeTrain.getMetadata().getName()); Integer size = distributeTrain.getSpec().getSize(); if (size < NUMBER_2) { throw new OperatorException("size must be greater than 1"); @@ -254,7 +254,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @param info 资源信息 */ private void createStatefulSet(ChildResourceCreateInfo info) { - log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info); + log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info.getParentName()); StatefulSet statefulSet = client.apps().statefulSets() .inNamespace(info.getNamespace()) .withName(info.getStatefulSetName()).get(); @@ -298,7 +298,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @param info Job信息 */ private void createJob(ChildResourceCreateInfo info) { - log.info("createJob=>childResourceCreateInfo : 【{}】", info); + log.info("createJob=>childResourceCreateInfo : 【{}】", info.getParentName()); Job job = client.batch().jobs() .inNamespace(info.getNamespace()) .withName(info.getJobName()).get(); @@ -311,7 +311,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { JobDeployer deployer = new BaseJobDeployer(); JobBuilder builder = deployer.deploy(info); job = builder.build(); - log.info("job is : 【{}】", job); + log.info("job is : 【{}】", job.getMetadata().getName()); client.batch().jobs().create(job); log.info("create job【{}】 successfully", job.getMetadata().getName()); } @@ -394,7 +394,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @param slavePods */ private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List slavePods) { - log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】, slavePods : 【{}】", info, masterPod, slavePods); + log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】", info.getParentName(), masterPod.getMetadata().getName()); String key = info.getOwnerReference().getUid(); if (dtMap.containsKey(key)) { dtMap.remove(key); @@ -527,7 +527,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @param info */ private void recycleCr(ChildResourceCreateInfo info) { - log.info("recycleCr=>childResourceCreateInfo : 【{}】", info); + log.info("recycleCr=>childResourceCreateInfo : 【{}】", info.getParentName()); Optional.ofNullable(DistributeTrainClientHolder.getClient()) .ifPresent(distributeTrainClient -> { ObjectMeta metadata = new ObjectMeta(); @@ -542,7 +542,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { /**更新状态*/ private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { - log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info, distributeTrain); + log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info.getParentName(), distributeTrain.getMetadata().getName()); if (distributeTrain.getStatus() == null) { distributeTrain.setStatus(new DistributeTrainStatus()); } @@ -568,7 +568,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @return List 分布式相关Pod集合 */ private List getPods(ChildResourceCreateInfo info) { - log.info("getPods=>childResourceCreateInfo : 【{}】", info); + log.info("getPods=>childResourceCreateInfo : 【{}】", info.getParentName()); List pods = Lists.newArrayList(); pods.add(getMasterPod(info)); pods.addAll(getSlavePods(info)); @@ -584,7 +584,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @return Pod Master节点对应的Pod */ private Pod getMasterPod(ChildResourceCreateInfo info) { - log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info); + log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info.getParentName()); List masterPods = client.pods().inNamespace(info.getNamespace()) .withLabel(JOB_LABEL, info.getJobName()) .list().getItems(); @@ -600,7 +600,7 @@ public class AddActionHandler implements DistributeTrainActionHandler { * @return List Slave节点对应的Pod集合 */ private List getSlavePods(ChildResourceCreateInfo info) { - log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info); + log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info.getParentName()); //取得从的所有pod List slavePods = client.pods().inNamespace(info.getNamespace()) .withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DeleteActionHandler.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DeleteActionHandler.java index f0b1465..8c3727b 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DeleteActionHandler.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DeleteActionHandler.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DistributeTrainActionHandler.java b/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DistributeTrainActionHandler.java index 70931e8..5396fbb 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DistributeTrainActionHandler.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/action/handler/DistributeTrainActionHandler.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/DefaultPodExecListener.java b/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/DefaultPodExecListener.java index c163c21..43ceb3d 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/DefaultPodExecListener.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/DefaultPodExecListener.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/PodApi.java b/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/PodApi.java index a3a11f1..01f3607 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/PodApi.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/PodApi.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/StdPodExecListener.java b/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/StdPodExecListener.java index bd0aa79..680c5be 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/StdPodExecListener.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/api/pod/StdPodExecListener.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/config/KubeConfig.java b/distribute-train-operator/src/main/java/org/onebrain/operator/config/KubeConfig.java index f586440..e7e394f 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/config/KubeConfig.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/config/KubeConfig.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/constants/CrdConstants.java b/distribute-train-operator/src/main/java/org/onebrain/operator/constants/CrdConstants.java index e945ed0..b00a58f 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/constants/CrdConstants.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/constants/CrdConstants.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/constants/KubeConstants.java b/distribute-train-operator/src/main/java/org/onebrain/operator/constants/KubeConstants.java index f2de52d..3653550 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/constants/KubeConstants.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/constants/KubeConstants.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/constants/NumberConstant.java b/distribute-train-operator/src/main/java/org/onebrain/operator/constants/NumberConstant.java index da5821f..709bc3d 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/constants/NumberConstant.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/constants/NumberConstant.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/context/KubeContext.java b/distribute-train-operator/src/main/java/org/onebrain/operator/context/KubeContext.java index 37f8393..d6895ca 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/context/KubeContext.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/context/KubeContext.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/controller/DistributeTrainController.java b/distribute-train-operator/src/main/java/org/onebrain/operator/controller/DistributeTrainController.java index e34e4bc..b88c3a0 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/controller/DistributeTrainController.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/controller/DistributeTrainController.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrain.java b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrain.java index 6128263..df0904d 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrain.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrain.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainList.java b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainList.java index 3550d53..4287b14 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainList.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainList.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainSpec.java b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainSpec.java index fbeaa48..ed3843e 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainSpec.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainSpec.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -105,4 +105,18 @@ public class DistributeTrainSpec implements KubernetesResource { */ private Volume modelStorage; + /** + * 内部映射 + */ + private List volumeMounts; + /** + * 外部挂载 + */ + private List volumes; + + /** + * 容忍度 + */ + private List tolerations; + } diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainStatus.java b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainStatus.java index 2e8c68e..d1f5c91 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainStatus.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DistributeTrainStatus.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DoneableDistributeTrain.java b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DoneableDistributeTrain.java index 677bac6..9a4c513 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DoneableDistributeTrain.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/crd/DoneableDistributeTrain.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/enums/AccessModeEnum.java b/distribute-train-operator/src/main/java/org/onebrain/operator/enums/AccessModeEnum.java index 6ba7693..b132949 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/enums/AccessModeEnum.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/enums/AccessModeEnum.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/exception/OperatorException.java b/distribute-train-operator/src/main/java/org/onebrain/operator/exception/OperatorException.java index 1e70905..26e52a1 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/exception/OperatorException.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/exception/OperatorException.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/properties/KubeProperties.java b/distribute-train-operator/src/main/java/org/onebrain/operator/properties/KubeProperties.java index d5de775..b156333 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/properties/KubeProperties.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/properties/KubeProperties.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/redis/AbstractKeyPrefix.java b/distribute-train-operator/src/main/java/org/onebrain/operator/redis/AbstractKeyPrefix.java index 914bd4f..c0081dd 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/redis/AbstractKeyPrefix.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/redis/AbstractKeyPrefix.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/redis/RedisService.java b/distribute-train-operator/src/main/java/org/onebrain/operator/redis/RedisService.java index a4d767c..b2bbb3e 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/redis/RedisService.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/redis/RedisService.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/redis/key/OperatorKey.java b/distribute-train-operator/src/main/java/org/onebrain/operator/redis/key/OperatorKey.java index 8cdba91..4188f52 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/redis/key/OperatorKey.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/redis/key/OperatorKey.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/DistributeTrainClientHolder.java b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/DistributeTrainClientHolder.java index 18e85c2..906ef59 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/DistributeTrainClientHolder.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/DistributeTrainClientHolder.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/FastjsonUtils.java b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/FastjsonUtils.java index e4af46a..28f55c6 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/FastjsonUtils.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/FastjsonUtils.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/IOUtils.java b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/IOUtils.java index a2a6f5d..58a88e5 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/IOUtils.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/IOUtils.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/RedisUtils.java b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/RedisUtils.java index d556e77..3400219 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/RedisUtils.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/RedisUtils.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/SpringContextHolder.java b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/SpringContextHolder.java index 90e89c6..dcc2ac8 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/utils/SpringContextHolder.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/utils/SpringContextHolder.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobHandler.java b/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobHandler.java index a95ce71..5f8c6cc 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobHandler.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobHandler.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -58,7 +58,7 @@ public class JobHandler { * @param job */ public void handleJob(Job job) { - log.info("handleJob=>job : 【{}】", job); + log.info("handleJob=>job : 【{}】", job.getMetadata().getName()); //筛选出DistributeTrain下的job List ownerReferences = job.getMetadata().getOwnerReferences(); diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobWatcher.java b/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobWatcher.java index ddc708f..321167f 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobWatcher.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/JobWatcher.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,7 +51,7 @@ public class JobWatcher implements Watcher { */ @Override public void eventReceived(Action action, Job job) { - log.info("Job Event received: {} at {}", job.getMetadata().getUid(), job.getMetadata().getCreationTimestamp()); + log.info("Job Event received: {} action {}", job.getMetadata().getName(), action.toString()); jobHandler.handleJob(job); } diff --git a/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/KubeWatcherManager.java b/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/KubeWatcherManager.java index 3394913..35074a7 100644 --- a/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/KubeWatcherManager.java +++ b/distribute-train-operator/src/main/java/org/onebrain/operator/watcher/KubeWatcherManager.java @@ -1,5 +1,5 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. + /** + * Copyright 2020 Tianshu AI Platform. All Rights Reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/distribute-train-operator/src/main/resources/shell/pretreatment b/distribute-train-operator/src/main/resources/shell/pretreatment index b6b1566..1c3e511 100644 --- a/distribute-train-operator/src/main/resources/shell/pretreatment +++ b/distribute-train-operator/src/main/resources/shell/pretreatment @@ -1,46 +1,43 @@ #!/bin/bash -if [ ! -f "/etc/init.d/ssh" ]; then - if [ ! -f "/etc/redhat-release" ]; then +if [ -f "/etc/lsb-release" ]; then + if [ ! -f "/etc/init.d/ssh" ]; then echo 'apt install -y openssh-server' >> pretreatment.log apt update >> pretreatment.log apt install -y openssh-server >> pretreatment.log + echo '/etc/init.d/ssh start' >> pretreatment.log + /etc/init.d/ssh start >> pretreatment.log fi - if [ ! -f "/etc/lsb-release" ]; then - echo 'yum install -y sshd' >> pretreatment.log - yum update >> pretreatment.log - yum install -y sshd >> pretreatment.log - fi -fi -echo '/etc/init.d/ssh start' >> pretreatment.log -/etc/init.d/ssh start >> pretreatment.log -if [ -f "/etc/redhat-release" ]; then - if command -v nslookup >/dev/null 2>&1; then - echo 'exists nslookup' >> pretreatment.log - else - echo 'yum install dnsutils jq' >> pretreatment.log - yum install -y dnsutils >> pretreatment.log - yum install -y jq >> pretreatment.log - fi - if command -v nslookup >/dev/null 2>&1; then - echo 'exists nslookup' >> pretreatment.log - else - echo 'yum install dnsutils jq' >> pretreatment.log - yum install -y dnsutils >> pretreatment.log - yum install -y jq >> pretreatment.log - fi -fi - -if [ -f "/etc/lsb-release" ]; then - if command -v jq >/dev/null 2>&1; then + if command -v jq >/dev/null 2>&1; then echo 'exists jq' >> pretreatment.log - else + else echo 'apt install jq' >> pretreatment.log apt install -y jq >> pretreatment.log fi - if command -v nslookup >/dev/null 2>&1; then + if command -v nslookup >/dev/null 2>&1; then echo 'exists nslookup' >> pretreatment.log - else + else echo 'apt install dnsutils' >> pretreatment.log apt install -y dnsutils >> pretreatment.log fi fi +if [ -f "/etc/redhat-release" ]; then + if [ ! -f "/usr/sbin/sshd" ]; then + echo 'yum install -y sshd' >> pretreatment.log + yum update >> pretreatment.log + yum install -y sshd >> pretreatment.log + echo 'sshd start' >> pretreatment.log + service sshd start >> pretreatment.log + fi + if command -v nslookup >/dev/null 2>&1; then + echo 'exists nslookup' >> pretreatment.log + else + echo 'yum install -y bind-utils' >> pretreatment.log + yum install -y bind-utils >> pretreatment.log + fi + if command -v jq >/dev/null 2>&1; then + echo 'exists jq' >> pretreatment.log + else + echo 'yum install -y jq' >> pretreatment.log + yum install -y jq >> pretreatment.log + fi +fi \ No newline at end of file diff --git a/distribute-train-operator/src/test/java/org/onebrain/operator/DistributeTrainOperatorApplicationTests.java b/distribute-train-operator/src/test/java/org/onebrain/operator/DistributeTrainOperatorApplicationTests.java deleted file mode 100644 index a64de1f..0000000 --- a/distribute-train-operator/src/test/java/org/onebrain/operator/DistributeTrainOperatorApplicationTests.java +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * ============================================================= - */ - - -package org.onebrain.operator; - -import org.onebrain.operator.api.pod.PodApi; -import org.onebrain.operator.constants.KubeConstants; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; - -import java.io.File; -import java.net.URISyntaxException; -import java.net.URL; - -@SpringBootTest -public class DistributeTrainOperatorApplicationTests { - - @Autowired - private PodApi podApi; - -// @Test - public void contextLoads() throws URISyntaxException { - final URL url = getClass().getClassLoader().getResource("key/id_rsa"); - File file = new File(url.toURI()); - podApi.copyToPod("default", "distribute-train-test-job-sv2dj", KubeConstants.MASTER_CONTAINER_NAME, file, "/root/.ssh/id_rsa"); - } - -}