@@ -0,0 +1,26 @@ | |||
# 之江天枢-分布式训练 operator | |||
该模块是分布式训练CRD的控制器,管理分布式训练容器生命周期,为分布式训练容器注入其他容器ip。 | |||
## 源码部署 | |||
### 准备环境 | |||
安装如下软件环境。 | |||
- OpenJDK:1.8+ | |||
- Redis: 3.0+ | |||
- Maven: 3.0+ | |||
### 下载源码 | |||
``` bash | |||
git clone https://codeup.teambition.com/zhejianglab/distribute-train-operator.git | |||
# 进入项目根目录 | |||
cd distribute-train-operator | |||
``` | |||
### 构建 | |||
``` bash | |||
# 构建,生成的 jar 包位于 ./target/distribute-train-operator-1.0.jar | |||
mvn clean compile package | |||
``` | |||
### 部署 | |||
部署过程参看文档:[部署 分布式训练operator](http://tianshu.org.cn/?/course/1.html) |
@@ -0,0 +1,65 @@ | |||
apiVersion: onebrain.oneflow.org/v1alpha1 | |||
kind: DistributeTrain | |||
metadata: | |||
name: dt-resnet50 | |||
namespace: resnet50 | |||
labels: | |||
key: value | |||
spec: | |||
size: 3 | |||
image: {{IMAGE}} | |||
imagePullPolicy: IfNotPresent | |||
masterCmd: export NODE_IPS=`cat /home/hostfile.json |jq -r '.[]|.ip'|paste -d "," -s` && cd /workspace/Classification/cnns && rm -rf core.* && rm -rf ./output/snapshots/* && python3 of_cnn_train_val.py --train_data_dir=$DATA_ROOT/train --train_data_part_num=$TRAIN_DATA_PART_NUM --val_data_dir=$DATA_ROOT/validation --val_data_part_num=$VAL_DATA_PART_NUM --num_nodes=$NODE_NUM --node_ips="$NODE_IPS" --gpu_num_per_node=$GPU_NUM_PER_NODE --model_update="momentum" --learning_rate=0.256 --loss_print_every_n_iter=1 --batch_size_per_device=64 --val_batch_size_per_device=64 --num_epoch=1 --model="resnet50" --model_save_dir=/model | |||
masterResources: | |||
requests: | |||
nvidia.com/gpu: 2 | |||
memory: "16Gi" | |||
cpu: "2" | |||
limits: | |||
nvidia.com/gpu: 2 | |||
memory: "16Gi" | |||
cpu: "2" | |||
slaveCmd: export NODE_IPS=`cat /home/hostfile.json |jq -r '.[]|.ip'|paste -d "," -s` && cd /workspace/Classification/cnns && rm -rf core.* && rm -rf ./output/snapshots/* && python3 of_cnn_train_val.py --train_data_dir=$DATA_ROOT/train --train_data_part_num=$TRAIN_DATA_PART_NUM --val_data_dir=$DATA_ROOT/validation --val_data_part_num=$VAL_DATA_PART_NUM --num_nodes=$NODE_NUM --node_ips="$NODE_IPS" --gpu_num_per_node=$GPU_NUM_PER_NODE --model_update="momentum" --learning_rate=0.256 --loss_print_every_n_iter=1 --batch_size_per_device=64 --val_batch_size_per_device=64 --num_epoch=1 --model="resnet50" --model_save_dir=/model | |||
slaveResources: | |||
requests: | |||
nvidia.com/gpu: 2 | |||
memory: "16Gi" | |||
cpu: "2" | |||
limits: | |||
nvidia.com/gpu: 2 | |||
memory: "16Gi" | |||
cpu: "2" | |||
nodeSelector: | |||
kubernetes.io/hostname: node02 | |||
env: | |||
- name: ENABLE_USER_OP | |||
value: 'True' | |||
- name: DATA_ROOT | |||
value: '/dataset' | |||
- name: NODE_NUM | |||
value: 3 | |||
- name: GPU_NUM_PER_NODE | |||
value: 2 | |||
- name: ONEFLOW_DEBUG_MODE | |||
value: "" | |||
- name: TRAIN_DATA_PART_NUM | |||
value: 6 | |||
- name: VAL_DATA_PART_NUM | |||
value: 6 | |||
- name: NCCL_DEBUG | |||
value: INFO | |||
datasetStorage: | |||
name: pvc-dataset | |||
nfs: | |||
path: {{DATASET}} | |||
server: {{NFS}} | |||
workspaceStorage: | |||
name: pvc-workspace | |||
nfs: | |||
path: /nfs/resnet50/workspace | |||
server: {{WORKSPACE}} | |||
modelStorage: | |||
name: pvc-model | |||
nfs: | |||
path: /nfs/resnet50/model | |||
server: {{MODEL}} |
@@ -0,0 +1,61 @@ | |||
--- | |||
apiVersion: apiextensions.k8s.io/v1beta1 | |||
kind: CustomResourceDefinition | |||
metadata: | |||
name: distributetrains.onebrain.oneflow.org | |||
spec: | |||
group: onebrain.oneflow.org | |||
names: | |||
kind: DistributeTrain | |||
singular: distributetrain | |||
plural: distributetrains | |||
shortNames: | |||
- dt | |||
scope: Namespaced | |||
subresources: | |||
status: {} | |||
version: v1alpha1 | |||
validation: | |||
openAPIV3Schema: | |||
properties: | |||
apiVersion: | |||
type: string | |||
kind: | |||
type: string | |||
metadata: | |||
type: object | |||
spec: | |||
properties: | |||
image: | |||
type: string | |||
imagePullPolicy: | |||
type: string | |||
size: | |||
format: int32 | |||
type: integer | |||
masterCmd: | |||
type: string | |||
slaveCmd: | |||
type: string | |||
masterResources: | |||
type: object | |||
slaveResources: | |||
type: object | |||
nodeSelector: | |||
type: object | |||
initContainer: | |||
type: object | |||
datasetStorage: | |||
type: object | |||
workspaceStorage: | |||
type: object | |||
modelStorage: | |||
type: object | |||
required: | |||
- image | |||
- imagePullPolicy | |||
- size | |||
- masterCmd | |||
- slaveCmd | |||
- workspaceStorage | |||
type: object |
@@ -0,0 +1,47 @@ | |||
kind: Deployment | |||
apiVersion: apps/v1 | |||
metadata: | |||
name: distribute-train-operator | |||
namespace: test-ns | |||
labels: | |||
name: distribute-train-operator | |||
spec: | |||
replicas: 1 | |||
selector: | |||
matchLabels: | |||
name: distribute-train-operator | |||
template: | |||
metadata: | |||
labels: | |||
name: distribute-train-operator | |||
spec: | |||
containers: | |||
- name: distribute-train-operator | |||
image: {{IMAGE}} | |||
ports: | |||
- containerPort: 8080 | |||
protocol: TCP | |||
volumeMounts:d | |||
- mountPath: /root/config | |||
name: config-volume | |||
env: | |||
- name: JAR_BALL | |||
value: "distribute-train-operator-1.0.jar --k8s.kubeconfig=/root/config --spring.redis.host=192.168.1.104" | |||
imagePullPolicy: IfNotPresent | |||
volumes: | |||
- name: config-volume | |||
hostPath: | |||
path: /root/.kube/config | |||
restartPolicy: Always | |||
terminationGracePeriodSeconds: 30 | |||
securityContext: | |||
runAsUser: 0 | |||
schedulerName: default-scheduler | |||
strategy: | |||
type: RollingUpdate | |||
rollingUpdate: | |||
maxUnavailable: 1 | |||
maxSurge: 1 | |||
revisionHistoryLimit: 7 | |||
progressDeadlineSeconds: 600 | |||
@@ -0,0 +1,150 @@ | |||
<?xml version="1.0" encoding="UTF-8"?> | |||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> | |||
<modelVersion>4.0.0</modelVersion> | |||
<parent> | |||
<groupId>org.springframework.boot</groupId> | |||
<artifactId>spring-boot-starter-parent</artifactId> | |||
<version>2.2.5.RELEASE</version> | |||
</parent> | |||
<groupId>org.onebrain</groupId> | |||
<artifactId>distribute-train-operator</artifactId> | |||
<version>1.0</version> | |||
<name>distribute-train-operator</name> | |||
<description>distribute-train operatior</description> | |||
<properties> | |||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |||
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> | |||
<java.version>1.8</java.version> | |||
<fabric.io.version>4.9.0</fabric.io.version> | |||
</properties> | |||
<dependencies> | |||
<!-- web --> | |||
<dependency> | |||
<groupId>org.springframework.boot</groupId> | |||
<artifactId>spring-boot-starter-web</artifactId> | |||
</dependency> | |||
<!-- k8s --> | |||
<dependency> | |||
<groupId>io.fabric8</groupId> | |||
<artifactId>kubernetes-client</artifactId> | |||
<version>${fabric.io.version}</version> | |||
</dependency> | |||
<dependency> | |||
<groupId>io.fabric8</groupId> | |||
<artifactId>kubernetes-assertions</artifactId> | |||
<version>4.0.0</version> | |||
<scope>test</scope> | |||
</dependency> | |||
<!-- configuration processor --> | |||
<dependency> | |||
<groupId>org.springframework.boot</groupId> | |||
<artifactId>spring-boot-configuration-processor</artifactId> | |||
</dependency> | |||
<!-- redis --> | |||
<dependency> | |||
<groupId>org.springframework.boot</groupId> | |||
<artifactId>spring-boot-starter-data-redis</artifactId> | |||
</dependency> | |||
<dependency> | |||
<groupId>redis.clients</groupId> | |||
<artifactId>jedis</artifactId> | |||
</dependency> | |||
<!-- common jars --> | |||
<dependency> | |||
<groupId>commons-io</groupId> | |||
<artifactId>commons-io</artifactId> | |||
<version>2.6</version> | |||
</dependency> | |||
<dependency> | |||
<groupId>org.apache.commons</groupId> | |||
<artifactId>commons-compress</artifactId> | |||
<version>1.19</version> | |||
</dependency> | |||
<dependency> | |||
<groupId>commons-codec</groupId> | |||
<artifactId>commons-codec</artifactId> | |||
</dependency> | |||
<!-- tools --> | |||
<dependency> | |||
<groupId>cn.hutool</groupId> | |||
<artifactId>hutool-all</artifactId> | |||
<version>5.1.1</version> | |||
</dependency> | |||
<dependency> | |||
<groupId>com.google.guava</groupId> | |||
<artifactId>guava</artifactId> | |||
<version>27.0.1-jre</version> | |||
</dependency> | |||
<dependency> | |||
<groupId>com.alibaba</groupId> | |||
<artifactId>fastjson</artifactId> | |||
<version>1.2.54</version> | |||
</dependency> | |||
<dependency> | |||
<groupId>org.projectlombok</groupId> | |||
<artifactId>lombok</artifactId> | |||
<optional>true</optional> | |||
</dependency> | |||
<dependency> | |||
<groupId>org.springframework.boot</groupId> | |||
<artifactId>spring-boot-starter-test</artifactId> | |||
<scope>test</scope> | |||
</dependency> | |||
</dependencies> | |||
<build> | |||
<plugins> | |||
<plugin> | |||
<groupId>org.springframework.boot</groupId> | |||
<artifactId>spring-boot-maven-plugin</artifactId> | |||
</plugin> | |||
<!-- 打包时跳过测试 --> | |||
<plugin> | |||
<groupId>org.apache.maven.plugins</groupId> | |||
<artifactId>maven-surefire-plugin</artifactId> | |||
<configuration> | |||
<skip>true</skip> | |||
</configuration> | |||
</plugin> | |||
</plugins> | |||
</build> | |||
<repositories> | |||
<repository> | |||
<id>public</id> | |||
<name>aliyun nexus</name> | |||
<url>http://maven.aliyun.com/nexus/content/groups/public/</url> | |||
<releases> | |||
<enabled>true</enabled> | |||
</releases> | |||
</repository> | |||
</repositories> | |||
<pluginRepositories> | |||
<pluginRepository> | |||
<id>public</id> | |||
<name>aliyun nexus</name> | |||
<url>http://maven.aliyun.com/nexus/content/groups/public/</url> | |||
<releases> | |||
<enabled>true</enabled> | |||
</releases> | |||
<snapshots> | |||
<enabled>false</enabled> | |||
</snapshots> | |||
</pluginRepository> | |||
</pluginRepositories> | |||
</project> |
@@ -0,0 +1,35 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator; | |||
import org.springframework.boot.SpringApplication; | |||
import org.springframework.boot.autoconfigure.SpringBootApplication; | |||
import org.springframework.scheduling.annotation.EnableAsync; | |||
/** | |||
* @description Operator启动类 | |||
* @date 2020-09-03 | |||
*/ | |||
@SpringBootApplication | |||
@EnableAsync | |||
public class DistributeTrainOperatorApplication { | |||
public static void main(String[] args) { | |||
SpringApplication.run(DistributeTrainOperatorApplication.class, args); | |||
} | |||
} |
@@ -0,0 +1,199 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action; | |||
import cn.hutool.core.util.StrUtil; | |||
import com.fasterxml.jackson.core.JsonProcessingException; | |||
import com.google.common.collect.Maps; | |||
import io.fabric8.kubernetes.api.model.apiextensions.*; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||
import io.fabric8.kubernetes.client.dsl.Resource; | |||
import io.fabric8.kubernetes.client.dsl.base.CustomResourceDefinitionContext; | |||
import io.fabric8.kubernetes.client.informers.SharedIndexInformer; | |||
import io.fabric8.kubernetes.client.informers.SharedInformerFactory; | |||
import io.fabric8.kubernetes.client.internal.SerializationUtils; | |||
import io.fabric8.kubernetes.internal.KubernetesDeserializer; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.onebrain.operator.controller.DistributeTrainController; | |||
import org.onebrain.operator.crd.DistributeTrain; | |||
import org.onebrain.operator.crd.DistributeTrainList; | |||
import org.onebrain.operator.crd.DoneableDistributeTrain; | |||
import org.onebrain.operator.utils.DistributeTrainClientHolder; | |||
import org.onebrain.operator.utils.SpringContextHolder; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.beans.factory.support.BeanDefinitionBuilder; | |||
import org.springframework.beans.factory.support.DefaultListableBeanFactory; | |||
import org.springframework.context.ConfigurableApplicationContext; | |||
import org.springframework.stereotype.Component; | |||
import java.util.Map; | |||
import static org.onebrain.operator.constants.CrdConstants.*; | |||
/** | |||
* @description operator 主控制器 | |||
* @date 2020-09-23 | |||
*/ | |||
@Component | |||
@Slf4j | |||
public class DistributeTrainOperatorManager { | |||
public static final String NAMESPACE_DEFAULT = "default"; | |||
public static final String TYPE_STRING = "string"; | |||
public static final String TYPE_INTEGER = "integer"; | |||
public static final String TYPE_OBJECT = "object"; | |||
public static final String TYPE_ARRAY = "array"; | |||
public static final String FORMAT_INT_32 = "int32"; | |||
@Autowired | |||
private KubernetesClient client; | |||
private CustomResourceDefinition crd; | |||
private String namespace; | |||
/** | |||
* 检查crd是否存在,若不存在则创建 | |||
* @throws JsonProcessingException | |||
*/ | |||
public void createCrdIfNotExists() throws JsonProcessingException { | |||
String namespace = client.getNamespace(); | |||
if (namespace == null) { | |||
log.info("No namespace found via config, assuming default."); | |||
namespace = NAMESPACE_DEFAULT; | |||
} | |||
this.namespace = namespace; | |||
log.info("Using namespace : {}", namespace); | |||
//检查crd是否已存在 | |||
CustomResourceDefinition crd = client.customResourceDefinitions().withName(CRD_NAME).get(); | |||
if(crd == null){ | |||
Map<String, JSONSchemaProps> crdPropsMap = buildCrdProperties(); | |||
log.info("crd props map is : 【{}】",crdPropsMap); | |||
//如不存在,则创建 | |||
CustomResourceDefinition distributeTrainCustomResourceDefinition = new CustomResourceDefinitionBuilder() | |||
.withApiVersion(CRD_API_VERSION) | |||
.withNewMetadata() | |||
.withName(CRD_NAME) | |||
.endMetadata() | |||
.withNewSpec() | |||
.withGroup(CRD_GROUP) | |||
.withVersion(CRD_VERSION) | |||
.withScope(CRD_SCOPE) | |||
.withNewNames() | |||
.withKind(CRD_KIND) | |||
.withSingular(CRD_SINGULAR_NAME) | |||
.withPlural(CRD_PLURAL_NAME) | |||
.withShortNames(CRD_SHORT_NAME) | |||
.endNames() | |||
.withNewValidation() | |||
.withNewOpenAPIV3Schema() | |||
.addToProperties(crdPropsMap) | |||
.endOpenAPIV3Schema() | |||
.endValidation() | |||
.endSpec() | |||
.build(); | |||
distributeTrainCustomResourceDefinition = client.customResourceDefinitions().create(distributeTrainCustomResourceDefinition); | |||
log.info("create crd successfully : \n{}", SerializationUtils.dumpAsYaml(distributeTrainCustomResourceDefinition)); | |||
crd = distributeTrainCustomResourceDefinition; | |||
} | |||
//注册到k8s反序列化解析器 | |||
KubernetesDeserializer.registerCustomKind(CRD_GROUP + StrUtil.SLASH + CRD_VERSION, CRD_KIND, DistributeTrain.class); | |||
this.crd = crd; | |||
} | |||
/** | |||
* 初始化informer | |||
*/ | |||
public void initInformer(){ | |||
CustomResourceDefinitionContext distributeTrainCustomResourceDefinitionContext = new CustomResourceDefinitionContext.Builder() | |||
.withVersion(CRD_VERSION) | |||
.withScope(CRD_SCOPE) | |||
.withGroup(CRD_GROUP) | |||
.withPlural(CRD_PLURAL_NAME) | |||
.build(); | |||
SharedInformerFactory informerFactory = client.informers(); | |||
MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient = client.customResources(this.crd, DistributeTrain.class, DistributeTrainList.class, DoneableDistributeTrain.class); | |||
SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer = informerFactory.sharedIndexInformerForCustomResource(distributeTrainCustomResourceDefinitionContext, DistributeTrain.class, DistributeTrainList.class, 10 * 60 * 1000); | |||
//使用静态变量维持 | |||
DistributeTrainClientHolder.setDistributeTrainClient(distributeTrainClient); | |||
//手动注册controller到ioc容器 | |||
BeanDefinitionBuilder beanDefinitionBuilder = BeanDefinitionBuilder.genericBeanDefinition(DistributeTrainController.class); | |||
DefaultListableBeanFactory beanFactory = (DefaultListableBeanFactory)((ConfigurableApplicationContext) SpringContextHolder.applicationContext).getBeanFactory(); | |||
beanDefinitionBuilder.addConstructorArgValue(distributeTrainClient); | |||
beanDefinitionBuilder.addConstructorArgValue(distributeTrainSharedIndexInformer); | |||
beanDefinitionBuilder.addConstructorArgValue(namespace); | |||
beanFactory.registerBeanDefinition("org.onebrain.operator.controller.DistributeTrainController", beanDefinitionBuilder.getRawBeanDefinition()); | |||
//取得托管的controller | |||
DistributeTrainController controller = SpringContextHolder.getBean(DistributeTrainController.class); | |||
//注册informer监听 | |||
controller.create(); | |||
informerFactory.startAllRegisteredInformers(); | |||
//等待就绪 | |||
controller.run(); | |||
} | |||
/** | |||
* 生成crd属性 | |||
* @return crd属性集合 | |||
*/ | |||
private Map<String, JSONSchemaProps> buildCrdProperties(){ | |||
Map<String, JSONSchemaProps> properties = Maps.newHashMap(); | |||
JSONSchemaProps stringType = new JSONSchemaPropsBuilder() | |||
.withType(TYPE_STRING) | |||
.build(); | |||
JSONSchemaProps intType = new JSONSchemaPropsBuilder() | |||
.withType(TYPE_INTEGER) | |||
.withFormat(FORMAT_INT_32) | |||
.build(); | |||
JSONSchemaProps objectType = new JSONSchemaPropsBuilder() | |||
.withType(TYPE_OBJECT) | |||
.build(); | |||
JSONSchemaProps arrayType = new JSONSchemaPropsBuilder() | |||
.withType(TYPE_ARRAY) | |||
.withNewItems() | |||
.endItems() | |||
.build(); | |||
//添加属性校验规则 | |||
JSONSchemaProps specObjectType = new JSONSchemaPropsBuilder() | |||
.addToProperties("image", stringType) | |||
.addToProperties("imagePullPolicy", stringType) | |||
.addToProperties("size", intType) | |||
.addToProperties("env", arrayType) | |||
.addToProperties("masterCmd", stringType) | |||
.addToProperties("slaveCmd", stringType) | |||
.addToProperties("masterResources", objectType) | |||
.addToProperties("slaveResources", objectType) | |||
.addToProperties("nodeSelector", objectType) | |||
.addToProperties("initContainer", objectType) | |||
.addToProperties("datasetStorage", objectType) | |||
.addToProperties("workspaceStorage", objectType) | |||
.addToProperties("modelStorage", objectType) | |||
.withType("object") | |||
.addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd", "workspaceStorage") | |||
.build(); | |||
properties.put("apiVersion", stringType); | |||
properties.put("kind", stringType); | |||
properties.put("metadata", objectType); | |||
properties.put("spec", specObjectType); | |||
return properties; | |||
} | |||
} |
@@ -0,0 +1,58 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.onebrain.operator.watcher.KubeWatcherManager; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.boot.ApplicationArguments; | |||
import org.springframework.boot.ApplicationRunner; | |||
import org.springframework.stereotype.Component; | |||
/** | |||
* @description Operator运行入口 | |||
* @date 2020-09-23 | |||
*/ | |||
@Component | |||
@Slf4j | |||
public class OperatorRunner implements ApplicationRunner { | |||
@Autowired | |||
private DistributeTrainOperatorManager operatorManager; | |||
@Autowired | |||
private KubeWatcherManager watcherManager; | |||
/** | |||
* spring 容器完全启动后 注册operator运行逻辑 | |||
* @param args | |||
* @throws Exception | |||
*/ | |||
@Override | |||
public void run(ApplicationArguments args) throws Exception { | |||
//检查crd是否已存在,如果不存在则创建 | |||
operatorManager.createCrdIfNotExists(); | |||
//job监控者启动 | |||
watcherManager.startWatching(); | |||
log.info("job watcher is running"); | |||
//初始化informer | |||
operatorManager.initInformer(); | |||
} | |||
} |
@@ -0,0 +1,44 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action; | |||
import lombok.AllArgsConstructor; | |||
import lombok.Builder; | |||
import lombok.Data; | |||
import lombok.NoArgsConstructor; | |||
/** | |||
* @description pod信息类 | |||
* @date 2020-09-23 | |||
*/ | |||
@Data | |||
@NoArgsConstructor | |||
@AllArgsConstructor | |||
@Builder | |||
public class PodInfo { | |||
/** | |||
* ip地址 | |||
*/ | |||
private String ip; | |||
/** | |||
* 角色 | |||
*/ | |||
private String role; | |||
} |
@@ -0,0 +1,41 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.deployer; | |||
import cn.hutool.core.util.RandomUtil; | |||
import lombok.Data; | |||
import lombok.experimental.Accessors; | |||
/** | |||
* @description 创建资源的信息的抽象类 | |||
* @date 2020-04-30 | |||
*/ | |||
@Data | |||
@Accessors(chain = true) | |||
public abstract class AbstractResourceCreateInfo { | |||
/** | |||
* 生成随机字符串 | |||
* @param digits 位数 | |||
* @return | |||
*/ | |||
protected static String getRandomStr(Integer digits){ | |||
return RandomUtil.randomString(digits); | |||
} | |||
} |
@@ -0,0 +1,227 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.deployer; | |||
import cn.hutool.core.collection.CollectionUtil; | |||
import cn.hutool.core.util.StrUtil; | |||
import io.fabric8.kubernetes.api.model.*; | |||
import lombok.Data; | |||
import lombok.experimental.Accessors; | |||
import org.onebrain.operator.constants.KubeConstants; | |||
import org.onebrain.operator.constants.NumberConstant; | |||
import org.onebrain.operator.crd.DistributeTrain; | |||
import java.util.List; | |||
import java.util.Map; | |||
import java.util.Optional; | |||
import java.util.stream.Collectors; | |||
/** | |||
* @description 暂存创建子资源所需的信息 | |||
* @date 2020-06-16 | |||
*/ | |||
@Data | |||
@Accessors(chain = true) | |||
public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { | |||
public static final String SLAVE_TEMPLATE = "{}-slave-{}"; | |||
public static final String MASTER_TEMPLATE = "{}-master-{}"; | |||
public static final String SVC_TEMPLATE = "{}-svc"; | |||
/** | |||
* 父级名称(分布式训练名称) | |||
*/ | |||
private String parentName; | |||
/** | |||
* job名称 | |||
*/ | |||
private String jobName; | |||
/** | |||
* statefullSet名称 | |||
*/ | |||
private String statefulSetName; | |||
/** | |||
* 服务名称 | |||
*/ | |||
private String svcName; | |||
/** | |||
* 命名空间 | |||
*/ | |||
private String namespace; | |||
/** | |||
* 镜像 | |||
*/ | |||
private String image; | |||
/** | |||
* 镜像拉取策略 | |||
*/ | |||
private String imagePullPolicy; | |||
/** | |||
* 标签 | |||
*/ | |||
private Map<String, String> labels; | |||
/** | |||
* master副本数 | |||
*/ | |||
private Integer masterReplicas; | |||
/** | |||
* slave副本数 | |||
*/ | |||
private Integer slaveReplicas; | |||
/** | |||
* master命令 | |||
*/ | |||
private String masterCmd; | |||
/** | |||
* slave命令 | |||
*/ | |||
private String slaveCmd; | |||
/** | |||
* master 资源节点限制 | |||
*/ | |||
private ResourceRequirements masterResources; | |||
/** | |||
* slave 资源节点限制 | |||
*/ | |||
private ResourceRequirements slaveResources; | |||
/** | |||
* 节点调度选择器 | |||
*/ | |||
private Map<String, String> nodeSelector; | |||
/** | |||
* 初始化容器 | |||
*/ | |||
private Container initContainer; | |||
/** | |||
* 工作目录挂载 | |||
*/ | |||
private Volume workspaceVolume; | |||
/** | |||
* 数据集目录挂载 | |||
*/ | |||
private Volume datasetVolume; | |||
/** | |||
* 模型目录挂载 | |||
*/ | |||
private Volume modelVolume; | |||
/** | |||
* 环境变量 | |||
*/ | |||
private List<EnvVar> env; | |||
/** | |||
* 拥有者信息 | |||
*/ | |||
private OwnerReference ownerReference; | |||
/** | |||
* 将分布式训练转换为K8S的资源信息 | |||
* @param distributeTrain 分布式训练 | |||
* @return ChildResourceCreateInfo | |||
*/ | |||
public static ChildResourceCreateInfo fromCr(DistributeTrain distributeTrain){ | |||
ChildResourceCreateInfo info = new ChildResourceCreateInfo(); | |||
//ownerReferece信息 | |||
info.generateOwnerReference(distributeTrain); | |||
//各种资源的名称 | |||
info.setNamespace(distributeTrain.getMetadata().getNamespace()); | |||
info.setParentName(distributeTrain.getMetadata().getName()); | |||
info.generateResoureName(); | |||
//标签 | |||
info.setLabels(distributeTrain.getMetadata().getLabels()); | |||
//镜像 | |||
info.setImage(distributeTrain.getSpec().getImage()) | |||
.setImagePullPolicy(distributeTrain.getSpec().getImagePullPolicy()); | |||
//副本数 | |||
Integer size = distributeTrain.getSpec().getSize(); | |||
info.setMasterReplicas(NumberConstant.NUMBER_1); | |||
info.setSlaveReplicas(size - NumberConstant.NUMBER_1); | |||
//命令行 | |||
info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) | |||
.setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); | |||
//挂载 | |||
Optional.ofNullable(distributeTrain.getSpec().getWorkspaceStorage()) | |||
.ifPresent(v -> info.setWorkspaceVolume(v)); | |||
Optional.ofNullable(distributeTrain.getSpec().getDatasetStorage()) | |||
.ifPresent(v -> info.setDatasetVolume(v)); | |||
Optional.ofNullable(distributeTrain.getSpec().getModelStorage()) | |||
.ifPresent(v -> info.setModelVolume(v)); | |||
//主从两组资源限制 | |||
Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) | |||
.ifPresent(v -> info.setMasterResources(v)); | |||
Optional.ofNullable(distributeTrain.getSpec().getSlaveResources()) | |||
.ifPresent(v -> info.setSlaveResources(v)); | |||
//环境变量 | |||
List<EnvVar> env = distributeTrain.getSpec().getEnv(); | |||
if(CollectionUtil.isNotEmpty(env)){ | |||
env = env.stream().filter(e -> !KubeConstants.ENV_NODE_NUM.equals(e.getName())).collect(Collectors.toList()); | |||
info.setEnv(env); | |||
} | |||
//node调度 | |||
info.setNodeSelector(distributeTrain.getSpec().getNodeSelector()); | |||
//init-container | |||
info.setInitContainer(distributeTrain.getSpec().getInitContainer()); | |||
return info; | |||
} | |||
/** | |||
* 生成资源名称 | |||
*/ | |||
private void generateResoureName(){ | |||
String suffix = getRandomStr(NumberConstant.NUMBER_5); | |||
this.statefulSetName = StrUtil.format(SLAVE_TEMPLATE, this.parentName, suffix); | |||
this.jobName = StrUtil.format(MASTER_TEMPLATE, this.parentName, suffix); | |||
this.svcName = StrUtil.format(SVC_TEMPLATE, this.parentName); | |||
} | |||
/** | |||
* 生成所有者信息 | |||
* @param distributeTrain 分布式训练 | |||
*/ | |||
private void generateOwnerReference(DistributeTrain distributeTrain){ | |||
this.ownerReference = new OwnerReferenceBuilder() | |||
.withApiVersion(distributeTrain.getApiVersion()) | |||
.withKind(distributeTrain.getKind()) | |||
.withName(distributeTrain.getMetadata().getName()) | |||
.withNewUid(distributeTrain.getMetadata().getUid()) | |||
.build(); | |||
} | |||
} |
@@ -0,0 +1,35 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.deployer; | |||
import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||
/** | |||
* @description Job部署接口 规范部署方法 | |||
* T 必须是AbstractResourceCreateInfo 的子类型 | |||
* @date 2020-09-23 | |||
*/ | |||
public interface JobDeployer<T extends AbstractResourceCreateInfo> { | |||
/** | |||
* 构建 Job信息 | |||
* @param info 资源信息 | |||
* @return Job构建者 | |||
*/ | |||
JobBuilder deploy(T info); | |||
} |
@@ -0,0 +1,33 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.deployer; | |||
import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||
/** | |||
* @description service部署器接口 | |||
* @date 2020-09-23 | |||
*/ | |||
public interface ServiceDeployer<T extends AbstractResourceCreateInfo> { | |||
/** | |||
* 构建service信息 | |||
* @param info 资源信息 | |||
* @return 服务构建者 | |||
*/ | |||
ServiceBuilder deploy(T info); | |||
} |
@@ -0,0 +1,33 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.deployer; | |||
import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||
/** | |||
* @description statefulset部署器接口 | |||
* @date 2020-09-23 | |||
*/ | |||
public interface StatefulSetDeployer<T extends AbstractResourceCreateInfo> { | |||
/** | |||
* 构建service信息 | |||
* @param info 资源信息 | |||
* @return StatefulSet构建者 | |||
*/ | |||
StatefulSetBuilder deploy(T info); | |||
} |
@@ -0,0 +1,246 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.deployer.impl; | |||
import cn.hutool.core.collection.CollectionUtil; | |||
import com.google.common.collect.Lists; | |||
import io.fabric8.kubernetes.api.model.CapabilitiesBuilder; | |||
import io.fabric8.kubernetes.api.model.Container; | |||
import io.fabric8.kubernetes.api.model.ContainerPortBuilder; | |||
import io.fabric8.kubernetes.api.model.EnvVar; | |||
import io.fabric8.kubernetes.api.model.EnvVarBuilder; | |||
import io.fabric8.kubernetes.api.model.SecurityContextBuilder; | |||
import io.fabric8.kubernetes.api.model.Volume; | |||
import io.fabric8.kubernetes.api.model.VolumeBuilder; | |||
import io.fabric8.kubernetes.api.model.VolumeMount; | |||
import io.fabric8.kubernetes.api.model.VolumeMountBuilder; | |||
import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||
import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||
import org.onebrain.operator.action.deployer.JobDeployer; | |||
import org.onebrain.operator.constants.KubeConstants; | |||
import java.util.*; | |||
import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_0; | |||
import static org.onebrain.operator.constants.NumberConstant.NUMBER_1; | |||
import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||
/** | |||
* @description Job部署器 | |||
* @date 2020-09-23 | |||
*/ | |||
public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||
public static final String PVC_WORKSPACE = "pvc-workspace"; | |||
public static final String SSH = "ssh"; | |||
public static final String WORKSPACE = "/workspace"; | |||
public static final String PVC_DATASET = "pvc-dataset"; | |||
public static final String DATASET = "/dataset"; | |||
public static final String PVC_MODEL = "pvc-model"; | |||
public static final String MODEL = "/model"; | |||
public static final String MEMORY = "Memory"; | |||
public static final String DEV_SHM = "/dev/shm"; | |||
public static final String BIN_BASH = "/bin/bash"; | |||
public static final String IPC_LOCK = "IPC_LOCK"; | |||
public static final String RESTART_POLICY_NEVER = "Never"; | |||
/** | |||
* 部署Job | |||
* @param info 资源信息 | |||
* @return | |||
*/ | |||
@Override | |||
public JobBuilder deploy(ChildResourceCreateInfo info) { | |||
//容器 | |||
Container container = buildContainer(info); | |||
//存储卷 | |||
List<Volume> volumes = buildVolumes(info); | |||
//挂载 | |||
List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | |||
container.setVolumeMounts(volumeMounts); | |||
//启动命令 | |||
container.setCommand(Collections.singletonList(BIN_BASH)); | |||
//训练等待命令 | |||
//一个是等待 pretreatment 文件 通过 podApi 拷贝 到pod上 | |||
//另一个是等待 服务(svc)创建成功 | |||
List<String> cmdLines = Arrays.asList("while [ ! -f /home/pretreatment ]; do echo pretreatment not exist >> pretreatment.log; sleep 1;done && chmod a+x /home/pretreatment && bash /home/pretreatment ", "until nslookup " + info.getSvcName() + "; do sleep 5; done", info.getMasterCmd()); | |||
container.setArgs(Arrays.asList("-c", CollectionUtil.join(cmdLines, " && "))); | |||
//权限 | |||
container.setSecurityContext(new SecurityContextBuilder() | |||
.withAllowPrivilegeEscalation(true) | |||
.withCapabilities(new CapabilitiesBuilder() | |||
.withAdd(Collections.singletonList(IPC_LOCK)) | |||
.build()) | |||
.build()); | |||
//用户自定义的标签 | |||
Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||
JobBuilder builder = new JobBuilder(); | |||
builder.withNewMetadata() | |||
.withName(info.getJobName()) | |||
.withNamespace(info.getNamespace()) | |||
.addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
.addToLabels(customizeLabels) | |||
.addToOwnerReferences(info.getOwnerReference()) | |||
.endMetadata() | |||
.withNewSpec() | |||
//并行1个 | |||
.withParallelism(NUMBER_1) | |||
//共计运行1次 | |||
.withCompletions(NUMBER_1) | |||
//失败重试次数 | |||
.withBackoffLimit(KubeConstants.BACKOFFLIMIT) | |||
.withNewTemplate() | |||
.withNewMetadata() | |||
.withName(info.getJobName()) | |||
.addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
.addToLabels(KubeConstants.JOB_LABEL, info.getJobName()) | |||
.addToLabels(customizeLabels) | |||
.endMetadata() | |||
.withNewSpec() | |||
//关闭指令发出时 立即执行 | |||
.withTerminationGracePeriodSeconds(LONG_NUMBER_0) | |||
.addToContainers(container) | |||
.addToVolumes(volumes.toArray(new Volume[volumes.size()])) | |||
.withRestartPolicy(RESTART_POLICY_NEVER) | |||
.endSpec() | |||
.endTemplate() | |||
.endSpec(); | |||
//init-container | |||
JobBuilder finalBuilder = builder; | |||
Optional.ofNullable(info.getInitContainer()) | |||
.ifPresent(initContainer -> { | |||
finalBuilder.editSpec() | |||
.editTemplate() | |||
.editSpec() | |||
.addToInitContainers(initContainer) | |||
.endSpec() | |||
.endTemplate() | |||
.endSpec(); | |||
}); | |||
//固定节点调度 | |||
if(CollectionUtil.isNotEmpty(info.getNodeSelector())){ | |||
builder = builder.editSpec() | |||
.editTemplate().editSpec() | |||
.addToNodeSelector(info.getNodeSelector()) | |||
.endSpec().endTemplate() | |||
.endSpec(); | |||
} | |||
return builder; | |||
} | |||
/** | |||
* 构建容器 | |||
* @param info 资源信息 | |||
* @return 容器信息 | |||
*/ | |||
private Container buildContainer(ChildResourceCreateInfo info){ | |||
//容器 | |||
Container container = new Container(); | |||
//镜像 | |||
container.setName(KubeConstants.MASTER_CONTAINER_NAME); | |||
container.setImage(info.getImage()); | |||
container.setImagePullPolicy(info.getImagePullPolicy()); | |||
//端口映射 | |||
container.setPorts(Arrays.asList(new ContainerPortBuilder() | |||
.withContainerPort(NUMBER_22) | |||
.withName(SSH).build())); | |||
//环境变量 | |||
List<EnvVar> envVars = Lists.newArrayList(new EnvVarBuilder() | |||
.withName(KubeConstants.ENV_NODE_NUM) | |||
.withValue(String.valueOf(info.getSlaveReplicas() + info.getMasterReplicas())) | |||
.build()); | |||
Optional.ofNullable(info.getEnv()).ifPresent(v -> envVars.addAll(v)); | |||
container.setEnv(envVars); | |||
//资源限制 | |||
Optional.ofNullable(info.getMasterResources()).ifPresent(v->container.setResources(v)); | |||
return container; | |||
} | |||
/** | |||
* 构建存储卷集合 | |||
* @param info 资源信息 | |||
* @return 存储卷集合 | |||
*/ | |||
private List<Volume> buildVolumes(ChildResourceCreateInfo info){ | |||
//存储卷 | |||
List<Volume> volumes = new LinkedList<>(); | |||
Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||
Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||
Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||
//shm默认就有 | |||
volumes.add(new VolumeBuilder() | |||
.withName(KubeConstants.VOLUME_SHM) | |||
.withNewEmptyDir() | |||
.withMedium(MEMORY) | |||
.endEmptyDir() | |||
.build()); | |||
return volumes; | |||
} | |||
/** | |||
* 构建挂载存储卷集合 | |||
* @param volumes 存储卷集合 | |||
* @return 构建挂载存储卷集合 | |||
*/ | |||
private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | |||
List<VolumeMount> volumeMounts = new LinkedList<>(); | |||
for (Volume volume : volumes) { | |||
if(PVC_WORKSPACE.equals(volume.getName())){ | |||
volumeMounts.add(new VolumeMountBuilder() | |||
.withName(volume.getName()) | |||
.withMountPath(WORKSPACE) | |||
.build()); | |||
continue; | |||
} | |||
if(PVC_DATASET.equals(volume.getName())){ | |||
volumeMounts.add(new VolumeMountBuilder() | |||
.withName(volume.getName()) | |||
.withMountPath(DATASET) | |||
.build()); | |||
continue; | |||
} | |||
if(PVC_MODEL.equals(volume.getName())){ | |||
volumeMounts.add(new VolumeMountBuilder() | |||
.withName(volume.getName()) | |||
.withMountPath(MODEL) | |||
.build()); | |||
continue; | |||
} | |||
} | |||
volumeMounts.add(new VolumeMountBuilder() | |||
.withName(KubeConstants.VOLUME_SHM) | |||
.withMountPath(DEV_SHM) | |||
.build()); | |||
return volumeMounts; | |||
} | |||
} |
@@ -0,0 +1,73 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.deployer.impl; | |||
import cn.hutool.core.collection.CollectionUtil; | |||
import io.fabric8.kubernetes.api.model.IntOrString; | |||
import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||
import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||
import org.onebrain.operator.action.deployer.ServiceDeployer; | |||
import org.onebrain.operator.constants.KubeConstants; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.Map; | |||
import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||
import static org.onebrain.operator.constants.NumberConstant.NUMBER_30000; | |||
/** | |||
* @description Service部署器 | |||
* @date 2020-09-23 | |||
*/ | |||
public class BaseServiceDeployer implements ServiceDeployer<ChildResourceCreateInfo> { | |||
public static final String WEB_SSH = "web-ssh"; | |||
public static final String NONE = "None"; | |||
/** | |||
* 构建service信息 | |||
* @param info 资源信息 | |||
* @return | |||
*/ | |||
@Override | |||
public ServiceBuilder deploy(ChildResourceCreateInfo info) { | |||
//用户自定义的标签 | |||
Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||
return new ServiceBuilder() | |||
.withNewMetadata() | |||
.withName(info.getSvcName()) | |||
.addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
.addToLabels(customizeLabels) | |||
.withNamespace(info.getNamespace()) | |||
.addToOwnerReferences(info.getOwnerReference()) | |||
.endMetadata() | |||
.withNewSpec() | |||
.addNewPort() | |||
.withPort(NUMBER_30000) | |||
.withTargetPort(new IntOrString(NUMBER_22)) | |||
.withName(WEB_SSH) | |||
.endPort() | |||
.withClusterIP(NONE) | |||
//选择带有分布式训练的节点 | |||
.withSelector(Collections.singletonMap(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName())) | |||
.endSpec(); | |||
} | |||
} |
@@ -0,0 +1,246 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.deployer.impl; | |||
import cn.hutool.core.collection.CollectionUtil; | |||
import com.google.common.collect.ImmutableMap; | |||
import com.google.common.collect.Lists; | |||
import io.fabric8.kubernetes.api.model.CapabilitiesBuilder; | |||
import io.fabric8.kubernetes.api.model.Container; | |||
import io.fabric8.kubernetes.api.model.ContainerPortBuilder; | |||
import io.fabric8.kubernetes.api.model.EnvVar; | |||
import io.fabric8.kubernetes.api.model.EnvVarBuilder; | |||
import io.fabric8.kubernetes.api.model.LabelSelector; | |||
import io.fabric8.kubernetes.api.model.SecurityContextBuilder; | |||
import io.fabric8.kubernetes.api.model.Volume; | |||
import io.fabric8.kubernetes.api.model.VolumeBuilder; | |||
import io.fabric8.kubernetes.api.model.VolumeMount; | |||
import io.fabric8.kubernetes.api.model.VolumeMountBuilder; | |||
import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||
import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||
import org.onebrain.operator.action.deployer.StatefulSetDeployer; | |||
import org.onebrain.operator.constants.KubeConstants; | |||
import java.util.Arrays; | |||
import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.LinkedList; | |||
import java.util.List; | |||
import java.util.Map; | |||
import java.util.Optional; | |||
import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_0; | |||
import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_60; | |||
import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||
/** | |||
* @description StatefullSet部署器 | |||
* @date 2020-09-23 | |||
*/ | |||
public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> { | |||
public static final String SSH = "ssh"; | |||
public static final String PVC_WORKSPACE = "pvc-workspace"; | |||
public static final String WORKSPACE = "/workspace"; | |||
public static final String PVC_DATASET = "pvc-dataset"; | |||
public static final String DATASET = "/dataset"; | |||
public static final String PVC_MODEL = "pvc-model"; | |||
public static final String MODEL = "/model"; | |||
public static final String MEMORY = "Memory"; | |||
public static final String DEV_SHM = "/dev/shm"; | |||
public static final String BIN_BASH = "/bin/bash"; | |||
public static final String IPC_LOCK = "IPC_LOCK"; | |||
/** | |||
* 生成 StatefullSet 信息 | |||
* @param info 资源信息 | |||
* @return | |||
*/ | |||
@Override | |||
public StatefulSetBuilder deploy(ChildResourceCreateInfo info) { | |||
//标签筛选 | |||
LabelSelector labelSelector = new LabelSelector(); | |||
labelSelector.setMatchLabels(ImmutableMap.of(KubeConstants.STATEFULSET_LABEL, info.getStatefulSetName())); | |||
//存储卷 | |||
List<Volume> volumes = buildVolumes(info); | |||
//容器 | |||
Container container = buildContainer(info); | |||
//挂载 | |||
List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | |||
container.setVolumeMounts(volumeMounts); | |||
//启动命令 | |||
List<String> cmdLines = Arrays.asList("while [ ! -f /home/pretreatment ]; do echo pretreatment not exist >> pretreatment.log; sleep 1;done && chmod a+x /home/pretreatment && bash /home/pretreatment ", "until nslookup " + info.getSvcName() + "; do sleep 5; done", info.getSlaveCmd()); | |||
container.setCommand(Collections.singletonList(BIN_BASH)); | |||
container.setArgs(Arrays.asList("-c", CollectionUtil.join(cmdLines, " && "))); | |||
//权限 | |||
container.setSecurityContext(new SecurityContextBuilder() | |||
.withAllowPrivilegeEscalation(true) | |||
// .withPrivileged(true) | |||
.withCapabilities(new CapabilitiesBuilder() | |||
.withAdd(Collections.singletonList(IPC_LOCK)) | |||
.build()) | |||
.build()); | |||
//用户自定义的标签 | |||
Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||
StatefulSetBuilder builder = new StatefulSetBuilder(); | |||
builder.withNewMetadata() | |||
.withName(info.getStatefulSetName()) | |||
.withNamespace(info.getNamespace()) | |||
.addToOwnerReferences(info.getOwnerReference()) | |||
.addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
.endMetadata() | |||
.withNewSpec() | |||
.withSelector(labelSelector) | |||
.withServiceName(info.getStatefulSetName()) | |||
.withReplicas(info.getSlaveReplicas()) | |||
.withNewTemplate() | |||
.withNewMetadata() | |||
.withName(info.getStatefulSetName()) | |||
.addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
.addToLabels(KubeConstants.STATEFULSET_LABEL, info.getStatefulSetName()) | |||
.addToLabels(customizeLabels) | |||
.endMetadata() | |||
.withNewSpec() | |||
.withTerminationGracePeriodSeconds(LONG_NUMBER_0) | |||
.withTerminationGracePeriodSeconds(LONG_NUMBER_60) | |||
.addToContainers(container) | |||
.addToVolumes(volumes.toArray(new Volume[0])) | |||
.endSpec() | |||
.endTemplate() | |||
.endSpec(); | |||
//init-container | |||
StatefulSetBuilder finalBuilder = builder; | |||
Optional.ofNullable(info.getInitContainer()) | |||
.ifPresent(initContainer -> { | |||
finalBuilder.editSpec() | |||
.editTemplate() | |||
.editSpec() | |||
.addToInitContainers(initContainer) | |||
.endSpec() | |||
.endTemplate() | |||
.endSpec(); | |||
}); | |||
//固定节点调度 | |||
if(CollectionUtil.isNotEmpty(info.getNodeSelector())){ | |||
builder = builder.editSpec() | |||
.editTemplate().editSpec() | |||
.addToNodeSelector(info.getNodeSelector()) | |||
.endSpec().endTemplate() | |||
.endSpec(); | |||
} | |||
return builder; | |||
} | |||
/** | |||
* 构建容器 | |||
* @param info 资源信息 | |||
* @return 容器信息 | |||
*/ | |||
private Container buildContainer(ChildResourceCreateInfo info) { | |||
Container container = new Container(); | |||
//镜像 | |||
container.setName(KubeConstants.SLAVE_CONTAINER_NAME); | |||
container.setImage(info.getImage()); | |||
container.setImagePullPolicy(info.getImagePullPolicy()); | |||
//端口映射 | |||
container.setPorts(Arrays.asList(new ContainerPortBuilder() | |||
.withContainerPort(NUMBER_22) | |||
.withName(SSH).build())); | |||
//环境变量 | |||
List<EnvVar> envVars = Lists.newArrayList(new EnvVarBuilder() | |||
.withName(KubeConstants.ENV_NODE_NUM) | |||
.withValue(String.valueOf(info.getSlaveReplicas() + info.getMasterReplicas())) | |||
.build()); | |||
Optional.ofNullable(info.getEnv()).ifPresent(v -> envVars.addAll(v)); | |||
container.setEnv(envVars); | |||
//资源限制 | |||
Optional.ofNullable(info.getSlaveResources()).ifPresent(v -> container.setResources(v)); | |||
return container; | |||
} | |||
/** | |||
* 构建存储卷集合 | |||
* @param info 资源信息 | |||
* @return 存储卷集合 | |||
*/ | |||
private List<Volume> buildVolumes(ChildResourceCreateInfo info) { | |||
List<Volume> volumes = buildVolumes(info); | |||
Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||
Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||
Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||
//shm默认就有 | |||
volumes.add(new VolumeBuilder() | |||
.withName(KubeConstants.VOLUME_SHM) | |||
.withNewEmptyDir() | |||
.withMedium(MEMORY) | |||
.endEmptyDir() | |||
.build()); | |||
return volumes; | |||
} | |||
/** | |||
* 构建挂载存储卷集合 | |||
* @param volumes 存储卷集合 | |||
* @return 构建挂载存储卷集合 | |||
*/ | |||
private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | |||
List<VolumeMount> volumeMounts=new LinkedList<>(); | |||
for (Volume volume : volumes) { | |||
if(PVC_WORKSPACE.equals(volume.getName())){ | |||
volumeMounts.add(new VolumeMountBuilder() | |||
.withName(volume.getName()) | |||
.withMountPath(WORKSPACE) | |||
.build()); | |||
continue; | |||
} | |||
if(PVC_DATASET.equals(volume.getName())){ | |||
volumeMounts.add(new VolumeMountBuilder() | |||
.withName(volume.getName()) | |||
.withMountPath(DATASET) | |||
.build()); | |||
continue; | |||
} | |||
if(PVC_MODEL.equals(volume.getName())){ | |||
volumeMounts.add(new VolumeMountBuilder() | |||
.withName(volume.getName()) | |||
.withMountPath(MODEL) | |||
.build()); | |||
continue; | |||
} | |||
} | |||
volumeMounts.add(new VolumeMountBuilder() | |||
.withName(KubeConstants.VOLUME_SHM) | |||
.withMountPath(DEV_SHM) | |||
.build()); | |||
return volumeMounts; | |||
} | |||
} |
@@ -0,0 +1,614 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.handler; | |||
import cn.hutool.core.collection.CollectionUtil; | |||
import cn.hutool.core.io.FileUtil; | |||
import cn.hutool.core.util.ObjectUtil; | |||
import cn.hutool.core.util.StrUtil; | |||
import com.alibaba.fastjson.JSONArray; | |||
import com.alibaba.fastjson.JSONObject; | |||
import com.google.common.collect.Lists; | |||
import com.google.common.io.Files; | |||
import io.fabric8.kubernetes.api.model.ObjectMeta; | |||
import io.fabric8.kubernetes.api.model.Pod; | |||
import io.fabric8.kubernetes.api.model.Service; | |||
import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||
import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||
import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||
import io.fabric8.kubernetes.api.model.batch.Job; | |||
import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.onebrain.operator.action.PodInfo; | |||
import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||
import org.onebrain.operator.action.deployer.JobDeployer; | |||
import org.onebrain.operator.action.deployer.ServiceDeployer; | |||
import org.onebrain.operator.action.deployer.StatefulSetDeployer; | |||
import org.onebrain.operator.action.deployer.impl.BaseJobDeployer; | |||
import org.onebrain.operator.action.deployer.impl.BaseServiceDeployer; | |||
import org.onebrain.operator.action.deployer.impl.BaseStatefulSetDeployer; | |||
import org.onebrain.operator.api.pod.PodApi; | |||
import org.onebrain.operator.constants.KubeConstants; | |||
import org.onebrain.operator.crd.DistributeTrain; | |||
import org.onebrain.operator.crd.DistributeTrainSpec; | |||
import org.onebrain.operator.crd.DistributeTrainStatus; | |||
import org.onebrain.operator.exception.OperatorException; | |||
import org.onebrain.operator.redis.RedisService; | |||
import org.onebrain.operator.redis.key.OperatorKey; | |||
import org.onebrain.operator.utils.DistributeTrainClientHolder; | |||
import org.onebrain.operator.utils.IOUtils; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.core.io.ClassPathResource; | |||
import org.springframework.stereotype.Component; | |||
import java.io.File; | |||
import java.io.InputStream; | |||
import java.util.Collections; | |||
import java.util.List; | |||
import java.util.Map; | |||
import java.util.Optional; | |||
import java.util.concurrent.ConcurrentHashMap; | |||
import java.util.concurrent.LinkedBlockingQueue; | |||
import java.util.concurrent.ThreadFactory; | |||
import java.util.concurrent.ThreadPoolExecutor; | |||
import java.util.concurrent.TimeUnit; | |||
import java.util.concurrent.atomic.AtomicInteger; | |||
import static org.onebrain.operator.constants.KubeConstants.CHARSET; | |||
import static org.onebrain.operator.constants.KubeConstants.JOB_LABEL; | |||
import static org.onebrain.operator.constants.KubeConstants.MASTER_CONTAINER_NAME; | |||
import static org.onebrain.operator.constants.KubeConstants.SLAVE_CONTAINER_NAME; | |||
import static org.onebrain.operator.constants.KubeConstants.STATEFULSET_LABEL; | |||
import static org.onebrain.operator.constants.NumberConstant.NUMBER_2; | |||
/** | |||
* @description 分布式训练添加事件的处理器 | |||
* @date 2020-09-23 | |||
*/ | |||
@Component("addActionHandler") | |||
@Slf4j | |||
public class AddActionHandler implements DistributeTrainActionHandler { | |||
public static final String JOB_WATCHER = "job-watcher-"; | |||
public static final String PRETREATMENT = "pretreatment"; | |||
public static final String JOB_NAME = "job-name"; | |||
public static final String RUNNING = "Running"; | |||
public static final String MASTER = "master"; | |||
public static final String SLAVE = "slave"; | |||
public static final String PRETREATMENT_TARGET_DIR = "/home/pretreatment"; | |||
public static final String IP = "ip"; | |||
public static final String ROLE = "role"; | |||
public static final String HOSTFILE_TARGET_DIR = "/home/hostfile.json"; | |||
@Autowired | |||
private KubernetesClient client; | |||
@Autowired | |||
private PodApi podApi; | |||
/** | |||
* String 训练uid List pod信息 | |||
*/ | |||
private Map<String, List<PodInfo>> dtMap = new ConcurrentHashMap(); | |||
@Autowired | |||
private RedisService redis; | |||
/** | |||
* 线程池 | |||
*/ | |||
private ThreadPoolExecutor pool = new ThreadPoolExecutor(5, 10, 10, TimeUnit.SECONDS, new LinkedBlockingQueue<>(1), new ThreadFactory() { | |||
private final AtomicInteger mThreadNum = new AtomicInteger(1); | |||
@Override | |||
public Thread newThread(Runnable r) { | |||
return new Thread(r, JOB_WATCHER + mThreadNum.getAndIncrement()); | |||
} | |||
}, new ThreadPoolExecutor.DiscardOldestPolicy()); | |||
/** | |||
* 处理事件的任务 | |||
*/ | |||
class HandlerActionTask implements Runnable { | |||
private DistributeTrain distributeTrain; | |||
public HandlerActionTask(DistributeTrain distributeTrain) { | |||
this.distributeTrain = distributeTrain; | |||
} | |||
@Override | |||
public void run() { | |||
doAction(distributeTrain); | |||
} | |||
} | |||
/** | |||
* 执行任务动作 | |||
* @param distributeTrain | |||
*/ | |||
public void doAction(DistributeTrain distributeTrain) { | |||
log.info("doAction=>distributeTrain : 【{}】", distributeTrain); | |||
ChildResourceCreateInfo info = null; | |||
try { | |||
//redis重复检查 | |||
//根据k8s 创建DistributionTrain 的uid去重 | |||
if (null != redis.get(OperatorKey.CR, distributeTrain.getMetadata().getUid())) { | |||
log.info("distribute train 【{}】 in namespace 【{}】 already exists", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||
return; | |||
} else { | |||
//录入redis做消费记录 | |||
redis.set(OperatorKey.CR, distributeTrain.getMetadata().getUid(), System.currentTimeMillis()); | |||
} | |||
//参数检查,提取并生成所需参数 | |||
validateParams(distributeTrain); | |||
info = ChildResourceCreateInfo.fromCr(distributeTrain); | |||
//按照size,创建副本数为size-1的statefulSet | |||
createStatefulSet(info); | |||
//等待statefulset全部ready | |||
waitUntilStatefulSetReady(info); | |||
//创建job,job此时在死循环 | |||
createJob(info); | |||
//等待job ready | |||
waitUntilJobReady(info); | |||
//复制 /home/pretreatment 到 pod | |||
copyPretreatmentShell(info); | |||
//收集statefulSet和job的ip | |||
validateAndCollectPods(info); | |||
//本地生成公私钥、认证文件,并拷贝到所有节点的~/.ssh目录下 | |||
sshAuthWithoutPass(info); | |||
//本地生成hostfile,并拷贝到所有节点的指定目录下 | |||
generateAndUploadHostFile(info); | |||
//解锁job的死循环 | |||
releaseInterLock(info); | |||
//改状态 | |||
//updateStatus(info, distributeTrain); | |||
//为job注册监听器 | |||
registerJobListener(info); | |||
log.info("all parts of【{}】 are ready", info.getParentName()); | |||
} catch (Exception e) { | |||
log.error("doAction error:【{}】", e); | |||
//移除缓存 | |||
redis.del(OperatorKey.CR, distributeTrain.getMetadata().getUid()); | |||
//回收创建的资源 | |||
if (info != null) { | |||
recycleCr(info); | |||
} | |||
} | |||
} | |||
/** | |||
* 处理分布式训练 | |||
* @param distributeTrain 分布式训练信息 | |||
*/ | |||
@Override | |||
public void handlerAction(DistributeTrain distributeTrain) { | |||
log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); | |||
HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); | |||
pool.getActiveCount(); | |||
pool.execute(handlerActionTask); | |||
} | |||
/** | |||
* 校验参数合法性 | |||
* @param distributeTrain 分布式训练 | |||
*/ | |||
private void validateParams(DistributeTrain distributeTrain) { | |||
log.info("validateParams=>distributeTrain : 【{}】", distributeTrain); | |||
Integer size = distributeTrain.getSpec().getSize(); | |||
if (size < NUMBER_2) { | |||
throw new OperatorException("size must be greater than 1"); | |||
} | |||
String masterCmd = distributeTrain.getSpec().getMasterCmd(); | |||
String slaveCmd = distributeTrain.getSpec().getSlaveCmd(); | |||
if (StrUtil.isEmpty(slaveCmd) || StrUtil.isEmpty(masterCmd)) { | |||
throw new OperatorException("cmd lines must not be empty"); | |||
} | |||
} | |||
/** | |||
* 拷贝文件pretreatment到pod | |||
* @param info 资源信息 | |||
*/ | |||
private void copyPretreatmentShell(ChildResourceCreateInfo info) { | |||
log.info("start to copy pretreatment for 【{}】 ", info.getParentName()); | |||
try { | |||
String path = System.getProperty(KubeConstants.USER_DIR_SYSTEM_PROPERTY) + File.separator + PRETREATMENT; | |||
if (!FileUtil.exist(path)) { | |||
FileUtil.writeFromStream(new ClassPathResource("/shell/pretreatment").getInputStream(), path); | |||
} | |||
File pretreatment = new File(path); | |||
//上传到pod指定目录 | |||
List<Pod> pods = getPods(info); | |||
for (int i = 0; i < pods.size(); i++) { | |||
Pod pod = pods.get(i); | |||
//默认第一个为master | |||
String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||
podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, pretreatment, PRETREATMENT_TARGET_DIR); | |||
} | |||
} catch (Exception e) { | |||
log.error("copy pretreatment shell error: 【{}】",e); | |||
throw new OperatorException("exception is thrown when copy pretreatment for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||
} | |||
} | |||
/** | |||
* 创建statefulSet | |||
* @param info 资源信息 | |||
*/ | |||
private void createStatefulSet(ChildResourceCreateInfo info) { | |||
log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info); | |||
StatefulSet statefulSet = client.apps().statefulSets() | |||
.inNamespace(info.getNamespace()) | |||
.withName(info.getStatefulSetName()).get(); | |||
//已存在 | |||
if (statefulSet != null) { | |||
log.info("statefulSet 【{}】 already exists", statefulSet.getMetadata().getName()); | |||
return; | |||
} | |||
//不存在,新建 | |||
StatefulSetDeployer deployer = new BaseStatefulSetDeployer(); | |||
StatefulSetBuilder builder = deployer.deploy(info); | |||
statefulSet = builder.build(); | |||
client.apps().statefulSets().create(statefulSet); | |||
log.info("create statefulSet【{}】 successfully", statefulSet.getMetadata().getName()); | |||
} | |||
/** | |||
* 等待statefulSet全部ready | |||
* @param info 资源信息 | |||
*/ | |||
private void waitUntilStatefulSetReady(ChildResourceCreateInfo info) { | |||
log.info("wait for statefulSet 【{}】 in namespace 【{}】 ready", info.getStatefulSetName(), info.getNamespace()); | |||
try { | |||
client.apps().statefulSets() | |||
.inNamespace(info.getNamespace()) | |||
.withName(info.getStatefulSetName()) | |||
//阻塞 直到全部pod Ready 最长阻塞时间2小时 | |||
.waitUntilCondition(c -> | |||
c.getStatus().getReplicas() != null | |||
&& ObjectUtil.equal(c.getStatus().getReplicas(), c.getStatus().getReadyReplicas()), | |||
NUMBER_2, TimeUnit.HOURS); | |||
log.info("statefulSet 【{}】 in namespace 【{}】 is ready", info.getStatefulSetName(), info.getNamespace()); | |||
} catch (Exception e) { | |||
log.error("wait until statefulSet ready error:【{}】", e); | |||
throw new OperatorException("exception is thrown when waiting for statefulSet 【" + info.getStatefulSetName() + "】 ready : \n" + e.getMessage()); | |||
} | |||
} | |||
/** | |||
* 创建job | |||
* @param info Job信息 | |||
*/ | |||
private void createJob(ChildResourceCreateInfo info) { | |||
log.info("createJob=>childResourceCreateInfo : 【{}】", info); | |||
Job job = client.batch().jobs() | |||
.inNamespace(info.getNamespace()) | |||
.withName(info.getJobName()).get(); | |||
//已存在 | |||
if (job != null) { | |||
log.info("job 【{}】 already exists", job.getMetadata().getName()); | |||
return; | |||
} | |||
//不存在,新建 | |||
JobDeployer deployer = new BaseJobDeployer(); | |||
JobBuilder builder = deployer.deploy(info); | |||
job = builder.build(); | |||
log.info("job is : 【{}】", job); | |||
client.batch().jobs().create(job); | |||
log.info("create job【{}】 successfully", job.getMetadata().getName()); | |||
} | |||
/** | |||
* 等待job全部ready | |||
* @param info 资源信息 | |||
*/ | |||
private void waitUntilJobReady(ChildResourceCreateInfo info) { | |||
log.info("wait for job 【{}】 in namespace 【{}】 ready", info.getStatefulSetName(), info.getNamespace()); | |||
try { | |||
List<Pod> podList = client.pods().inNamespace(info.getNamespace()) | |||
.withLabel(JOB_NAME, info.getJobName()) | |||
.list().getItems(); | |||
while (CollectionUtil.isEmpty(podList)) { | |||
TimeUnit.SECONDS.sleep(2); | |||
podList = client.pods().inNamespace(info.getNamespace()) | |||
.withLabel(JOB_NAME, info.getJobName()) | |||
.list().getItems(); | |||
} | |||
Pod pod = podList.get(0); | |||
client.pods().inNamespace(info.getNamespace()) | |||
.withName(pod.getMetadata().getName()) | |||
//等待直到Ready状态 最长2小时 | |||
.waitUntilReady(2, TimeUnit.HOURS); | |||
log.info("job 【{}】 in namespace 【{}】 is ready", info.getJobName(), info.getNamespace()); | |||
} catch (Exception e) { | |||
log.info(e.getMessage(), e); | |||
throw new OperatorException("exception is thrown when waiting for job 【" + info.getJobName() + "】 ready : \n" + e.getMessage()); | |||
} | |||
} | |||
/** | |||
* 收集资源的podInfo | |||
* @param info 资源信息 | |||
*/ | |||
private void validateAndCollectPods(ChildResourceCreateInfo info) { | |||
//检查是否都在正常运行 | |||
log.info("validate pods status for 【{}】", info.getParentName()); | |||
boolean isAllSlaveRunning = true; | |||
boolean isMasterRunning = true; | |||
Pod masterPod = null; | |||
List<Pod> slavePods = null; | |||
do { | |||
//取得主的pod | |||
masterPod = getMasterPod(info); | |||
//取得从的所有pod | |||
slavePods = getSlavePods(info); | |||
if (masterPod == null) { | |||
log.info("can not find pod belongs to job 【{}】", info.getJobName()); | |||
return; | |||
} | |||
if (CollectionUtil.isEmpty(slavePods)) { | |||
log.info("can not find pod belongs to statefulSet 【{}】", info.getStatefulSetName()); | |||
return; | |||
} | |||
isMasterRunning = RUNNING.equals(masterPod.getStatus().getPhase()); | |||
isAllSlaveRunning = true; | |||
for (Pod slavePod : slavePods) { | |||
boolean isSlaveRunning = RUNNING.equals(slavePod.getStatus().getPhase()); | |||
if (!isSlaveRunning) { | |||
isAllSlaveRunning = false; | |||
break; | |||
} | |||
} | |||
} while (!(isMasterRunning && isAllSlaveRunning)); | |||
log.info("status checked 【{}】 all right", info.getParentName()); | |||
collectChildPodInfo(info, masterPod, slavePods); | |||
} | |||
/** | |||
* 收集pod基本信息 | |||
* @param info 资源信息 | |||
* @param masterPod | |||
* @param slavePods | |||
*/ | |||
private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) { | |||
log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】, slavePods : 【{}】", info, masterPod, slavePods); | |||
String key = info.getOwnerReference().getUid(); | |||
if (dtMap.containsKey(key)) { | |||
dtMap.remove(key); | |||
} | |||
List<PodInfo> podInfos = Lists.newArrayList(); | |||
PodInfo masterPodInfo = PodInfo.builder() | |||
.ip(masterPod.getStatus().getPodIP()) | |||
.role(MASTER) | |||
.build(); | |||
podInfos.add(masterPodInfo); | |||
for (Pod slavePod : slavePods) { | |||
PodInfo slavePodInfo = PodInfo.builder() | |||
.ip(slavePod.getStatus().getPodIP()) | |||
.role(SLAVE) | |||
.build(); | |||
podInfos.add(slavePodInfo); | |||
} | |||
dtMap.put(key, podInfos); | |||
} | |||
/** | |||
* ssh免密互通相关配置 | |||
* @param info 资源信息 | |||
*/ | |||
private void sshAuthWithoutPass(ChildResourceCreateInfo info) { | |||
log.info("start to configure ssh no password environment for 【{}】 ", info.getParentName()); | |||
File tempDir = Files.createTempDir(); | |||
try ( | |||
InputStream isRsa = getClass().getClassLoader().getResourceAsStream("key/id_rsa"); | |||
InputStream isRsaPub = getClass().getClassLoader().getResourceAsStream("key/id_rsa.pub") | |||
) { | |||
//id_rsa | |||
File tempIdRsa = FileUtil.createTempFile(tempDir); | |||
IOUtils.copy(isRsa, tempIdRsa); | |||
//id_rsa.pub | |||
File tempIdRsaPub = FileUtil.createTempFile(tempDir); | |||
IOUtils.copy(isRsaPub, tempIdRsaPub); | |||
List<String> pubLines = FileUtil.readLines(tempIdRsaPub, CHARSET); | |||
String pubKeyContent = pubLines.get(0); | |||
//按机器修改id_rsa.pub, 并组装一个大而全的authorized_keys | |||
List<File> idRsaPubFiles = Lists.newArrayList(); | |||
File tempAuthorizedKeys = FileUtil.createTempFile(tempDir); | |||
List<String> pubKeys = Lists.newArrayList(); | |||
for (PodInfo podInfo : dtMap.get(info.getOwnerReference().getUid())) { | |||
String podPubKeyContent = pubKeyContent.replace("{{ip}}", podInfo.getIp()); | |||
File tempIdRsaPubOnPod = FileUtil.createTempFile(tempDir); | |||
FileUtil.writeLines(Collections.singletonList(podPubKeyContent), tempIdRsaPubOnPod, CHARSET); | |||
idRsaPubFiles.add(tempIdRsaPubOnPod); | |||
pubKeys.add(podPubKeyContent); | |||
} | |||
FileUtil.writeLines(pubKeys, tempAuthorizedKeys, CHARSET); | |||
//获得所有pod, 上传三个文件 | |||
List<Pod> pods = getPods(info); | |||
for (int i = 0; i < pods.size(); i++) { | |||
Pod pod = pods.get(i); | |||
String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||
//上传id_rsa | |||
podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempIdRsa, "/root/.ssh/id_rsa"); | |||
//上传id_rsa.pub | |||
File tempIdRsaPubOnPod = idRsaPubFiles.get(i); | |||
podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempIdRsaPubOnPod, "/root/.ssh/id_rsa.pub"); | |||
//上传authorized_keys | |||
podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempAuthorizedKeys, "/root/.ssh/authorized_keys"); | |||
//修改权限 | |||
String chmodCmd = StrUtil.format("chmod 644 /root/.ssh/authorized_keys && chmod 600 /root/.ssh/id_rsa && chmod 644 /root/.ssh/id_rsa.pub"); | |||
podApi.exec(info.getNamespace(), pod.getMetadata().getName(), containerName, chmodCmd); | |||
} | |||
log.info("configure ssh no password environment for 【{}】 successfully ", info.getParentName()); | |||
} catch (Exception e) { | |||
log.error("sshAuthWithoutPass error:【{}】", e); | |||
throw new OperatorException("exception is thrown when configure ssh no password environment for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||
} finally { | |||
//清理临时文件 | |||
FileUtil.del(tempDir); | |||
} | |||
} | |||
/** | |||
* 生成并上传hostfile | |||
* @param info 资源信息 | |||
*/ | |||
private void generateAndUploadHostFile(ChildResourceCreateInfo info) { | |||
log.info("start to configure hostfile for 【{}】 ", info.getParentName()); | |||
File tempDir = Files.createTempDir(); | |||
try { | |||
//生成hostfile | |||
JSONArray jsonArray = new JSONArray(); | |||
List<PodInfo> podInfos = dtMap.get(info.getOwnerReference().getUid()); | |||
for (PodInfo podInfo : podInfos) { | |||
JSONObject podJson = new JSONObject(); | |||
podJson.put(IP, podInfo.getIp()); | |||
podJson.put(ROLE, podInfo.getRole()); | |||
jsonArray.add(podJson); | |||
} | |||
File tempHostFile = FileUtil.createTempFile(tempDir); | |||
FileUtil.writeLines(Collections.singletonList(jsonArray.toJSONString()), tempHostFile, CHARSET); | |||
//上传到pod指定目录 | |||
List<Pod> pods = getPods(info); | |||
for (int i = 0; i < pods.size(); i++) { | |||
Pod pod = pods.get(i); | |||
String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||
podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempHostFile, HOSTFILE_TARGET_DIR); | |||
} | |||
} catch (Exception e) { | |||
log.error("generateAndUploadHostFile error:【{}】", e); | |||
throw new OperatorException("exception is thrown when generate and upload hostfile for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||
} finally { | |||
//清理临时文件 | |||
FileUtil.del(tempDir); | |||
} | |||
} | |||
/** | |||
* 创建service 解除闭锁 | |||
* @param info | |||
*/ | |||
private void releaseInterLock(ChildResourceCreateInfo info) { | |||
log.info("release lock for 【{}】", info.getParentName()); | |||
ServiceDeployer deployer = new BaseServiceDeployer(); | |||
ServiceBuilder builder = deployer.deploy(info); | |||
Service svc = builder.build(); | |||
client.services().create(svc); | |||
log.info("lock for 【{}】 released", info.getParentName()); | |||
} | |||
/** | |||
* 回收cr | |||
* @param info | |||
*/ | |||
private void recycleCr(ChildResourceCreateInfo info) { | |||
log.info("recycleCr=>childResourceCreateInfo : 【{}】", info); | |||
Optional.ofNullable(DistributeTrainClientHolder.getClient()) | |||
.ifPresent(distributeTrainClient -> { | |||
ObjectMeta metadata = new ObjectMeta(); | |||
metadata.setName(info.getParentName()); | |||
metadata.setNamespace(info.getNamespace()); | |||
DistributeTrain dt = new DistributeTrain(metadata, DistributeTrainSpec.builder() | |||
.build()); | |||
distributeTrainClient.delete(dt); | |||
log.info("recycle distribute train 【{}】", info.getParentName()); | |||
}); | |||
} | |||
/**更新状态*/ | |||
private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { | |||
log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info, distributeTrain); | |||
if (distributeTrain.getStatus() == null) { | |||
distributeTrain.setStatus(new DistributeTrainStatus()); | |||
} | |||
Integer size = distributeTrain.getSpec().getSize(); | |||
distributeTrain.getStatus().setReplicas(size); | |||
distributeTrain.getStatus().setReadyReplicas(size); | |||
} | |||
/** | |||
* 为job注册监听器 | |||
* @param info | |||
*/ | |||
private void registerJobListener(ChildResourceCreateInfo info) { | |||
log.info("register listener for distribute train 【{}】", info.getParentName()); | |||
// client.batch().jobs() | |||
// .inNamespace(info.getNamespace()) | |||
// .withName(info.getJobName()).watch(null); | |||
} | |||
/** | |||
* 获取所有分布式训练相关的pod | |||
* @param info | |||
* @return List<Pod> 分布式相关Pod集合 | |||
*/ | |||
private List<Pod> getPods(ChildResourceCreateInfo info) { | |||
log.info("getPods=>childResourceCreateInfo : 【{}】", info); | |||
List<Pod> pods = Lists.newArrayList(); | |||
pods.add(getMasterPod(info)); | |||
pods.addAll(getSlavePods(info)); | |||
if (CollectionUtil.hasNull(pods) || pods.size() != info.getSlaveReplicas() + 1) { | |||
throw new OperatorException("can not get pods in correct numbers"); | |||
} | |||
return pods; | |||
} | |||
/** | |||
* 获取master信息 | |||
* @param info 资源信息 | |||
* @return Pod Master节点对应的Pod | |||
*/ | |||
private Pod getMasterPod(ChildResourceCreateInfo info) { | |||
log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info); | |||
List<Pod> masterPods = client.pods().inNamespace(info.getNamespace()) | |||
.withLabel(JOB_LABEL, info.getJobName()) | |||
.list().getItems(); | |||
if (CollectionUtil.isEmpty(masterPods)) { | |||
return null; | |||
} | |||
return masterPods.get(0); | |||
} | |||
/** | |||
* 取得从的所有pod | |||
* @param info 资源信息 | |||
* @return List<Pod> Slave节点对应的Pod集合 | |||
*/ | |||
private List<Pod> getSlavePods(ChildResourceCreateInfo info) { | |||
log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info); | |||
//取得从的所有pod | |||
List<Pod> slavePods = client.pods().inNamespace(info.getNamespace()) | |||
.withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) | |||
.list().getItems(); | |||
if (CollectionUtil.isEmpty(slavePods)) { | |||
return null; | |||
} | |||
return slavePods; | |||
} | |||
} |
@@ -0,0 +1,88 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.handler; | |||
import cn.hutool.core.collection.CollectionUtil; | |||
import io.fabric8.kubernetes.api.model.Service; | |||
import io.fabric8.kubernetes.api.model.ServiceList; | |||
import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||
import io.fabric8.kubernetes.api.model.apps.StatefulSetList; | |||
import io.fabric8.kubernetes.api.model.batch.Job; | |||
import io.fabric8.kubernetes.api.model.batch.JobList; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.onebrain.operator.constants.KubeConstants; | |||
import org.onebrain.operator.crd.DistributeTrain; | |||
import org.onebrain.operator.redis.RedisService; | |||
import org.onebrain.operator.redis.key.OperatorKey; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.stereotype.Component; | |||
/** | |||
* @description 删除事件的处理器 | |||
* @date 2020-09-23 | |||
*/ | |||
@Component("deleteActionHandler") | |||
@Slf4j | |||
public class DeleteActionHandler implements DistributeTrainActionHandler { | |||
@Autowired | |||
private KubernetesClient client; | |||
@Autowired | |||
private RedisService redis; | |||
/** | |||
* 处理删除事件 | |||
* @param distributeTrain 分布式训练信息 | |||
*/ | |||
@Override | |||
public void handlerAction(DistributeTrain distributeTrain) { | |||
log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); | |||
String namespace = distributeTrain.getMetadata().getNamespace(); | |||
String parentName = distributeTrain.getMetadata().getName(); | |||
// namespace+parentName(分布式训练名称) 确定相应的资源 | |||
//删除job | |||
JobList jobList = client.batch().jobs().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||
if(CollectionUtil.isNotEmpty(jobList.getItems())){ | |||
for (Job item : jobList.getItems()) { | |||
client.batch().jobs().delete(item); | |||
} | |||
log.info("delete job in distributeTrain 【{}】", parentName); | |||
} | |||
//删除statefullSete | |||
StatefulSetList statefulSetList = client.apps().statefulSets().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||
if(CollectionUtil.isNotEmpty(statefulSetList.getItems())){ | |||
for (StatefulSet item : statefulSetList.getItems()) { | |||
client.apps().statefulSets().delete(item); | |||
} | |||
log.info("delete statefulSet in distributeTrain 【{}】", parentName); | |||
} | |||
//删除service | |||
ServiceList svcList = client.services().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||
if(CollectionUtil.isNotEmpty(svcList.getItems())){ | |||
for (Service item : svcList.getItems()) { | |||
client.services().delete(item); | |||
} | |||
log.info("delete svc in distributeTrain 【{}】", parentName); | |||
} | |||
//删除redis里记录的分布式训练信息 | |||
redis.del(OperatorKey.CR, distributeTrain.getMetadata().getUid()); | |||
log.info("delete distributeTrain 【{}】 successfully", parentName); | |||
} | |||
} |
@@ -0,0 +1,33 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.action.handler; | |||
import org.onebrain.operator.crd.DistributeTrain; | |||
/** | |||
* @description 分布式训练的事件处理器 | |||
* @date 2020-09-23 | |||
*/ | |||
public interface DistributeTrainActionHandler { | |||
/** | |||
* 处理相应的事件 | |||
* @param distributeTrain 分布式训练信息 | |||
*/ | |||
void handlerAction(DistributeTrain distributeTrain); | |||
} |
@@ -0,0 +1,85 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.api.pod; | |||
import io.fabric8.kubernetes.client.dsl.ExecListener; | |||
import lombok.Getter; | |||
import lombok.extern.slf4j.Slf4j; | |||
import okhttp3.Response; | |||
import java.util.concurrent.CountDownLatch; | |||
/** | |||
* @description 默认命令执行监听器 | |||
* @date 2020-09-23 | |||
*/ | |||
@Slf4j | |||
@Getter | |||
public class DefaultPodExecListener implements ExecListener { | |||
/** | |||
* pod名称 | |||
*/ | |||
private String podName; | |||
/** | |||
* 命名空间 | |||
*/ | |||
private String namespace; | |||
/** | |||
* 容器名称 | |||
*/ | |||
private String containerName; | |||
/** | |||
* 执行门栓 线程通信用 | |||
*/ | |||
private CountDownLatch execLatch; | |||
public DefaultPodExecListener(String podName, String namespace, String containerName, CountDownLatch execLatch) { | |||
this.podName = podName; | |||
this.namespace = namespace; | |||
this.containerName = containerName; | |||
this.execLatch = execLatch; | |||
} | |||
@Override | |||
public void onOpen(Response response) { | |||
log.debug("shell environment in pod '{}', namespace '{}' is opened", podName, namespace); | |||
log.debug("onOpen: {}", response); | |||
} | |||
@Override | |||
public void onFailure(Throwable t, Response response) { | |||
log.error("shell environment in pod '{}', namespace '{}' barfed", podName, namespace); | |||
log.error("onFailure: {} {}", t.getMessage(), response); | |||
if (execLatch != null) { | |||
execLatch.countDown(); | |||
} | |||
} | |||
@Override | |||
public void onClose(int code, String reason) { | |||
log.debug("shell environment in pod '{}', namespace '{}' closed", podName, namespace); | |||
log.debug("onClose: {} {}", code, reason); | |||
if (execLatch != null) { | |||
execLatch.countDown(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,177 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.api.pod; | |||
import cn.hutool.core.util.StrUtil; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import io.fabric8.kubernetes.client.dsl.ExecWatch; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.apache.commons.io.FileUtils; | |||
import org.onebrain.operator.context.KubeContext; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.stereotype.Component; | |||
import java.io.File; | |||
import java.io.IOException; | |||
import java.io.PipedInputStream; | |||
import java.io.PipedOutputStream; | |||
import java.util.concurrent.CountDownLatch; | |||
import java.util.concurrent.atomic.AtomicBoolean; | |||
/** | |||
* | |||
* @description PodApi 操作pod 里的容器用于上传文件等操作吧 | |||
* @date 2020-09-23 | |||
*/ | |||
@Component | |||
@Slf4j | |||
public class PodApi { | |||
private static final Integer DEFAULT_LOG_LINES = 50; | |||
@Autowired | |||
private KubeContext kubeContext; | |||
@Autowired | |||
private KubernetesClient client; | |||
/** | |||
* 从Pod下载单个文件 | |||
* @return File 临时文件,用后需要及时清理 | |||
* **/ | |||
public File copyFileFromPod(String namespace, String podName, String containerName, String filePath){ | |||
try { | |||
File tmpFile = File.createTempFile("copy-from-pod-", ""); | |||
client.pods().inNamespace(namespace).withName(podName) | |||
.inContainer(containerName) | |||
.file(filePath) | |||
.copy(tmpFile.toPath()); | |||
if(tmpFile.length() == 0){ | |||
return null; | |||
} | |||
return tmpFile; | |||
} catch (IOException e) { | |||
log.error(" File copy error : 【{}】",e); | |||
} | |||
return null; | |||
} | |||
/** | |||
* 从Pod下载目录 | |||
* @return File 临时文件,用后需要及时清理 | |||
* **/ | |||
public File copyFolderFromPod(String namespace, String podName, String containerName, String folderPath){ | |||
final PipedInputStream stdoutInput = new PipedInputStream(); | |||
final PipedOutputStream stdoutOutput = new PipedOutputStream(); | |||
final PipedInputStream stderrInput = new PipedInputStream(); | |||
final PipedOutputStream stderrOutput = new PipedOutputStream(); | |||
final AtomicBoolean failed = new AtomicBoolean(false); | |||
try { | |||
stdoutInput.connect(stdoutOutput); | |||
stderrInput.connect(stderrOutput); | |||
//去除路径上的/前缀 | |||
if(folderPath.startsWith(StrUtil.SLASH)){ | |||
folderPath = StrUtil.removePrefix(folderPath, StrUtil.SLASH); | |||
} | |||
//监听器异步执行 | |||
DefaultPodExecListener defaultPodExecListener = new DefaultPodExecListener(podName, namespace, containerName, null); | |||
StdPodExecListener stdPodExecListener = new StdPodExecListener(defaultPodExecListener, stdoutOutput, stderrOutput, failed); | |||
ExecWatch watch = client.pods().inNamespace(namespace) | |||
.withName(podName).inContainer(containerName) | |||
.writingOutput(stdoutOutput).writingError(stderrOutput) | |||
.usingListener(stdPodExecListener) | |||
.exec("tar", "cf", "-", "-C", folderPath, "."); | |||
// execLatch.await(); | |||
} catch (IOException e) { | |||
log.error("copyFolderFromPod:【{}】",e); | |||
} | |||
File tmpFile = null; | |||
try { | |||
tmpFile = File.createTempFile("copy-from-pod-", ".tar"); | |||
int length; | |||
byte[] buffer = new byte[1024]; | |||
while (!Thread.currentThread().isInterrupted() | |||
&& (length = stdoutInput.read(buffer)) != -1) { | |||
byte[] content = new byte[length]; | |||
System.arraycopy(buffer, 0, content, 0, length); | |||
FileUtils.writeByteArrayToFile(tmpFile, content, true); | |||
} | |||
while (!Thread.currentThread().isInterrupted() | |||
&& (length = stderrInput.read(buffer)) != -1) { | |||
log.error(new String(buffer, 0, length)); | |||
} | |||
} catch (IOException e) { | |||
if (!Thread.currentThread().isInterrupted()) { | |||
log.error("Error while pumping stream. 【{}】", e); | |||
} else { | |||
log.error("Interrupted while pumping stream. 【{}】", e); | |||
} | |||
} | |||
return tmpFile; | |||
} | |||
/** | |||
* 拷贝文件到pod | |||
* @param namespace 命名空间 | |||
* @param podName pod名称 | |||
* @param containerName 容器名称 | |||
* @param file 文件 | |||
* @param targetDir 目标路径 | |||
*/ | |||
public void copyToPod(String namespace, String podName, String containerName, File file, String targetDir){ | |||
client.pods().inNamespace(namespace).withName(podName) | |||
.inContainer(containerName) | |||
.file(targetDir) | |||
.upload(file.toPath()); | |||
} | |||
/** | |||
* 同步执行 | |||
* @param namespace 命名空间 | |||
* @param podName pod名称 | |||
* @param containerName 容器名称 | |||
* @param cmd 命令 | |||
*/ | |||
public void exec(String namespace, String podName, String containerName, String cmd){ | |||
try { | |||
final CountDownLatch execLatch = new CountDownLatch(1); | |||
ExecWatch execWatch = client.pods().inNamespace(namespace).withName(podName).inContainer(containerName) | |||
.redirectingOutput() | |||
.withTTY() //不展示输出 | |||
.usingListener(new DefaultPodExecListener(namespace, podName, containerName, execLatch)) | |||
.exec("sh", "-c", cmd); | |||
execLatch.await(); | |||
} catch (InterruptedException e) { | |||
log.error(" PodApi execute cmd error : 【{}】",e); | |||
} | |||
} | |||
} |
@@ -0,0 +1,83 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.api.pod; | |||
import io.fabric8.kubernetes.client.dsl.ExecListener; | |||
import lombok.extern.slf4j.Slf4j; | |||
import okhttp3.Response; | |||
import java.io.IOException; | |||
import java.io.PipedOutputStream; | |||
import java.util.concurrent.atomic.AtomicBoolean; | |||
/** | |||
* @description 标准pod执行监听器 | |||
* @date 2020-09-23 | |||
*/ | |||
@Slf4j | |||
public class StdPodExecListener implements ExecListener { | |||
private ExecListener defaultExecListener; | |||
private PipedOutputStream stdoutOutput; | |||
private PipedOutputStream stderrOutput; | |||
private AtomicBoolean failed; | |||
public StdPodExecListener(ExecListener defaultExecListener, PipedOutputStream stdoutOutput, PipedOutputStream stderrOutput, AtomicBoolean failed) { | |||
this.defaultExecListener = defaultExecListener; | |||
this.stdoutOutput = stdoutOutput; | |||
this.stderrOutput = stderrOutput; | |||
this.failed = failed; | |||
} | |||
@Override | |||
public void onOpen(Response response) { | |||
log.info("onOpen=>response : 【{}】",response); | |||
defaultExecListener.onOpen(response); | |||
} | |||
@Override | |||
public void onFailure(Throwable t, Response response) { | |||
log.info("onFailure=> t :【{}】,response : 【{}】",t,response); | |||
try { | |||
failed.set(true); | |||
stdoutOutput.close(); | |||
stderrOutput.close(); | |||
} catch (IOException e) { | |||
log.error("Failed to close stdout and stderr pipes. 【{}】", e); | |||
} finally { | |||
defaultExecListener.onFailure(t, response); | |||
} | |||
} | |||
@Override | |||
public void onClose(int code, String reason) { | |||
log.info("onClose=>code : 【{}】,reason : 【{}】",code,reason); | |||
try { | |||
stdoutOutput.close(); | |||
stderrOutput.close(); | |||
} catch (IOException e) { | |||
log.error("Failed to close stdout and stderr pipes. 【{}】", e); | |||
} finally { | |||
defaultExecListener.onClose(code, reason); | |||
} | |||
} | |||
} |
@@ -0,0 +1,66 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.config; | |||
import cn.hutool.core.util.StrUtil; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import org.onebrain.operator.context.KubeContext; | |||
import org.onebrain.operator.properties.KubeProperties; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.boot.context.properties.EnableConfigurationProperties; | |||
import org.springframework.context.annotation.Bean; | |||
import org.springframework.context.annotation.Configuration; | |||
/** | |||
* @description k8s配置类 | |||
* @date 2020-09-23 | |||
*/ | |||
@Configuration | |||
@EnableConfigurationProperties(KubeProperties.class) | |||
public class KubeConfig { | |||
@Autowired | |||
private KubeProperties kubeProperties; | |||
/** | |||
* 注册k8s配置 | |||
* @return | |||
*/ | |||
@Bean | |||
public KubeContext kubeContext() { | |||
if (kubeProperties == null) { | |||
return null; | |||
} | |||
final String configSource = kubeProperties.getKubeconfig(); | |||
if(StrUtil.isEmpty(configSource)){ | |||
return null; | |||
} | |||
return new KubeContext(kubeProperties); | |||
} | |||
/** | |||
* 注册k8s客户端 | |||
* @param kubeContext k8s配置 | |||
* @return | |||
*/ | |||
@Bean | |||
public KubernetesClient kubernetesClient(KubeContext kubeContext){ | |||
return kubeContext.getClient(); | |||
} | |||
} |
@@ -0,0 +1,34 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.constants; | |||
/** | |||
* @description crd 常量信息 | |||
* @date 2020-09-23 | |||
*/ | |||
public class CrdConstants { | |||
public static final String CRD_GROUP = "onebrain.oneflow.org"; | |||
public static final String CRD_SINGULAR_NAME = "distributetrain"; | |||
public static final String CRD_PLURAL_NAME = "distributetrains"; | |||
public static final String CRD_NAME = CRD_PLURAL_NAME + "." + CRD_GROUP; | |||
public static final String CRD_KIND = "DistributeTrain"; | |||
public static final String CRD_SCOPE = "Namespaced"; | |||
public static final String CRD_SHORT_NAME = "dt"; | |||
public static final String CRD_VERSION = "v1alpha1"; | |||
public static final String CRD_API_VERSION = "apiextensions.k8s.io/v1beta1"; | |||
} |
@@ -0,0 +1,40 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.constants; | |||
/** | |||
* @description k8s常量 | |||
* @date 2020-09-23 | |||
*/ | |||
public class KubeConstants { | |||
public static final String DISTRIBUTE_TRAIN_LABEL = "dt-name"; | |||
public static final String STATEFULSET_LABEL = "dt-ss-name"; | |||
public static final String JOB_LABEL = "dt-job-name"; | |||
public static final String MASTER_CONTAINER_NAME = "distribute-train-master"; | |||
public static final String SLAVE_CONTAINER_NAME = "distribute-train-slave"; | |||
public final static String USER_DIR_SYSTEM_PROPERTY = "user.dir"; | |||
//不许重试 | |||
public static final Integer BACKOFFLIMIT = 0; | |||
public static final String CHARSET = "utf-8"; | |||
public static final String ENV_NODE_NUM = "NODE_NUM"; | |||
public static final String VOLUME_SHM = "dshm"; | |||
} |
@@ -0,0 +1,43 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.constants; | |||
/** | |||
* @Description 数字常量 | |||
* @Date 2020-6-9 | |||
*/ | |||
public class NumberConstant { | |||
public final static int NUMBER_0 = 0; | |||
public final static long LONG_NUMBER_0 = 0L; | |||
public final static int NUMBER_1 = 1; | |||
public final static int NUMBER_2 = 2; | |||
public final static int NUMBER_3 = 3; | |||
public final static int NUMBER_5 = 5; | |||
public final static int NUMBER_10 = 10; | |||
public final static int NUMBER_22 = 22; | |||
public final static int NUMBER_30 = 30; | |||
public final static int NUMBER_50 = 50; | |||
public final static int NUMBER_60 = 60; | |||
public final static long LONG_NUMBER_60 = 60L; | |||
public final static int HOUR_SECOND = 60 * 60; | |||
public final static int DAY_SECOND = 60 * 60 * 24; | |||
public final static int WEEK_SECOND = 60 * 60 * 24 * 7; | |||
public final static int MAX_PAGE_SIZE = 2000; | |||
public final static int NUMBER_30000 = 30000; | |||
} |
@@ -0,0 +1,117 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.context; | |||
import cn.hutool.core.util.StrUtil; | |||
import com.fasterxml.jackson.core.JsonProcessingException; | |||
import io.fabric8.kubernetes.api.model.HasMetadata; | |||
import io.fabric8.kubernetes.client.Config; | |||
import io.fabric8.kubernetes.client.DefaultKubernetesClient; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import io.fabric8.kubernetes.client.VersionInfo; | |||
import io.fabric8.kubernetes.client.internal.SerializationUtils; | |||
import io.fabric8.kubernetes.client.utils.Utils; | |||
import lombok.Getter; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.onebrain.operator.properties.KubeProperties; | |||
import org.springframework.beans.BeansException; | |||
import org.springframework.context.ApplicationContext; | |||
import org.springframework.context.ApplicationContextAware; | |||
/** | |||
* @description k8s上下文 | |||
* @date 2020-09-23 | |||
*/ | |||
@Slf4j | |||
@Getter | |||
public class KubeContext implements ApplicationContextAware { | |||
private static final String AUTO = "auto"; | |||
private ApplicationContext applicationContext; | |||
private KubernetesClient client; | |||
private Config config; | |||
public KubeContext(KubeProperties kubeProperties) { | |||
String configSource = kubeProperties.getKubeconfig(); | |||
try { | |||
if(AUTO.equals(configSource)){ | |||
//在集群内部可自动侦测 | |||
log.info("kubernetes client is in cluster mode"); | |||
client = new DefaultKubernetesClient(); | |||
config = client.getConfiguration(); | |||
}else{ | |||
if(configSource.startsWith(StrUtil.SLASH)){ | |||
log.info("read kubeconfig from file system:{}", configSource); | |||
System.setProperty(Config.KUBERNETES_KUBECONFIG_FILE, configSource); | |||
}else{ | |||
log.info("read kubeconfig from classpath:{}", configSource); | |||
final String testKubeconfigFile = Utils.filePath(getClass().getResource(StrUtil.SLASH + configSource)); | |||
//修改环境变量,重新指定kubeconfig读取位置 | |||
System.setProperty(Config.KUBERNETES_KUBECONFIG_FILE, testKubeconfigFile); | |||
} | |||
client = new DefaultKubernetesClient(); | |||
config = client.getConfiguration(); | |||
} | |||
//打印集群信息 | |||
log.info("ApiVersion : {}", client.getApiVersion()); | |||
log.info("MasterUrl : {}", client.getMasterUrl()); | |||
if(log.isDebugEnabled()){ | |||
VersionInfo versionInfo = client.getVersion(); | |||
log.debug("Version details of this Kubernetes cluster :-"); | |||
log.debug("Major : {}", versionInfo.getMajor()); | |||
log.debug("Minor : {}", versionInfo.getMinor()); | |||
log.debug("GitVersion : {}", versionInfo.getGitVersion()); | |||
log.debug("GitCommit : {}", versionInfo.getGitCommit()); | |||
log.debug("BuildDate : {}", versionInfo.getBuildDate()); | |||
log.debug("GitTreeState : {}", versionInfo.getGitTreeState()); | |||
log.debug("Platform : {}", versionInfo.getPlatform()); | |||
log.debug("GoVersion : {}", versionInfo.getGoVersion()); | |||
} | |||
}catch (Exception e){ | |||
client = null; | |||
log.error("初始化 K8sUtils 失败!", e); | |||
e.printStackTrace(); | |||
} | |||
} | |||
/** | |||
* 导出成yaml字符串 | |||
* @param resource k8s元数据 | |||
* @return | |||
*/ | |||
public String convertToYaml(HasMetadata resource) { | |||
try { | |||
return SerializationUtils.dumpAsYaml(resource); | |||
} catch (JsonProcessingException e) { | |||
e.printStackTrace(); | |||
throw new RuntimeException("can not transform resource to yaml"); | |||
} | |||
} | |||
@Override | |||
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { | |||
this.applicationContext = applicationContext; | |||
} | |||
} |
@@ -0,0 +1,131 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.controller; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||
import io.fabric8.kubernetes.client.dsl.Resource; | |||
import io.fabric8.kubernetes.client.informers.ResourceEventHandler; | |||
import io.fabric8.kubernetes.client.informers.SharedIndexInformer; | |||
import io.fabric8.kubernetes.client.informers.cache.Lister; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.onebrain.operator.action.handler.DistributeTrainActionHandler; | |||
import org.onebrain.operator.crd.DistributeTrain; | |||
import org.onebrain.operator.crd.DistributeTrainList; | |||
import org.onebrain.operator.crd.DoneableDistributeTrain; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.beans.factory.annotation.Qualifier; | |||
import org.springframework.scheduling.annotation.Async; | |||
import java.util.concurrent.TimeUnit; | |||
/** | |||
* @description 分布式训练控制器 | |||
* @date 2020-06-16 | |||
*/ | |||
@Slf4j | |||
public class DistributeTrainController { | |||
@Autowired | |||
private KubernetesClient client; | |||
/** | |||
* 分布式训练informer | |||
*/ | |||
private SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer; | |||
/** | |||
* 分布式训练k8s访问客户端 | |||
*/ | |||
private MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient; | |||
/** | |||
* 分布式训练lister | |||
*/ | |||
private Lister<DistributeTrain> distributeTrainLister; | |||
@Autowired | |||
@Qualifier("addActionHandler") | |||
private DistributeTrainActionHandler addActionHandler; | |||
@Autowired | |||
@Qualifier("deleteActionHandler") | |||
private DistributeTrainActionHandler deleteActionHandler; | |||
public DistributeTrainController(MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient, SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer, String namespace) { | |||
this.distributeTrainSharedIndexInformer = distributeTrainSharedIndexInformer; | |||
this.distributeTrainClient = distributeTrainClient; | |||
this.distributeTrainLister = new Lister<>(distributeTrainSharedIndexInformer.getIndexer()); | |||
} | |||
/** | |||
* 添加事件监听器 | |||
*/ | |||
public void create() { | |||
distributeTrainSharedIndexInformer.addEventHandler(new ResourceEventHandler<DistributeTrain>() { | |||
/** | |||
* 处理添加事件 | |||
* @param distributeTrain 分布式训练信息 | |||
*/ | |||
@Override | |||
public void onAdd(DistributeTrain distributeTrain) { | |||
log.info("add distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||
addActionHandler.handlerAction(distributeTrain); | |||
} | |||
/** | |||
* 处理更内心事件 | |||
* @param distributeTrain 旧的 分布式训练信息 | |||
* @param newDistributeTrain 新的 分布式训练信息 | |||
*/ | |||
@Override | |||
public void onUpdate(DistributeTrain distributeTrain, DistributeTrain newDistributeTrain) { | |||
log.info("update distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||
} | |||
/** | |||
* 处理删除事件 | |||
* @param distributeTrain 分布式训练信息 | |||
* @param b 是否为未知事件 | |||
*/ | |||
@Override | |||
public void onDelete(DistributeTrain distributeTrain, boolean b) { | |||
log.info("delete distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||
deleteActionHandler.handlerAction(distributeTrain); | |||
} | |||
}); | |||
} | |||
/** | |||
* 运行 | |||
*/ | |||
@Async | |||
public void run() { | |||
log.info("Starting DistributeTrain controller"); | |||
try { | |||
//分布式训练信息同步 | |||
while (!distributeTrainSharedIndexInformer.hasSynced()){ | |||
TimeUnit.SECONDS.sleep(1); | |||
} | |||
} catch (InterruptedException e) { | |||
e.printStackTrace(); | |||
log.error("run error:【{}】",e); | |||
} | |||
log.info("DistributeTrain controller is Running"); | |||
} | |||
} |
@@ -0,0 +1,47 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.crd; | |||
import io.fabric8.kubernetes.api.model.ObjectMeta; | |||
import io.fabric8.kubernetes.client.CustomResource; | |||
import lombok.Data; | |||
import lombok.NoArgsConstructor; | |||
/** | |||
* @description 分布式训练 | |||
* @date 2020-09-24 | |||
*/ | |||
@Data | |||
@NoArgsConstructor | |||
public class DistributeTrain extends CustomResource { | |||
/** | |||
* 分布式训练详细规格 | |||
*/ | |||
private DistributeTrainSpec spec; | |||
/** | |||
* 分布式训练状态 | |||
*/ | |||
private DistributeTrainStatus status; | |||
public DistributeTrain(ObjectMeta objectMeta, DistributeTrainSpec spec) { | |||
this.setMetadata(objectMeta); | |||
this.spec = spec; | |||
} | |||
} |
@@ -0,0 +1,27 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.crd; | |||
import io.fabric8.kubernetes.client.CustomResourceList; | |||
/** | |||
* @description CRD资源列表(分布式训练) | |||
* @date 2020-09-24 | |||
*/ | |||
public class DistributeTrainList extends CustomResourceList<DistributeTrain> { | |||
} |
@@ -0,0 +1,108 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.crd; | |||
import com.fasterxml.jackson.databind.JsonDeserializer; | |||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize; | |||
import io.fabric8.kubernetes.api.model.*; | |||
import lombok.AllArgsConstructor; | |||
import lombok.Builder; | |||
import lombok.Data; | |||
import lombok.NoArgsConstructor; | |||
import java.util.List; | |||
import java.util.Map; | |||
/** | |||
* @description 分布式训练详细规格 | |||
* @date 2020-09-23 | |||
*/ | |||
@JsonDeserialize( | |||
using = JsonDeserializer.None.class | |||
) | |||
@Data | |||
@NoArgsConstructor | |||
@AllArgsConstructor | |||
@Builder | |||
public class DistributeTrainSpec implements KubernetesResource { | |||
/** | |||
* 镜像 | |||
*/ | |||
private String image; | |||
/** | |||
* 镜像拉取策略 | |||
*/ | |||
private String imagePullPolicy; | |||
/** | |||
* 机器数 | |||
*/ | |||
private Integer size; | |||
/** | |||
* 环境参数 | |||
*/ | |||
private List<EnvVar> env; | |||
/** | |||
* master 命令 | |||
*/ | |||
private String masterCmd; | |||
/** | |||
* slave命令 | |||
*/ | |||
private String slaveCmd; | |||
/** | |||
* master 资源节点限制 | |||
*/ | |||
private ResourceRequirements masterResources; | |||
/** | |||
* slave 资源节点限制 | |||
*/ | |||
private ResourceRequirements slaveResources; | |||
/** | |||
* 节点调度选择器 | |||
*/ | |||
private Map<String,String> nodeSelector; | |||
/** | |||
* 初始化容器 | |||
*/ | |||
private Container initContainer; | |||
/** | |||
* 工作目录挂载 | |||
*/ | |||
private Volume workspaceStorage; | |||
/** | |||
* 数据集目录挂载 | |||
*/ | |||
private Volume datasetStorage; | |||
/** | |||
* 模型目录挂载 | |||
*/ | |||
private Volume modelStorage; | |||
} |
@@ -0,0 +1,55 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.crd; | |||
import com.fasterxml.jackson.databind.JsonDeserializer; | |||
import com.fasterxml.jackson.databind.annotation.JsonDeserialize; | |||
import io.fabric8.kubernetes.api.model.KubernetesResource; | |||
import lombok.Data; | |||
/** | |||
* @description 分布式训练状态 | |||
* @date 2020-09-23 | |||
*/ | |||
@JsonDeserialize( | |||
using = JsonDeserializer.None.class | |||
) | |||
@Data | |||
public class DistributeTrainStatus implements KubernetesResource { | |||
/** | |||
* 副本数 | |||
*/ | |||
private Integer replicas; | |||
/** | |||
* 处在ready状态的副本数 | |||
*/ | |||
private Integer readyReplicas; | |||
/** | |||
* 成功数 | |||
*/ | |||
private Integer success; | |||
/** | |||
* 失败数 | |||
*/ | |||
private Integer failed; | |||
} |
@@ -0,0 +1,31 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.crd; | |||
import io.fabric8.kubernetes.api.builder.Function; | |||
import io.fabric8.kubernetes.client.CustomResourceDoneable; | |||
/** | |||
* @description CRD资源的修改Builder | |||
* @date 2020-09-24 | |||
*/ | |||
public class DoneableDistributeTrain extends CustomResourceDoneable<DistributeTrain> { | |||
public DoneableDistributeTrain(DistributeTrain resource, Function<DistributeTrain, DistributeTrain> function) { | |||
super(resource, function); | |||
} | |||
} |
@@ -0,0 +1,56 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.enums; | |||
/** | |||
* @description pvc的访问模式 | |||
* @date 2020-09-24 | |||
*/ | |||
public enum AccessModeEnum { | |||
/** | |||
* RWO是最基本的方式,可读可写,但只支持被单个Pod挂载 | |||
*/ | |||
RWO("ReadWriteOnce"), | |||
/** | |||
* 可以以只读的方式被多个Pod挂载 | |||
*/ | |||
ROX("ReadOnlyMany"), | |||
/****/ | |||
/** | |||
* 这种存储可以以读写的方式被多个Pod共享。 | |||
* 不是每一种存储都支持这三种方式,像共享方式,目前支持的还比较少,比较常用的是NFS。 | |||
* 在PVC绑定PV时通常根据两个条件来绑定,一个是存储的大小,另一个就是访问模式。 | |||
*/ | |||
RWX("ReadWriteMany"); | |||
/** | |||
* 模式 | |||
*/ | |||
private final String mode; | |||
AccessModeEnum(String mode) { | |||
this.mode = mode; | |||
} | |||
public String getMode() { | |||
return mode; | |||
} | |||
} |
@@ -0,0 +1,49 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.exception; | |||
import lombok.Getter; | |||
import lombok.extern.slf4j.Slf4j; | |||
/** | |||
* @description Operator自定义异常 | |||
* @date 2020-09-24 | |||
*/ | |||
@Slf4j | |||
@Getter | |||
public class OperatorException extends RuntimeException{ | |||
/** | |||
* 信息 | |||
*/ | |||
private String msg; | |||
/** | |||
* 原因 | |||
*/ | |||
private Throwable cause; | |||
public OperatorException(String msg, Throwable cause) { | |||
this.msg = msg; | |||
this.cause = cause; | |||
} | |||
public OperatorException(String msg) { | |||
this.msg = msg; | |||
} | |||
} |
@@ -0,0 +1,34 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.properties; | |||
import lombok.Data; | |||
import org.springframework.boot.context.properties.ConfigurationProperties; | |||
import org.springframework.stereotype.Component; | |||
/** | |||
* @description 属性配置 | |||
* @date 2020-09-24 | |||
*/ | |||
@Data | |||
@ConfigurationProperties("k8s") | |||
@Component | |||
public class KubeProperties { | |||
private String kubeconfig; | |||
} |
@@ -0,0 +1,65 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.redis; | |||
/** | |||
* @description redis Key 前缀 | |||
* @date 2020-09-23 | |||
*/ | |||
public abstract class AbstractKeyPrefix { | |||
/** | |||
* key模板 | |||
*/ | |||
private static final String KEY_TEMPLATE = "Operator:%s"; | |||
/** | |||
* 过期时间 | |||
*/ | |||
private int expireSeconds; | |||
/** | |||
* 前缀 | |||
*/ | |||
private String prefix; | |||
public AbstractKeyPrefix(String prefix) {//0代表永不过期 | |||
this(prefix,0); | |||
} | |||
public AbstractKeyPrefix(String prefix, int expireSeconds) { | |||
this.expireSeconds = expireSeconds; | |||
this.prefix = prefix; | |||
} | |||
/** | |||
* 获取过期时间 | |||
* @return | |||
*/ | |||
public int getExpireSeconds() {//默认0代表永不过期 | |||
return expireSeconds; | |||
} | |||
/** | |||
* 获取前缀 | |||
* @return | |||
*/ | |||
public String getPrefix() { | |||
return String.format(KEY_TEMPLATE, prefix); | |||
} | |||
} |
@@ -0,0 +1,290 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.redis; | |||
import org.onebrain.operator.utils.FastjsonUtils; | |||
import org.onebrain.operator.utils.RedisUtils; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.stereotype.Service; | |||
import java.util.Set; | |||
/** | |||
* @description redis服务 | |||
* @date 2020-09-03 | |||
*/ | |||
@Service | |||
public class RedisService { | |||
@Autowired | |||
private RedisUtils redisUtils; | |||
/** | |||
* 真正key模板 | |||
*/ | |||
private static final String REAL_KEY_TEMPLATE = "%s:%s"; | |||
/** | |||
* 获取真正的key | |||
* @param prefix 前缀 | |||
* @param key key值 | |||
* @return 放入redis里的key值 | |||
*/ | |||
private String getRealKey(AbstractKeyPrefix prefix, String key){ | |||
return String.format(REAL_KEY_TEMPLATE, prefix.getPrefix(), key); | |||
} | |||
/** | |||
* 实现命令:TTL key,以秒为单位,返回给定 key的剩余生存时间(TTL, time to live)。 | |||
* @param prefix 前缀 | |||
* @param key key值 | |||
* @return 返回过期时间秒数 | |||
*/ | |||
public long ttl(AbstractKeyPrefix prefix, String key) { | |||
return redisUtils.ttl(getRealKey(prefix, key)); | |||
} | |||
/** | |||
* 实现命令:expire 设置过期时间,单位秒 | |||
* @param prefix 前缀 | |||
* @param key key值 | |||
* @param timeout 期望过期时间 | |||
*/ | |||
public void expire(AbstractKeyPrefix prefix, String key, long timeout) { | |||
redisUtils.expire(getRealKey(prefix, key), timeout); | |||
} | |||
/** | |||
* 实现命令:INCR key,增加key一次 | |||
* @param prefix 前缀 | |||
* @param key key值 | |||
* @param delta 增量 | |||
* @return 计数值 | |||
*/ | |||
public long incr(AbstractKeyPrefix prefix, String key, long delta) { | |||
return redisUtils.incr(getRealKey(prefix, key), delta); | |||
} | |||
/** | |||
* 实现命令: key,减少key一次 | |||
* @param prefix 前缀 | |||
* @param key key值 | |||
* @param delta 增量 | |||
* @return 计数值 | |||
*/ | |||
public long decr(AbstractKeyPrefix prefix, String key, long delta) { | |||
String realKey = getRealKey(prefix, key); | |||
if(delta < 0){ | |||
//throw new RuntimeException("递减因子必须大于0"); | |||
del(realKey); | |||
return 0; | |||
} | |||
return redisUtils.decr(realKey, delta); | |||
} | |||
/** | |||
* 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||
* @param prefix key前缀 | |||
* @return key集合 | |||
*/ | |||
public Set<String> keys(AbstractKeyPrefix prefix) { | |||
String pattern = prefix.getPrefix(); | |||
return redisUtils.keys(pattern + ":*"); | |||
} | |||
/** | |||
* 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @return key集合 | |||
*/ | |||
public Set<String> keys(AbstractKeyPrefix prefix, String key) { | |||
String pattern = prefix.getPrefix(); | |||
return redisUtils.keys(pattern + ":" + key + ":*"); | |||
} | |||
/** | |||
* 实现命令:DEL key,删除一个key | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
*/ | |||
public void del(AbstractKeyPrefix prefix, String key) { | |||
redisUtils.del(getRealKey(prefix, key)); | |||
} | |||
/** | |||
* 删除一个key | |||
* @param realKey 真正的key | |||
*/ | |||
public void del(String realKey) { | |||
redisUtils.del(realKey); | |||
} | |||
/** | |||
* 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param value 值 | |||
*/ | |||
public void set(AbstractKeyPrefix prefix, String key, String value) { | |||
if(prefix.getExpireSeconds() <= 0){ | |||
redisUtils.set(getRealKey(prefix, key), value); | |||
}else{ | |||
redisUtils.set(getRealKey(prefix, key), value, prefix.getExpireSeconds()); | |||
} | |||
} | |||
/** | |||
* 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param value 值 | |||
* @param <T> 指定类型 | |||
*/ | |||
public <T> void set(AbstractKeyPrefix prefix, String key, T value) { | |||
if(prefix.getExpireSeconds() <= 0){ | |||
redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value)); | |||
}else{ | |||
redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), prefix.getExpireSeconds()); | |||
} | |||
} | |||
/** | |||
* 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param value 值 | |||
* @param timeout 过期时间 | |||
*/ | |||
public void set(AbstractKeyPrefix prefix, String key, String value, long timeout) { | |||
redisUtils.set(getRealKey(prefix, key), value, timeout); | |||
} | |||
/** | |||
* 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param value 值 | |||
* @param timeout 过期时间 | |||
* @param <T> 指定类型 | |||
*/ | |||
public <T> void set(AbstractKeyPrefix prefix, String key, T value, long timeout) { | |||
redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), timeout); | |||
} | |||
/** | |||
* 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param value 值 | |||
* @return 是否设值成功 | |||
*/ | |||
public Boolean setnx(AbstractKeyPrefix prefix, String key, String value){ | |||
if(prefix.getExpireSeconds() <= 0){ | |||
return redisUtils.setnx(getRealKey(prefix, key), value); | |||
}else{ | |||
return redisUtils.setnx(getRealKey(prefix, key), value, prefix.getExpireSeconds()); | |||
} | |||
} | |||
/** | |||
* 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param value 值 | |||
* @param <T> 指定类型 | |||
* @return 是否设值成功 | |||
*/ | |||
public <T> Boolean setnx(AbstractKeyPrefix prefix, String key, T value){ | |||
if(prefix.getExpireSeconds() <= 0){ | |||
return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value)); | |||
}else{ | |||
return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), prefix.getExpireSeconds()); | |||
} | |||
} | |||
/** | |||
* 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param value 值 | |||
* @param timeout 过期时间 | |||
* @return 是否设值成功 | |||
*/ | |||
public Boolean setnx(AbstractKeyPrefix prefix, String key, String value, long timeout) { | |||
return redisUtils.setnx(getRealKey(prefix, key), value, timeout); | |||
} | |||
/** | |||
* 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param value 值 | |||
* @param timeout 过期时间 | |||
* @param <T> 指定类型 | |||
* @return 是否设值成功 | |||
*/ | |||
public <T> Boolean setnx(AbstractKeyPrefix prefix, String key, T value, long timeout) { | |||
return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), timeout); | |||
} | |||
/** | |||
* 实现命令:GET key,返回 key所关联的字符串值。 | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @return 值 | |||
*/ | |||
public String get(AbstractKeyPrefix prefix, String key) { | |||
return redisUtils.get(getRealKey(prefix, key)); | |||
} | |||
/** | |||
* 实现命令:GET key,返回 key所关联的字符串值。 | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @param <T> 指定类型 | |||
* @return 值 | |||
*/ | |||
public <T> T get(AbstractKeyPrefix prefix, String key, Class<T> clazz) { | |||
return redisUtils.get(getRealKey(prefix, key), clazz); | |||
} | |||
/** | |||
* 根据key获取值 | |||
* @param lastKey 真正的key | |||
* @param clazz 类型 | |||
* @param <T> 泛型 | |||
* @return | |||
*/ | |||
public <T> T get(String lastKey, Class<T> clazz) { | |||
return redisUtils.get(lastKey, clazz); | |||
} | |||
/** | |||
* 实现命令:GET key,返回 key所关联的字符串值。 | |||
* @param prefix key前缀 | |||
* @param key key值 | |||
* @return 是否存在 | |||
*/ | |||
public Boolean exists(AbstractKeyPrefix prefix, String key) { | |||
return redisUtils.exists(getRealKey(prefix, key)); | |||
} | |||
} |
@@ -0,0 +1,45 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.redis.key; | |||
import org.onebrain.operator.redis.AbstractKeyPrefix; | |||
/** | |||
* @description 由operator产生的cr的唯一标识 | |||
* @date 2020-09-23 | |||
*/ | |||
public class OperatorKey extends AbstractKeyPrefix { | |||
public OperatorKey(String prefix) { | |||
super(prefix); | |||
} | |||
public OperatorKey(String prefix, int expireSeconds) { | |||
super(prefix, expireSeconds); | |||
} | |||
/** | |||
* 分布式训练 Key | |||
*/ | |||
public static final OperatorKey CR = new OperatorKey("DistributeTrain"); | |||
/** | |||
* 分布式训练Job Key | |||
*/ | |||
public static final OperatorKey CR_JOB = new OperatorKey("DistributeTrain:Job"); | |||
} |
@@ -0,0 +1,41 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.utils; | |||
import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||
import io.fabric8.kubernetes.client.dsl.Resource; | |||
import org.onebrain.operator.crd.DistributeTrain; | |||
import org.onebrain.operator.crd.DistributeTrainList; | |||
import org.onebrain.operator.crd.DoneableDistributeTrain; | |||
/** | |||
* @description 分布式训练客户端持有器 | |||
* @date 2020-09-23 | |||
*/ | |||
public class DistributeTrainClientHolder { | |||
private static MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient; | |||
public static MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> getClient(){ | |||
return distributeTrainClient; | |||
} | |||
public static void setDistributeTrainClient(MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> client){ | |||
distributeTrainClient = client; | |||
} | |||
} |
@@ -0,0 +1,188 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.utils; | |||
import com.alibaba.fastjson.JSON; | |||
import com.alibaba.fastjson.JSONObject; | |||
import com.alibaba.fastjson.serializer.SerializerFeature; | |||
import java.util.List; | |||
import java.util.Map; | |||
/** | |||
* @description json工具类 | |||
* @date 2020-09-24 | |||
*/ | |||
public class FastjsonUtils { | |||
private static final SerializerFeature[] FEATURES = { | |||
// 输出空置字段 | |||
SerializerFeature.WriteMapNullValue, | |||
//日期类型用日期字符串 yyyy-MM-dd HH:mm:ss | |||
SerializerFeature.WriteDateUseDateFormat, | |||
// list字段如果为null,输出为[],而不是null | |||
SerializerFeature.WriteNullListAsEmpty, | |||
// 数值字段如果为null,输出为0,而不是null | |||
SerializerFeature.WriteNullNumberAsZero, | |||
// Boolean字段如果为null,输出为false,而不是null | |||
SerializerFeature.WriteNullBooleanAsFalse, | |||
// 字符类型字段如果为null,输出为"",而不是null | |||
SerializerFeature.WriteNullStringAsEmpty | |||
}; | |||
/** | |||
* 将对象转为json | |||
* @param object | |||
* @return json的String | |||
*/ | |||
public static String convertObjectToJSON(Object object) { | |||
return JSON.toJSONString(object, FEATURES); | |||
} | |||
/** | |||
* 将对象转为json(无循环引用) | |||
* @param object | |||
* @return json的String | |||
*/ | |||
public static String toJSONNoFeatures(Object object) { | |||
return JSON.toJSONString(object, SerializerFeature.DisableCircularReferenceDetect); | |||
} | |||
/** | |||
* 将json转为对象 | |||
* @param text | |||
* @return 对象 | |||
*/ | |||
public static Object toBean(String text) { | |||
return JSON.parse(text); | |||
} | |||
/** | |||
* 将json转为对象 | |||
* @param text 文本字符串 | |||
* @param clazz 类型 | |||
* @param <T> 泛型 | |||
* @return 泛型对象 | |||
*/ | |||
public static <T> T toBean(String text, Class<T> clazz) { | |||
return JSON.parseObject(text, clazz); | |||
} | |||
/** | |||
* 转换为数组 | |||
* @param text 文本字符串 | |||
* @return 泛型对象 | |||
*/ | |||
public static <T> Object[] toArray(String text) { | |||
return toArray(text, null); | |||
} | |||
/** | |||
* 转换为数组 | |||
* @param text 文本字符串 | |||
* @param clazz 类型 | |||
* @return | |||
*/ | |||
public static <T> Object[] toArray(String text, Class<T> clazz) { | |||
return JSON.parseArray(text, clazz).toArray(); | |||
} | |||
/** | |||
* 转换为List | |||
* @param text 文本字符串 | |||
* @param clazz 类型 | |||
* @return | |||
*/ | |||
public static <T> List<T> toList(String text, Class<T> clazz) { | |||
return JSON.parseArray(text, clazz); | |||
} | |||
/** | |||
* 将string转化为序列化的json字符串 | |||
* @param text 文本字符串 | |||
* @return json对象 | |||
*/ | |||
public static Object textToJson(String text) { | |||
Object objectJson = JSON.parse(text); | |||
return objectJson; | |||
} | |||
/** | |||
* json字符串转化为map | |||
* @param text json字符串 | |||
* @return Map集合 | |||
*/ | |||
public static <K, V> Map<K, V> stringToCollect(String text) { | |||
Map<K, V> m = (Map<K, V>) JSONObject.parseObject(text); | |||
return m; | |||
} | |||
/** | |||
* 转换JSON字符串为对象 | |||
* @param jsonData json字符串 | |||
* @param clazz 转换目标对象的类型 | |||
* @return json对象 | |||
*/ | |||
public static Object convertJsonToObject(String jsonData, Class<?> clazz) { | |||
return JSONObject.parseObject(jsonData, clazz); | |||
} | |||
/** | |||
* 将map转化为string | |||
* @param m Map集合 | |||
* @return 字符串 | |||
*/ | |||
public static <K, V> String collectToString(Map<K, V> m) { | |||
String s = JSONObject.toJSONString(m); | |||
return s; | |||
} | |||
/** | |||
* json字符串转化为map | |||
* | |||
* @param text 字符串 | |||
* @return Map 对象 | |||
*/ | |||
public static Map stringToMap(String text) { | |||
Map m = JSONObject.parseObject(text); | |||
return m; | |||
} | |||
/** | |||
* 将map转化为string | |||
* | |||
* @param m Map集合 | |||
* @return 字符串 | |||
*/ | |||
public static String mapToString(Map m) { | |||
String s = JSONObject.toJSONString(m); | |||
return s; | |||
} | |||
/** | |||
* 把对象转换为指定对象 | |||
* @param source 原对象 | |||
* @param target 目标class | |||
* @param <T> 泛型 | |||
* @return 泛型对象 | |||
*/ | |||
public static <T> T toObjectFromSource(Object source,Class<T> target) { | |||
return toBean(convertObjectToJSON(source), target); | |||
} | |||
} |
@@ -0,0 +1,56 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.utils; | |||
import lombok.extern.slf4j.Slf4j; | |||
import java.io.File; | |||
import java.io.FileOutputStream; | |||
import java.io.IOException; | |||
import java.io.InputStream; | |||
/** | |||
* @description IO工具类 | |||
* @date 2020-09-24 | |||
*/ | |||
@Slf4j | |||
public class IOUtils { | |||
/** | |||
* 将input流转换为文件 | |||
* | |||
* @param is 输入流 | |||
* @param targetFile 目标文件 | |||
*/ | |||
public static void copy(InputStream is, File targetFile) { | |||
try (FileOutputStream fos = new FileOutputStream(targetFile)) { | |||
byte[] b = new byte[1024]; | |||
int readCount = is.read(b); | |||
while (readCount != -1) { | |||
// 写入数据 | |||
fos.write(b, 0, readCount); | |||
readCount = is.read(b); | |||
} | |||
is.close(); | |||
fos.flush(); | |||
} catch (IOException e) { | |||
log.error("copy file error:【{}】", e); | |||
} | |||
} | |||
} |
@@ -0,0 +1,289 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.utils; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.data.redis.core.StringRedisTemplate; | |||
import org.springframework.stereotype.Component; | |||
import java.util.Map; | |||
import java.util.Set; | |||
import java.util.concurrent.TimeUnit; | |||
/** | |||
* @description 封装redis简单的key-value操作 | |||
* @date 2020-09-23 | |||
*/ | |||
@Component | |||
public class RedisUtils { | |||
@Autowired | |||
private StringRedisTemplate redisTemplate; | |||
/** | |||
* 实现命令:TTL key,以秒为单位,返回给定 key的剩余生存时间(TTL, time to live)。 | |||
* @param key key值 | |||
* @return 返回过期时间秒数 | |||
*/ | |||
public long ttl(String key) { | |||
return redisTemplate.getExpire(key); | |||
} | |||
/** | |||
* 实现命令:expire 设置过期时间,单位秒 | |||
* @param key key值 | |||
* @param timeout 期望过期时间 | |||
*/ | |||
public void expire(String key, long timeout) { | |||
redisTemplate.expire(key, timeout, TimeUnit.SECONDS); | |||
} | |||
/** | |||
* 实现命令:INCR key,增加key一次 | |||
* @param key key值 | |||
* @param delta 增量 | |||
* @return 计数值 | |||
*/ | |||
public long incr(String key, long delta) { | |||
return redisTemplate.opsForValue().increment(key, delta); | |||
} | |||
/** | |||
* 实现命令: key,减少key一次 | |||
* @param key key值 | |||
* @param delta 增量 | |||
* @return 计数值 | |||
*/ | |||
public long decr(String key, long delta) { | |||
if(delta < 0){ | |||
//throw new RuntimeException("递减因子必须大于0"); | |||
del(key); | |||
return 0; | |||
} | |||
return redisTemplate.opsForValue().increment(key, -delta); | |||
} | |||
/** | |||
* 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||
* @return key集合 | |||
*/ | |||
public Set<String> keys(String pattern) { | |||
return redisTemplate.keys(pattern); | |||
} | |||
/** | |||
* 实现命令:DEL key,删除一个key | |||
* @param key key值 | |||
*/ | |||
public void del(String key) { | |||
redisTemplate.delete(key); | |||
} | |||
/** | |||
* 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||
* @param key key值 | |||
* @param value 值 | |||
*/ | |||
public void set(String key, String value) { | |||
redisTemplate.opsForValue().set(key, value); | |||
} | |||
/** | |||
* 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||
* @param key key值 | |||
* @param value 值 | |||
* @param <T> 指定类型 | |||
*/ | |||
public <T> void set(String key, T value) { | |||
redisTemplate.opsForValue().set(key, FastjsonUtils.convertObjectToJSON(value)); | |||
} | |||
/** | |||
* 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||
* @param key key值 | |||
* @param value 值 | |||
* @param timeout 过期时间 | |||
*/ | |||
public void set(String key, String value, long timeout) { | |||
redisTemplate.opsForValue().set(key, value, timeout, TimeUnit.SECONDS); | |||
} | |||
/** | |||
* 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||
* @param key key值 | |||
* @param value 值 | |||
* @param timeout 过期时间 | |||
* @param <T> 指定类型 | |||
*/ | |||
public <T> void set(String key, T value, long timeout) { | |||
redisTemplate.opsForValue().set(key, FastjsonUtils.convertObjectToJSON(value), timeout, TimeUnit.SECONDS); | |||
} | |||
/** | |||
* 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||
* @param key key值 | |||
* @param value 值 | |||
* @return 是否设值成功 | |||
*/ | |||
public Boolean setnx(String key, String value){ | |||
return redisTemplate.opsForValue().setIfAbsent(key, value); | |||
} | |||
/** | |||
* 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||
* @param key key值 | |||
* @param value 值 | |||
* @param <T> 指定类型 | |||
* @return 是否设值成功 | |||
*/ | |||
public <T> Boolean setnx(String key, T value){ | |||
return redisTemplate.opsForValue().setIfAbsent(key, FastjsonUtils.convertObjectToJSON(value)); | |||
} | |||
/** | |||
* 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||
* @param key key值 | |||
* @param value 值 | |||
* @param timeout 过期时间 | |||
* @return 是否设值成功 | |||
*/ | |||
public Boolean setnx(String key, String value, long timeout) { | |||
return redisTemplate.opsForValue().setIfAbsent(key, value, timeout, TimeUnit.SECONDS); | |||
} | |||
/** | |||
* 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||
* @param key key值 | |||
* @param value 值 | |||
* @param timeout 过期时间 | |||
* @param <T> 指定类型 | |||
* @return 是否设值成功 | |||
*/ | |||
public <T> Boolean setnx(String key, T value, long timeout) { | |||
return redisTemplate.opsForValue().setIfAbsent(key, FastjsonUtils.convertObjectToJSON(value), timeout, TimeUnit.SECONDS); | |||
} | |||
/** | |||
* 实现命令:GET key,返回 key所关联的字符串值。 | |||
* @param key key值 | |||
* @return 值 | |||
*/ | |||
public String get(String key) { | |||
return (String) redisTemplate.opsForValue().get(key); | |||
} | |||
/** | |||
* | |||
* 根据key获取值 | |||
* @param key 真正的key | |||
* @param clazz 类型 | |||
* @param <T> 泛型 | |||
* @return | |||
*/ | |||
public <T> T get(String key, Class<T> clazz) { | |||
String value = (String) redisTemplate.opsForValue().get(key); | |||
return (T) FastjsonUtils.convertJsonToObject(value, clazz); | |||
} | |||
/** | |||
* 实现命令:GET key,返回 key所关联的字符串值。 | |||
* @param key key值 | |||
* @return 是否存在 | |||
*/ | |||
public Boolean exists(String key) { | |||
return redisTemplate.hasKey(key); | |||
} | |||
/****----------------------------------Hash----------------------------------------****/ | |||
/** | |||
* 实现命令:HSET key field value,将哈希表 key中的域 field的值设为 value | |||
* | |||
* @param key key | |||
* @param field 域 | |||
* @param value 值 | |||
*/ | |||
public void hset(String key, String field, Object value) { | |||
redisTemplate.opsForHash().put(key, field, value); | |||
} | |||
/** | |||
* 实现命令:HGET key field,返回哈希表 key中给定域 field的值 | |||
* | |||
* @param key key | |||
* @param field 域 | |||
* @return | |||
*/ | |||
public String hget(String key, String field) { | |||
return (String) redisTemplate.opsForHash().get(key, field); | |||
} | |||
/** | |||
* 实现命令:HDEL key field [field ...],删除哈希表 key 中的一个或多个指定域,不存在的域将被忽略。 | |||
* | |||
* @param key key | |||
* @param fields 域 | |||
*/ | |||
public void hdel(String key, Object... fields) { | |||
redisTemplate.opsForHash().delete(key, fields); | |||
} | |||
/** | |||
* 实现命令:HGETALL key,返回哈希表 key中,所有的域和值。 | |||
* | |||
* @param key | |||
* @return 域和值 | |||
*/ | |||
public Map<Object, Object> hgetall(String key) { | |||
return redisTemplate.opsForHash().entries(key); | |||
} | |||
/****----------------------------------List----------------------------------------****/ | |||
/** | |||
* 实现命令:LPUSH key value,将一个值 value插入到列表 key的表头 | |||
* | |||
* @param key | |||
* @param value | |||
* @return 执行 LPUSH命令后,列表的长度。 | |||
*/ | |||
public long lpush(String key, String value) { | |||
return redisTemplate.opsForList().leftPush(key, value); | |||
} | |||
/** | |||
* 实现命令:LPOP key,移除并返回列表 key的头元素。 | |||
* | |||
* @param key | |||
* @return 列表key的头元素。 | |||
*/ | |||
public String lpop(String key) { | |||
return (String) redisTemplate.opsForList().leftPop(key); | |||
} | |||
/** | |||
* 实现命令:RPUSH key value,将一个值 value插入到列表 key的表尾(最右边)。 | |||
* | |||
* @param key | |||
* @param value | |||
* @return 执行 LPUSH命令后,列表的长度。 | |||
*/ | |||
public long rpush(String key, String value) { | |||
return redisTemplate.opsForList().rightPush(key, value); | |||
} | |||
} |
@@ -0,0 +1,99 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.utils; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.springframework.beans.BeansException; | |||
import org.springframework.beans.factory.DisposableBean; | |||
import org.springframework.context.ApplicationContext; | |||
import org.springframework.context.ApplicationContextAware; | |||
import org.springframework.stereotype.Component; | |||
/** | |||
* @description 上下文工具类 | |||
* @date 2020-09-24 | |||
*/ | |||
@Component | |||
@Slf4j | |||
public class SpringContextHolder implements ApplicationContextAware, DisposableBean { | |||
public static ApplicationContext applicationContext = null; | |||
/** | |||
* 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型. | |||
* @param name bean名称 | |||
* @param <T> 类型 | |||
* @return bean对象 | |||
*/ | |||
@SuppressWarnings("unchecked") | |||
public static <T> T getBean(String name) { | |||
assertContextInjected(); | |||
return (T) applicationContext.getBean(name); | |||
} | |||
/** | |||
* 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型. | |||
* @param requiredType bean类型 class | |||
* @param <T> 泛型 | |||
* @return bean对象 | |||
*/ | |||
public static <T> T getBean(Class<T> requiredType) { | |||
assertContextInjected(); | |||
return applicationContext.getBean(requiredType); | |||
} | |||
/** | |||
* 检查ApplicationContext不为空. | |||
*/ | |||
private static void assertContextInjected() { | |||
if (applicationContext == null) { | |||
throw new IllegalStateException("applicaitonContext属性未注入, 请在applicationContext" + | |||
".xml中定义SpringContextHolder或在SpringBoot启动类中注册SpringContextHolder."); | |||
} | |||
} | |||
/** | |||
* 清除SpringContextHolder中的ApplicationContext为Null. | |||
*/ | |||
private static void clearHolder() { | |||
log.debug("清除SpringContextHolder中的ApplicationContext:" | |||
+ applicationContext); | |||
applicationContext = null; | |||
} | |||
/** | |||
* 销毁回调函数 | |||
*/ | |||
@Override | |||
public void destroy() { | |||
SpringContextHolder.clearHolder(); | |||
} | |||
/** | |||
* spring上下文设置 | |||
* @param applicationContext | |||
* @throws BeansException | |||
*/ | |||
@Override | |||
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { | |||
if (SpringContextHolder.applicationContext != null) { | |||
log.warn("SpringContextHolder中的ApplicationContext被覆盖, 原有ApplicationContext为:" + SpringContextHolder.applicationContext); | |||
} | |||
SpringContextHolder.applicationContext = applicationContext; | |||
} | |||
} |
@@ -0,0 +1,111 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.watcher; | |||
import cn.hutool.core.collection.CollectionUtil; | |||
import cn.hutool.core.util.StrUtil; | |||
import io.fabric8.kubernetes.api.model.OwnerReference; | |||
import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||
import io.fabric8.kubernetes.api.model.batch.Job; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import lombok.Data; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.onebrain.operator.constants.KubeConstants; | |||
import org.onebrain.operator.redis.RedisService; | |||
import org.onebrain.operator.redis.key.OperatorKey; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.stereotype.Component; | |||
import java.util.List; | |||
import static org.onebrain.operator.constants.CrdConstants.CRD_KIND; | |||
/** | |||
* @description Job处理器 | |||
* @date 2020-09-24 | |||
*/ | |||
@Data | |||
@Slf4j | |||
@Component | |||
public class JobHandler { | |||
public static final String FINISHED = "finished"; | |||
public static final String PENDING = "pending"; | |||
@Autowired | |||
private RedisService redis; | |||
@Autowired | |||
private KubernetesClient client; | |||
/** | |||
* 处理Job | |||
* | |||
* @param job | |||
*/ | |||
public void handleJob(Job job) { | |||
log.info("handleJob=>job : 【{}】", job); | |||
//筛选出DistributeTrain下的job | |||
List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences(); | |||
if (CollectionUtil.isEmpty(ownerReferences) || !CRD_KIND.equals(ownerReferences.get(0).getKind())) { | |||
return; | |||
} | |||
String key = job.getMetadata().getUid(); | |||
if (StrUtil.equals(redis.get(OperatorKey.CR_JOB, key), FINISHED)) { | |||
return; | |||
} | |||
try { | |||
redis.set(OperatorKey.CR_JOB, key, PENDING); | |||
final Integer parallelism = job.getSpec().getParallelism(); | |||
final Integer backoffLimit = job.getSpec().getBackoffLimit(); | |||
//成功 或者 失败达到最大次数 | |||
if (job.getStatus() != null | |||
&& ((job.getStatus().getFailed() != null && job.getStatus().getFailed() + 1 >= backoffLimit) | |||
|| (job.getStatus().getSucceeded() != null && parallelism.equals(job.getStatus().getSucceeded())))) { | |||
//得到DistributeTrain的Statefulset | |||
String dtName = ownerReferences.get(0).getName(); | |||
String namespace = job.getMetadata().getNamespace(); | |||
List<StatefulSet> statefulsetList = client.apps().statefulSets() | |||
.inNamespace(namespace) | |||
.withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, dtName) | |||
.list().getItems(); | |||
if (CollectionUtil.isEmpty(statefulsetList)) { | |||
log.info("jobWatcher: statefulset of 【{}】 not exists", dtName); | |||
return; | |||
} | |||
//缩容Statefulset的replica到0 | |||
StatefulSet statefulSet = statefulsetList.get(0); | |||
statefulSet.getSpec().setReplicas(0); | |||
client.resource(statefulSet).createOrReplace(); | |||
log.info("jobWatcher: reduce replicas of 【{}】 to zero", dtName); | |||
redis.set(OperatorKey.CR_JOB, key, "finished"); | |||
} | |||
} catch (Exception e) { | |||
redis.set(OperatorKey.CR_JOB, key, "error"); | |||
log.error("handle job error:【{}】", e); | |||
} | |||
} | |||
} |
@@ -0,0 +1,71 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.watcher; | |||
import io.fabric8.kubernetes.api.model.batch.Job; | |||
import io.fabric8.kubernetes.client.KubernetesClientException; | |||
import io.fabric8.kubernetes.client.Watcher; | |||
import lombok.Data; | |||
import lombok.extern.slf4j.Slf4j; | |||
/** | |||
* @description Job监视器 | |||
* @date 2020-09-24 | |||
*/ | |||
@Data | |||
@Slf4j | |||
public class JobWatcher implements Watcher<Job> { | |||
private String namespace; | |||
private String jobName; | |||
private KubeWatcherManager manager; | |||
private JobHandler jobHandler; | |||
public JobWatcher(JobHandler jobHandler, KubeWatcherManager manager) { | |||
this.manager = manager; | |||
this.jobHandler = jobHandler; | |||
} | |||
/** | |||
* 接收事件进行处理 | |||
* @param action 事件类型 | |||
* @param job job信息 | |||
*/ | |||
@Override | |||
public void eventReceived(Action action, Job job) { | |||
log.info("Job Event received: {} at {}", job.getMetadata().getUid(), job.getMetadata().getCreationTimestamp()); | |||
jobHandler.handleJob(job); | |||
} | |||
/** | |||
* 关闭事件 | |||
* @param e 客户端异常 | |||
*/ | |||
@Override | |||
public void onClose(KubernetesClientException e) { | |||
log.debug("job watcher close"); | |||
if (e != null) { | |||
log.error(e.getMessage()); | |||
log.info("restart new job watcher thread"); | |||
manager.putNewWatcher(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,120 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator.watcher; | |||
import io.fabric8.kubernetes.client.KubernetesClient; | |||
import lombok.extern.slf4j.Slf4j; | |||
import org.onebrain.operator.context.KubeContext; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.stereotype.Component; | |||
import java.util.concurrent.LinkedBlockingQueue; | |||
import java.util.concurrent.ThreadFactory; | |||
import java.util.concurrent.ThreadPoolExecutor; | |||
import java.util.concurrent.TimeUnit; | |||
import java.util.concurrent.atomic.AtomicInteger; | |||
/** | |||
* @description 监视器的管理器 | |||
* @date 2020-09-24 | |||
*/ | |||
@Slf4j | |||
@Component | |||
public class KubeWatcherManager { | |||
/** | |||
* 监视队列 | |||
*/ | |||
private static final LinkedBlockingQueue<JobWatcher> watchQueue = new LinkedBlockingQueue<>(1000); | |||
/** | |||
* 单例线程池 | |||
*/ | |||
private ThreadPoolExecutor pool = new ThreadPoolExecutor(1, 1, 1, TimeUnit.SECONDS, new LinkedBlockingQueue<>(1), new ThreadFactory() { | |||
private final AtomicInteger mThreadNum = new AtomicInteger(1); | |||
@Override | |||
public Thread newThread(Runnable r) { | |||
return new Thread(r, "job-watcher-" + mThreadNum.getAndIncrement()); | |||
} | |||
}); | |||
@Autowired | |||
private KubeContext kubeContext; | |||
@Autowired | |||
private JobHandler jobHandler; | |||
/** | |||
* 第一次启动时 | |||
*/ | |||
public void startWatching(){ | |||
JobWatchHolder jobWatchHolder = new JobWatchHolder(); | |||
pool.execute(jobWatchHolder); | |||
putNewWatcher(); | |||
} | |||
/** | |||
* 监听指定job | |||
* @param jobWatcher | |||
*/ | |||
public void watch(JobWatcher jobWatcher){ | |||
KubernetesClient client = kubeContext.getClient(); | |||
//监听指定job | |||
client.batch().jobs() | |||
.inAnyNamespace().watch(jobWatcher); | |||
} | |||
/** | |||
* 加入新watcher | |||
*/ | |||
public void putNewWatcher(){ | |||
try { | |||
JobWatcher jobWatcher = new JobWatcher(jobHandler, this); | |||
watchQueue.put(jobWatcher); | |||
} catch (InterruptedException e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
/** | |||
* Job监视器持有者 | |||
*/ | |||
class JobWatchHolder implements Runnable { | |||
@Override | |||
public void run() { | |||
while(true){ | |||
try { | |||
//无监视器时阻塞 | |||
JobWatcher jobWatcher = watchQueue.take(); | |||
//启动监视器 | |||
try{ | |||
watch(jobWatcher); | |||
}catch (Exception e){ | |||
//出错不影响其他listener | |||
log.error("JobWatchHolder watch error:【{}】",e); | |||
} | |||
} catch (InterruptedException e) { | |||
log.error("JobWatchHolder run error:【{}】",e); | |||
} | |||
} | |||
} | |||
} | |||
} |
@@ -0,0 +1,27 @@ | |||
-----BEGIN RSA PRIVATE KEY----- | |||
MIIEogIBAAKCAQEA06ZOLQq4pzBZL+bybsxdl4PzYg3jB4kRVc771nm5Y8JenDAT | |||
hlOTz6+nGH4EDT63J7oNj4JYLufsONKYhJkya8p0btWeKHqz5LgEfLGwz/FTMRH5 | |||
WTCZCZUa/3i9gQeKK/CKEned1h4l2w4agrYrnXHpnuNSw6HSlTpX8FgaQGfmTkL3 | |||
XtzSCeY9F2fXGOm9fMfVmv5I5uP6B4TmKwtWPvx3a/1MDgHbmtoaCqYP/JmzWHyi | |||
mc9l2ilX3kTPxh57oRtW9N3FATc8/OCYkNt4vDUTRVB4drODaR5TgUbFtkBVGcFR | |||
f7MrQo4Krd2g8rtEv7PaWN/wlNle5ANXJ/oL3wIDAQABAoIBADiqC8APYMSSMy6Z | |||
/EohuOT51M1pvmCkF9oLYm1XhYTp4v6Z+IA8HBS8iFYMVvVc1xhxvXOwh/925E2K | |||
RH8rrM4jE+0gkAlyYHtZsQnZYOcrSwSWNVXlpvNj0iiXoNTMufdtnOm40K8kvynY | |||
qsxYDXFHsC5z2hK6XnDJgAW+8LhRHCizWwxc0dSN9r33VGry0rgndUZsj2ZBf7u5 | |||
rdslZKvRzMymXct7CIQQ3s5IUO3qbaj7TIzMIo14bmHgD3zlBQ66ESCX1o5A+hPq | |||
1gfUNqUPBtJhsNJg4YYJ/bGgGhBxAxam8jWz3DFZEuYHr6fCDIhLJzL5ulxoQS2z | |||
vJYBwsECgYEA8JGfw004BxqcBVxqBveestsCVGIWDtb+Zx4OI+uBAmYMXd2WCzxv | |||
XxgQJ/IrpNx6FAXZ/bFdE0HRZWR6H07wtNgABuBgd0tAfcH8sw2CJkTO/0N2Xr6/ | |||
O4kh3yHNMy/wAxnktISf1hE/ElEdPI6slhwGDQObRdXxaqBEq+Tjc28CgYEA4TnM | |||
rCaJ8aMaUE0nvVzrev3VTLp4f1qOcPUOnrHDdyrPs1SjYzmAOC72X/FylJZmtkvh | |||
coMQUKVQgiBn1dTtnALANq705b1S+0U07m6+dGJ7LWchOY2tFPiIsx3SZvNJeEKJ | |||
38PsaFi2eDcDP8cKriNoAoby8TbqjqiyHgDX9pECfxww9IfuhKJQe/gk3Ef0vKQ5 | |||
BgzdcbhLeYScAQw0jOm7C7f0P6ERc/uw1jPYLUUkkSnHhcQ1BLM9A0zeeXExzwNi | |||
TJ6BrMxOBUC3euWAr7/MUHWZckWoFMDlURLU4zccZwP2BNcis5hibQG4f7SZA6CT | |||
qCHeSlPkvmXAYkvChuUCgYEA0DNlL9KkfBqBja/1R4jpKhYSIs7R6zCkMmlm7W54 | |||
ueV6gVWBgI08KTPIj2KcwBzUsDovG3NrFpHrfY9FTZd7W1fzpdlQDDxaxGryhmMb | |||
bm1HXu5R+WktkhA6FhJAWOkXhrNDzvXHyaIQc8qvFzsBdX7HfGaRmEhixiPOHAw9 | |||
l/ECgYEAwNywUARR9HtmgoyrwifrzIkMo6jcmLNEIzi2kJ4OQQxW5eKj5JgSV0ND | |||
QUoAIWDAhHQd3ygSfbeShcvtcw+zoF92iOVFn0SLiSe1TgA5ggzC/VJUnInO7zx7 | |||
8Sj8Zk5tHrVmTlelEA2Nbq5H7/U1Q33c1AWbw8yxqD/JRxudHKA= | |||
-----END RSA PRIVATE KEY----- |
@@ -0,0 +1 @@ | |||
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDTpk4tCrinMFkv5vJuzF2Xg/NiDeMHiRFVzvvWebljwl6cMBOGU5PPr6cYfgQNPrcnug2Pglgu5+w40piEmTJrynRu1Z4oerPkuAR8sbDP8VMxEflZMJkJlRr/eL2BB4or8IoSd53WHiXbDhqCtiudceme41LDodKVOlfwWBpAZ+ZOQvde3NIJ5j0XZ9cY6b18x9Wa/kjm4/oHhOYrC1Y+/Hdr/UwOAdua2hoKpg/8mbNYfKKZz2XaKVfeRM/GHnuhG1b03cUBNzz84JiQ23i8NRNFUHh2s4NpHlOBRsW2QFUZwVF/sytCjgqt3aDyu0S/s9pY3/CU2V7kA1cn+gvf root@{{ip}} |
@@ -0,0 +1,19 @@ | |||
apiVersion: v1 | |||
clusters: | |||
- cluster: | |||
certificate-authority-data: {} | |||
server: {} | |||
name: kubernetes | |||
contexts: | |||
- context: | |||
cluster: kubernetes | |||
user: kubernetes-admin | |||
name: kubernetes-admin@kubernetes | |||
current-context: kubernetes-admin@kubernetes | |||
kind: Config | |||
preferences: {} | |||
users: | |||
- name: kubernetes-admin | |||
user: | |||
client-certificate-data: {} | |||
client-key-data: {} |
@@ -0,0 +1,46 @@ | |||
#!/bin/bash | |||
if [ ! -f "/etc/init.d/ssh" ]; then | |||
if [ ! -f "/etc/redhat-release" ]; then | |||
echo 'apt install -y openssh-server' >> pretreatment.log | |||
apt update >> pretreatment.log | |||
apt install -y openssh-server >> pretreatment.log | |||
fi | |||
if [ ! -f "/etc/lsb-release" ]; then | |||
echo 'yum install -y sshd' >> pretreatment.log | |||
yum update >> pretreatment.log | |||
yum install -y sshd >> pretreatment.log | |||
fi | |||
fi | |||
echo '/etc/init.d/ssh start' >> pretreatment.log | |||
/etc/init.d/ssh start >> pretreatment.log | |||
if [ -f "/etc/redhat-release" ]; then | |||
if command -v nslookup >/dev/null 2>&1; then | |||
echo 'exists nslookup' >> pretreatment.log | |||
else | |||
echo 'yum install dnsutils jq' >> pretreatment.log | |||
yum install -y dnsutils >> pretreatment.log | |||
yum install -y jq >> pretreatment.log | |||
fi | |||
if command -v nslookup >/dev/null 2>&1; then | |||
echo 'exists nslookup' >> pretreatment.log | |||
else | |||
echo 'yum install dnsutils jq' >> pretreatment.log | |||
yum install -y dnsutils >> pretreatment.log | |||
yum install -y jq >> pretreatment.log | |||
fi | |||
fi | |||
if [ -f "/etc/lsb-release" ]; then | |||
if command -v jq >/dev/null 2>&1; then | |||
echo 'exists jq' >> pretreatment.log | |||
else | |||
echo 'apt install jq' >> pretreatment.log | |||
apt install -y jq >> pretreatment.log | |||
fi | |||
if command -v nslookup >/dev/null 2>&1; then | |||
echo 'exists nslookup' >> pretreatment.log | |||
else | |||
echo 'apt install dnsutils' >> pretreatment.log | |||
apt install -y dnsutils >> pretreatment.log | |||
fi | |||
fi |
@@ -0,0 +1,43 @@ | |||
/** | |||
* Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
* | |||
* Licensed under the Apache License, Version 2.0 (the "License"); | |||
* you may not use this file except in compliance with the License. | |||
* You may obtain a copy of the License at | |||
* | |||
* http://www.apache.org/licenses/LICENSE-2.0 | |||
* | |||
* Unless required by applicable law or agreed to in writing, software | |||
* distributed under the License is distributed on an "AS IS" BASIS, | |||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* See the License for the specific language governing permissions and | |||
* limitations under the License. | |||
* ============================================================= | |||
*/ | |||
package org.onebrain.operator; | |||
import org.onebrain.operator.api.pod.PodApi; | |||
import org.onebrain.operator.constants.KubeConstants; | |||
import org.springframework.beans.factory.annotation.Autowired; | |||
import org.springframework.boot.test.context.SpringBootTest; | |||
import java.io.File; | |||
import java.net.URISyntaxException; | |||
import java.net.URL; | |||
@SpringBootTest | |||
public class DistributeTrainOperatorApplicationTests { | |||
@Autowired | |||
private PodApi podApi; | |||
// @Test | |||
public void contextLoads() throws URISyntaxException { | |||
final URL url = getClass().getClassLoader().getResource("key/id_rsa"); | |||
File file = new File(url.toURI()); | |||
podApi.copyToPod("default", "distribute-train-test-job-sv2dj", KubeConstants.MASTER_CONTAINER_NAME, file, "/root/.ssh/id_rsa"); | |||
} | |||
} |