| @@ -0,0 +1,26 @@ | |||
| # 之江天枢-分布式训练 operator | |||
| 该模块是分布式训练CRD的控制器,管理分布式训练容器生命周期,为分布式训练容器注入其他容器ip。 | |||
| ## 源码部署 | |||
| ### 准备环境 | |||
| 安装如下软件环境。 | |||
| - OpenJDK:1.8+ | |||
| - Redis: 3.0+ | |||
| - Maven: 3.0+ | |||
| ### 下载源码 | |||
| ``` bash | |||
| git clone https://codeup.teambition.com/zhejianglab/distribute-train-operator.git | |||
| # 进入项目根目录 | |||
| cd distribute-train-operator | |||
| ``` | |||
| ### 构建 | |||
| ``` bash | |||
| # 构建,生成的 jar 包位于 ./target/distribute-train-operator-1.0.jar | |||
| mvn clean compile package | |||
| ``` | |||
| ### 部署 | |||
| 部署过程参看文档:[部署 分布式训练operator](http://tianshu.org.cn/?/course/1.html) | |||
| @@ -0,0 +1,65 @@ | |||
| apiVersion: onebrain.oneflow.org/v1alpha1 | |||
| kind: DistributeTrain | |||
| metadata: | |||
| name: dt-resnet50 | |||
| namespace: resnet50 | |||
| labels: | |||
| key: value | |||
| spec: | |||
| size: 3 | |||
| image: {{IMAGE}} | |||
| imagePullPolicy: IfNotPresent | |||
| masterCmd: export NODE_IPS=`cat /home/hostfile.json |jq -r '.[]|.ip'|paste -d "," -s` && cd /workspace/Classification/cnns && rm -rf core.* && rm -rf ./output/snapshots/* && python3 of_cnn_train_val.py --train_data_dir=$DATA_ROOT/train --train_data_part_num=$TRAIN_DATA_PART_NUM --val_data_dir=$DATA_ROOT/validation --val_data_part_num=$VAL_DATA_PART_NUM --num_nodes=$NODE_NUM --node_ips="$NODE_IPS" --gpu_num_per_node=$GPU_NUM_PER_NODE --model_update="momentum" --learning_rate=0.256 --loss_print_every_n_iter=1 --batch_size_per_device=64 --val_batch_size_per_device=64 --num_epoch=1 --model="resnet50" --model_save_dir=/model | |||
| masterResources: | |||
| requests: | |||
| nvidia.com/gpu: 2 | |||
| memory: "16Gi" | |||
| cpu: "2" | |||
| limits: | |||
| nvidia.com/gpu: 2 | |||
| memory: "16Gi" | |||
| cpu: "2" | |||
| slaveCmd: export NODE_IPS=`cat /home/hostfile.json |jq -r '.[]|.ip'|paste -d "," -s` && cd /workspace/Classification/cnns && rm -rf core.* && rm -rf ./output/snapshots/* && python3 of_cnn_train_val.py --train_data_dir=$DATA_ROOT/train --train_data_part_num=$TRAIN_DATA_PART_NUM --val_data_dir=$DATA_ROOT/validation --val_data_part_num=$VAL_DATA_PART_NUM --num_nodes=$NODE_NUM --node_ips="$NODE_IPS" --gpu_num_per_node=$GPU_NUM_PER_NODE --model_update="momentum" --learning_rate=0.256 --loss_print_every_n_iter=1 --batch_size_per_device=64 --val_batch_size_per_device=64 --num_epoch=1 --model="resnet50" --model_save_dir=/model | |||
| slaveResources: | |||
| requests: | |||
| nvidia.com/gpu: 2 | |||
| memory: "16Gi" | |||
| cpu: "2" | |||
| limits: | |||
| nvidia.com/gpu: 2 | |||
| memory: "16Gi" | |||
| cpu: "2" | |||
| nodeSelector: | |||
| kubernetes.io/hostname: node02 | |||
| env: | |||
| - name: ENABLE_USER_OP | |||
| value: 'True' | |||
| - name: DATA_ROOT | |||
| value: '/dataset' | |||
| - name: NODE_NUM | |||
| value: 3 | |||
| - name: GPU_NUM_PER_NODE | |||
| value: 2 | |||
| - name: ONEFLOW_DEBUG_MODE | |||
| value: "" | |||
| - name: TRAIN_DATA_PART_NUM | |||
| value: 6 | |||
| - name: VAL_DATA_PART_NUM | |||
| value: 6 | |||
| - name: NCCL_DEBUG | |||
| value: INFO | |||
| datasetStorage: | |||
| name: pvc-dataset | |||
| nfs: | |||
| path: {{DATASET}} | |||
| server: {{NFS}} | |||
| workspaceStorage: | |||
| name: pvc-workspace | |||
| nfs: | |||
| path: /nfs/resnet50/workspace | |||
| server: {{WORKSPACE}} | |||
| modelStorage: | |||
| name: pvc-model | |||
| nfs: | |||
| path: /nfs/resnet50/model | |||
| server: {{MODEL}} | |||
| @@ -0,0 +1,61 @@ | |||
| --- | |||
| apiVersion: apiextensions.k8s.io/v1beta1 | |||
| kind: CustomResourceDefinition | |||
| metadata: | |||
| name: distributetrains.onebrain.oneflow.org | |||
| spec: | |||
| group: onebrain.oneflow.org | |||
| names: | |||
| kind: DistributeTrain | |||
| singular: distributetrain | |||
| plural: distributetrains | |||
| shortNames: | |||
| - dt | |||
| scope: Namespaced | |||
| subresources: | |||
| status: {} | |||
| version: v1alpha1 | |||
| validation: | |||
| openAPIV3Schema: | |||
| properties: | |||
| apiVersion: | |||
| type: string | |||
| kind: | |||
| type: string | |||
| metadata: | |||
| type: object | |||
| spec: | |||
| properties: | |||
| image: | |||
| type: string | |||
| imagePullPolicy: | |||
| type: string | |||
| size: | |||
| format: int32 | |||
| type: integer | |||
| masterCmd: | |||
| type: string | |||
| slaveCmd: | |||
| type: string | |||
| masterResources: | |||
| type: object | |||
| slaveResources: | |||
| type: object | |||
| nodeSelector: | |||
| type: object | |||
| initContainer: | |||
| type: object | |||
| datasetStorage: | |||
| type: object | |||
| workspaceStorage: | |||
| type: object | |||
| modelStorage: | |||
| type: object | |||
| required: | |||
| - image | |||
| - imagePullPolicy | |||
| - size | |||
| - masterCmd | |||
| - slaveCmd | |||
| - workspaceStorage | |||
| type: object | |||
| @@ -0,0 +1,47 @@ | |||
| kind: Deployment | |||
| apiVersion: apps/v1 | |||
| metadata: | |||
| name: distribute-train-operator | |||
| namespace: test-ns | |||
| labels: | |||
| name: distribute-train-operator | |||
| spec: | |||
| replicas: 1 | |||
| selector: | |||
| matchLabels: | |||
| name: distribute-train-operator | |||
| template: | |||
| metadata: | |||
| labels: | |||
| name: distribute-train-operator | |||
| spec: | |||
| containers: | |||
| - name: distribute-train-operator | |||
| image: {{IMAGE}} | |||
| ports: | |||
| - containerPort: 8080 | |||
| protocol: TCP | |||
| volumeMounts:d | |||
| - mountPath: /root/config | |||
| name: config-volume | |||
| env: | |||
| - name: JAR_BALL | |||
| value: "distribute-train-operator-1.0.jar --k8s.kubeconfig=/root/config --spring.redis.host=192.168.1.104" | |||
| imagePullPolicy: IfNotPresent | |||
| volumes: | |||
| - name: config-volume | |||
| hostPath: | |||
| path: /root/.kube/config | |||
| restartPolicy: Always | |||
| terminationGracePeriodSeconds: 30 | |||
| securityContext: | |||
| runAsUser: 0 | |||
| schedulerName: default-scheduler | |||
| strategy: | |||
| type: RollingUpdate | |||
| rollingUpdate: | |||
| maxUnavailable: 1 | |||
| maxSurge: 1 | |||
| revisionHistoryLimit: 7 | |||
| progressDeadlineSeconds: 600 | |||
| @@ -0,0 +1,150 @@ | |||
| <?xml version="1.0" encoding="UTF-8"?> | |||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |||
| xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> | |||
| <modelVersion>4.0.0</modelVersion> | |||
| <parent> | |||
| <groupId>org.springframework.boot</groupId> | |||
| <artifactId>spring-boot-starter-parent</artifactId> | |||
| <version>2.2.5.RELEASE</version> | |||
| </parent> | |||
| <groupId>org.onebrain</groupId> | |||
| <artifactId>distribute-train-operator</artifactId> | |||
| <version>1.0</version> | |||
| <name>distribute-train-operator</name> | |||
| <description>distribute-train operatior</description> | |||
| <properties> | |||
| <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |||
| <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> | |||
| <java.version>1.8</java.version> | |||
| <fabric.io.version>4.9.0</fabric.io.version> | |||
| </properties> | |||
| <dependencies> | |||
| <!-- web --> | |||
| <dependency> | |||
| <groupId>org.springframework.boot</groupId> | |||
| <artifactId>spring-boot-starter-web</artifactId> | |||
| </dependency> | |||
| <!-- k8s --> | |||
| <dependency> | |||
| <groupId>io.fabric8</groupId> | |||
| <artifactId>kubernetes-client</artifactId> | |||
| <version>${fabric.io.version}</version> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>io.fabric8</groupId> | |||
| <artifactId>kubernetes-assertions</artifactId> | |||
| <version>4.0.0</version> | |||
| <scope>test</scope> | |||
| </dependency> | |||
| <!-- configuration processor --> | |||
| <dependency> | |||
| <groupId>org.springframework.boot</groupId> | |||
| <artifactId>spring-boot-configuration-processor</artifactId> | |||
| </dependency> | |||
| <!-- redis --> | |||
| <dependency> | |||
| <groupId>org.springframework.boot</groupId> | |||
| <artifactId>spring-boot-starter-data-redis</artifactId> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>redis.clients</groupId> | |||
| <artifactId>jedis</artifactId> | |||
| </dependency> | |||
| <!-- common jars --> | |||
| <dependency> | |||
| <groupId>commons-io</groupId> | |||
| <artifactId>commons-io</artifactId> | |||
| <version>2.6</version> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>org.apache.commons</groupId> | |||
| <artifactId>commons-compress</artifactId> | |||
| <version>1.19</version> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>commons-codec</groupId> | |||
| <artifactId>commons-codec</artifactId> | |||
| </dependency> | |||
| <!-- tools --> | |||
| <dependency> | |||
| <groupId>cn.hutool</groupId> | |||
| <artifactId>hutool-all</artifactId> | |||
| <version>5.1.1</version> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>com.google.guava</groupId> | |||
| <artifactId>guava</artifactId> | |||
| <version>27.0.1-jre</version> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>com.alibaba</groupId> | |||
| <artifactId>fastjson</artifactId> | |||
| <version>1.2.54</version> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>org.projectlombok</groupId> | |||
| <artifactId>lombok</artifactId> | |||
| <optional>true</optional> | |||
| </dependency> | |||
| <dependency> | |||
| <groupId>org.springframework.boot</groupId> | |||
| <artifactId>spring-boot-starter-test</artifactId> | |||
| <scope>test</scope> | |||
| </dependency> | |||
| </dependencies> | |||
| <build> | |||
| <plugins> | |||
| <plugin> | |||
| <groupId>org.springframework.boot</groupId> | |||
| <artifactId>spring-boot-maven-plugin</artifactId> | |||
| </plugin> | |||
| <!-- 打包时跳过测试 --> | |||
| <plugin> | |||
| <groupId>org.apache.maven.plugins</groupId> | |||
| <artifactId>maven-surefire-plugin</artifactId> | |||
| <configuration> | |||
| <skip>true</skip> | |||
| </configuration> | |||
| </plugin> | |||
| </plugins> | |||
| </build> | |||
| <repositories> | |||
| <repository> | |||
| <id>public</id> | |||
| <name>aliyun nexus</name> | |||
| <url>http://maven.aliyun.com/nexus/content/groups/public/</url> | |||
| <releases> | |||
| <enabled>true</enabled> | |||
| </releases> | |||
| </repository> | |||
| </repositories> | |||
| <pluginRepositories> | |||
| <pluginRepository> | |||
| <id>public</id> | |||
| <name>aliyun nexus</name> | |||
| <url>http://maven.aliyun.com/nexus/content/groups/public/</url> | |||
| <releases> | |||
| <enabled>true</enabled> | |||
| </releases> | |||
| <snapshots> | |||
| <enabled>false</enabled> | |||
| </snapshots> | |||
| </pluginRepository> | |||
| </pluginRepositories> | |||
| </project> | |||
| @@ -0,0 +1,35 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator; | |||
| import org.springframework.boot.SpringApplication; | |||
| import org.springframework.boot.autoconfigure.SpringBootApplication; | |||
| import org.springframework.scheduling.annotation.EnableAsync; | |||
| /** | |||
| * @description Operator启动类 | |||
| * @date 2020-09-03 | |||
| */ | |||
| @SpringBootApplication | |||
| @EnableAsync | |||
| public class DistributeTrainOperatorApplication { | |||
| public static void main(String[] args) { | |||
| SpringApplication.run(DistributeTrainOperatorApplication.class, args); | |||
| } | |||
| } | |||
| @@ -0,0 +1,199 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action; | |||
| import cn.hutool.core.util.StrUtil; | |||
| import com.fasterxml.jackson.core.JsonProcessingException; | |||
| import com.google.common.collect.Maps; | |||
| import io.fabric8.kubernetes.api.model.apiextensions.*; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||
| import io.fabric8.kubernetes.client.dsl.Resource; | |||
| import io.fabric8.kubernetes.client.dsl.base.CustomResourceDefinitionContext; | |||
| import io.fabric8.kubernetes.client.informers.SharedIndexInformer; | |||
| import io.fabric8.kubernetes.client.informers.SharedInformerFactory; | |||
| import io.fabric8.kubernetes.client.internal.SerializationUtils; | |||
| import io.fabric8.kubernetes.internal.KubernetesDeserializer; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.onebrain.operator.controller.DistributeTrainController; | |||
| import org.onebrain.operator.crd.DistributeTrain; | |||
| import org.onebrain.operator.crd.DistributeTrainList; | |||
| import org.onebrain.operator.crd.DoneableDistributeTrain; | |||
| import org.onebrain.operator.utils.DistributeTrainClientHolder; | |||
| import org.onebrain.operator.utils.SpringContextHolder; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.beans.factory.support.BeanDefinitionBuilder; | |||
| import org.springframework.beans.factory.support.DefaultListableBeanFactory; | |||
| import org.springframework.context.ConfigurableApplicationContext; | |||
| import org.springframework.stereotype.Component; | |||
| import java.util.Map; | |||
| import static org.onebrain.operator.constants.CrdConstants.*; | |||
| /** | |||
| * @description operator 主控制器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Component | |||
| @Slf4j | |||
| public class DistributeTrainOperatorManager { | |||
| public static final String NAMESPACE_DEFAULT = "default"; | |||
| public static final String TYPE_STRING = "string"; | |||
| public static final String TYPE_INTEGER = "integer"; | |||
| public static final String TYPE_OBJECT = "object"; | |||
| public static final String TYPE_ARRAY = "array"; | |||
| public static final String FORMAT_INT_32 = "int32"; | |||
| @Autowired | |||
| private KubernetesClient client; | |||
| private CustomResourceDefinition crd; | |||
| private String namespace; | |||
| /** | |||
| * 检查crd是否存在,若不存在则创建 | |||
| * @throws JsonProcessingException | |||
| */ | |||
| public void createCrdIfNotExists() throws JsonProcessingException { | |||
| String namespace = client.getNamespace(); | |||
| if (namespace == null) { | |||
| log.info("No namespace found via config, assuming default."); | |||
| namespace = NAMESPACE_DEFAULT; | |||
| } | |||
| this.namespace = namespace; | |||
| log.info("Using namespace : {}", namespace); | |||
| //检查crd是否已存在 | |||
| CustomResourceDefinition crd = client.customResourceDefinitions().withName(CRD_NAME).get(); | |||
| if(crd == null){ | |||
| Map<String, JSONSchemaProps> crdPropsMap = buildCrdProperties(); | |||
| log.info("crd props map is : 【{}】",crdPropsMap); | |||
| //如不存在,则创建 | |||
| CustomResourceDefinition distributeTrainCustomResourceDefinition = new CustomResourceDefinitionBuilder() | |||
| .withApiVersion(CRD_API_VERSION) | |||
| .withNewMetadata() | |||
| .withName(CRD_NAME) | |||
| .endMetadata() | |||
| .withNewSpec() | |||
| .withGroup(CRD_GROUP) | |||
| .withVersion(CRD_VERSION) | |||
| .withScope(CRD_SCOPE) | |||
| .withNewNames() | |||
| .withKind(CRD_KIND) | |||
| .withSingular(CRD_SINGULAR_NAME) | |||
| .withPlural(CRD_PLURAL_NAME) | |||
| .withShortNames(CRD_SHORT_NAME) | |||
| .endNames() | |||
| .withNewValidation() | |||
| .withNewOpenAPIV3Schema() | |||
| .addToProperties(crdPropsMap) | |||
| .endOpenAPIV3Schema() | |||
| .endValidation() | |||
| .endSpec() | |||
| .build(); | |||
| distributeTrainCustomResourceDefinition = client.customResourceDefinitions().create(distributeTrainCustomResourceDefinition); | |||
| log.info("create crd successfully : \n{}", SerializationUtils.dumpAsYaml(distributeTrainCustomResourceDefinition)); | |||
| crd = distributeTrainCustomResourceDefinition; | |||
| } | |||
| //注册到k8s反序列化解析器 | |||
| KubernetesDeserializer.registerCustomKind(CRD_GROUP + StrUtil.SLASH + CRD_VERSION, CRD_KIND, DistributeTrain.class); | |||
| this.crd = crd; | |||
| } | |||
| /** | |||
| * 初始化informer | |||
| */ | |||
| public void initInformer(){ | |||
| CustomResourceDefinitionContext distributeTrainCustomResourceDefinitionContext = new CustomResourceDefinitionContext.Builder() | |||
| .withVersion(CRD_VERSION) | |||
| .withScope(CRD_SCOPE) | |||
| .withGroup(CRD_GROUP) | |||
| .withPlural(CRD_PLURAL_NAME) | |||
| .build(); | |||
| SharedInformerFactory informerFactory = client.informers(); | |||
| MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient = client.customResources(this.crd, DistributeTrain.class, DistributeTrainList.class, DoneableDistributeTrain.class); | |||
| SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer = informerFactory.sharedIndexInformerForCustomResource(distributeTrainCustomResourceDefinitionContext, DistributeTrain.class, DistributeTrainList.class, 10 * 60 * 1000); | |||
| //使用静态变量维持 | |||
| DistributeTrainClientHolder.setDistributeTrainClient(distributeTrainClient); | |||
| //手动注册controller到ioc容器 | |||
| BeanDefinitionBuilder beanDefinitionBuilder = BeanDefinitionBuilder.genericBeanDefinition(DistributeTrainController.class); | |||
| DefaultListableBeanFactory beanFactory = (DefaultListableBeanFactory)((ConfigurableApplicationContext) SpringContextHolder.applicationContext).getBeanFactory(); | |||
| beanDefinitionBuilder.addConstructorArgValue(distributeTrainClient); | |||
| beanDefinitionBuilder.addConstructorArgValue(distributeTrainSharedIndexInformer); | |||
| beanDefinitionBuilder.addConstructorArgValue(namespace); | |||
| beanFactory.registerBeanDefinition("org.onebrain.operator.controller.DistributeTrainController", beanDefinitionBuilder.getRawBeanDefinition()); | |||
| //取得托管的controller | |||
| DistributeTrainController controller = SpringContextHolder.getBean(DistributeTrainController.class); | |||
| //注册informer监听 | |||
| controller.create(); | |||
| informerFactory.startAllRegisteredInformers(); | |||
| //等待就绪 | |||
| controller.run(); | |||
| } | |||
| /** | |||
| * 生成crd属性 | |||
| * @return crd属性集合 | |||
| */ | |||
| private Map<String, JSONSchemaProps> buildCrdProperties(){ | |||
| Map<String, JSONSchemaProps> properties = Maps.newHashMap(); | |||
| JSONSchemaProps stringType = new JSONSchemaPropsBuilder() | |||
| .withType(TYPE_STRING) | |||
| .build(); | |||
| JSONSchemaProps intType = new JSONSchemaPropsBuilder() | |||
| .withType(TYPE_INTEGER) | |||
| .withFormat(FORMAT_INT_32) | |||
| .build(); | |||
| JSONSchemaProps objectType = new JSONSchemaPropsBuilder() | |||
| .withType(TYPE_OBJECT) | |||
| .build(); | |||
| JSONSchemaProps arrayType = new JSONSchemaPropsBuilder() | |||
| .withType(TYPE_ARRAY) | |||
| .withNewItems() | |||
| .endItems() | |||
| .build(); | |||
| //添加属性校验规则 | |||
| JSONSchemaProps specObjectType = new JSONSchemaPropsBuilder() | |||
| .addToProperties("image", stringType) | |||
| .addToProperties("imagePullPolicy", stringType) | |||
| .addToProperties("size", intType) | |||
| .addToProperties("env", arrayType) | |||
| .addToProperties("masterCmd", stringType) | |||
| .addToProperties("slaveCmd", stringType) | |||
| .addToProperties("masterResources", objectType) | |||
| .addToProperties("slaveResources", objectType) | |||
| .addToProperties("nodeSelector", objectType) | |||
| .addToProperties("initContainer", objectType) | |||
| .addToProperties("datasetStorage", objectType) | |||
| .addToProperties("workspaceStorage", objectType) | |||
| .addToProperties("modelStorage", objectType) | |||
| .withType("object") | |||
| .addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd", "workspaceStorage") | |||
| .build(); | |||
| properties.put("apiVersion", stringType); | |||
| properties.put("kind", stringType); | |||
| properties.put("metadata", objectType); | |||
| properties.put("spec", specObjectType); | |||
| return properties; | |||
| } | |||
| } | |||
| @@ -0,0 +1,58 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.onebrain.operator.watcher.KubeWatcherManager; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.boot.ApplicationArguments; | |||
| import org.springframework.boot.ApplicationRunner; | |||
| import org.springframework.stereotype.Component; | |||
| /** | |||
| * @description Operator运行入口 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Component | |||
| @Slf4j | |||
| public class OperatorRunner implements ApplicationRunner { | |||
| @Autowired | |||
| private DistributeTrainOperatorManager operatorManager; | |||
| @Autowired | |||
| private KubeWatcherManager watcherManager; | |||
| /** | |||
| * spring 容器完全启动后 注册operator运行逻辑 | |||
| * @param args | |||
| * @throws Exception | |||
| */ | |||
| @Override | |||
| public void run(ApplicationArguments args) throws Exception { | |||
| //检查crd是否已存在,如果不存在则创建 | |||
| operatorManager.createCrdIfNotExists(); | |||
| //job监控者启动 | |||
| watcherManager.startWatching(); | |||
| log.info("job watcher is running"); | |||
| //初始化informer | |||
| operatorManager.initInformer(); | |||
| } | |||
| } | |||
| @@ -0,0 +1,44 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action; | |||
| import lombok.AllArgsConstructor; | |||
| import lombok.Builder; | |||
| import lombok.Data; | |||
| import lombok.NoArgsConstructor; | |||
| /** | |||
| * @description pod信息类 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Data | |||
| @NoArgsConstructor | |||
| @AllArgsConstructor | |||
| @Builder | |||
| public class PodInfo { | |||
| /** | |||
| * ip地址 | |||
| */ | |||
| private String ip; | |||
| /** | |||
| * 角色 | |||
| */ | |||
| private String role; | |||
| } | |||
| @@ -0,0 +1,41 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.deployer; | |||
| import cn.hutool.core.util.RandomUtil; | |||
| import lombok.Data; | |||
| import lombok.experimental.Accessors; | |||
| /** | |||
| * @description 创建资源的信息的抽象类 | |||
| * @date 2020-04-30 | |||
| */ | |||
| @Data | |||
| @Accessors(chain = true) | |||
| public abstract class AbstractResourceCreateInfo { | |||
| /** | |||
| * 生成随机字符串 | |||
| * @param digits 位数 | |||
| * @return | |||
| */ | |||
| protected static String getRandomStr(Integer digits){ | |||
| return RandomUtil.randomString(digits); | |||
| } | |||
| } | |||
| @@ -0,0 +1,227 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.deployer; | |||
| import cn.hutool.core.collection.CollectionUtil; | |||
| import cn.hutool.core.util.StrUtil; | |||
| import io.fabric8.kubernetes.api.model.*; | |||
| import lombok.Data; | |||
| import lombok.experimental.Accessors; | |||
| import org.onebrain.operator.constants.KubeConstants; | |||
| import org.onebrain.operator.constants.NumberConstant; | |||
| import org.onebrain.operator.crd.DistributeTrain; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| import java.util.Optional; | |||
| import java.util.stream.Collectors; | |||
| /** | |||
| * @description 暂存创建子资源所需的信息 | |||
| * @date 2020-06-16 | |||
| */ | |||
| @Data | |||
| @Accessors(chain = true) | |||
| public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { | |||
| public static final String SLAVE_TEMPLATE = "{}-slave-{}"; | |||
| public static final String MASTER_TEMPLATE = "{}-master-{}"; | |||
| public static final String SVC_TEMPLATE = "{}-svc"; | |||
| /** | |||
| * 父级名称(分布式训练名称) | |||
| */ | |||
| private String parentName; | |||
| /** | |||
| * job名称 | |||
| */ | |||
| private String jobName; | |||
| /** | |||
| * statefullSet名称 | |||
| */ | |||
| private String statefulSetName; | |||
| /** | |||
| * 服务名称 | |||
| */ | |||
| private String svcName; | |||
| /** | |||
| * 命名空间 | |||
| */ | |||
| private String namespace; | |||
| /** | |||
| * 镜像 | |||
| */ | |||
| private String image; | |||
| /** | |||
| * 镜像拉取策略 | |||
| */ | |||
| private String imagePullPolicy; | |||
| /** | |||
| * 标签 | |||
| */ | |||
| private Map<String, String> labels; | |||
| /** | |||
| * master副本数 | |||
| */ | |||
| private Integer masterReplicas; | |||
| /** | |||
| * slave副本数 | |||
| */ | |||
| private Integer slaveReplicas; | |||
| /** | |||
| * master命令 | |||
| */ | |||
| private String masterCmd; | |||
| /** | |||
| * slave命令 | |||
| */ | |||
| private String slaveCmd; | |||
| /** | |||
| * master 资源节点限制 | |||
| */ | |||
| private ResourceRequirements masterResources; | |||
| /** | |||
| * slave 资源节点限制 | |||
| */ | |||
| private ResourceRequirements slaveResources; | |||
| /** | |||
| * 节点调度选择器 | |||
| */ | |||
| private Map<String, String> nodeSelector; | |||
| /** | |||
| * 初始化容器 | |||
| */ | |||
| private Container initContainer; | |||
| /** | |||
| * 工作目录挂载 | |||
| */ | |||
| private Volume workspaceVolume; | |||
| /** | |||
| * 数据集目录挂载 | |||
| */ | |||
| private Volume datasetVolume; | |||
| /** | |||
| * 模型目录挂载 | |||
| */ | |||
| private Volume modelVolume; | |||
| /** | |||
| * 环境变量 | |||
| */ | |||
| private List<EnvVar> env; | |||
| /** | |||
| * 拥有者信息 | |||
| */ | |||
| private OwnerReference ownerReference; | |||
| /** | |||
| * 将分布式训练转换为K8S的资源信息 | |||
| * @param distributeTrain 分布式训练 | |||
| * @return ChildResourceCreateInfo | |||
| */ | |||
| public static ChildResourceCreateInfo fromCr(DistributeTrain distributeTrain){ | |||
| ChildResourceCreateInfo info = new ChildResourceCreateInfo(); | |||
| //ownerReferece信息 | |||
| info.generateOwnerReference(distributeTrain); | |||
| //各种资源的名称 | |||
| info.setNamespace(distributeTrain.getMetadata().getNamespace()); | |||
| info.setParentName(distributeTrain.getMetadata().getName()); | |||
| info.generateResoureName(); | |||
| //标签 | |||
| info.setLabels(distributeTrain.getMetadata().getLabels()); | |||
| //镜像 | |||
| info.setImage(distributeTrain.getSpec().getImage()) | |||
| .setImagePullPolicy(distributeTrain.getSpec().getImagePullPolicy()); | |||
| //副本数 | |||
| Integer size = distributeTrain.getSpec().getSize(); | |||
| info.setMasterReplicas(NumberConstant.NUMBER_1); | |||
| info.setSlaveReplicas(size - NumberConstant.NUMBER_1); | |||
| //命令行 | |||
| info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) | |||
| .setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); | |||
| //挂载 | |||
| Optional.ofNullable(distributeTrain.getSpec().getWorkspaceStorage()) | |||
| .ifPresent(v -> info.setWorkspaceVolume(v)); | |||
| Optional.ofNullable(distributeTrain.getSpec().getDatasetStorage()) | |||
| .ifPresent(v -> info.setDatasetVolume(v)); | |||
| Optional.ofNullable(distributeTrain.getSpec().getModelStorage()) | |||
| .ifPresent(v -> info.setModelVolume(v)); | |||
| //主从两组资源限制 | |||
| Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) | |||
| .ifPresent(v -> info.setMasterResources(v)); | |||
| Optional.ofNullable(distributeTrain.getSpec().getSlaveResources()) | |||
| .ifPresent(v -> info.setSlaveResources(v)); | |||
| //环境变量 | |||
| List<EnvVar> env = distributeTrain.getSpec().getEnv(); | |||
| if(CollectionUtil.isNotEmpty(env)){ | |||
| env = env.stream().filter(e -> !KubeConstants.ENV_NODE_NUM.equals(e.getName())).collect(Collectors.toList()); | |||
| info.setEnv(env); | |||
| } | |||
| //node调度 | |||
| info.setNodeSelector(distributeTrain.getSpec().getNodeSelector()); | |||
| //init-container | |||
| info.setInitContainer(distributeTrain.getSpec().getInitContainer()); | |||
| return info; | |||
| } | |||
| /** | |||
| * 生成资源名称 | |||
| */ | |||
| private void generateResoureName(){ | |||
| String suffix = getRandomStr(NumberConstant.NUMBER_5); | |||
| this.statefulSetName = StrUtil.format(SLAVE_TEMPLATE, this.parentName, suffix); | |||
| this.jobName = StrUtil.format(MASTER_TEMPLATE, this.parentName, suffix); | |||
| this.svcName = StrUtil.format(SVC_TEMPLATE, this.parentName); | |||
| } | |||
| /** | |||
| * 生成所有者信息 | |||
| * @param distributeTrain 分布式训练 | |||
| */ | |||
| private void generateOwnerReference(DistributeTrain distributeTrain){ | |||
| this.ownerReference = new OwnerReferenceBuilder() | |||
| .withApiVersion(distributeTrain.getApiVersion()) | |||
| .withKind(distributeTrain.getKind()) | |||
| .withName(distributeTrain.getMetadata().getName()) | |||
| .withNewUid(distributeTrain.getMetadata().getUid()) | |||
| .build(); | |||
| } | |||
| } | |||
| @@ -0,0 +1,35 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.deployer; | |||
| import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||
| /** | |||
| * @description Job部署接口 规范部署方法 | |||
| * T 必须是AbstractResourceCreateInfo 的子类型 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public interface JobDeployer<T extends AbstractResourceCreateInfo> { | |||
| /** | |||
| * 构建 Job信息 | |||
| * @param info 资源信息 | |||
| * @return Job构建者 | |||
| */ | |||
| JobBuilder deploy(T info); | |||
| } | |||
| @@ -0,0 +1,33 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.deployer; | |||
| import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||
| /** | |||
| * @description service部署器接口 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public interface ServiceDeployer<T extends AbstractResourceCreateInfo> { | |||
| /** | |||
| * 构建service信息 | |||
| * @param info 资源信息 | |||
| * @return 服务构建者 | |||
| */ | |||
| ServiceBuilder deploy(T info); | |||
| } | |||
| @@ -0,0 +1,33 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.deployer; | |||
| import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||
| /** | |||
| * @description statefulset部署器接口 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public interface StatefulSetDeployer<T extends AbstractResourceCreateInfo> { | |||
| /** | |||
| * 构建service信息 | |||
| * @param info 资源信息 | |||
| * @return StatefulSet构建者 | |||
| */ | |||
| StatefulSetBuilder deploy(T info); | |||
| } | |||
| @@ -0,0 +1,246 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.deployer.impl; | |||
| import cn.hutool.core.collection.CollectionUtil; | |||
| import com.google.common.collect.Lists; | |||
| import io.fabric8.kubernetes.api.model.CapabilitiesBuilder; | |||
| import io.fabric8.kubernetes.api.model.Container; | |||
| import io.fabric8.kubernetes.api.model.ContainerPortBuilder; | |||
| import io.fabric8.kubernetes.api.model.EnvVar; | |||
| import io.fabric8.kubernetes.api.model.EnvVarBuilder; | |||
| import io.fabric8.kubernetes.api.model.SecurityContextBuilder; | |||
| import io.fabric8.kubernetes.api.model.Volume; | |||
| import io.fabric8.kubernetes.api.model.VolumeBuilder; | |||
| import io.fabric8.kubernetes.api.model.VolumeMount; | |||
| import io.fabric8.kubernetes.api.model.VolumeMountBuilder; | |||
| import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||
| import org.onebrain.operator.action.deployer.JobDeployer; | |||
| import org.onebrain.operator.constants.KubeConstants; | |||
| import java.util.*; | |||
| import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_0; | |||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_1; | |||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||
| /** | |||
| * @description Job部署器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||
| public static final String PVC_WORKSPACE = "pvc-workspace"; | |||
| public static final String SSH = "ssh"; | |||
| public static final String WORKSPACE = "/workspace"; | |||
| public static final String PVC_DATASET = "pvc-dataset"; | |||
| public static final String DATASET = "/dataset"; | |||
| public static final String PVC_MODEL = "pvc-model"; | |||
| public static final String MODEL = "/model"; | |||
| public static final String MEMORY = "Memory"; | |||
| public static final String DEV_SHM = "/dev/shm"; | |||
| public static final String BIN_BASH = "/bin/bash"; | |||
| public static final String IPC_LOCK = "IPC_LOCK"; | |||
| public static final String RESTART_POLICY_NEVER = "Never"; | |||
| /** | |||
| * 部署Job | |||
| * @param info 资源信息 | |||
| * @return | |||
| */ | |||
| @Override | |||
| public JobBuilder deploy(ChildResourceCreateInfo info) { | |||
| //容器 | |||
| Container container = buildContainer(info); | |||
| //存储卷 | |||
| List<Volume> volumes = buildVolumes(info); | |||
| //挂载 | |||
| List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | |||
| container.setVolumeMounts(volumeMounts); | |||
| //启动命令 | |||
| container.setCommand(Collections.singletonList(BIN_BASH)); | |||
| //训练等待命令 | |||
| //一个是等待 pretreatment 文件 通过 podApi 拷贝 到pod上 | |||
| //另一个是等待 服务(svc)创建成功 | |||
| List<String> cmdLines = Arrays.asList("while [ ! -f /home/pretreatment ]; do echo pretreatment not exist >> pretreatment.log; sleep 1;done && chmod a+x /home/pretreatment && bash /home/pretreatment ", "until nslookup " + info.getSvcName() + "; do sleep 5; done", info.getMasterCmd()); | |||
| container.setArgs(Arrays.asList("-c", CollectionUtil.join(cmdLines, " && "))); | |||
| //权限 | |||
| container.setSecurityContext(new SecurityContextBuilder() | |||
| .withAllowPrivilegeEscalation(true) | |||
| .withCapabilities(new CapabilitiesBuilder() | |||
| .withAdd(Collections.singletonList(IPC_LOCK)) | |||
| .build()) | |||
| .build()); | |||
| //用户自定义的标签 | |||
| Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||
| JobBuilder builder = new JobBuilder(); | |||
| builder.withNewMetadata() | |||
| .withName(info.getJobName()) | |||
| .withNamespace(info.getNamespace()) | |||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
| .addToLabels(customizeLabels) | |||
| .addToOwnerReferences(info.getOwnerReference()) | |||
| .endMetadata() | |||
| .withNewSpec() | |||
| //并行1个 | |||
| .withParallelism(NUMBER_1) | |||
| //共计运行1次 | |||
| .withCompletions(NUMBER_1) | |||
| //失败重试次数 | |||
| .withBackoffLimit(KubeConstants.BACKOFFLIMIT) | |||
| .withNewTemplate() | |||
| .withNewMetadata() | |||
| .withName(info.getJobName()) | |||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
| .addToLabels(KubeConstants.JOB_LABEL, info.getJobName()) | |||
| .addToLabels(customizeLabels) | |||
| .endMetadata() | |||
| .withNewSpec() | |||
| //关闭指令发出时 立即执行 | |||
| .withTerminationGracePeriodSeconds(LONG_NUMBER_0) | |||
| .addToContainers(container) | |||
| .addToVolumes(volumes.toArray(new Volume[volumes.size()])) | |||
| .withRestartPolicy(RESTART_POLICY_NEVER) | |||
| .endSpec() | |||
| .endTemplate() | |||
| .endSpec(); | |||
| //init-container | |||
| JobBuilder finalBuilder = builder; | |||
| Optional.ofNullable(info.getInitContainer()) | |||
| .ifPresent(initContainer -> { | |||
| finalBuilder.editSpec() | |||
| .editTemplate() | |||
| .editSpec() | |||
| .addToInitContainers(initContainer) | |||
| .endSpec() | |||
| .endTemplate() | |||
| .endSpec(); | |||
| }); | |||
| //固定节点调度 | |||
| if(CollectionUtil.isNotEmpty(info.getNodeSelector())){ | |||
| builder = builder.editSpec() | |||
| .editTemplate().editSpec() | |||
| .addToNodeSelector(info.getNodeSelector()) | |||
| .endSpec().endTemplate() | |||
| .endSpec(); | |||
| } | |||
| return builder; | |||
| } | |||
| /** | |||
| * 构建容器 | |||
| * @param info 资源信息 | |||
| * @return 容器信息 | |||
| */ | |||
| private Container buildContainer(ChildResourceCreateInfo info){ | |||
| //容器 | |||
| Container container = new Container(); | |||
| //镜像 | |||
| container.setName(KubeConstants.MASTER_CONTAINER_NAME); | |||
| container.setImage(info.getImage()); | |||
| container.setImagePullPolicy(info.getImagePullPolicy()); | |||
| //端口映射 | |||
| container.setPorts(Arrays.asList(new ContainerPortBuilder() | |||
| .withContainerPort(NUMBER_22) | |||
| .withName(SSH).build())); | |||
| //环境变量 | |||
| List<EnvVar> envVars = Lists.newArrayList(new EnvVarBuilder() | |||
| .withName(KubeConstants.ENV_NODE_NUM) | |||
| .withValue(String.valueOf(info.getSlaveReplicas() + info.getMasterReplicas())) | |||
| .build()); | |||
| Optional.ofNullable(info.getEnv()).ifPresent(v -> envVars.addAll(v)); | |||
| container.setEnv(envVars); | |||
| //资源限制 | |||
| Optional.ofNullable(info.getMasterResources()).ifPresent(v->container.setResources(v)); | |||
| return container; | |||
| } | |||
| /** | |||
| * 构建存储卷集合 | |||
| * @param info 资源信息 | |||
| * @return 存储卷集合 | |||
| */ | |||
| private List<Volume> buildVolumes(ChildResourceCreateInfo info){ | |||
| //存储卷 | |||
| List<Volume> volumes = new LinkedList<>(); | |||
| Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||
| Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||
| Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||
| //shm默认就有 | |||
| volumes.add(new VolumeBuilder() | |||
| .withName(KubeConstants.VOLUME_SHM) | |||
| .withNewEmptyDir() | |||
| .withMedium(MEMORY) | |||
| .endEmptyDir() | |||
| .build()); | |||
| return volumes; | |||
| } | |||
| /** | |||
| * 构建挂载存储卷集合 | |||
| * @param volumes 存储卷集合 | |||
| * @return 构建挂载存储卷集合 | |||
| */ | |||
| private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | |||
| List<VolumeMount> volumeMounts = new LinkedList<>(); | |||
| for (Volume volume : volumes) { | |||
| if(PVC_WORKSPACE.equals(volume.getName())){ | |||
| volumeMounts.add(new VolumeMountBuilder() | |||
| .withName(volume.getName()) | |||
| .withMountPath(WORKSPACE) | |||
| .build()); | |||
| continue; | |||
| } | |||
| if(PVC_DATASET.equals(volume.getName())){ | |||
| volumeMounts.add(new VolumeMountBuilder() | |||
| .withName(volume.getName()) | |||
| .withMountPath(DATASET) | |||
| .build()); | |||
| continue; | |||
| } | |||
| if(PVC_MODEL.equals(volume.getName())){ | |||
| volumeMounts.add(new VolumeMountBuilder() | |||
| .withName(volume.getName()) | |||
| .withMountPath(MODEL) | |||
| .build()); | |||
| continue; | |||
| } | |||
| } | |||
| volumeMounts.add(new VolumeMountBuilder() | |||
| .withName(KubeConstants.VOLUME_SHM) | |||
| .withMountPath(DEV_SHM) | |||
| .build()); | |||
| return volumeMounts; | |||
| } | |||
| } | |||
| @@ -0,0 +1,73 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.deployer.impl; | |||
| import cn.hutool.core.collection.CollectionUtil; | |||
| import io.fabric8.kubernetes.api.model.IntOrString; | |||
| import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||
| import org.onebrain.operator.action.deployer.ServiceDeployer; | |||
| import org.onebrain.operator.constants.KubeConstants; | |||
| import java.util.Collections; | |||
| import java.util.HashMap; | |||
| import java.util.Map; | |||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_30000; | |||
| /** | |||
| * @description Service部署器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public class BaseServiceDeployer implements ServiceDeployer<ChildResourceCreateInfo> { | |||
| public static final String WEB_SSH = "web-ssh"; | |||
| public static final String NONE = "None"; | |||
| /** | |||
| * 构建service信息 | |||
| * @param info 资源信息 | |||
| * @return | |||
| */ | |||
| @Override | |||
| public ServiceBuilder deploy(ChildResourceCreateInfo info) { | |||
| //用户自定义的标签 | |||
| Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||
| return new ServiceBuilder() | |||
| .withNewMetadata() | |||
| .withName(info.getSvcName()) | |||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
| .addToLabels(customizeLabels) | |||
| .withNamespace(info.getNamespace()) | |||
| .addToOwnerReferences(info.getOwnerReference()) | |||
| .endMetadata() | |||
| .withNewSpec() | |||
| .addNewPort() | |||
| .withPort(NUMBER_30000) | |||
| .withTargetPort(new IntOrString(NUMBER_22)) | |||
| .withName(WEB_SSH) | |||
| .endPort() | |||
| .withClusterIP(NONE) | |||
| //选择带有分布式训练的节点 | |||
| .withSelector(Collections.singletonMap(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName())) | |||
| .endSpec(); | |||
| } | |||
| } | |||
| @@ -0,0 +1,246 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.deployer.impl; | |||
| import cn.hutool.core.collection.CollectionUtil; | |||
| import com.google.common.collect.ImmutableMap; | |||
| import com.google.common.collect.Lists; | |||
| import io.fabric8.kubernetes.api.model.CapabilitiesBuilder; | |||
| import io.fabric8.kubernetes.api.model.Container; | |||
| import io.fabric8.kubernetes.api.model.ContainerPortBuilder; | |||
| import io.fabric8.kubernetes.api.model.EnvVar; | |||
| import io.fabric8.kubernetes.api.model.EnvVarBuilder; | |||
| import io.fabric8.kubernetes.api.model.LabelSelector; | |||
| import io.fabric8.kubernetes.api.model.SecurityContextBuilder; | |||
| import io.fabric8.kubernetes.api.model.Volume; | |||
| import io.fabric8.kubernetes.api.model.VolumeBuilder; | |||
| import io.fabric8.kubernetes.api.model.VolumeMount; | |||
| import io.fabric8.kubernetes.api.model.VolumeMountBuilder; | |||
| import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||
| import org.onebrain.operator.action.deployer.StatefulSetDeployer; | |||
| import org.onebrain.operator.constants.KubeConstants; | |||
| import java.util.Arrays; | |||
| import java.util.Collections; | |||
| import java.util.HashMap; | |||
| import java.util.LinkedList; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| import java.util.Optional; | |||
| import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_0; | |||
| import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_60; | |||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||
| /** | |||
| * @description StatefullSet部署器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> { | |||
| public static final String SSH = "ssh"; | |||
| public static final String PVC_WORKSPACE = "pvc-workspace"; | |||
| public static final String WORKSPACE = "/workspace"; | |||
| public static final String PVC_DATASET = "pvc-dataset"; | |||
| public static final String DATASET = "/dataset"; | |||
| public static final String PVC_MODEL = "pvc-model"; | |||
| public static final String MODEL = "/model"; | |||
| public static final String MEMORY = "Memory"; | |||
| public static final String DEV_SHM = "/dev/shm"; | |||
| public static final String BIN_BASH = "/bin/bash"; | |||
| public static final String IPC_LOCK = "IPC_LOCK"; | |||
| /** | |||
| * 生成 StatefullSet 信息 | |||
| * @param info 资源信息 | |||
| * @return | |||
| */ | |||
| @Override | |||
| public StatefulSetBuilder deploy(ChildResourceCreateInfo info) { | |||
| //标签筛选 | |||
| LabelSelector labelSelector = new LabelSelector(); | |||
| labelSelector.setMatchLabels(ImmutableMap.of(KubeConstants.STATEFULSET_LABEL, info.getStatefulSetName())); | |||
| //存储卷 | |||
| List<Volume> volumes = buildVolumes(info); | |||
| //容器 | |||
| Container container = buildContainer(info); | |||
| //挂载 | |||
| List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | |||
| container.setVolumeMounts(volumeMounts); | |||
| //启动命令 | |||
| List<String> cmdLines = Arrays.asList("while [ ! -f /home/pretreatment ]; do echo pretreatment not exist >> pretreatment.log; sleep 1;done && chmod a+x /home/pretreatment && bash /home/pretreatment ", "until nslookup " + info.getSvcName() + "; do sleep 5; done", info.getSlaveCmd()); | |||
| container.setCommand(Collections.singletonList(BIN_BASH)); | |||
| container.setArgs(Arrays.asList("-c", CollectionUtil.join(cmdLines, " && "))); | |||
| //权限 | |||
| container.setSecurityContext(new SecurityContextBuilder() | |||
| .withAllowPrivilegeEscalation(true) | |||
| // .withPrivileged(true) | |||
| .withCapabilities(new CapabilitiesBuilder() | |||
| .withAdd(Collections.singletonList(IPC_LOCK)) | |||
| .build()) | |||
| .build()); | |||
| //用户自定义的标签 | |||
| Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||
| StatefulSetBuilder builder = new StatefulSetBuilder(); | |||
| builder.withNewMetadata() | |||
| .withName(info.getStatefulSetName()) | |||
| .withNamespace(info.getNamespace()) | |||
| .addToOwnerReferences(info.getOwnerReference()) | |||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
| .endMetadata() | |||
| .withNewSpec() | |||
| .withSelector(labelSelector) | |||
| .withServiceName(info.getStatefulSetName()) | |||
| .withReplicas(info.getSlaveReplicas()) | |||
| .withNewTemplate() | |||
| .withNewMetadata() | |||
| .withName(info.getStatefulSetName()) | |||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||
| .addToLabels(KubeConstants.STATEFULSET_LABEL, info.getStatefulSetName()) | |||
| .addToLabels(customizeLabels) | |||
| .endMetadata() | |||
| .withNewSpec() | |||
| .withTerminationGracePeriodSeconds(LONG_NUMBER_0) | |||
| .withTerminationGracePeriodSeconds(LONG_NUMBER_60) | |||
| .addToContainers(container) | |||
| .addToVolumes(volumes.toArray(new Volume[0])) | |||
| .endSpec() | |||
| .endTemplate() | |||
| .endSpec(); | |||
| //init-container | |||
| StatefulSetBuilder finalBuilder = builder; | |||
| Optional.ofNullable(info.getInitContainer()) | |||
| .ifPresent(initContainer -> { | |||
| finalBuilder.editSpec() | |||
| .editTemplate() | |||
| .editSpec() | |||
| .addToInitContainers(initContainer) | |||
| .endSpec() | |||
| .endTemplate() | |||
| .endSpec(); | |||
| }); | |||
| //固定节点调度 | |||
| if(CollectionUtil.isNotEmpty(info.getNodeSelector())){ | |||
| builder = builder.editSpec() | |||
| .editTemplate().editSpec() | |||
| .addToNodeSelector(info.getNodeSelector()) | |||
| .endSpec().endTemplate() | |||
| .endSpec(); | |||
| } | |||
| return builder; | |||
| } | |||
| /** | |||
| * 构建容器 | |||
| * @param info 资源信息 | |||
| * @return 容器信息 | |||
| */ | |||
| private Container buildContainer(ChildResourceCreateInfo info) { | |||
| Container container = new Container(); | |||
| //镜像 | |||
| container.setName(KubeConstants.SLAVE_CONTAINER_NAME); | |||
| container.setImage(info.getImage()); | |||
| container.setImagePullPolicy(info.getImagePullPolicy()); | |||
| //端口映射 | |||
| container.setPorts(Arrays.asList(new ContainerPortBuilder() | |||
| .withContainerPort(NUMBER_22) | |||
| .withName(SSH).build())); | |||
| //环境变量 | |||
| List<EnvVar> envVars = Lists.newArrayList(new EnvVarBuilder() | |||
| .withName(KubeConstants.ENV_NODE_NUM) | |||
| .withValue(String.valueOf(info.getSlaveReplicas() + info.getMasterReplicas())) | |||
| .build()); | |||
| Optional.ofNullable(info.getEnv()).ifPresent(v -> envVars.addAll(v)); | |||
| container.setEnv(envVars); | |||
| //资源限制 | |||
| Optional.ofNullable(info.getSlaveResources()).ifPresent(v -> container.setResources(v)); | |||
| return container; | |||
| } | |||
| /** | |||
| * 构建存储卷集合 | |||
| * @param info 资源信息 | |||
| * @return 存储卷集合 | |||
| */ | |||
| private List<Volume> buildVolumes(ChildResourceCreateInfo info) { | |||
| List<Volume> volumes = buildVolumes(info); | |||
| Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||
| Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||
| Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||
| //shm默认就有 | |||
| volumes.add(new VolumeBuilder() | |||
| .withName(KubeConstants.VOLUME_SHM) | |||
| .withNewEmptyDir() | |||
| .withMedium(MEMORY) | |||
| .endEmptyDir() | |||
| .build()); | |||
| return volumes; | |||
| } | |||
| /** | |||
| * 构建挂载存储卷集合 | |||
| * @param volumes 存储卷集合 | |||
| * @return 构建挂载存储卷集合 | |||
| */ | |||
| private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | |||
| List<VolumeMount> volumeMounts=new LinkedList<>(); | |||
| for (Volume volume : volumes) { | |||
| if(PVC_WORKSPACE.equals(volume.getName())){ | |||
| volumeMounts.add(new VolumeMountBuilder() | |||
| .withName(volume.getName()) | |||
| .withMountPath(WORKSPACE) | |||
| .build()); | |||
| continue; | |||
| } | |||
| if(PVC_DATASET.equals(volume.getName())){ | |||
| volumeMounts.add(new VolumeMountBuilder() | |||
| .withName(volume.getName()) | |||
| .withMountPath(DATASET) | |||
| .build()); | |||
| continue; | |||
| } | |||
| if(PVC_MODEL.equals(volume.getName())){ | |||
| volumeMounts.add(new VolumeMountBuilder() | |||
| .withName(volume.getName()) | |||
| .withMountPath(MODEL) | |||
| .build()); | |||
| continue; | |||
| } | |||
| } | |||
| volumeMounts.add(new VolumeMountBuilder() | |||
| .withName(KubeConstants.VOLUME_SHM) | |||
| .withMountPath(DEV_SHM) | |||
| .build()); | |||
| return volumeMounts; | |||
| } | |||
| } | |||
| @@ -0,0 +1,614 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.handler; | |||
| import cn.hutool.core.collection.CollectionUtil; | |||
| import cn.hutool.core.io.FileUtil; | |||
| import cn.hutool.core.util.ObjectUtil; | |||
| import cn.hutool.core.util.StrUtil; | |||
| import com.alibaba.fastjson.JSONArray; | |||
| import com.alibaba.fastjson.JSONObject; | |||
| import com.google.common.collect.Lists; | |||
| import com.google.common.io.Files; | |||
| import io.fabric8.kubernetes.api.model.ObjectMeta; | |||
| import io.fabric8.kubernetes.api.model.Pod; | |||
| import io.fabric8.kubernetes.api.model.Service; | |||
| import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||
| import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||
| import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||
| import io.fabric8.kubernetes.api.model.batch.Job; | |||
| import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.onebrain.operator.action.PodInfo; | |||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||
| import org.onebrain.operator.action.deployer.JobDeployer; | |||
| import org.onebrain.operator.action.deployer.ServiceDeployer; | |||
| import org.onebrain.operator.action.deployer.StatefulSetDeployer; | |||
| import org.onebrain.operator.action.deployer.impl.BaseJobDeployer; | |||
| import org.onebrain.operator.action.deployer.impl.BaseServiceDeployer; | |||
| import org.onebrain.operator.action.deployer.impl.BaseStatefulSetDeployer; | |||
| import org.onebrain.operator.api.pod.PodApi; | |||
| import org.onebrain.operator.constants.KubeConstants; | |||
| import org.onebrain.operator.crd.DistributeTrain; | |||
| import org.onebrain.operator.crd.DistributeTrainSpec; | |||
| import org.onebrain.operator.crd.DistributeTrainStatus; | |||
| import org.onebrain.operator.exception.OperatorException; | |||
| import org.onebrain.operator.redis.RedisService; | |||
| import org.onebrain.operator.redis.key.OperatorKey; | |||
| import org.onebrain.operator.utils.DistributeTrainClientHolder; | |||
| import org.onebrain.operator.utils.IOUtils; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.core.io.ClassPathResource; | |||
| import org.springframework.stereotype.Component; | |||
| import java.io.File; | |||
| import java.io.InputStream; | |||
| import java.util.Collections; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| import java.util.Optional; | |||
| import java.util.concurrent.ConcurrentHashMap; | |||
| import java.util.concurrent.LinkedBlockingQueue; | |||
| import java.util.concurrent.ThreadFactory; | |||
| import java.util.concurrent.ThreadPoolExecutor; | |||
| import java.util.concurrent.TimeUnit; | |||
| import java.util.concurrent.atomic.AtomicInteger; | |||
| import static org.onebrain.operator.constants.KubeConstants.CHARSET; | |||
| import static org.onebrain.operator.constants.KubeConstants.JOB_LABEL; | |||
| import static org.onebrain.operator.constants.KubeConstants.MASTER_CONTAINER_NAME; | |||
| import static org.onebrain.operator.constants.KubeConstants.SLAVE_CONTAINER_NAME; | |||
| import static org.onebrain.operator.constants.KubeConstants.STATEFULSET_LABEL; | |||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_2; | |||
| /** | |||
| * @description 分布式训练添加事件的处理器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Component("addActionHandler") | |||
| @Slf4j | |||
| public class AddActionHandler implements DistributeTrainActionHandler { | |||
| public static final String JOB_WATCHER = "job-watcher-"; | |||
| public static final String PRETREATMENT = "pretreatment"; | |||
| public static final String JOB_NAME = "job-name"; | |||
| public static final String RUNNING = "Running"; | |||
| public static final String MASTER = "master"; | |||
| public static final String SLAVE = "slave"; | |||
| public static final String PRETREATMENT_TARGET_DIR = "/home/pretreatment"; | |||
| public static final String IP = "ip"; | |||
| public static final String ROLE = "role"; | |||
| public static final String HOSTFILE_TARGET_DIR = "/home/hostfile.json"; | |||
| @Autowired | |||
| private KubernetesClient client; | |||
| @Autowired | |||
| private PodApi podApi; | |||
| /** | |||
| * String 训练uid List pod信息 | |||
| */ | |||
| private Map<String, List<PodInfo>> dtMap = new ConcurrentHashMap(); | |||
| @Autowired | |||
| private RedisService redis; | |||
| /** | |||
| * 线程池 | |||
| */ | |||
| private ThreadPoolExecutor pool = new ThreadPoolExecutor(5, 10, 10, TimeUnit.SECONDS, new LinkedBlockingQueue<>(1), new ThreadFactory() { | |||
| private final AtomicInteger mThreadNum = new AtomicInteger(1); | |||
| @Override | |||
| public Thread newThread(Runnable r) { | |||
| return new Thread(r, JOB_WATCHER + mThreadNum.getAndIncrement()); | |||
| } | |||
| }, new ThreadPoolExecutor.DiscardOldestPolicy()); | |||
| /** | |||
| * 处理事件的任务 | |||
| */ | |||
| class HandlerActionTask implements Runnable { | |||
| private DistributeTrain distributeTrain; | |||
| public HandlerActionTask(DistributeTrain distributeTrain) { | |||
| this.distributeTrain = distributeTrain; | |||
| } | |||
| @Override | |||
| public void run() { | |||
| doAction(distributeTrain); | |||
| } | |||
| } | |||
| /** | |||
| * 执行任务动作 | |||
| * @param distributeTrain | |||
| */ | |||
| public void doAction(DistributeTrain distributeTrain) { | |||
| log.info("doAction=>distributeTrain : 【{}】", distributeTrain); | |||
| ChildResourceCreateInfo info = null; | |||
| try { | |||
| //redis重复检查 | |||
| //根据k8s 创建DistributionTrain 的uid去重 | |||
| if (null != redis.get(OperatorKey.CR, distributeTrain.getMetadata().getUid())) { | |||
| log.info("distribute train 【{}】 in namespace 【{}】 already exists", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||
| return; | |||
| } else { | |||
| //录入redis做消费记录 | |||
| redis.set(OperatorKey.CR, distributeTrain.getMetadata().getUid(), System.currentTimeMillis()); | |||
| } | |||
| //参数检查,提取并生成所需参数 | |||
| validateParams(distributeTrain); | |||
| info = ChildResourceCreateInfo.fromCr(distributeTrain); | |||
| //按照size,创建副本数为size-1的statefulSet | |||
| createStatefulSet(info); | |||
| //等待statefulset全部ready | |||
| waitUntilStatefulSetReady(info); | |||
| //创建job,job此时在死循环 | |||
| createJob(info); | |||
| //等待job ready | |||
| waitUntilJobReady(info); | |||
| //复制 /home/pretreatment 到 pod | |||
| copyPretreatmentShell(info); | |||
| //收集statefulSet和job的ip | |||
| validateAndCollectPods(info); | |||
| //本地生成公私钥、认证文件,并拷贝到所有节点的~/.ssh目录下 | |||
| sshAuthWithoutPass(info); | |||
| //本地生成hostfile,并拷贝到所有节点的指定目录下 | |||
| generateAndUploadHostFile(info); | |||
| //解锁job的死循环 | |||
| releaseInterLock(info); | |||
| //改状态 | |||
| //updateStatus(info, distributeTrain); | |||
| //为job注册监听器 | |||
| registerJobListener(info); | |||
| log.info("all parts of【{}】 are ready", info.getParentName()); | |||
| } catch (Exception e) { | |||
| log.error("doAction error:【{}】", e); | |||
| //移除缓存 | |||
| redis.del(OperatorKey.CR, distributeTrain.getMetadata().getUid()); | |||
| //回收创建的资源 | |||
| if (info != null) { | |||
| recycleCr(info); | |||
| } | |||
| } | |||
| } | |||
| /** | |||
| * 处理分布式训练 | |||
| * @param distributeTrain 分布式训练信息 | |||
| */ | |||
| @Override | |||
| public void handlerAction(DistributeTrain distributeTrain) { | |||
| log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); | |||
| HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); | |||
| pool.getActiveCount(); | |||
| pool.execute(handlerActionTask); | |||
| } | |||
| /** | |||
| * 校验参数合法性 | |||
| * @param distributeTrain 分布式训练 | |||
| */ | |||
| private void validateParams(DistributeTrain distributeTrain) { | |||
| log.info("validateParams=>distributeTrain : 【{}】", distributeTrain); | |||
| Integer size = distributeTrain.getSpec().getSize(); | |||
| if (size < NUMBER_2) { | |||
| throw new OperatorException("size must be greater than 1"); | |||
| } | |||
| String masterCmd = distributeTrain.getSpec().getMasterCmd(); | |||
| String slaveCmd = distributeTrain.getSpec().getSlaveCmd(); | |||
| if (StrUtil.isEmpty(slaveCmd) || StrUtil.isEmpty(masterCmd)) { | |||
| throw new OperatorException("cmd lines must not be empty"); | |||
| } | |||
| } | |||
| /** | |||
| * 拷贝文件pretreatment到pod | |||
| * @param info 资源信息 | |||
| */ | |||
| private void copyPretreatmentShell(ChildResourceCreateInfo info) { | |||
| log.info("start to copy pretreatment for 【{}】 ", info.getParentName()); | |||
| try { | |||
| String path = System.getProperty(KubeConstants.USER_DIR_SYSTEM_PROPERTY) + File.separator + PRETREATMENT; | |||
| if (!FileUtil.exist(path)) { | |||
| FileUtil.writeFromStream(new ClassPathResource("/shell/pretreatment").getInputStream(), path); | |||
| } | |||
| File pretreatment = new File(path); | |||
| //上传到pod指定目录 | |||
| List<Pod> pods = getPods(info); | |||
| for (int i = 0; i < pods.size(); i++) { | |||
| Pod pod = pods.get(i); | |||
| //默认第一个为master | |||
| String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, pretreatment, PRETREATMENT_TARGET_DIR); | |||
| } | |||
| } catch (Exception e) { | |||
| log.error("copy pretreatment shell error: 【{}】",e); | |||
| throw new OperatorException("exception is thrown when copy pretreatment for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||
| } | |||
| } | |||
| /** | |||
| * 创建statefulSet | |||
| * @param info 资源信息 | |||
| */ | |||
| private void createStatefulSet(ChildResourceCreateInfo info) { | |||
| log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info); | |||
| StatefulSet statefulSet = client.apps().statefulSets() | |||
| .inNamespace(info.getNamespace()) | |||
| .withName(info.getStatefulSetName()).get(); | |||
| //已存在 | |||
| if (statefulSet != null) { | |||
| log.info("statefulSet 【{}】 already exists", statefulSet.getMetadata().getName()); | |||
| return; | |||
| } | |||
| //不存在,新建 | |||
| StatefulSetDeployer deployer = new BaseStatefulSetDeployer(); | |||
| StatefulSetBuilder builder = deployer.deploy(info); | |||
| statefulSet = builder.build(); | |||
| client.apps().statefulSets().create(statefulSet); | |||
| log.info("create statefulSet【{}】 successfully", statefulSet.getMetadata().getName()); | |||
| } | |||
| /** | |||
| * 等待statefulSet全部ready | |||
| * @param info 资源信息 | |||
| */ | |||
| private void waitUntilStatefulSetReady(ChildResourceCreateInfo info) { | |||
| log.info("wait for statefulSet 【{}】 in namespace 【{}】 ready", info.getStatefulSetName(), info.getNamespace()); | |||
| try { | |||
| client.apps().statefulSets() | |||
| .inNamespace(info.getNamespace()) | |||
| .withName(info.getStatefulSetName()) | |||
| //阻塞 直到全部pod Ready 最长阻塞时间2小时 | |||
| .waitUntilCondition(c -> | |||
| c.getStatus().getReplicas() != null | |||
| && ObjectUtil.equal(c.getStatus().getReplicas(), c.getStatus().getReadyReplicas()), | |||
| NUMBER_2, TimeUnit.HOURS); | |||
| log.info("statefulSet 【{}】 in namespace 【{}】 is ready", info.getStatefulSetName(), info.getNamespace()); | |||
| } catch (Exception e) { | |||
| log.error("wait until statefulSet ready error:【{}】", e); | |||
| throw new OperatorException("exception is thrown when waiting for statefulSet 【" + info.getStatefulSetName() + "】 ready : \n" + e.getMessage()); | |||
| } | |||
| } | |||
| /** | |||
| * 创建job | |||
| * @param info Job信息 | |||
| */ | |||
| private void createJob(ChildResourceCreateInfo info) { | |||
| log.info("createJob=>childResourceCreateInfo : 【{}】", info); | |||
| Job job = client.batch().jobs() | |||
| .inNamespace(info.getNamespace()) | |||
| .withName(info.getJobName()).get(); | |||
| //已存在 | |||
| if (job != null) { | |||
| log.info("job 【{}】 already exists", job.getMetadata().getName()); | |||
| return; | |||
| } | |||
| //不存在,新建 | |||
| JobDeployer deployer = new BaseJobDeployer(); | |||
| JobBuilder builder = deployer.deploy(info); | |||
| job = builder.build(); | |||
| log.info("job is : 【{}】", job); | |||
| client.batch().jobs().create(job); | |||
| log.info("create job【{}】 successfully", job.getMetadata().getName()); | |||
| } | |||
| /** | |||
| * 等待job全部ready | |||
| * @param info 资源信息 | |||
| */ | |||
| private void waitUntilJobReady(ChildResourceCreateInfo info) { | |||
| log.info("wait for job 【{}】 in namespace 【{}】 ready", info.getStatefulSetName(), info.getNamespace()); | |||
| try { | |||
| List<Pod> podList = client.pods().inNamespace(info.getNamespace()) | |||
| .withLabel(JOB_NAME, info.getJobName()) | |||
| .list().getItems(); | |||
| while (CollectionUtil.isEmpty(podList)) { | |||
| TimeUnit.SECONDS.sleep(2); | |||
| podList = client.pods().inNamespace(info.getNamespace()) | |||
| .withLabel(JOB_NAME, info.getJobName()) | |||
| .list().getItems(); | |||
| } | |||
| Pod pod = podList.get(0); | |||
| client.pods().inNamespace(info.getNamespace()) | |||
| .withName(pod.getMetadata().getName()) | |||
| //等待直到Ready状态 最长2小时 | |||
| .waitUntilReady(2, TimeUnit.HOURS); | |||
| log.info("job 【{}】 in namespace 【{}】 is ready", info.getJobName(), info.getNamespace()); | |||
| } catch (Exception e) { | |||
| log.info(e.getMessage(), e); | |||
| throw new OperatorException("exception is thrown when waiting for job 【" + info.getJobName() + "】 ready : \n" + e.getMessage()); | |||
| } | |||
| } | |||
| /** | |||
| * 收集资源的podInfo | |||
| * @param info 资源信息 | |||
| */ | |||
| private void validateAndCollectPods(ChildResourceCreateInfo info) { | |||
| //检查是否都在正常运行 | |||
| log.info("validate pods status for 【{}】", info.getParentName()); | |||
| boolean isAllSlaveRunning = true; | |||
| boolean isMasterRunning = true; | |||
| Pod masterPod = null; | |||
| List<Pod> slavePods = null; | |||
| do { | |||
| //取得主的pod | |||
| masterPod = getMasterPod(info); | |||
| //取得从的所有pod | |||
| slavePods = getSlavePods(info); | |||
| if (masterPod == null) { | |||
| log.info("can not find pod belongs to job 【{}】", info.getJobName()); | |||
| return; | |||
| } | |||
| if (CollectionUtil.isEmpty(slavePods)) { | |||
| log.info("can not find pod belongs to statefulSet 【{}】", info.getStatefulSetName()); | |||
| return; | |||
| } | |||
| isMasterRunning = RUNNING.equals(masterPod.getStatus().getPhase()); | |||
| isAllSlaveRunning = true; | |||
| for (Pod slavePod : slavePods) { | |||
| boolean isSlaveRunning = RUNNING.equals(slavePod.getStatus().getPhase()); | |||
| if (!isSlaveRunning) { | |||
| isAllSlaveRunning = false; | |||
| break; | |||
| } | |||
| } | |||
| } while (!(isMasterRunning && isAllSlaveRunning)); | |||
| log.info("status checked 【{}】 all right", info.getParentName()); | |||
| collectChildPodInfo(info, masterPod, slavePods); | |||
| } | |||
| /** | |||
| * 收集pod基本信息 | |||
| * @param info 资源信息 | |||
| * @param masterPod | |||
| * @param slavePods | |||
| */ | |||
| private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) { | |||
| log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】, slavePods : 【{}】", info, masterPod, slavePods); | |||
| String key = info.getOwnerReference().getUid(); | |||
| if (dtMap.containsKey(key)) { | |||
| dtMap.remove(key); | |||
| } | |||
| List<PodInfo> podInfos = Lists.newArrayList(); | |||
| PodInfo masterPodInfo = PodInfo.builder() | |||
| .ip(masterPod.getStatus().getPodIP()) | |||
| .role(MASTER) | |||
| .build(); | |||
| podInfos.add(masterPodInfo); | |||
| for (Pod slavePod : slavePods) { | |||
| PodInfo slavePodInfo = PodInfo.builder() | |||
| .ip(slavePod.getStatus().getPodIP()) | |||
| .role(SLAVE) | |||
| .build(); | |||
| podInfos.add(slavePodInfo); | |||
| } | |||
| dtMap.put(key, podInfos); | |||
| } | |||
| /** | |||
| * ssh免密互通相关配置 | |||
| * @param info 资源信息 | |||
| */ | |||
| private void sshAuthWithoutPass(ChildResourceCreateInfo info) { | |||
| log.info("start to configure ssh no password environment for 【{}】 ", info.getParentName()); | |||
| File tempDir = Files.createTempDir(); | |||
| try ( | |||
| InputStream isRsa = getClass().getClassLoader().getResourceAsStream("key/id_rsa"); | |||
| InputStream isRsaPub = getClass().getClassLoader().getResourceAsStream("key/id_rsa.pub") | |||
| ) { | |||
| //id_rsa | |||
| File tempIdRsa = FileUtil.createTempFile(tempDir); | |||
| IOUtils.copy(isRsa, tempIdRsa); | |||
| //id_rsa.pub | |||
| File tempIdRsaPub = FileUtil.createTempFile(tempDir); | |||
| IOUtils.copy(isRsaPub, tempIdRsaPub); | |||
| List<String> pubLines = FileUtil.readLines(tempIdRsaPub, CHARSET); | |||
| String pubKeyContent = pubLines.get(0); | |||
| //按机器修改id_rsa.pub, 并组装一个大而全的authorized_keys | |||
| List<File> idRsaPubFiles = Lists.newArrayList(); | |||
| File tempAuthorizedKeys = FileUtil.createTempFile(tempDir); | |||
| List<String> pubKeys = Lists.newArrayList(); | |||
| for (PodInfo podInfo : dtMap.get(info.getOwnerReference().getUid())) { | |||
| String podPubKeyContent = pubKeyContent.replace("{{ip}}", podInfo.getIp()); | |||
| File tempIdRsaPubOnPod = FileUtil.createTempFile(tempDir); | |||
| FileUtil.writeLines(Collections.singletonList(podPubKeyContent), tempIdRsaPubOnPod, CHARSET); | |||
| idRsaPubFiles.add(tempIdRsaPubOnPod); | |||
| pubKeys.add(podPubKeyContent); | |||
| } | |||
| FileUtil.writeLines(pubKeys, tempAuthorizedKeys, CHARSET); | |||
| //获得所有pod, 上传三个文件 | |||
| List<Pod> pods = getPods(info); | |||
| for (int i = 0; i < pods.size(); i++) { | |||
| Pod pod = pods.get(i); | |||
| String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||
| //上传id_rsa | |||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempIdRsa, "/root/.ssh/id_rsa"); | |||
| //上传id_rsa.pub | |||
| File tempIdRsaPubOnPod = idRsaPubFiles.get(i); | |||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempIdRsaPubOnPod, "/root/.ssh/id_rsa.pub"); | |||
| //上传authorized_keys | |||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempAuthorizedKeys, "/root/.ssh/authorized_keys"); | |||
| //修改权限 | |||
| String chmodCmd = StrUtil.format("chmod 644 /root/.ssh/authorized_keys && chmod 600 /root/.ssh/id_rsa && chmod 644 /root/.ssh/id_rsa.pub"); | |||
| podApi.exec(info.getNamespace(), pod.getMetadata().getName(), containerName, chmodCmd); | |||
| } | |||
| log.info("configure ssh no password environment for 【{}】 successfully ", info.getParentName()); | |||
| } catch (Exception e) { | |||
| log.error("sshAuthWithoutPass error:【{}】", e); | |||
| throw new OperatorException("exception is thrown when configure ssh no password environment for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||
| } finally { | |||
| //清理临时文件 | |||
| FileUtil.del(tempDir); | |||
| } | |||
| } | |||
| /** | |||
| * 生成并上传hostfile | |||
| * @param info 资源信息 | |||
| */ | |||
| private void generateAndUploadHostFile(ChildResourceCreateInfo info) { | |||
| log.info("start to configure hostfile for 【{}】 ", info.getParentName()); | |||
| File tempDir = Files.createTempDir(); | |||
| try { | |||
| //生成hostfile | |||
| JSONArray jsonArray = new JSONArray(); | |||
| List<PodInfo> podInfos = dtMap.get(info.getOwnerReference().getUid()); | |||
| for (PodInfo podInfo : podInfos) { | |||
| JSONObject podJson = new JSONObject(); | |||
| podJson.put(IP, podInfo.getIp()); | |||
| podJson.put(ROLE, podInfo.getRole()); | |||
| jsonArray.add(podJson); | |||
| } | |||
| File tempHostFile = FileUtil.createTempFile(tempDir); | |||
| FileUtil.writeLines(Collections.singletonList(jsonArray.toJSONString()), tempHostFile, CHARSET); | |||
| //上传到pod指定目录 | |||
| List<Pod> pods = getPods(info); | |||
| for (int i = 0; i < pods.size(); i++) { | |||
| Pod pod = pods.get(i); | |||
| String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempHostFile, HOSTFILE_TARGET_DIR); | |||
| } | |||
| } catch (Exception e) { | |||
| log.error("generateAndUploadHostFile error:【{}】", e); | |||
| throw new OperatorException("exception is thrown when generate and upload hostfile for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||
| } finally { | |||
| //清理临时文件 | |||
| FileUtil.del(tempDir); | |||
| } | |||
| } | |||
| /** | |||
| * 创建service 解除闭锁 | |||
| * @param info | |||
| */ | |||
| private void releaseInterLock(ChildResourceCreateInfo info) { | |||
| log.info("release lock for 【{}】", info.getParentName()); | |||
| ServiceDeployer deployer = new BaseServiceDeployer(); | |||
| ServiceBuilder builder = deployer.deploy(info); | |||
| Service svc = builder.build(); | |||
| client.services().create(svc); | |||
| log.info("lock for 【{}】 released", info.getParentName()); | |||
| } | |||
| /** | |||
| * 回收cr | |||
| * @param info | |||
| */ | |||
| private void recycleCr(ChildResourceCreateInfo info) { | |||
| log.info("recycleCr=>childResourceCreateInfo : 【{}】", info); | |||
| Optional.ofNullable(DistributeTrainClientHolder.getClient()) | |||
| .ifPresent(distributeTrainClient -> { | |||
| ObjectMeta metadata = new ObjectMeta(); | |||
| metadata.setName(info.getParentName()); | |||
| metadata.setNamespace(info.getNamespace()); | |||
| DistributeTrain dt = new DistributeTrain(metadata, DistributeTrainSpec.builder() | |||
| .build()); | |||
| distributeTrainClient.delete(dt); | |||
| log.info("recycle distribute train 【{}】", info.getParentName()); | |||
| }); | |||
| } | |||
| /**更新状态*/ | |||
| private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { | |||
| log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info, distributeTrain); | |||
| if (distributeTrain.getStatus() == null) { | |||
| distributeTrain.setStatus(new DistributeTrainStatus()); | |||
| } | |||
| Integer size = distributeTrain.getSpec().getSize(); | |||
| distributeTrain.getStatus().setReplicas(size); | |||
| distributeTrain.getStatus().setReadyReplicas(size); | |||
| } | |||
| /** | |||
| * 为job注册监听器 | |||
| * @param info | |||
| */ | |||
| private void registerJobListener(ChildResourceCreateInfo info) { | |||
| log.info("register listener for distribute train 【{}】", info.getParentName()); | |||
| // client.batch().jobs() | |||
| // .inNamespace(info.getNamespace()) | |||
| // .withName(info.getJobName()).watch(null); | |||
| } | |||
| /** | |||
| * 获取所有分布式训练相关的pod | |||
| * @param info | |||
| * @return List<Pod> 分布式相关Pod集合 | |||
| */ | |||
| private List<Pod> getPods(ChildResourceCreateInfo info) { | |||
| log.info("getPods=>childResourceCreateInfo : 【{}】", info); | |||
| List<Pod> pods = Lists.newArrayList(); | |||
| pods.add(getMasterPod(info)); | |||
| pods.addAll(getSlavePods(info)); | |||
| if (CollectionUtil.hasNull(pods) || pods.size() != info.getSlaveReplicas() + 1) { | |||
| throw new OperatorException("can not get pods in correct numbers"); | |||
| } | |||
| return pods; | |||
| } | |||
| /** | |||
| * 获取master信息 | |||
| * @param info 资源信息 | |||
| * @return Pod Master节点对应的Pod | |||
| */ | |||
| private Pod getMasterPod(ChildResourceCreateInfo info) { | |||
| log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info); | |||
| List<Pod> masterPods = client.pods().inNamespace(info.getNamespace()) | |||
| .withLabel(JOB_LABEL, info.getJobName()) | |||
| .list().getItems(); | |||
| if (CollectionUtil.isEmpty(masterPods)) { | |||
| return null; | |||
| } | |||
| return masterPods.get(0); | |||
| } | |||
| /** | |||
| * 取得从的所有pod | |||
| * @param info 资源信息 | |||
| * @return List<Pod> Slave节点对应的Pod集合 | |||
| */ | |||
| private List<Pod> getSlavePods(ChildResourceCreateInfo info) { | |||
| log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info); | |||
| //取得从的所有pod | |||
| List<Pod> slavePods = client.pods().inNamespace(info.getNamespace()) | |||
| .withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) | |||
| .list().getItems(); | |||
| if (CollectionUtil.isEmpty(slavePods)) { | |||
| return null; | |||
| } | |||
| return slavePods; | |||
| } | |||
| } | |||
| @@ -0,0 +1,88 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.handler; | |||
| import cn.hutool.core.collection.CollectionUtil; | |||
| import io.fabric8.kubernetes.api.model.Service; | |||
| import io.fabric8.kubernetes.api.model.ServiceList; | |||
| import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||
| import io.fabric8.kubernetes.api.model.apps.StatefulSetList; | |||
| import io.fabric8.kubernetes.api.model.batch.Job; | |||
| import io.fabric8.kubernetes.api.model.batch.JobList; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.onebrain.operator.constants.KubeConstants; | |||
| import org.onebrain.operator.crd.DistributeTrain; | |||
| import org.onebrain.operator.redis.RedisService; | |||
| import org.onebrain.operator.redis.key.OperatorKey; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.stereotype.Component; | |||
| /** | |||
| * @description 删除事件的处理器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Component("deleteActionHandler") | |||
| @Slf4j | |||
| public class DeleteActionHandler implements DistributeTrainActionHandler { | |||
| @Autowired | |||
| private KubernetesClient client; | |||
| @Autowired | |||
| private RedisService redis; | |||
| /** | |||
| * 处理删除事件 | |||
| * @param distributeTrain 分布式训练信息 | |||
| */ | |||
| @Override | |||
| public void handlerAction(DistributeTrain distributeTrain) { | |||
| log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); | |||
| String namespace = distributeTrain.getMetadata().getNamespace(); | |||
| String parentName = distributeTrain.getMetadata().getName(); | |||
| // namespace+parentName(分布式训练名称) 确定相应的资源 | |||
| //删除job | |||
| JobList jobList = client.batch().jobs().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||
| if(CollectionUtil.isNotEmpty(jobList.getItems())){ | |||
| for (Job item : jobList.getItems()) { | |||
| client.batch().jobs().delete(item); | |||
| } | |||
| log.info("delete job in distributeTrain 【{}】", parentName); | |||
| } | |||
| //删除statefullSete | |||
| StatefulSetList statefulSetList = client.apps().statefulSets().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||
| if(CollectionUtil.isNotEmpty(statefulSetList.getItems())){ | |||
| for (StatefulSet item : statefulSetList.getItems()) { | |||
| client.apps().statefulSets().delete(item); | |||
| } | |||
| log.info("delete statefulSet in distributeTrain 【{}】", parentName); | |||
| } | |||
| //删除service | |||
| ServiceList svcList = client.services().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||
| if(CollectionUtil.isNotEmpty(svcList.getItems())){ | |||
| for (Service item : svcList.getItems()) { | |||
| client.services().delete(item); | |||
| } | |||
| log.info("delete svc in distributeTrain 【{}】", parentName); | |||
| } | |||
| //删除redis里记录的分布式训练信息 | |||
| redis.del(OperatorKey.CR, distributeTrain.getMetadata().getUid()); | |||
| log.info("delete distributeTrain 【{}】 successfully", parentName); | |||
| } | |||
| } | |||
| @@ -0,0 +1,33 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.action.handler; | |||
| import org.onebrain.operator.crd.DistributeTrain; | |||
| /** | |||
| * @description 分布式训练的事件处理器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public interface DistributeTrainActionHandler { | |||
| /** | |||
| * 处理相应的事件 | |||
| * @param distributeTrain 分布式训练信息 | |||
| */ | |||
| void handlerAction(DistributeTrain distributeTrain); | |||
| } | |||
| @@ -0,0 +1,85 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.api.pod; | |||
| import io.fabric8.kubernetes.client.dsl.ExecListener; | |||
| import lombok.Getter; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import okhttp3.Response; | |||
| import java.util.concurrent.CountDownLatch; | |||
| /** | |||
| * @description 默认命令执行监听器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Slf4j | |||
| @Getter | |||
| public class DefaultPodExecListener implements ExecListener { | |||
| /** | |||
| * pod名称 | |||
| */ | |||
| private String podName; | |||
| /** | |||
| * 命名空间 | |||
| */ | |||
| private String namespace; | |||
| /** | |||
| * 容器名称 | |||
| */ | |||
| private String containerName; | |||
| /** | |||
| * 执行门栓 线程通信用 | |||
| */ | |||
| private CountDownLatch execLatch; | |||
| public DefaultPodExecListener(String podName, String namespace, String containerName, CountDownLatch execLatch) { | |||
| this.podName = podName; | |||
| this.namespace = namespace; | |||
| this.containerName = containerName; | |||
| this.execLatch = execLatch; | |||
| } | |||
| @Override | |||
| public void onOpen(Response response) { | |||
| log.debug("shell environment in pod '{}', namespace '{}' is opened", podName, namespace); | |||
| log.debug("onOpen: {}", response); | |||
| } | |||
| @Override | |||
| public void onFailure(Throwable t, Response response) { | |||
| log.error("shell environment in pod '{}', namespace '{}' barfed", podName, namespace); | |||
| log.error("onFailure: {} {}", t.getMessage(), response); | |||
| if (execLatch != null) { | |||
| execLatch.countDown(); | |||
| } | |||
| } | |||
| @Override | |||
| public void onClose(int code, String reason) { | |||
| log.debug("shell environment in pod '{}', namespace '{}' closed", podName, namespace); | |||
| log.debug("onClose: {} {}", code, reason); | |||
| if (execLatch != null) { | |||
| execLatch.countDown(); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,177 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.api.pod; | |||
| import cn.hutool.core.util.StrUtil; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import io.fabric8.kubernetes.client.dsl.ExecWatch; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.apache.commons.io.FileUtils; | |||
| import org.onebrain.operator.context.KubeContext; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.stereotype.Component; | |||
| import java.io.File; | |||
| import java.io.IOException; | |||
| import java.io.PipedInputStream; | |||
| import java.io.PipedOutputStream; | |||
| import java.util.concurrent.CountDownLatch; | |||
| import java.util.concurrent.atomic.AtomicBoolean; | |||
| /** | |||
| * | |||
| * @description PodApi 操作pod 里的容器用于上传文件等操作吧 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Component | |||
| @Slf4j | |||
| public class PodApi { | |||
| private static final Integer DEFAULT_LOG_LINES = 50; | |||
| @Autowired | |||
| private KubeContext kubeContext; | |||
| @Autowired | |||
| private KubernetesClient client; | |||
| /** | |||
| * 从Pod下载单个文件 | |||
| * @return File 临时文件,用后需要及时清理 | |||
| * **/ | |||
| public File copyFileFromPod(String namespace, String podName, String containerName, String filePath){ | |||
| try { | |||
| File tmpFile = File.createTempFile("copy-from-pod-", ""); | |||
| client.pods().inNamespace(namespace).withName(podName) | |||
| .inContainer(containerName) | |||
| .file(filePath) | |||
| .copy(tmpFile.toPath()); | |||
| if(tmpFile.length() == 0){ | |||
| return null; | |||
| } | |||
| return tmpFile; | |||
| } catch (IOException e) { | |||
| log.error(" File copy error : 【{}】",e); | |||
| } | |||
| return null; | |||
| } | |||
| /** | |||
| * 从Pod下载目录 | |||
| * @return File 临时文件,用后需要及时清理 | |||
| * **/ | |||
| public File copyFolderFromPod(String namespace, String podName, String containerName, String folderPath){ | |||
| final PipedInputStream stdoutInput = new PipedInputStream(); | |||
| final PipedOutputStream stdoutOutput = new PipedOutputStream(); | |||
| final PipedInputStream stderrInput = new PipedInputStream(); | |||
| final PipedOutputStream stderrOutput = new PipedOutputStream(); | |||
| final AtomicBoolean failed = new AtomicBoolean(false); | |||
| try { | |||
| stdoutInput.connect(stdoutOutput); | |||
| stderrInput.connect(stderrOutput); | |||
| //去除路径上的/前缀 | |||
| if(folderPath.startsWith(StrUtil.SLASH)){ | |||
| folderPath = StrUtil.removePrefix(folderPath, StrUtil.SLASH); | |||
| } | |||
| //监听器异步执行 | |||
| DefaultPodExecListener defaultPodExecListener = new DefaultPodExecListener(podName, namespace, containerName, null); | |||
| StdPodExecListener stdPodExecListener = new StdPodExecListener(defaultPodExecListener, stdoutOutput, stderrOutput, failed); | |||
| ExecWatch watch = client.pods().inNamespace(namespace) | |||
| .withName(podName).inContainer(containerName) | |||
| .writingOutput(stdoutOutput).writingError(stderrOutput) | |||
| .usingListener(stdPodExecListener) | |||
| .exec("tar", "cf", "-", "-C", folderPath, "."); | |||
| // execLatch.await(); | |||
| } catch (IOException e) { | |||
| log.error("copyFolderFromPod:【{}】",e); | |||
| } | |||
| File tmpFile = null; | |||
| try { | |||
| tmpFile = File.createTempFile("copy-from-pod-", ".tar"); | |||
| int length; | |||
| byte[] buffer = new byte[1024]; | |||
| while (!Thread.currentThread().isInterrupted() | |||
| && (length = stdoutInput.read(buffer)) != -1) { | |||
| byte[] content = new byte[length]; | |||
| System.arraycopy(buffer, 0, content, 0, length); | |||
| FileUtils.writeByteArrayToFile(tmpFile, content, true); | |||
| } | |||
| while (!Thread.currentThread().isInterrupted() | |||
| && (length = stderrInput.read(buffer)) != -1) { | |||
| log.error(new String(buffer, 0, length)); | |||
| } | |||
| } catch (IOException e) { | |||
| if (!Thread.currentThread().isInterrupted()) { | |||
| log.error("Error while pumping stream. 【{}】", e); | |||
| } else { | |||
| log.error("Interrupted while pumping stream. 【{}】", e); | |||
| } | |||
| } | |||
| return tmpFile; | |||
| } | |||
| /** | |||
| * 拷贝文件到pod | |||
| * @param namespace 命名空间 | |||
| * @param podName pod名称 | |||
| * @param containerName 容器名称 | |||
| * @param file 文件 | |||
| * @param targetDir 目标路径 | |||
| */ | |||
| public void copyToPod(String namespace, String podName, String containerName, File file, String targetDir){ | |||
| client.pods().inNamespace(namespace).withName(podName) | |||
| .inContainer(containerName) | |||
| .file(targetDir) | |||
| .upload(file.toPath()); | |||
| } | |||
| /** | |||
| * 同步执行 | |||
| * @param namespace 命名空间 | |||
| * @param podName pod名称 | |||
| * @param containerName 容器名称 | |||
| * @param cmd 命令 | |||
| */ | |||
| public void exec(String namespace, String podName, String containerName, String cmd){ | |||
| try { | |||
| final CountDownLatch execLatch = new CountDownLatch(1); | |||
| ExecWatch execWatch = client.pods().inNamespace(namespace).withName(podName).inContainer(containerName) | |||
| .redirectingOutput() | |||
| .withTTY() //不展示输出 | |||
| .usingListener(new DefaultPodExecListener(namespace, podName, containerName, execLatch)) | |||
| .exec("sh", "-c", cmd); | |||
| execLatch.await(); | |||
| } catch (InterruptedException e) { | |||
| log.error(" PodApi execute cmd error : 【{}】",e); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,83 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.api.pod; | |||
| import io.fabric8.kubernetes.client.dsl.ExecListener; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import okhttp3.Response; | |||
| import java.io.IOException; | |||
| import java.io.PipedOutputStream; | |||
| import java.util.concurrent.atomic.AtomicBoolean; | |||
| /** | |||
| * @description 标准pod执行监听器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Slf4j | |||
| public class StdPodExecListener implements ExecListener { | |||
| private ExecListener defaultExecListener; | |||
| private PipedOutputStream stdoutOutput; | |||
| private PipedOutputStream stderrOutput; | |||
| private AtomicBoolean failed; | |||
| public StdPodExecListener(ExecListener defaultExecListener, PipedOutputStream stdoutOutput, PipedOutputStream stderrOutput, AtomicBoolean failed) { | |||
| this.defaultExecListener = defaultExecListener; | |||
| this.stdoutOutput = stdoutOutput; | |||
| this.stderrOutput = stderrOutput; | |||
| this.failed = failed; | |||
| } | |||
| @Override | |||
| public void onOpen(Response response) { | |||
| log.info("onOpen=>response : 【{}】",response); | |||
| defaultExecListener.onOpen(response); | |||
| } | |||
| @Override | |||
| public void onFailure(Throwable t, Response response) { | |||
| log.info("onFailure=> t :【{}】,response : 【{}】",t,response); | |||
| try { | |||
| failed.set(true); | |||
| stdoutOutput.close(); | |||
| stderrOutput.close(); | |||
| } catch (IOException e) { | |||
| log.error("Failed to close stdout and stderr pipes. 【{}】", e); | |||
| } finally { | |||
| defaultExecListener.onFailure(t, response); | |||
| } | |||
| } | |||
| @Override | |||
| public void onClose(int code, String reason) { | |||
| log.info("onClose=>code : 【{}】,reason : 【{}】",code,reason); | |||
| try { | |||
| stdoutOutput.close(); | |||
| stderrOutput.close(); | |||
| } catch (IOException e) { | |||
| log.error("Failed to close stdout and stderr pipes. 【{}】", e); | |||
| } finally { | |||
| defaultExecListener.onClose(code, reason); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,66 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.config; | |||
| import cn.hutool.core.util.StrUtil; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import org.onebrain.operator.context.KubeContext; | |||
| import org.onebrain.operator.properties.KubeProperties; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.boot.context.properties.EnableConfigurationProperties; | |||
| import org.springframework.context.annotation.Bean; | |||
| import org.springframework.context.annotation.Configuration; | |||
| /** | |||
| * @description k8s配置类 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Configuration | |||
| @EnableConfigurationProperties(KubeProperties.class) | |||
| public class KubeConfig { | |||
| @Autowired | |||
| private KubeProperties kubeProperties; | |||
| /** | |||
| * 注册k8s配置 | |||
| * @return | |||
| */ | |||
| @Bean | |||
| public KubeContext kubeContext() { | |||
| if (kubeProperties == null) { | |||
| return null; | |||
| } | |||
| final String configSource = kubeProperties.getKubeconfig(); | |||
| if(StrUtil.isEmpty(configSource)){ | |||
| return null; | |||
| } | |||
| return new KubeContext(kubeProperties); | |||
| } | |||
| /** | |||
| * 注册k8s客户端 | |||
| * @param kubeContext k8s配置 | |||
| * @return | |||
| */ | |||
| @Bean | |||
| public KubernetesClient kubernetesClient(KubeContext kubeContext){ | |||
| return kubeContext.getClient(); | |||
| } | |||
| } | |||
| @@ -0,0 +1,34 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.constants; | |||
| /** | |||
| * @description crd 常量信息 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public class CrdConstants { | |||
| public static final String CRD_GROUP = "onebrain.oneflow.org"; | |||
| public static final String CRD_SINGULAR_NAME = "distributetrain"; | |||
| public static final String CRD_PLURAL_NAME = "distributetrains"; | |||
| public static final String CRD_NAME = CRD_PLURAL_NAME + "." + CRD_GROUP; | |||
| public static final String CRD_KIND = "DistributeTrain"; | |||
| public static final String CRD_SCOPE = "Namespaced"; | |||
| public static final String CRD_SHORT_NAME = "dt"; | |||
| public static final String CRD_VERSION = "v1alpha1"; | |||
| public static final String CRD_API_VERSION = "apiextensions.k8s.io/v1beta1"; | |||
| } | |||
| @@ -0,0 +1,40 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.constants; | |||
| /** | |||
| * @description k8s常量 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public class KubeConstants { | |||
| public static final String DISTRIBUTE_TRAIN_LABEL = "dt-name"; | |||
| public static final String STATEFULSET_LABEL = "dt-ss-name"; | |||
| public static final String JOB_LABEL = "dt-job-name"; | |||
| public static final String MASTER_CONTAINER_NAME = "distribute-train-master"; | |||
| public static final String SLAVE_CONTAINER_NAME = "distribute-train-slave"; | |||
| public final static String USER_DIR_SYSTEM_PROPERTY = "user.dir"; | |||
| //不许重试 | |||
| public static final Integer BACKOFFLIMIT = 0; | |||
| public static final String CHARSET = "utf-8"; | |||
| public static final String ENV_NODE_NUM = "NODE_NUM"; | |||
| public static final String VOLUME_SHM = "dshm"; | |||
| } | |||
| @@ -0,0 +1,43 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.constants; | |||
| /** | |||
| * @Description 数字常量 | |||
| * @Date 2020-6-9 | |||
| */ | |||
| public class NumberConstant { | |||
| public final static int NUMBER_0 = 0; | |||
| public final static long LONG_NUMBER_0 = 0L; | |||
| public final static int NUMBER_1 = 1; | |||
| public final static int NUMBER_2 = 2; | |||
| public final static int NUMBER_3 = 3; | |||
| public final static int NUMBER_5 = 5; | |||
| public final static int NUMBER_10 = 10; | |||
| public final static int NUMBER_22 = 22; | |||
| public final static int NUMBER_30 = 30; | |||
| public final static int NUMBER_50 = 50; | |||
| public final static int NUMBER_60 = 60; | |||
| public final static long LONG_NUMBER_60 = 60L; | |||
| public final static int HOUR_SECOND = 60 * 60; | |||
| public final static int DAY_SECOND = 60 * 60 * 24; | |||
| public final static int WEEK_SECOND = 60 * 60 * 24 * 7; | |||
| public final static int MAX_PAGE_SIZE = 2000; | |||
| public final static int NUMBER_30000 = 30000; | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.context; | |||
| import cn.hutool.core.util.StrUtil; | |||
| import com.fasterxml.jackson.core.JsonProcessingException; | |||
| import io.fabric8.kubernetes.api.model.HasMetadata; | |||
| import io.fabric8.kubernetes.client.Config; | |||
| import io.fabric8.kubernetes.client.DefaultKubernetesClient; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import io.fabric8.kubernetes.client.VersionInfo; | |||
| import io.fabric8.kubernetes.client.internal.SerializationUtils; | |||
| import io.fabric8.kubernetes.client.utils.Utils; | |||
| import lombok.Getter; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.onebrain.operator.properties.KubeProperties; | |||
| import org.springframework.beans.BeansException; | |||
| import org.springframework.context.ApplicationContext; | |||
| import org.springframework.context.ApplicationContextAware; | |||
| /** | |||
| * @description k8s上下文 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Slf4j | |||
| @Getter | |||
| public class KubeContext implements ApplicationContextAware { | |||
| private static final String AUTO = "auto"; | |||
| private ApplicationContext applicationContext; | |||
| private KubernetesClient client; | |||
| private Config config; | |||
| public KubeContext(KubeProperties kubeProperties) { | |||
| String configSource = kubeProperties.getKubeconfig(); | |||
| try { | |||
| if(AUTO.equals(configSource)){ | |||
| //在集群内部可自动侦测 | |||
| log.info("kubernetes client is in cluster mode"); | |||
| client = new DefaultKubernetesClient(); | |||
| config = client.getConfiguration(); | |||
| }else{ | |||
| if(configSource.startsWith(StrUtil.SLASH)){ | |||
| log.info("read kubeconfig from file system:{}", configSource); | |||
| System.setProperty(Config.KUBERNETES_KUBECONFIG_FILE, configSource); | |||
| }else{ | |||
| log.info("read kubeconfig from classpath:{}", configSource); | |||
| final String testKubeconfigFile = Utils.filePath(getClass().getResource(StrUtil.SLASH + configSource)); | |||
| //修改环境变量,重新指定kubeconfig读取位置 | |||
| System.setProperty(Config.KUBERNETES_KUBECONFIG_FILE, testKubeconfigFile); | |||
| } | |||
| client = new DefaultKubernetesClient(); | |||
| config = client.getConfiguration(); | |||
| } | |||
| //打印集群信息 | |||
| log.info("ApiVersion : {}", client.getApiVersion()); | |||
| log.info("MasterUrl : {}", client.getMasterUrl()); | |||
| if(log.isDebugEnabled()){ | |||
| VersionInfo versionInfo = client.getVersion(); | |||
| log.debug("Version details of this Kubernetes cluster :-"); | |||
| log.debug("Major : {}", versionInfo.getMajor()); | |||
| log.debug("Minor : {}", versionInfo.getMinor()); | |||
| log.debug("GitVersion : {}", versionInfo.getGitVersion()); | |||
| log.debug("GitCommit : {}", versionInfo.getGitCommit()); | |||
| log.debug("BuildDate : {}", versionInfo.getBuildDate()); | |||
| log.debug("GitTreeState : {}", versionInfo.getGitTreeState()); | |||
| log.debug("Platform : {}", versionInfo.getPlatform()); | |||
| log.debug("GoVersion : {}", versionInfo.getGoVersion()); | |||
| } | |||
| }catch (Exception e){ | |||
| client = null; | |||
| log.error("初始化 K8sUtils 失败!", e); | |||
| e.printStackTrace(); | |||
| } | |||
| } | |||
| /** | |||
| * 导出成yaml字符串 | |||
| * @param resource k8s元数据 | |||
| * @return | |||
| */ | |||
| public String convertToYaml(HasMetadata resource) { | |||
| try { | |||
| return SerializationUtils.dumpAsYaml(resource); | |||
| } catch (JsonProcessingException e) { | |||
| e.printStackTrace(); | |||
| throw new RuntimeException("can not transform resource to yaml"); | |||
| } | |||
| } | |||
| @Override | |||
| public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { | |||
| this.applicationContext = applicationContext; | |||
| } | |||
| } | |||
| @@ -0,0 +1,131 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.controller; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||
| import io.fabric8.kubernetes.client.dsl.Resource; | |||
| import io.fabric8.kubernetes.client.informers.ResourceEventHandler; | |||
| import io.fabric8.kubernetes.client.informers.SharedIndexInformer; | |||
| import io.fabric8.kubernetes.client.informers.cache.Lister; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.onebrain.operator.action.handler.DistributeTrainActionHandler; | |||
| import org.onebrain.operator.crd.DistributeTrain; | |||
| import org.onebrain.operator.crd.DistributeTrainList; | |||
| import org.onebrain.operator.crd.DoneableDistributeTrain; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.beans.factory.annotation.Qualifier; | |||
| import org.springframework.scheduling.annotation.Async; | |||
| import java.util.concurrent.TimeUnit; | |||
| /** | |||
| * @description 分布式训练控制器 | |||
| * @date 2020-06-16 | |||
| */ | |||
| @Slf4j | |||
| public class DistributeTrainController { | |||
| @Autowired | |||
| private KubernetesClient client; | |||
| /** | |||
| * 分布式训练informer | |||
| */ | |||
| private SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer; | |||
| /** | |||
| * 分布式训练k8s访问客户端 | |||
| */ | |||
| private MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient; | |||
| /** | |||
| * 分布式训练lister | |||
| */ | |||
| private Lister<DistributeTrain> distributeTrainLister; | |||
| @Autowired | |||
| @Qualifier("addActionHandler") | |||
| private DistributeTrainActionHandler addActionHandler; | |||
| @Autowired | |||
| @Qualifier("deleteActionHandler") | |||
| private DistributeTrainActionHandler deleteActionHandler; | |||
| public DistributeTrainController(MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient, SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer, String namespace) { | |||
| this.distributeTrainSharedIndexInformer = distributeTrainSharedIndexInformer; | |||
| this.distributeTrainClient = distributeTrainClient; | |||
| this.distributeTrainLister = new Lister<>(distributeTrainSharedIndexInformer.getIndexer()); | |||
| } | |||
| /** | |||
| * 添加事件监听器 | |||
| */ | |||
| public void create() { | |||
| distributeTrainSharedIndexInformer.addEventHandler(new ResourceEventHandler<DistributeTrain>() { | |||
| /** | |||
| * 处理添加事件 | |||
| * @param distributeTrain 分布式训练信息 | |||
| */ | |||
| @Override | |||
| public void onAdd(DistributeTrain distributeTrain) { | |||
| log.info("add distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||
| addActionHandler.handlerAction(distributeTrain); | |||
| } | |||
| /** | |||
| * 处理更内心事件 | |||
| * @param distributeTrain 旧的 分布式训练信息 | |||
| * @param newDistributeTrain 新的 分布式训练信息 | |||
| */ | |||
| @Override | |||
| public void onUpdate(DistributeTrain distributeTrain, DistributeTrain newDistributeTrain) { | |||
| log.info("update distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||
| } | |||
| /** | |||
| * 处理删除事件 | |||
| * @param distributeTrain 分布式训练信息 | |||
| * @param b 是否为未知事件 | |||
| */ | |||
| @Override | |||
| public void onDelete(DistributeTrain distributeTrain, boolean b) { | |||
| log.info("delete distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||
| deleteActionHandler.handlerAction(distributeTrain); | |||
| } | |||
| }); | |||
| } | |||
| /** | |||
| * 运行 | |||
| */ | |||
| @Async | |||
| public void run() { | |||
| log.info("Starting DistributeTrain controller"); | |||
| try { | |||
| //分布式训练信息同步 | |||
| while (!distributeTrainSharedIndexInformer.hasSynced()){ | |||
| TimeUnit.SECONDS.sleep(1); | |||
| } | |||
| } catch (InterruptedException e) { | |||
| e.printStackTrace(); | |||
| log.error("run error:【{}】",e); | |||
| } | |||
| log.info("DistributeTrain controller is Running"); | |||
| } | |||
| } | |||
| @@ -0,0 +1,47 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.crd; | |||
| import io.fabric8.kubernetes.api.model.ObjectMeta; | |||
| import io.fabric8.kubernetes.client.CustomResource; | |||
| import lombok.Data; | |||
| import lombok.NoArgsConstructor; | |||
| /** | |||
| * @description 分布式训练 | |||
| * @date 2020-09-24 | |||
| */ | |||
| @Data | |||
| @NoArgsConstructor | |||
| public class DistributeTrain extends CustomResource { | |||
| /** | |||
| * 分布式训练详细规格 | |||
| */ | |||
| private DistributeTrainSpec spec; | |||
| /** | |||
| * 分布式训练状态 | |||
| */ | |||
| private DistributeTrainStatus status; | |||
| public DistributeTrain(ObjectMeta objectMeta, DistributeTrainSpec spec) { | |||
| this.setMetadata(objectMeta); | |||
| this.spec = spec; | |||
| } | |||
| } | |||
| @@ -0,0 +1,27 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.crd; | |||
| import io.fabric8.kubernetes.client.CustomResourceList; | |||
| /** | |||
| * @description CRD资源列表(分布式训练) | |||
| * @date 2020-09-24 | |||
| */ | |||
| public class DistributeTrainList extends CustomResourceList<DistributeTrain> { | |||
| } | |||
| @@ -0,0 +1,108 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.crd; | |||
| import com.fasterxml.jackson.databind.JsonDeserializer; | |||
| import com.fasterxml.jackson.databind.annotation.JsonDeserialize; | |||
| import io.fabric8.kubernetes.api.model.*; | |||
| import lombok.AllArgsConstructor; | |||
| import lombok.Builder; | |||
| import lombok.Data; | |||
| import lombok.NoArgsConstructor; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| /** | |||
| * @description 分布式训练详细规格 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @JsonDeserialize( | |||
| using = JsonDeserializer.None.class | |||
| ) | |||
| @Data | |||
| @NoArgsConstructor | |||
| @AllArgsConstructor | |||
| @Builder | |||
| public class DistributeTrainSpec implements KubernetesResource { | |||
| /** | |||
| * 镜像 | |||
| */ | |||
| private String image; | |||
| /** | |||
| * 镜像拉取策略 | |||
| */ | |||
| private String imagePullPolicy; | |||
| /** | |||
| * 机器数 | |||
| */ | |||
| private Integer size; | |||
| /** | |||
| * 环境参数 | |||
| */ | |||
| private List<EnvVar> env; | |||
| /** | |||
| * master 命令 | |||
| */ | |||
| private String masterCmd; | |||
| /** | |||
| * slave命令 | |||
| */ | |||
| private String slaveCmd; | |||
| /** | |||
| * master 资源节点限制 | |||
| */ | |||
| private ResourceRequirements masterResources; | |||
| /** | |||
| * slave 资源节点限制 | |||
| */ | |||
| private ResourceRequirements slaveResources; | |||
| /** | |||
| * 节点调度选择器 | |||
| */ | |||
| private Map<String,String> nodeSelector; | |||
| /** | |||
| * 初始化容器 | |||
| */ | |||
| private Container initContainer; | |||
| /** | |||
| * 工作目录挂载 | |||
| */ | |||
| private Volume workspaceStorage; | |||
| /** | |||
| * 数据集目录挂载 | |||
| */ | |||
| private Volume datasetStorage; | |||
| /** | |||
| * 模型目录挂载 | |||
| */ | |||
| private Volume modelStorage; | |||
| } | |||
| @@ -0,0 +1,55 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.crd; | |||
| import com.fasterxml.jackson.databind.JsonDeserializer; | |||
| import com.fasterxml.jackson.databind.annotation.JsonDeserialize; | |||
| import io.fabric8.kubernetes.api.model.KubernetesResource; | |||
| import lombok.Data; | |||
| /** | |||
| * @description 分布式训练状态 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @JsonDeserialize( | |||
| using = JsonDeserializer.None.class | |||
| ) | |||
| @Data | |||
| public class DistributeTrainStatus implements KubernetesResource { | |||
| /** | |||
| * 副本数 | |||
| */ | |||
| private Integer replicas; | |||
| /** | |||
| * 处在ready状态的副本数 | |||
| */ | |||
| private Integer readyReplicas; | |||
| /** | |||
| * 成功数 | |||
| */ | |||
| private Integer success; | |||
| /** | |||
| * 失败数 | |||
| */ | |||
| private Integer failed; | |||
| } | |||
| @@ -0,0 +1,31 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.crd; | |||
| import io.fabric8.kubernetes.api.builder.Function; | |||
| import io.fabric8.kubernetes.client.CustomResourceDoneable; | |||
| /** | |||
| * @description CRD资源的修改Builder | |||
| * @date 2020-09-24 | |||
| */ | |||
| public class DoneableDistributeTrain extends CustomResourceDoneable<DistributeTrain> { | |||
| public DoneableDistributeTrain(DistributeTrain resource, Function<DistributeTrain, DistributeTrain> function) { | |||
| super(resource, function); | |||
| } | |||
| } | |||
| @@ -0,0 +1,56 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.enums; | |||
| /** | |||
| * @description pvc的访问模式 | |||
| * @date 2020-09-24 | |||
| */ | |||
| public enum AccessModeEnum { | |||
| /** | |||
| * RWO是最基本的方式,可读可写,但只支持被单个Pod挂载 | |||
| */ | |||
| RWO("ReadWriteOnce"), | |||
| /** | |||
| * 可以以只读的方式被多个Pod挂载 | |||
| */ | |||
| ROX("ReadOnlyMany"), | |||
| /****/ | |||
| /** | |||
| * 这种存储可以以读写的方式被多个Pod共享。 | |||
| * 不是每一种存储都支持这三种方式,像共享方式,目前支持的还比较少,比较常用的是NFS。 | |||
| * 在PVC绑定PV时通常根据两个条件来绑定,一个是存储的大小,另一个就是访问模式。 | |||
| */ | |||
| RWX("ReadWriteMany"); | |||
| /** | |||
| * 模式 | |||
| */ | |||
| private final String mode; | |||
| AccessModeEnum(String mode) { | |||
| this.mode = mode; | |||
| } | |||
| public String getMode() { | |||
| return mode; | |||
| } | |||
| } | |||
| @@ -0,0 +1,49 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.exception; | |||
| import lombok.Getter; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| /** | |||
| * @description Operator自定义异常 | |||
| * @date 2020-09-24 | |||
| */ | |||
| @Slf4j | |||
| @Getter | |||
| public class OperatorException extends RuntimeException{ | |||
| /** | |||
| * 信息 | |||
| */ | |||
| private String msg; | |||
| /** | |||
| * 原因 | |||
| */ | |||
| private Throwable cause; | |||
| public OperatorException(String msg, Throwable cause) { | |||
| this.msg = msg; | |||
| this.cause = cause; | |||
| } | |||
| public OperatorException(String msg) { | |||
| this.msg = msg; | |||
| } | |||
| } | |||
| @@ -0,0 +1,34 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.properties; | |||
| import lombok.Data; | |||
| import org.springframework.boot.context.properties.ConfigurationProperties; | |||
| import org.springframework.stereotype.Component; | |||
| /** | |||
| * @description 属性配置 | |||
| * @date 2020-09-24 | |||
| */ | |||
| @Data | |||
| @ConfigurationProperties("k8s") | |||
| @Component | |||
| public class KubeProperties { | |||
| private String kubeconfig; | |||
| } | |||
| @@ -0,0 +1,65 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.redis; | |||
| /** | |||
| * @description redis Key 前缀 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public abstract class AbstractKeyPrefix { | |||
| /** | |||
| * key模板 | |||
| */ | |||
| private static final String KEY_TEMPLATE = "Operator:%s"; | |||
| /** | |||
| * 过期时间 | |||
| */ | |||
| private int expireSeconds; | |||
| /** | |||
| * 前缀 | |||
| */ | |||
| private String prefix; | |||
| public AbstractKeyPrefix(String prefix) {//0代表永不过期 | |||
| this(prefix,0); | |||
| } | |||
| public AbstractKeyPrefix(String prefix, int expireSeconds) { | |||
| this.expireSeconds = expireSeconds; | |||
| this.prefix = prefix; | |||
| } | |||
| /** | |||
| * 获取过期时间 | |||
| * @return | |||
| */ | |||
| public int getExpireSeconds() {//默认0代表永不过期 | |||
| return expireSeconds; | |||
| } | |||
| /** | |||
| * 获取前缀 | |||
| * @return | |||
| */ | |||
| public String getPrefix() { | |||
| return String.format(KEY_TEMPLATE, prefix); | |||
| } | |||
| } | |||
| @@ -0,0 +1,290 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.redis; | |||
| import org.onebrain.operator.utils.FastjsonUtils; | |||
| import org.onebrain.operator.utils.RedisUtils; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.stereotype.Service; | |||
| import java.util.Set; | |||
| /** | |||
| * @description redis服务 | |||
| * @date 2020-09-03 | |||
| */ | |||
| @Service | |||
| public class RedisService { | |||
| @Autowired | |||
| private RedisUtils redisUtils; | |||
| /** | |||
| * 真正key模板 | |||
| */ | |||
| private static final String REAL_KEY_TEMPLATE = "%s:%s"; | |||
| /** | |||
| * 获取真正的key | |||
| * @param prefix 前缀 | |||
| * @param key key值 | |||
| * @return 放入redis里的key值 | |||
| */ | |||
| private String getRealKey(AbstractKeyPrefix prefix, String key){ | |||
| return String.format(REAL_KEY_TEMPLATE, prefix.getPrefix(), key); | |||
| } | |||
| /** | |||
| * 实现命令:TTL key,以秒为单位,返回给定 key的剩余生存时间(TTL, time to live)。 | |||
| * @param prefix 前缀 | |||
| * @param key key值 | |||
| * @return 返回过期时间秒数 | |||
| */ | |||
| public long ttl(AbstractKeyPrefix prefix, String key) { | |||
| return redisUtils.ttl(getRealKey(prefix, key)); | |||
| } | |||
| /** | |||
| * 实现命令:expire 设置过期时间,单位秒 | |||
| * @param prefix 前缀 | |||
| * @param key key值 | |||
| * @param timeout 期望过期时间 | |||
| */ | |||
| public void expire(AbstractKeyPrefix prefix, String key, long timeout) { | |||
| redisUtils.expire(getRealKey(prefix, key), timeout); | |||
| } | |||
| /** | |||
| * 实现命令:INCR key,增加key一次 | |||
| * @param prefix 前缀 | |||
| * @param key key值 | |||
| * @param delta 增量 | |||
| * @return 计数值 | |||
| */ | |||
| public long incr(AbstractKeyPrefix prefix, String key, long delta) { | |||
| return redisUtils.incr(getRealKey(prefix, key), delta); | |||
| } | |||
| /** | |||
| * 实现命令: key,减少key一次 | |||
| * @param prefix 前缀 | |||
| * @param key key值 | |||
| * @param delta 增量 | |||
| * @return 计数值 | |||
| */ | |||
| public long decr(AbstractKeyPrefix prefix, String key, long delta) { | |||
| String realKey = getRealKey(prefix, key); | |||
| if(delta < 0){ | |||
| //throw new RuntimeException("递减因子必须大于0"); | |||
| del(realKey); | |||
| return 0; | |||
| } | |||
| return redisUtils.decr(realKey, delta); | |||
| } | |||
| /** | |||
| * 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||
| * @param prefix key前缀 | |||
| * @return key集合 | |||
| */ | |||
| public Set<String> keys(AbstractKeyPrefix prefix) { | |||
| String pattern = prefix.getPrefix(); | |||
| return redisUtils.keys(pattern + ":*"); | |||
| } | |||
| /** | |||
| * 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @return key集合 | |||
| */ | |||
| public Set<String> keys(AbstractKeyPrefix prefix, String key) { | |||
| String pattern = prefix.getPrefix(); | |||
| return redisUtils.keys(pattern + ":" + key + ":*"); | |||
| } | |||
| /** | |||
| * 实现命令:DEL key,删除一个key | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| */ | |||
| public void del(AbstractKeyPrefix prefix, String key) { | |||
| redisUtils.del(getRealKey(prefix, key)); | |||
| } | |||
| /** | |||
| * 删除一个key | |||
| * @param realKey 真正的key | |||
| */ | |||
| public void del(String realKey) { | |||
| redisUtils.del(realKey); | |||
| } | |||
| /** | |||
| * 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param value 值 | |||
| */ | |||
| public void set(AbstractKeyPrefix prefix, String key, String value) { | |||
| if(prefix.getExpireSeconds() <= 0){ | |||
| redisUtils.set(getRealKey(prefix, key), value); | |||
| }else{ | |||
| redisUtils.set(getRealKey(prefix, key), value, prefix.getExpireSeconds()); | |||
| } | |||
| } | |||
| /** | |||
| * 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param <T> 指定类型 | |||
| */ | |||
| public <T> void set(AbstractKeyPrefix prefix, String key, T value) { | |||
| if(prefix.getExpireSeconds() <= 0){ | |||
| redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value)); | |||
| }else{ | |||
| redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), prefix.getExpireSeconds()); | |||
| } | |||
| } | |||
| /** | |||
| * 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param timeout 过期时间 | |||
| */ | |||
| public void set(AbstractKeyPrefix prefix, String key, String value, long timeout) { | |||
| redisUtils.set(getRealKey(prefix, key), value, timeout); | |||
| } | |||
| /** | |||
| * 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param timeout 过期时间 | |||
| * @param <T> 指定类型 | |||
| */ | |||
| public <T> void set(AbstractKeyPrefix prefix, String key, T value, long timeout) { | |||
| redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), timeout); | |||
| } | |||
| /** | |||
| * 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @return 是否设值成功 | |||
| */ | |||
| public Boolean setnx(AbstractKeyPrefix prefix, String key, String value){ | |||
| if(prefix.getExpireSeconds() <= 0){ | |||
| return redisUtils.setnx(getRealKey(prefix, key), value); | |||
| }else{ | |||
| return redisUtils.setnx(getRealKey(prefix, key), value, prefix.getExpireSeconds()); | |||
| } | |||
| } | |||
| /** | |||
| * 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param <T> 指定类型 | |||
| * @return 是否设值成功 | |||
| */ | |||
| public <T> Boolean setnx(AbstractKeyPrefix prefix, String key, T value){ | |||
| if(prefix.getExpireSeconds() <= 0){ | |||
| return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value)); | |||
| }else{ | |||
| return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), prefix.getExpireSeconds()); | |||
| } | |||
| } | |||
| /** | |||
| * 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param timeout 过期时间 | |||
| * @return 是否设值成功 | |||
| */ | |||
| public Boolean setnx(AbstractKeyPrefix prefix, String key, String value, long timeout) { | |||
| return redisUtils.setnx(getRealKey(prefix, key), value, timeout); | |||
| } | |||
| /** | |||
| * 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param timeout 过期时间 | |||
| * @param <T> 指定类型 | |||
| * @return 是否设值成功 | |||
| */ | |||
| public <T> Boolean setnx(AbstractKeyPrefix prefix, String key, T value, long timeout) { | |||
| return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), timeout); | |||
| } | |||
| /** | |||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @return 值 | |||
| */ | |||
| public String get(AbstractKeyPrefix prefix, String key) { | |||
| return redisUtils.get(getRealKey(prefix, key)); | |||
| } | |||
| /** | |||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @param <T> 指定类型 | |||
| * @return 值 | |||
| */ | |||
| public <T> T get(AbstractKeyPrefix prefix, String key, Class<T> clazz) { | |||
| return redisUtils.get(getRealKey(prefix, key), clazz); | |||
| } | |||
| /** | |||
| * 根据key获取值 | |||
| * @param lastKey 真正的key | |||
| * @param clazz 类型 | |||
| * @param <T> 泛型 | |||
| * @return | |||
| */ | |||
| public <T> T get(String lastKey, Class<T> clazz) { | |||
| return redisUtils.get(lastKey, clazz); | |||
| } | |||
| /** | |||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||
| * @param prefix key前缀 | |||
| * @param key key值 | |||
| * @return 是否存在 | |||
| */ | |||
| public Boolean exists(AbstractKeyPrefix prefix, String key) { | |||
| return redisUtils.exists(getRealKey(prefix, key)); | |||
| } | |||
| } | |||
| @@ -0,0 +1,45 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.redis.key; | |||
| import org.onebrain.operator.redis.AbstractKeyPrefix; | |||
| /** | |||
| * @description 由operator产生的cr的唯一标识 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public class OperatorKey extends AbstractKeyPrefix { | |||
| public OperatorKey(String prefix) { | |||
| super(prefix); | |||
| } | |||
| public OperatorKey(String prefix, int expireSeconds) { | |||
| super(prefix, expireSeconds); | |||
| } | |||
| /** | |||
| * 分布式训练 Key | |||
| */ | |||
| public static final OperatorKey CR = new OperatorKey("DistributeTrain"); | |||
| /** | |||
| * 分布式训练Job Key | |||
| */ | |||
| public static final OperatorKey CR_JOB = new OperatorKey("DistributeTrain:Job"); | |||
| } | |||
| @@ -0,0 +1,41 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.utils; | |||
| import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||
| import io.fabric8.kubernetes.client.dsl.Resource; | |||
| import org.onebrain.operator.crd.DistributeTrain; | |||
| import org.onebrain.operator.crd.DistributeTrainList; | |||
| import org.onebrain.operator.crd.DoneableDistributeTrain; | |||
| /** | |||
| * @description 分布式训练客户端持有器 | |||
| * @date 2020-09-23 | |||
| */ | |||
| public class DistributeTrainClientHolder { | |||
| private static MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient; | |||
| public static MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> getClient(){ | |||
| return distributeTrainClient; | |||
| } | |||
| public static void setDistributeTrainClient(MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> client){ | |||
| distributeTrainClient = client; | |||
| } | |||
| } | |||
| @@ -0,0 +1,188 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.utils; | |||
| import com.alibaba.fastjson.JSON; | |||
| import com.alibaba.fastjson.JSONObject; | |||
| import com.alibaba.fastjson.serializer.SerializerFeature; | |||
| import java.util.List; | |||
| import java.util.Map; | |||
| /** | |||
| * @description json工具类 | |||
| * @date 2020-09-24 | |||
| */ | |||
| public class FastjsonUtils { | |||
| private static final SerializerFeature[] FEATURES = { | |||
| // 输出空置字段 | |||
| SerializerFeature.WriteMapNullValue, | |||
| //日期类型用日期字符串 yyyy-MM-dd HH:mm:ss | |||
| SerializerFeature.WriteDateUseDateFormat, | |||
| // list字段如果为null,输出为[],而不是null | |||
| SerializerFeature.WriteNullListAsEmpty, | |||
| // 数值字段如果为null,输出为0,而不是null | |||
| SerializerFeature.WriteNullNumberAsZero, | |||
| // Boolean字段如果为null,输出为false,而不是null | |||
| SerializerFeature.WriteNullBooleanAsFalse, | |||
| // 字符类型字段如果为null,输出为"",而不是null | |||
| SerializerFeature.WriteNullStringAsEmpty | |||
| }; | |||
| /** | |||
| * 将对象转为json | |||
| * @param object | |||
| * @return json的String | |||
| */ | |||
| public static String convertObjectToJSON(Object object) { | |||
| return JSON.toJSONString(object, FEATURES); | |||
| } | |||
| /** | |||
| * 将对象转为json(无循环引用) | |||
| * @param object | |||
| * @return json的String | |||
| */ | |||
| public static String toJSONNoFeatures(Object object) { | |||
| return JSON.toJSONString(object, SerializerFeature.DisableCircularReferenceDetect); | |||
| } | |||
| /** | |||
| * 将json转为对象 | |||
| * @param text | |||
| * @return 对象 | |||
| */ | |||
| public static Object toBean(String text) { | |||
| return JSON.parse(text); | |||
| } | |||
| /** | |||
| * 将json转为对象 | |||
| * @param text 文本字符串 | |||
| * @param clazz 类型 | |||
| * @param <T> 泛型 | |||
| * @return 泛型对象 | |||
| */ | |||
| public static <T> T toBean(String text, Class<T> clazz) { | |||
| return JSON.parseObject(text, clazz); | |||
| } | |||
| /** | |||
| * 转换为数组 | |||
| * @param text 文本字符串 | |||
| * @return 泛型对象 | |||
| */ | |||
| public static <T> Object[] toArray(String text) { | |||
| return toArray(text, null); | |||
| } | |||
| /** | |||
| * 转换为数组 | |||
| * @param text 文本字符串 | |||
| * @param clazz 类型 | |||
| * @return | |||
| */ | |||
| public static <T> Object[] toArray(String text, Class<T> clazz) { | |||
| return JSON.parseArray(text, clazz).toArray(); | |||
| } | |||
| /** | |||
| * 转换为List | |||
| * @param text 文本字符串 | |||
| * @param clazz 类型 | |||
| * @return | |||
| */ | |||
| public static <T> List<T> toList(String text, Class<T> clazz) { | |||
| return JSON.parseArray(text, clazz); | |||
| } | |||
| /** | |||
| * 将string转化为序列化的json字符串 | |||
| * @param text 文本字符串 | |||
| * @return json对象 | |||
| */ | |||
| public static Object textToJson(String text) { | |||
| Object objectJson = JSON.parse(text); | |||
| return objectJson; | |||
| } | |||
| /** | |||
| * json字符串转化为map | |||
| * @param text json字符串 | |||
| * @return Map集合 | |||
| */ | |||
| public static <K, V> Map<K, V> stringToCollect(String text) { | |||
| Map<K, V> m = (Map<K, V>) JSONObject.parseObject(text); | |||
| return m; | |||
| } | |||
| /** | |||
| * 转换JSON字符串为对象 | |||
| * @param jsonData json字符串 | |||
| * @param clazz 转换目标对象的类型 | |||
| * @return json对象 | |||
| */ | |||
| public static Object convertJsonToObject(String jsonData, Class<?> clazz) { | |||
| return JSONObject.parseObject(jsonData, clazz); | |||
| } | |||
| /** | |||
| * 将map转化为string | |||
| * @param m Map集合 | |||
| * @return 字符串 | |||
| */ | |||
| public static <K, V> String collectToString(Map<K, V> m) { | |||
| String s = JSONObject.toJSONString(m); | |||
| return s; | |||
| } | |||
| /** | |||
| * json字符串转化为map | |||
| * | |||
| * @param text 字符串 | |||
| * @return Map 对象 | |||
| */ | |||
| public static Map stringToMap(String text) { | |||
| Map m = JSONObject.parseObject(text); | |||
| return m; | |||
| } | |||
| /** | |||
| * 将map转化为string | |||
| * | |||
| * @param m Map集合 | |||
| * @return 字符串 | |||
| */ | |||
| public static String mapToString(Map m) { | |||
| String s = JSONObject.toJSONString(m); | |||
| return s; | |||
| } | |||
| /** | |||
| * 把对象转换为指定对象 | |||
| * @param source 原对象 | |||
| * @param target 目标class | |||
| * @param <T> 泛型 | |||
| * @return 泛型对象 | |||
| */ | |||
| public static <T> T toObjectFromSource(Object source,Class<T> target) { | |||
| return toBean(convertObjectToJSON(source), target); | |||
| } | |||
| } | |||
| @@ -0,0 +1,56 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.utils; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import java.io.File; | |||
| import java.io.FileOutputStream; | |||
| import java.io.IOException; | |||
| import java.io.InputStream; | |||
| /** | |||
| * @description IO工具类 | |||
| * @date 2020-09-24 | |||
| */ | |||
| @Slf4j | |||
| public class IOUtils { | |||
| /** | |||
| * 将input流转换为文件 | |||
| * | |||
| * @param is 输入流 | |||
| * @param targetFile 目标文件 | |||
| */ | |||
| public static void copy(InputStream is, File targetFile) { | |||
| try (FileOutputStream fos = new FileOutputStream(targetFile)) { | |||
| byte[] b = new byte[1024]; | |||
| int readCount = is.read(b); | |||
| while (readCount != -1) { | |||
| // 写入数据 | |||
| fos.write(b, 0, readCount); | |||
| readCount = is.read(b); | |||
| } | |||
| is.close(); | |||
| fos.flush(); | |||
| } catch (IOException e) { | |||
| log.error("copy file error:【{}】", e); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,289 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.utils; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.data.redis.core.StringRedisTemplate; | |||
| import org.springframework.stereotype.Component; | |||
| import java.util.Map; | |||
| import java.util.Set; | |||
| import java.util.concurrent.TimeUnit; | |||
| /** | |||
| * @description 封装redis简单的key-value操作 | |||
| * @date 2020-09-23 | |||
| */ | |||
| @Component | |||
| public class RedisUtils { | |||
| @Autowired | |||
| private StringRedisTemplate redisTemplate; | |||
| /** | |||
| * 实现命令:TTL key,以秒为单位,返回给定 key的剩余生存时间(TTL, time to live)。 | |||
| * @param key key值 | |||
| * @return 返回过期时间秒数 | |||
| */ | |||
| public long ttl(String key) { | |||
| return redisTemplate.getExpire(key); | |||
| } | |||
| /** | |||
| * 实现命令:expire 设置过期时间,单位秒 | |||
| * @param key key值 | |||
| * @param timeout 期望过期时间 | |||
| */ | |||
| public void expire(String key, long timeout) { | |||
| redisTemplate.expire(key, timeout, TimeUnit.SECONDS); | |||
| } | |||
| /** | |||
| * 实现命令:INCR key,增加key一次 | |||
| * @param key key值 | |||
| * @param delta 增量 | |||
| * @return 计数值 | |||
| */ | |||
| public long incr(String key, long delta) { | |||
| return redisTemplate.opsForValue().increment(key, delta); | |||
| } | |||
| /** | |||
| * 实现命令: key,减少key一次 | |||
| * @param key key值 | |||
| * @param delta 增量 | |||
| * @return 计数值 | |||
| */ | |||
| public long decr(String key, long delta) { | |||
| if(delta < 0){ | |||
| //throw new RuntimeException("递减因子必须大于0"); | |||
| del(key); | |||
| return 0; | |||
| } | |||
| return redisTemplate.opsForValue().increment(key, -delta); | |||
| } | |||
| /** | |||
| * 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||
| * @return key集合 | |||
| */ | |||
| public Set<String> keys(String pattern) { | |||
| return redisTemplate.keys(pattern); | |||
| } | |||
| /** | |||
| * 实现命令:DEL key,删除一个key | |||
| * @param key key值 | |||
| */ | |||
| public void del(String key) { | |||
| redisTemplate.delete(key); | |||
| } | |||
| /** | |||
| * 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||
| * @param key key值 | |||
| * @param value 值 | |||
| */ | |||
| public void set(String key, String value) { | |||
| redisTemplate.opsForValue().set(key, value); | |||
| } | |||
| /** | |||
| * 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param <T> 指定类型 | |||
| */ | |||
| public <T> void set(String key, T value) { | |||
| redisTemplate.opsForValue().set(key, FastjsonUtils.convertObjectToJSON(value)); | |||
| } | |||
| /** | |||
| * 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param timeout 过期时间 | |||
| */ | |||
| public void set(String key, String value, long timeout) { | |||
| redisTemplate.opsForValue().set(key, value, timeout, TimeUnit.SECONDS); | |||
| } | |||
| /** | |||
| * 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param timeout 过期时间 | |||
| * @param <T> 指定类型 | |||
| */ | |||
| public <T> void set(String key, T value, long timeout) { | |||
| redisTemplate.opsForValue().set(key, FastjsonUtils.convertObjectToJSON(value), timeout, TimeUnit.SECONDS); | |||
| } | |||
| /** | |||
| * 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @return 是否设值成功 | |||
| */ | |||
| public Boolean setnx(String key, String value){ | |||
| return redisTemplate.opsForValue().setIfAbsent(key, value); | |||
| } | |||
| /** | |||
| * 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param <T> 指定类型 | |||
| * @return 是否设值成功 | |||
| */ | |||
| public <T> Boolean setnx(String key, T value){ | |||
| return redisTemplate.opsForValue().setIfAbsent(key, FastjsonUtils.convertObjectToJSON(value)); | |||
| } | |||
| /** | |||
| * 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param timeout 过期时间 | |||
| * @return 是否设值成功 | |||
| */ | |||
| public Boolean setnx(String key, String value, long timeout) { | |||
| return redisTemplate.opsForValue().setIfAbsent(key, value, timeout, TimeUnit.SECONDS); | |||
| } | |||
| /** | |||
| * 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||
| * @param key key值 | |||
| * @param value 值 | |||
| * @param timeout 过期时间 | |||
| * @param <T> 指定类型 | |||
| * @return 是否设值成功 | |||
| */ | |||
| public <T> Boolean setnx(String key, T value, long timeout) { | |||
| return redisTemplate.opsForValue().setIfAbsent(key, FastjsonUtils.convertObjectToJSON(value), timeout, TimeUnit.SECONDS); | |||
| } | |||
| /** | |||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||
| * @param key key值 | |||
| * @return 值 | |||
| */ | |||
| public String get(String key) { | |||
| return (String) redisTemplate.opsForValue().get(key); | |||
| } | |||
| /** | |||
| * | |||
| * 根据key获取值 | |||
| * @param key 真正的key | |||
| * @param clazz 类型 | |||
| * @param <T> 泛型 | |||
| * @return | |||
| */ | |||
| public <T> T get(String key, Class<T> clazz) { | |||
| String value = (String) redisTemplate.opsForValue().get(key); | |||
| return (T) FastjsonUtils.convertJsonToObject(value, clazz); | |||
| } | |||
| /** | |||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||
| * @param key key值 | |||
| * @return 是否存在 | |||
| */ | |||
| public Boolean exists(String key) { | |||
| return redisTemplate.hasKey(key); | |||
| } | |||
| /****----------------------------------Hash----------------------------------------****/ | |||
| /** | |||
| * 实现命令:HSET key field value,将哈希表 key中的域 field的值设为 value | |||
| * | |||
| * @param key key | |||
| * @param field 域 | |||
| * @param value 值 | |||
| */ | |||
| public void hset(String key, String field, Object value) { | |||
| redisTemplate.opsForHash().put(key, field, value); | |||
| } | |||
| /** | |||
| * 实现命令:HGET key field,返回哈希表 key中给定域 field的值 | |||
| * | |||
| * @param key key | |||
| * @param field 域 | |||
| * @return | |||
| */ | |||
| public String hget(String key, String field) { | |||
| return (String) redisTemplate.opsForHash().get(key, field); | |||
| } | |||
| /** | |||
| * 实现命令:HDEL key field [field ...],删除哈希表 key 中的一个或多个指定域,不存在的域将被忽略。 | |||
| * | |||
| * @param key key | |||
| * @param fields 域 | |||
| */ | |||
| public void hdel(String key, Object... fields) { | |||
| redisTemplate.opsForHash().delete(key, fields); | |||
| } | |||
| /** | |||
| * 实现命令:HGETALL key,返回哈希表 key中,所有的域和值。 | |||
| * | |||
| * @param key | |||
| * @return 域和值 | |||
| */ | |||
| public Map<Object, Object> hgetall(String key) { | |||
| return redisTemplate.opsForHash().entries(key); | |||
| } | |||
| /****----------------------------------List----------------------------------------****/ | |||
| /** | |||
| * 实现命令:LPUSH key value,将一个值 value插入到列表 key的表头 | |||
| * | |||
| * @param key | |||
| * @param value | |||
| * @return 执行 LPUSH命令后,列表的长度。 | |||
| */ | |||
| public long lpush(String key, String value) { | |||
| return redisTemplate.opsForList().leftPush(key, value); | |||
| } | |||
| /** | |||
| * 实现命令:LPOP key,移除并返回列表 key的头元素。 | |||
| * | |||
| * @param key | |||
| * @return 列表key的头元素。 | |||
| */ | |||
| public String lpop(String key) { | |||
| return (String) redisTemplate.opsForList().leftPop(key); | |||
| } | |||
| /** | |||
| * 实现命令:RPUSH key value,将一个值 value插入到列表 key的表尾(最右边)。 | |||
| * | |||
| * @param key | |||
| * @param value | |||
| * @return 执行 LPUSH命令后,列表的长度。 | |||
| */ | |||
| public long rpush(String key, String value) { | |||
| return redisTemplate.opsForList().rightPush(key, value); | |||
| } | |||
| } | |||
| @@ -0,0 +1,99 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.utils; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.springframework.beans.BeansException; | |||
| import org.springframework.beans.factory.DisposableBean; | |||
| import org.springframework.context.ApplicationContext; | |||
| import org.springframework.context.ApplicationContextAware; | |||
| import org.springframework.stereotype.Component; | |||
| /** | |||
| * @description 上下文工具类 | |||
| * @date 2020-09-24 | |||
| */ | |||
| @Component | |||
| @Slf4j | |||
| public class SpringContextHolder implements ApplicationContextAware, DisposableBean { | |||
| public static ApplicationContext applicationContext = null; | |||
| /** | |||
| * 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型. | |||
| * @param name bean名称 | |||
| * @param <T> 类型 | |||
| * @return bean对象 | |||
| */ | |||
| @SuppressWarnings("unchecked") | |||
| public static <T> T getBean(String name) { | |||
| assertContextInjected(); | |||
| return (T) applicationContext.getBean(name); | |||
| } | |||
| /** | |||
| * 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型. | |||
| * @param requiredType bean类型 class | |||
| * @param <T> 泛型 | |||
| * @return bean对象 | |||
| */ | |||
| public static <T> T getBean(Class<T> requiredType) { | |||
| assertContextInjected(); | |||
| return applicationContext.getBean(requiredType); | |||
| } | |||
| /** | |||
| * 检查ApplicationContext不为空. | |||
| */ | |||
| private static void assertContextInjected() { | |||
| if (applicationContext == null) { | |||
| throw new IllegalStateException("applicaitonContext属性未注入, 请在applicationContext" + | |||
| ".xml中定义SpringContextHolder或在SpringBoot启动类中注册SpringContextHolder."); | |||
| } | |||
| } | |||
| /** | |||
| * 清除SpringContextHolder中的ApplicationContext为Null. | |||
| */ | |||
| private static void clearHolder() { | |||
| log.debug("清除SpringContextHolder中的ApplicationContext:" | |||
| + applicationContext); | |||
| applicationContext = null; | |||
| } | |||
| /** | |||
| * 销毁回调函数 | |||
| */ | |||
| @Override | |||
| public void destroy() { | |||
| SpringContextHolder.clearHolder(); | |||
| } | |||
| /** | |||
| * spring上下文设置 | |||
| * @param applicationContext | |||
| * @throws BeansException | |||
| */ | |||
| @Override | |||
| public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { | |||
| if (SpringContextHolder.applicationContext != null) { | |||
| log.warn("SpringContextHolder中的ApplicationContext被覆盖, 原有ApplicationContext为:" + SpringContextHolder.applicationContext); | |||
| } | |||
| SpringContextHolder.applicationContext = applicationContext; | |||
| } | |||
| } | |||
| @@ -0,0 +1,111 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.watcher; | |||
| import cn.hutool.core.collection.CollectionUtil; | |||
| import cn.hutool.core.util.StrUtil; | |||
| import io.fabric8.kubernetes.api.model.OwnerReference; | |||
| import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||
| import io.fabric8.kubernetes.api.model.batch.Job; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import lombok.Data; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.onebrain.operator.constants.KubeConstants; | |||
| import org.onebrain.operator.redis.RedisService; | |||
| import org.onebrain.operator.redis.key.OperatorKey; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.stereotype.Component; | |||
| import java.util.List; | |||
| import static org.onebrain.operator.constants.CrdConstants.CRD_KIND; | |||
| /** | |||
| * @description Job处理器 | |||
| * @date 2020-09-24 | |||
| */ | |||
| @Data | |||
| @Slf4j | |||
| @Component | |||
| public class JobHandler { | |||
| public static final String FINISHED = "finished"; | |||
| public static final String PENDING = "pending"; | |||
| @Autowired | |||
| private RedisService redis; | |||
| @Autowired | |||
| private KubernetesClient client; | |||
| /** | |||
| * 处理Job | |||
| * | |||
| * @param job | |||
| */ | |||
| public void handleJob(Job job) { | |||
| log.info("handleJob=>job : 【{}】", job); | |||
| //筛选出DistributeTrain下的job | |||
| List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences(); | |||
| if (CollectionUtil.isEmpty(ownerReferences) || !CRD_KIND.equals(ownerReferences.get(0).getKind())) { | |||
| return; | |||
| } | |||
| String key = job.getMetadata().getUid(); | |||
| if (StrUtil.equals(redis.get(OperatorKey.CR_JOB, key), FINISHED)) { | |||
| return; | |||
| } | |||
| try { | |||
| redis.set(OperatorKey.CR_JOB, key, PENDING); | |||
| final Integer parallelism = job.getSpec().getParallelism(); | |||
| final Integer backoffLimit = job.getSpec().getBackoffLimit(); | |||
| //成功 或者 失败达到最大次数 | |||
| if (job.getStatus() != null | |||
| && ((job.getStatus().getFailed() != null && job.getStatus().getFailed() + 1 >= backoffLimit) | |||
| || (job.getStatus().getSucceeded() != null && parallelism.equals(job.getStatus().getSucceeded())))) { | |||
| //得到DistributeTrain的Statefulset | |||
| String dtName = ownerReferences.get(0).getName(); | |||
| String namespace = job.getMetadata().getNamespace(); | |||
| List<StatefulSet> statefulsetList = client.apps().statefulSets() | |||
| .inNamespace(namespace) | |||
| .withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, dtName) | |||
| .list().getItems(); | |||
| if (CollectionUtil.isEmpty(statefulsetList)) { | |||
| log.info("jobWatcher: statefulset of 【{}】 not exists", dtName); | |||
| return; | |||
| } | |||
| //缩容Statefulset的replica到0 | |||
| StatefulSet statefulSet = statefulsetList.get(0); | |||
| statefulSet.getSpec().setReplicas(0); | |||
| client.resource(statefulSet).createOrReplace(); | |||
| log.info("jobWatcher: reduce replicas of 【{}】 to zero", dtName); | |||
| redis.set(OperatorKey.CR_JOB, key, "finished"); | |||
| } | |||
| } catch (Exception e) { | |||
| redis.set(OperatorKey.CR_JOB, key, "error"); | |||
| log.error("handle job error:【{}】", e); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,71 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.watcher; | |||
| import io.fabric8.kubernetes.api.model.batch.Job; | |||
| import io.fabric8.kubernetes.client.KubernetesClientException; | |||
| import io.fabric8.kubernetes.client.Watcher; | |||
| import lombok.Data; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| /** | |||
| * @description Job监视器 | |||
| * @date 2020-09-24 | |||
| */ | |||
| @Data | |||
| @Slf4j | |||
| public class JobWatcher implements Watcher<Job> { | |||
| private String namespace; | |||
| private String jobName; | |||
| private KubeWatcherManager manager; | |||
| private JobHandler jobHandler; | |||
| public JobWatcher(JobHandler jobHandler, KubeWatcherManager manager) { | |||
| this.manager = manager; | |||
| this.jobHandler = jobHandler; | |||
| } | |||
| /** | |||
| * 接收事件进行处理 | |||
| * @param action 事件类型 | |||
| * @param job job信息 | |||
| */ | |||
| @Override | |||
| public void eventReceived(Action action, Job job) { | |||
| log.info("Job Event received: {} at {}", job.getMetadata().getUid(), job.getMetadata().getCreationTimestamp()); | |||
| jobHandler.handleJob(job); | |||
| } | |||
| /** | |||
| * 关闭事件 | |||
| * @param e 客户端异常 | |||
| */ | |||
| @Override | |||
| public void onClose(KubernetesClientException e) { | |||
| log.debug("job watcher close"); | |||
| if (e != null) { | |||
| log.error(e.getMessage()); | |||
| log.info("restart new job watcher thread"); | |||
| manager.putNewWatcher(); | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,120 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator.watcher; | |||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||
| import lombok.extern.slf4j.Slf4j; | |||
| import org.onebrain.operator.context.KubeContext; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.stereotype.Component; | |||
| import java.util.concurrent.LinkedBlockingQueue; | |||
| import java.util.concurrent.ThreadFactory; | |||
| import java.util.concurrent.ThreadPoolExecutor; | |||
| import java.util.concurrent.TimeUnit; | |||
| import java.util.concurrent.atomic.AtomicInteger; | |||
| /** | |||
| * @description 监视器的管理器 | |||
| * @date 2020-09-24 | |||
| */ | |||
| @Slf4j | |||
| @Component | |||
| public class KubeWatcherManager { | |||
| /** | |||
| * 监视队列 | |||
| */ | |||
| private static final LinkedBlockingQueue<JobWatcher> watchQueue = new LinkedBlockingQueue<>(1000); | |||
| /** | |||
| * 单例线程池 | |||
| */ | |||
| private ThreadPoolExecutor pool = new ThreadPoolExecutor(1, 1, 1, TimeUnit.SECONDS, new LinkedBlockingQueue<>(1), new ThreadFactory() { | |||
| private final AtomicInteger mThreadNum = new AtomicInteger(1); | |||
| @Override | |||
| public Thread newThread(Runnable r) { | |||
| return new Thread(r, "job-watcher-" + mThreadNum.getAndIncrement()); | |||
| } | |||
| }); | |||
| @Autowired | |||
| private KubeContext kubeContext; | |||
| @Autowired | |||
| private JobHandler jobHandler; | |||
| /** | |||
| * 第一次启动时 | |||
| */ | |||
| public void startWatching(){ | |||
| JobWatchHolder jobWatchHolder = new JobWatchHolder(); | |||
| pool.execute(jobWatchHolder); | |||
| putNewWatcher(); | |||
| } | |||
| /** | |||
| * 监听指定job | |||
| * @param jobWatcher | |||
| */ | |||
| public void watch(JobWatcher jobWatcher){ | |||
| KubernetesClient client = kubeContext.getClient(); | |||
| //监听指定job | |||
| client.batch().jobs() | |||
| .inAnyNamespace().watch(jobWatcher); | |||
| } | |||
| /** | |||
| * 加入新watcher | |||
| */ | |||
| public void putNewWatcher(){ | |||
| try { | |||
| JobWatcher jobWatcher = new JobWatcher(jobHandler, this); | |||
| watchQueue.put(jobWatcher); | |||
| } catch (InterruptedException e) { | |||
| e.printStackTrace(); | |||
| } | |||
| } | |||
| /** | |||
| * Job监视器持有者 | |||
| */ | |||
| class JobWatchHolder implements Runnable { | |||
| @Override | |||
| public void run() { | |||
| while(true){ | |||
| try { | |||
| //无监视器时阻塞 | |||
| JobWatcher jobWatcher = watchQueue.take(); | |||
| //启动监视器 | |||
| try{ | |||
| watch(jobWatcher); | |||
| }catch (Exception e){ | |||
| //出错不影响其他listener | |||
| log.error("JobWatchHolder watch error:【{}】",e); | |||
| } | |||
| } catch (InterruptedException e) { | |||
| log.error("JobWatchHolder run error:【{}】",e); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -0,0 +1,27 @@ | |||
| -----BEGIN RSA PRIVATE KEY----- | |||
| MIIEogIBAAKCAQEA06ZOLQq4pzBZL+bybsxdl4PzYg3jB4kRVc771nm5Y8JenDAT | |||
| hlOTz6+nGH4EDT63J7oNj4JYLufsONKYhJkya8p0btWeKHqz5LgEfLGwz/FTMRH5 | |||
| WTCZCZUa/3i9gQeKK/CKEned1h4l2w4agrYrnXHpnuNSw6HSlTpX8FgaQGfmTkL3 | |||
| XtzSCeY9F2fXGOm9fMfVmv5I5uP6B4TmKwtWPvx3a/1MDgHbmtoaCqYP/JmzWHyi | |||
| mc9l2ilX3kTPxh57oRtW9N3FATc8/OCYkNt4vDUTRVB4drODaR5TgUbFtkBVGcFR | |||
| f7MrQo4Krd2g8rtEv7PaWN/wlNle5ANXJ/oL3wIDAQABAoIBADiqC8APYMSSMy6Z | |||
| /EohuOT51M1pvmCkF9oLYm1XhYTp4v6Z+IA8HBS8iFYMVvVc1xhxvXOwh/925E2K | |||
| RH8rrM4jE+0gkAlyYHtZsQnZYOcrSwSWNVXlpvNj0iiXoNTMufdtnOm40K8kvynY | |||
| qsxYDXFHsC5z2hK6XnDJgAW+8LhRHCizWwxc0dSN9r33VGry0rgndUZsj2ZBf7u5 | |||
| rdslZKvRzMymXct7CIQQ3s5IUO3qbaj7TIzMIo14bmHgD3zlBQ66ESCX1o5A+hPq | |||
| 1gfUNqUPBtJhsNJg4YYJ/bGgGhBxAxam8jWz3DFZEuYHr6fCDIhLJzL5ulxoQS2z | |||
| vJYBwsECgYEA8JGfw004BxqcBVxqBveestsCVGIWDtb+Zx4OI+uBAmYMXd2WCzxv | |||
| XxgQJ/IrpNx6FAXZ/bFdE0HRZWR6H07wtNgABuBgd0tAfcH8sw2CJkTO/0N2Xr6/ | |||
| O4kh3yHNMy/wAxnktISf1hE/ElEdPI6slhwGDQObRdXxaqBEq+Tjc28CgYEA4TnM | |||
| rCaJ8aMaUE0nvVzrev3VTLp4f1qOcPUOnrHDdyrPs1SjYzmAOC72X/FylJZmtkvh | |||
| coMQUKVQgiBn1dTtnALANq705b1S+0U07m6+dGJ7LWchOY2tFPiIsx3SZvNJeEKJ | |||
| 38PsaFi2eDcDP8cKriNoAoby8TbqjqiyHgDX9pECfxww9IfuhKJQe/gk3Ef0vKQ5 | |||
| BgzdcbhLeYScAQw0jOm7C7f0P6ERc/uw1jPYLUUkkSnHhcQ1BLM9A0zeeXExzwNi | |||
| TJ6BrMxOBUC3euWAr7/MUHWZckWoFMDlURLU4zccZwP2BNcis5hibQG4f7SZA6CT | |||
| qCHeSlPkvmXAYkvChuUCgYEA0DNlL9KkfBqBja/1R4jpKhYSIs7R6zCkMmlm7W54 | |||
| ueV6gVWBgI08KTPIj2KcwBzUsDovG3NrFpHrfY9FTZd7W1fzpdlQDDxaxGryhmMb | |||
| bm1HXu5R+WktkhA6FhJAWOkXhrNDzvXHyaIQc8qvFzsBdX7HfGaRmEhixiPOHAw9 | |||
| l/ECgYEAwNywUARR9HtmgoyrwifrzIkMo6jcmLNEIzi2kJ4OQQxW5eKj5JgSV0ND | |||
| QUoAIWDAhHQd3ygSfbeShcvtcw+zoF92iOVFn0SLiSe1TgA5ggzC/VJUnInO7zx7 | |||
| 8Sj8Zk5tHrVmTlelEA2Nbq5H7/U1Q33c1AWbw8yxqD/JRxudHKA= | |||
| -----END RSA PRIVATE KEY----- | |||
| @@ -0,0 +1 @@ | |||
| ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDTpk4tCrinMFkv5vJuzF2Xg/NiDeMHiRFVzvvWebljwl6cMBOGU5PPr6cYfgQNPrcnug2Pglgu5+w40piEmTJrynRu1Z4oerPkuAR8sbDP8VMxEflZMJkJlRr/eL2BB4or8IoSd53WHiXbDhqCtiudceme41LDodKVOlfwWBpAZ+ZOQvde3NIJ5j0XZ9cY6b18x9Wa/kjm4/oHhOYrC1Y+/Hdr/UwOAdua2hoKpg/8mbNYfKKZz2XaKVfeRM/GHnuhG1b03cUBNzz84JiQ23i8NRNFUHh2s4NpHlOBRsW2QFUZwVF/sytCjgqt3aDyu0S/s9pY3/CU2V7kA1cn+gvf root@{{ip}} | |||
| @@ -0,0 +1,19 @@ | |||
| apiVersion: v1 | |||
| clusters: | |||
| - cluster: | |||
| certificate-authority-data: {} | |||
| server: {} | |||
| name: kubernetes | |||
| contexts: | |||
| - context: | |||
| cluster: kubernetes | |||
| user: kubernetes-admin | |||
| name: kubernetes-admin@kubernetes | |||
| current-context: kubernetes-admin@kubernetes | |||
| kind: Config | |||
| preferences: {} | |||
| users: | |||
| - name: kubernetes-admin | |||
| user: | |||
| client-certificate-data: {} | |||
| client-key-data: {} | |||
| @@ -0,0 +1,46 @@ | |||
| #!/bin/bash | |||
| if [ ! -f "/etc/init.d/ssh" ]; then | |||
| if [ ! -f "/etc/redhat-release" ]; then | |||
| echo 'apt install -y openssh-server' >> pretreatment.log | |||
| apt update >> pretreatment.log | |||
| apt install -y openssh-server >> pretreatment.log | |||
| fi | |||
| if [ ! -f "/etc/lsb-release" ]; then | |||
| echo 'yum install -y sshd' >> pretreatment.log | |||
| yum update >> pretreatment.log | |||
| yum install -y sshd >> pretreatment.log | |||
| fi | |||
| fi | |||
| echo '/etc/init.d/ssh start' >> pretreatment.log | |||
| /etc/init.d/ssh start >> pretreatment.log | |||
| if [ -f "/etc/redhat-release" ]; then | |||
| if command -v nslookup >/dev/null 2>&1; then | |||
| echo 'exists nslookup' >> pretreatment.log | |||
| else | |||
| echo 'yum install dnsutils jq' >> pretreatment.log | |||
| yum install -y dnsutils >> pretreatment.log | |||
| yum install -y jq >> pretreatment.log | |||
| fi | |||
| if command -v nslookup >/dev/null 2>&1; then | |||
| echo 'exists nslookup' >> pretreatment.log | |||
| else | |||
| echo 'yum install dnsutils jq' >> pretreatment.log | |||
| yum install -y dnsutils >> pretreatment.log | |||
| yum install -y jq >> pretreatment.log | |||
| fi | |||
| fi | |||
| if [ -f "/etc/lsb-release" ]; then | |||
| if command -v jq >/dev/null 2>&1; then | |||
| echo 'exists jq' >> pretreatment.log | |||
| else | |||
| echo 'apt install jq' >> pretreatment.log | |||
| apt install -y jq >> pretreatment.log | |||
| fi | |||
| if command -v nslookup >/dev/null 2>&1; then | |||
| echo 'exists nslookup' >> pretreatment.log | |||
| else | |||
| echo 'apt install dnsutils' >> pretreatment.log | |||
| apt install -y dnsutils >> pretreatment.log | |||
| fi | |||
| fi | |||
| @@ -0,0 +1,43 @@ | |||
| /** | |||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| * ============================================================= | |||
| */ | |||
| package org.onebrain.operator; | |||
| import org.onebrain.operator.api.pod.PodApi; | |||
| import org.onebrain.operator.constants.KubeConstants; | |||
| import org.springframework.beans.factory.annotation.Autowired; | |||
| import org.springframework.boot.test.context.SpringBootTest; | |||
| import java.io.File; | |||
| import java.net.URISyntaxException; | |||
| import java.net.URL; | |||
| @SpringBootTest | |||
| public class DistributeTrainOperatorApplicationTests { | |||
| @Autowired | |||
| private PodApi podApi; | |||
| // @Test | |||
| public void contextLoads() throws URISyntaxException { | |||
| final URL url = getClass().getClassLoader().getResource("key/id_rsa"); | |||
| File file = new File(url.toURI()); | |||
| podApi.copyToPod("default", "distribute-train-test-job-sv2dj", KubeConstants.MASTER_CONTAINER_NAME, file, "/root/.ssh/id_rsa"); | |||
| } | |||
| } | |||