| @@ -0,0 +1,26 @@ | |||||
| # 之江天枢-分布式训练 operator | |||||
| 该模块是分布式训练CRD的控制器,管理分布式训练容器生命周期,为分布式训练容器注入其他容器ip。 | |||||
| ## 源码部署 | |||||
| ### 准备环境 | |||||
| 安装如下软件环境。 | |||||
| - OpenJDK:1.8+ | |||||
| - Redis: 3.0+ | |||||
| - Maven: 3.0+ | |||||
| ### 下载源码 | |||||
| ``` bash | |||||
| git clone https://codeup.teambition.com/zhejianglab/distribute-train-operator.git | |||||
| # 进入项目根目录 | |||||
| cd distribute-train-operator | |||||
| ``` | |||||
| ### 构建 | |||||
| ``` bash | |||||
| # 构建,生成的 jar 包位于 ./target/distribute-train-operator-1.0.jar | |||||
| mvn clean compile package | |||||
| ``` | |||||
| ### 部署 | |||||
| 部署过程参看文档:[部署 分布式训练operator](http://tianshu.org.cn/?/course/1.html) | |||||
| @@ -0,0 +1,65 @@ | |||||
| apiVersion: onebrain.oneflow.org/v1alpha1 | |||||
| kind: DistributeTrain | |||||
| metadata: | |||||
| name: dt-resnet50 | |||||
| namespace: resnet50 | |||||
| labels: | |||||
| key: value | |||||
| spec: | |||||
| size: 3 | |||||
| image: {{IMAGE}} | |||||
| imagePullPolicy: IfNotPresent | |||||
| masterCmd: export NODE_IPS=`cat /home/hostfile.json |jq -r '.[]|.ip'|paste -d "," -s` && cd /workspace/Classification/cnns && rm -rf core.* && rm -rf ./output/snapshots/* && python3 of_cnn_train_val.py --train_data_dir=$DATA_ROOT/train --train_data_part_num=$TRAIN_DATA_PART_NUM --val_data_dir=$DATA_ROOT/validation --val_data_part_num=$VAL_DATA_PART_NUM --num_nodes=$NODE_NUM --node_ips="$NODE_IPS" --gpu_num_per_node=$GPU_NUM_PER_NODE --model_update="momentum" --learning_rate=0.256 --loss_print_every_n_iter=1 --batch_size_per_device=64 --val_batch_size_per_device=64 --num_epoch=1 --model="resnet50" --model_save_dir=/model | |||||
| masterResources: | |||||
| requests: | |||||
| nvidia.com/gpu: 2 | |||||
| memory: "16Gi" | |||||
| cpu: "2" | |||||
| limits: | |||||
| nvidia.com/gpu: 2 | |||||
| memory: "16Gi" | |||||
| cpu: "2" | |||||
| slaveCmd: export NODE_IPS=`cat /home/hostfile.json |jq -r '.[]|.ip'|paste -d "," -s` && cd /workspace/Classification/cnns && rm -rf core.* && rm -rf ./output/snapshots/* && python3 of_cnn_train_val.py --train_data_dir=$DATA_ROOT/train --train_data_part_num=$TRAIN_DATA_PART_NUM --val_data_dir=$DATA_ROOT/validation --val_data_part_num=$VAL_DATA_PART_NUM --num_nodes=$NODE_NUM --node_ips="$NODE_IPS" --gpu_num_per_node=$GPU_NUM_PER_NODE --model_update="momentum" --learning_rate=0.256 --loss_print_every_n_iter=1 --batch_size_per_device=64 --val_batch_size_per_device=64 --num_epoch=1 --model="resnet50" --model_save_dir=/model | |||||
| slaveResources: | |||||
| requests: | |||||
| nvidia.com/gpu: 2 | |||||
| memory: "16Gi" | |||||
| cpu: "2" | |||||
| limits: | |||||
| nvidia.com/gpu: 2 | |||||
| memory: "16Gi" | |||||
| cpu: "2" | |||||
| nodeSelector: | |||||
| kubernetes.io/hostname: node02 | |||||
| env: | |||||
| - name: ENABLE_USER_OP | |||||
| value: 'True' | |||||
| - name: DATA_ROOT | |||||
| value: '/dataset' | |||||
| - name: NODE_NUM | |||||
| value: 3 | |||||
| - name: GPU_NUM_PER_NODE | |||||
| value: 2 | |||||
| - name: ONEFLOW_DEBUG_MODE | |||||
| value: "" | |||||
| - name: TRAIN_DATA_PART_NUM | |||||
| value: 6 | |||||
| - name: VAL_DATA_PART_NUM | |||||
| value: 6 | |||||
| - name: NCCL_DEBUG | |||||
| value: INFO | |||||
| datasetStorage: | |||||
| name: pvc-dataset | |||||
| nfs: | |||||
| path: {{DATASET}} | |||||
| server: {{NFS}} | |||||
| workspaceStorage: | |||||
| name: pvc-workspace | |||||
| nfs: | |||||
| path: /nfs/resnet50/workspace | |||||
| server: {{WORKSPACE}} | |||||
| modelStorage: | |||||
| name: pvc-model | |||||
| nfs: | |||||
| path: /nfs/resnet50/model | |||||
| server: {{MODEL}} | |||||
| @@ -0,0 +1,61 @@ | |||||
| --- | |||||
| apiVersion: apiextensions.k8s.io/v1beta1 | |||||
| kind: CustomResourceDefinition | |||||
| metadata: | |||||
| name: distributetrains.onebrain.oneflow.org | |||||
| spec: | |||||
| group: onebrain.oneflow.org | |||||
| names: | |||||
| kind: DistributeTrain | |||||
| singular: distributetrain | |||||
| plural: distributetrains | |||||
| shortNames: | |||||
| - dt | |||||
| scope: Namespaced | |||||
| subresources: | |||||
| status: {} | |||||
| version: v1alpha1 | |||||
| validation: | |||||
| openAPIV3Schema: | |||||
| properties: | |||||
| apiVersion: | |||||
| type: string | |||||
| kind: | |||||
| type: string | |||||
| metadata: | |||||
| type: object | |||||
| spec: | |||||
| properties: | |||||
| image: | |||||
| type: string | |||||
| imagePullPolicy: | |||||
| type: string | |||||
| size: | |||||
| format: int32 | |||||
| type: integer | |||||
| masterCmd: | |||||
| type: string | |||||
| slaveCmd: | |||||
| type: string | |||||
| masterResources: | |||||
| type: object | |||||
| slaveResources: | |||||
| type: object | |||||
| nodeSelector: | |||||
| type: object | |||||
| initContainer: | |||||
| type: object | |||||
| datasetStorage: | |||||
| type: object | |||||
| workspaceStorage: | |||||
| type: object | |||||
| modelStorage: | |||||
| type: object | |||||
| required: | |||||
| - image | |||||
| - imagePullPolicy | |||||
| - size | |||||
| - masterCmd | |||||
| - slaveCmd | |||||
| - workspaceStorage | |||||
| type: object | |||||
| @@ -0,0 +1,47 @@ | |||||
| kind: Deployment | |||||
| apiVersion: apps/v1 | |||||
| metadata: | |||||
| name: distribute-train-operator | |||||
| namespace: test-ns | |||||
| labels: | |||||
| name: distribute-train-operator | |||||
| spec: | |||||
| replicas: 1 | |||||
| selector: | |||||
| matchLabels: | |||||
| name: distribute-train-operator | |||||
| template: | |||||
| metadata: | |||||
| labels: | |||||
| name: distribute-train-operator | |||||
| spec: | |||||
| containers: | |||||
| - name: distribute-train-operator | |||||
| image: {{IMAGE}} | |||||
| ports: | |||||
| - containerPort: 8080 | |||||
| protocol: TCP | |||||
| volumeMounts:d | |||||
| - mountPath: /root/config | |||||
| name: config-volume | |||||
| env: | |||||
| - name: JAR_BALL | |||||
| value: "distribute-train-operator-1.0.jar --k8s.kubeconfig=/root/config --spring.redis.host=192.168.1.104" | |||||
| imagePullPolicy: IfNotPresent | |||||
| volumes: | |||||
| - name: config-volume | |||||
| hostPath: | |||||
| path: /root/.kube/config | |||||
| restartPolicy: Always | |||||
| terminationGracePeriodSeconds: 30 | |||||
| securityContext: | |||||
| runAsUser: 0 | |||||
| schedulerName: default-scheduler | |||||
| strategy: | |||||
| type: RollingUpdate | |||||
| rollingUpdate: | |||||
| maxUnavailable: 1 | |||||
| maxSurge: 1 | |||||
| revisionHistoryLimit: 7 | |||||
| progressDeadlineSeconds: 600 | |||||
| @@ -0,0 +1,150 @@ | |||||
| <?xml version="1.0" encoding="UTF-8"?> | |||||
| <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |||||
| xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> | |||||
| <modelVersion>4.0.0</modelVersion> | |||||
| <parent> | |||||
| <groupId>org.springframework.boot</groupId> | |||||
| <artifactId>spring-boot-starter-parent</artifactId> | |||||
| <version>2.2.5.RELEASE</version> | |||||
| </parent> | |||||
| <groupId>org.onebrain</groupId> | |||||
| <artifactId>distribute-train-operator</artifactId> | |||||
| <version>1.0</version> | |||||
| <name>distribute-train-operator</name> | |||||
| <description>distribute-train operatior</description> | |||||
| <properties> | |||||
| <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |||||
| <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> | |||||
| <java.version>1.8</java.version> | |||||
| <fabric.io.version>4.9.0</fabric.io.version> | |||||
| </properties> | |||||
| <dependencies> | |||||
| <!-- web --> | |||||
| <dependency> | |||||
| <groupId>org.springframework.boot</groupId> | |||||
| <artifactId>spring-boot-starter-web</artifactId> | |||||
| </dependency> | |||||
| <!-- k8s --> | |||||
| <dependency> | |||||
| <groupId>io.fabric8</groupId> | |||||
| <artifactId>kubernetes-client</artifactId> | |||||
| <version>${fabric.io.version}</version> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>io.fabric8</groupId> | |||||
| <artifactId>kubernetes-assertions</artifactId> | |||||
| <version>4.0.0</version> | |||||
| <scope>test</scope> | |||||
| </dependency> | |||||
| <!-- configuration processor --> | |||||
| <dependency> | |||||
| <groupId>org.springframework.boot</groupId> | |||||
| <artifactId>spring-boot-configuration-processor</artifactId> | |||||
| </dependency> | |||||
| <!-- redis --> | |||||
| <dependency> | |||||
| <groupId>org.springframework.boot</groupId> | |||||
| <artifactId>spring-boot-starter-data-redis</artifactId> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>redis.clients</groupId> | |||||
| <artifactId>jedis</artifactId> | |||||
| </dependency> | |||||
| <!-- common jars --> | |||||
| <dependency> | |||||
| <groupId>commons-io</groupId> | |||||
| <artifactId>commons-io</artifactId> | |||||
| <version>2.6</version> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>org.apache.commons</groupId> | |||||
| <artifactId>commons-compress</artifactId> | |||||
| <version>1.19</version> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>commons-codec</groupId> | |||||
| <artifactId>commons-codec</artifactId> | |||||
| </dependency> | |||||
| <!-- tools --> | |||||
| <dependency> | |||||
| <groupId>cn.hutool</groupId> | |||||
| <artifactId>hutool-all</artifactId> | |||||
| <version>5.1.1</version> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>com.google.guava</groupId> | |||||
| <artifactId>guava</artifactId> | |||||
| <version>27.0.1-jre</version> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>com.alibaba</groupId> | |||||
| <artifactId>fastjson</artifactId> | |||||
| <version>1.2.54</version> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>org.projectlombok</groupId> | |||||
| <artifactId>lombok</artifactId> | |||||
| <optional>true</optional> | |||||
| </dependency> | |||||
| <dependency> | |||||
| <groupId>org.springframework.boot</groupId> | |||||
| <artifactId>spring-boot-starter-test</artifactId> | |||||
| <scope>test</scope> | |||||
| </dependency> | |||||
| </dependencies> | |||||
| <build> | |||||
| <plugins> | |||||
| <plugin> | |||||
| <groupId>org.springframework.boot</groupId> | |||||
| <artifactId>spring-boot-maven-plugin</artifactId> | |||||
| </plugin> | |||||
| <!-- 打包时跳过测试 --> | |||||
| <plugin> | |||||
| <groupId>org.apache.maven.plugins</groupId> | |||||
| <artifactId>maven-surefire-plugin</artifactId> | |||||
| <configuration> | |||||
| <skip>true</skip> | |||||
| </configuration> | |||||
| </plugin> | |||||
| </plugins> | |||||
| </build> | |||||
| <repositories> | |||||
| <repository> | |||||
| <id>public</id> | |||||
| <name>aliyun nexus</name> | |||||
| <url>http://maven.aliyun.com/nexus/content/groups/public/</url> | |||||
| <releases> | |||||
| <enabled>true</enabled> | |||||
| </releases> | |||||
| </repository> | |||||
| </repositories> | |||||
| <pluginRepositories> | |||||
| <pluginRepository> | |||||
| <id>public</id> | |||||
| <name>aliyun nexus</name> | |||||
| <url>http://maven.aliyun.com/nexus/content/groups/public/</url> | |||||
| <releases> | |||||
| <enabled>true</enabled> | |||||
| </releases> | |||||
| <snapshots> | |||||
| <enabled>false</enabled> | |||||
| </snapshots> | |||||
| </pluginRepository> | |||||
| </pluginRepositories> | |||||
| </project> | |||||
| @@ -0,0 +1,35 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator; | |||||
| import org.springframework.boot.SpringApplication; | |||||
| import org.springframework.boot.autoconfigure.SpringBootApplication; | |||||
| import org.springframework.scheduling.annotation.EnableAsync; | |||||
| /** | |||||
| * @description Operator启动类 | |||||
| * @date 2020-09-03 | |||||
| */ | |||||
| @SpringBootApplication | |||||
| @EnableAsync | |||||
| public class DistributeTrainOperatorApplication { | |||||
| public static void main(String[] args) { | |||||
| SpringApplication.run(DistributeTrainOperatorApplication.class, args); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,199 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action; | |||||
| import cn.hutool.core.util.StrUtil; | |||||
| import com.fasterxml.jackson.core.JsonProcessingException; | |||||
| import com.google.common.collect.Maps; | |||||
| import io.fabric8.kubernetes.api.model.apiextensions.*; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||||
| import io.fabric8.kubernetes.client.dsl.Resource; | |||||
| import io.fabric8.kubernetes.client.dsl.base.CustomResourceDefinitionContext; | |||||
| import io.fabric8.kubernetes.client.informers.SharedIndexInformer; | |||||
| import io.fabric8.kubernetes.client.informers.SharedInformerFactory; | |||||
| import io.fabric8.kubernetes.client.internal.SerializationUtils; | |||||
| import io.fabric8.kubernetes.internal.KubernetesDeserializer; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.onebrain.operator.controller.DistributeTrainController; | |||||
| import org.onebrain.operator.crd.DistributeTrain; | |||||
| import org.onebrain.operator.crd.DistributeTrainList; | |||||
| import org.onebrain.operator.crd.DoneableDistributeTrain; | |||||
| import org.onebrain.operator.utils.DistributeTrainClientHolder; | |||||
| import org.onebrain.operator.utils.SpringContextHolder; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.beans.factory.support.BeanDefinitionBuilder; | |||||
| import org.springframework.beans.factory.support.DefaultListableBeanFactory; | |||||
| import org.springframework.context.ConfigurableApplicationContext; | |||||
| import org.springframework.stereotype.Component; | |||||
| import java.util.Map; | |||||
| import static org.onebrain.operator.constants.CrdConstants.*; | |||||
| /** | |||||
| * @description operator 主控制器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Component | |||||
| @Slf4j | |||||
| public class DistributeTrainOperatorManager { | |||||
| public static final String NAMESPACE_DEFAULT = "default"; | |||||
| public static final String TYPE_STRING = "string"; | |||||
| public static final String TYPE_INTEGER = "integer"; | |||||
| public static final String TYPE_OBJECT = "object"; | |||||
| public static final String TYPE_ARRAY = "array"; | |||||
| public static final String FORMAT_INT_32 = "int32"; | |||||
| @Autowired | |||||
| private KubernetesClient client; | |||||
| private CustomResourceDefinition crd; | |||||
| private String namespace; | |||||
| /** | |||||
| * 检查crd是否存在,若不存在则创建 | |||||
| * @throws JsonProcessingException | |||||
| */ | |||||
| public void createCrdIfNotExists() throws JsonProcessingException { | |||||
| String namespace = client.getNamespace(); | |||||
| if (namespace == null) { | |||||
| log.info("No namespace found via config, assuming default."); | |||||
| namespace = NAMESPACE_DEFAULT; | |||||
| } | |||||
| this.namespace = namespace; | |||||
| log.info("Using namespace : {}", namespace); | |||||
| //检查crd是否已存在 | |||||
| CustomResourceDefinition crd = client.customResourceDefinitions().withName(CRD_NAME).get(); | |||||
| if(crd == null){ | |||||
| Map<String, JSONSchemaProps> crdPropsMap = buildCrdProperties(); | |||||
| log.info("crd props map is : 【{}】",crdPropsMap); | |||||
| //如不存在,则创建 | |||||
| CustomResourceDefinition distributeTrainCustomResourceDefinition = new CustomResourceDefinitionBuilder() | |||||
| .withApiVersion(CRD_API_VERSION) | |||||
| .withNewMetadata() | |||||
| .withName(CRD_NAME) | |||||
| .endMetadata() | |||||
| .withNewSpec() | |||||
| .withGroup(CRD_GROUP) | |||||
| .withVersion(CRD_VERSION) | |||||
| .withScope(CRD_SCOPE) | |||||
| .withNewNames() | |||||
| .withKind(CRD_KIND) | |||||
| .withSingular(CRD_SINGULAR_NAME) | |||||
| .withPlural(CRD_PLURAL_NAME) | |||||
| .withShortNames(CRD_SHORT_NAME) | |||||
| .endNames() | |||||
| .withNewValidation() | |||||
| .withNewOpenAPIV3Schema() | |||||
| .addToProperties(crdPropsMap) | |||||
| .endOpenAPIV3Schema() | |||||
| .endValidation() | |||||
| .endSpec() | |||||
| .build(); | |||||
| distributeTrainCustomResourceDefinition = client.customResourceDefinitions().create(distributeTrainCustomResourceDefinition); | |||||
| log.info("create crd successfully : \n{}", SerializationUtils.dumpAsYaml(distributeTrainCustomResourceDefinition)); | |||||
| crd = distributeTrainCustomResourceDefinition; | |||||
| } | |||||
| //注册到k8s反序列化解析器 | |||||
| KubernetesDeserializer.registerCustomKind(CRD_GROUP + StrUtil.SLASH + CRD_VERSION, CRD_KIND, DistributeTrain.class); | |||||
| this.crd = crd; | |||||
| } | |||||
| /** | |||||
| * 初始化informer | |||||
| */ | |||||
| public void initInformer(){ | |||||
| CustomResourceDefinitionContext distributeTrainCustomResourceDefinitionContext = new CustomResourceDefinitionContext.Builder() | |||||
| .withVersion(CRD_VERSION) | |||||
| .withScope(CRD_SCOPE) | |||||
| .withGroup(CRD_GROUP) | |||||
| .withPlural(CRD_PLURAL_NAME) | |||||
| .build(); | |||||
| SharedInformerFactory informerFactory = client.informers(); | |||||
| MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient = client.customResources(this.crd, DistributeTrain.class, DistributeTrainList.class, DoneableDistributeTrain.class); | |||||
| SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer = informerFactory.sharedIndexInformerForCustomResource(distributeTrainCustomResourceDefinitionContext, DistributeTrain.class, DistributeTrainList.class, 10 * 60 * 1000); | |||||
| //使用静态变量维持 | |||||
| DistributeTrainClientHolder.setDistributeTrainClient(distributeTrainClient); | |||||
| //手动注册controller到ioc容器 | |||||
| BeanDefinitionBuilder beanDefinitionBuilder = BeanDefinitionBuilder.genericBeanDefinition(DistributeTrainController.class); | |||||
| DefaultListableBeanFactory beanFactory = (DefaultListableBeanFactory)((ConfigurableApplicationContext) SpringContextHolder.applicationContext).getBeanFactory(); | |||||
| beanDefinitionBuilder.addConstructorArgValue(distributeTrainClient); | |||||
| beanDefinitionBuilder.addConstructorArgValue(distributeTrainSharedIndexInformer); | |||||
| beanDefinitionBuilder.addConstructorArgValue(namespace); | |||||
| beanFactory.registerBeanDefinition("org.onebrain.operator.controller.DistributeTrainController", beanDefinitionBuilder.getRawBeanDefinition()); | |||||
| //取得托管的controller | |||||
| DistributeTrainController controller = SpringContextHolder.getBean(DistributeTrainController.class); | |||||
| //注册informer监听 | |||||
| controller.create(); | |||||
| informerFactory.startAllRegisteredInformers(); | |||||
| //等待就绪 | |||||
| controller.run(); | |||||
| } | |||||
| /** | |||||
| * 生成crd属性 | |||||
| * @return crd属性集合 | |||||
| */ | |||||
| private Map<String, JSONSchemaProps> buildCrdProperties(){ | |||||
| Map<String, JSONSchemaProps> properties = Maps.newHashMap(); | |||||
| JSONSchemaProps stringType = new JSONSchemaPropsBuilder() | |||||
| .withType(TYPE_STRING) | |||||
| .build(); | |||||
| JSONSchemaProps intType = new JSONSchemaPropsBuilder() | |||||
| .withType(TYPE_INTEGER) | |||||
| .withFormat(FORMAT_INT_32) | |||||
| .build(); | |||||
| JSONSchemaProps objectType = new JSONSchemaPropsBuilder() | |||||
| .withType(TYPE_OBJECT) | |||||
| .build(); | |||||
| JSONSchemaProps arrayType = new JSONSchemaPropsBuilder() | |||||
| .withType(TYPE_ARRAY) | |||||
| .withNewItems() | |||||
| .endItems() | |||||
| .build(); | |||||
| //添加属性校验规则 | |||||
| JSONSchemaProps specObjectType = new JSONSchemaPropsBuilder() | |||||
| .addToProperties("image", stringType) | |||||
| .addToProperties("imagePullPolicy", stringType) | |||||
| .addToProperties("size", intType) | |||||
| .addToProperties("env", arrayType) | |||||
| .addToProperties("masterCmd", stringType) | |||||
| .addToProperties("slaveCmd", stringType) | |||||
| .addToProperties("masterResources", objectType) | |||||
| .addToProperties("slaveResources", objectType) | |||||
| .addToProperties("nodeSelector", objectType) | |||||
| .addToProperties("initContainer", objectType) | |||||
| .addToProperties("datasetStorage", objectType) | |||||
| .addToProperties("workspaceStorage", objectType) | |||||
| .addToProperties("modelStorage", objectType) | |||||
| .withType("object") | |||||
| .addToRequired("image", "imagePullPolicy", "size", "masterCmd", "slaveCmd", "workspaceStorage") | |||||
| .build(); | |||||
| properties.put("apiVersion", stringType); | |||||
| properties.put("kind", stringType); | |||||
| properties.put("metadata", objectType); | |||||
| properties.put("spec", specObjectType); | |||||
| return properties; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,58 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.onebrain.operator.watcher.KubeWatcherManager; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.boot.ApplicationArguments; | |||||
| import org.springframework.boot.ApplicationRunner; | |||||
| import org.springframework.stereotype.Component; | |||||
| /** | |||||
| * @description Operator运行入口 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Component | |||||
| @Slf4j | |||||
| public class OperatorRunner implements ApplicationRunner { | |||||
| @Autowired | |||||
| private DistributeTrainOperatorManager operatorManager; | |||||
| @Autowired | |||||
| private KubeWatcherManager watcherManager; | |||||
| /** | |||||
| * spring 容器完全启动后 注册operator运行逻辑 | |||||
| * @param args | |||||
| * @throws Exception | |||||
| */ | |||||
| @Override | |||||
| public void run(ApplicationArguments args) throws Exception { | |||||
| //检查crd是否已存在,如果不存在则创建 | |||||
| operatorManager.createCrdIfNotExists(); | |||||
| //job监控者启动 | |||||
| watcherManager.startWatching(); | |||||
| log.info("job watcher is running"); | |||||
| //初始化informer | |||||
| operatorManager.initInformer(); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,44 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action; | |||||
| import lombok.AllArgsConstructor; | |||||
| import lombok.Builder; | |||||
| import lombok.Data; | |||||
| import lombok.NoArgsConstructor; | |||||
| /** | |||||
| * @description pod信息类 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Data | |||||
| @NoArgsConstructor | |||||
| @AllArgsConstructor | |||||
| @Builder | |||||
| public class PodInfo { | |||||
| /** | |||||
| * ip地址 | |||||
| */ | |||||
| private String ip; | |||||
| /** | |||||
| * 角色 | |||||
| */ | |||||
| private String role; | |||||
| } | |||||
| @@ -0,0 +1,41 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.deployer; | |||||
| import cn.hutool.core.util.RandomUtil; | |||||
| import lombok.Data; | |||||
| import lombok.experimental.Accessors; | |||||
| /** | |||||
| * @description 创建资源的信息的抽象类 | |||||
| * @date 2020-04-30 | |||||
| */ | |||||
| @Data | |||||
| @Accessors(chain = true) | |||||
| public abstract class AbstractResourceCreateInfo { | |||||
| /** | |||||
| * 生成随机字符串 | |||||
| * @param digits 位数 | |||||
| * @return | |||||
| */ | |||||
| protected static String getRandomStr(Integer digits){ | |||||
| return RandomUtil.randomString(digits); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,227 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.deployer; | |||||
| import cn.hutool.core.collection.CollectionUtil; | |||||
| import cn.hutool.core.util.StrUtil; | |||||
| import io.fabric8.kubernetes.api.model.*; | |||||
| import lombok.Data; | |||||
| import lombok.experimental.Accessors; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import org.onebrain.operator.constants.NumberConstant; | |||||
| import org.onebrain.operator.crd.DistributeTrain; | |||||
| import java.util.List; | |||||
| import java.util.Map; | |||||
| import java.util.Optional; | |||||
| import java.util.stream.Collectors; | |||||
| /** | |||||
| * @description 暂存创建子资源所需的信息 | |||||
| * @date 2020-06-16 | |||||
| */ | |||||
| @Data | |||||
| @Accessors(chain = true) | |||||
| public class ChildResourceCreateInfo extends AbstractResourceCreateInfo { | |||||
| public static final String SLAVE_TEMPLATE = "{}-slave-{}"; | |||||
| public static final String MASTER_TEMPLATE = "{}-master-{}"; | |||||
| public static final String SVC_TEMPLATE = "{}-svc"; | |||||
| /** | |||||
| * 父级名称(分布式训练名称) | |||||
| */ | |||||
| private String parentName; | |||||
| /** | |||||
| * job名称 | |||||
| */ | |||||
| private String jobName; | |||||
| /** | |||||
| * statefullSet名称 | |||||
| */ | |||||
| private String statefulSetName; | |||||
| /** | |||||
| * 服务名称 | |||||
| */ | |||||
| private String svcName; | |||||
| /** | |||||
| * 命名空间 | |||||
| */ | |||||
| private String namespace; | |||||
| /** | |||||
| * 镜像 | |||||
| */ | |||||
| private String image; | |||||
| /** | |||||
| * 镜像拉取策略 | |||||
| */ | |||||
| private String imagePullPolicy; | |||||
| /** | |||||
| * 标签 | |||||
| */ | |||||
| private Map<String, String> labels; | |||||
| /** | |||||
| * master副本数 | |||||
| */ | |||||
| private Integer masterReplicas; | |||||
| /** | |||||
| * slave副本数 | |||||
| */ | |||||
| private Integer slaveReplicas; | |||||
| /** | |||||
| * master命令 | |||||
| */ | |||||
| private String masterCmd; | |||||
| /** | |||||
| * slave命令 | |||||
| */ | |||||
| private String slaveCmd; | |||||
| /** | |||||
| * master 资源节点限制 | |||||
| */ | |||||
| private ResourceRequirements masterResources; | |||||
| /** | |||||
| * slave 资源节点限制 | |||||
| */ | |||||
| private ResourceRequirements slaveResources; | |||||
| /** | |||||
| * 节点调度选择器 | |||||
| */ | |||||
| private Map<String, String> nodeSelector; | |||||
| /** | |||||
| * 初始化容器 | |||||
| */ | |||||
| private Container initContainer; | |||||
| /** | |||||
| * 工作目录挂载 | |||||
| */ | |||||
| private Volume workspaceVolume; | |||||
| /** | |||||
| * 数据集目录挂载 | |||||
| */ | |||||
| private Volume datasetVolume; | |||||
| /** | |||||
| * 模型目录挂载 | |||||
| */ | |||||
| private Volume modelVolume; | |||||
| /** | |||||
| * 环境变量 | |||||
| */ | |||||
| private List<EnvVar> env; | |||||
| /** | |||||
| * 拥有者信息 | |||||
| */ | |||||
| private OwnerReference ownerReference; | |||||
| /** | |||||
| * 将分布式训练转换为K8S的资源信息 | |||||
| * @param distributeTrain 分布式训练 | |||||
| * @return ChildResourceCreateInfo | |||||
| */ | |||||
| public static ChildResourceCreateInfo fromCr(DistributeTrain distributeTrain){ | |||||
| ChildResourceCreateInfo info = new ChildResourceCreateInfo(); | |||||
| //ownerReferece信息 | |||||
| info.generateOwnerReference(distributeTrain); | |||||
| //各种资源的名称 | |||||
| info.setNamespace(distributeTrain.getMetadata().getNamespace()); | |||||
| info.setParentName(distributeTrain.getMetadata().getName()); | |||||
| info.generateResoureName(); | |||||
| //标签 | |||||
| info.setLabels(distributeTrain.getMetadata().getLabels()); | |||||
| //镜像 | |||||
| info.setImage(distributeTrain.getSpec().getImage()) | |||||
| .setImagePullPolicy(distributeTrain.getSpec().getImagePullPolicy()); | |||||
| //副本数 | |||||
| Integer size = distributeTrain.getSpec().getSize(); | |||||
| info.setMasterReplicas(NumberConstant.NUMBER_1); | |||||
| info.setSlaveReplicas(size - NumberConstant.NUMBER_1); | |||||
| //命令行 | |||||
| info.setMasterCmd(distributeTrain.getSpec().getMasterCmd()) | |||||
| .setSlaveCmd(distributeTrain.getSpec().getSlaveCmd()); | |||||
| //挂载 | |||||
| Optional.ofNullable(distributeTrain.getSpec().getWorkspaceStorage()) | |||||
| .ifPresent(v -> info.setWorkspaceVolume(v)); | |||||
| Optional.ofNullable(distributeTrain.getSpec().getDatasetStorage()) | |||||
| .ifPresent(v -> info.setDatasetVolume(v)); | |||||
| Optional.ofNullable(distributeTrain.getSpec().getModelStorage()) | |||||
| .ifPresent(v -> info.setModelVolume(v)); | |||||
| //主从两组资源限制 | |||||
| Optional.ofNullable(distributeTrain.getSpec().getMasterResources()) | |||||
| .ifPresent(v -> info.setMasterResources(v)); | |||||
| Optional.ofNullable(distributeTrain.getSpec().getSlaveResources()) | |||||
| .ifPresent(v -> info.setSlaveResources(v)); | |||||
| //环境变量 | |||||
| List<EnvVar> env = distributeTrain.getSpec().getEnv(); | |||||
| if(CollectionUtil.isNotEmpty(env)){ | |||||
| env = env.stream().filter(e -> !KubeConstants.ENV_NODE_NUM.equals(e.getName())).collect(Collectors.toList()); | |||||
| info.setEnv(env); | |||||
| } | |||||
| //node调度 | |||||
| info.setNodeSelector(distributeTrain.getSpec().getNodeSelector()); | |||||
| //init-container | |||||
| info.setInitContainer(distributeTrain.getSpec().getInitContainer()); | |||||
| return info; | |||||
| } | |||||
| /** | |||||
| * 生成资源名称 | |||||
| */ | |||||
| private void generateResoureName(){ | |||||
| String suffix = getRandomStr(NumberConstant.NUMBER_5); | |||||
| this.statefulSetName = StrUtil.format(SLAVE_TEMPLATE, this.parentName, suffix); | |||||
| this.jobName = StrUtil.format(MASTER_TEMPLATE, this.parentName, suffix); | |||||
| this.svcName = StrUtil.format(SVC_TEMPLATE, this.parentName); | |||||
| } | |||||
| /** | |||||
| * 生成所有者信息 | |||||
| * @param distributeTrain 分布式训练 | |||||
| */ | |||||
| private void generateOwnerReference(DistributeTrain distributeTrain){ | |||||
| this.ownerReference = new OwnerReferenceBuilder() | |||||
| .withApiVersion(distributeTrain.getApiVersion()) | |||||
| .withKind(distributeTrain.getKind()) | |||||
| .withName(distributeTrain.getMetadata().getName()) | |||||
| .withNewUid(distributeTrain.getMetadata().getUid()) | |||||
| .build(); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,35 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.deployer; | |||||
| import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||||
| /** | |||||
| * @description Job部署接口 规范部署方法 | |||||
| * T 必须是AbstractResourceCreateInfo 的子类型 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public interface JobDeployer<T extends AbstractResourceCreateInfo> { | |||||
| /** | |||||
| * 构建 Job信息 | |||||
| * @param info 资源信息 | |||||
| * @return Job构建者 | |||||
| */ | |||||
| JobBuilder deploy(T info); | |||||
| } | |||||
| @@ -0,0 +1,33 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.deployer; | |||||
| import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||||
| /** | |||||
| * @description service部署器接口 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public interface ServiceDeployer<T extends AbstractResourceCreateInfo> { | |||||
| /** | |||||
| * 构建service信息 | |||||
| * @param info 资源信息 | |||||
| * @return 服务构建者 | |||||
| */ | |||||
| ServiceBuilder deploy(T info); | |||||
| } | |||||
| @@ -0,0 +1,33 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.deployer; | |||||
| import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||||
| /** | |||||
| * @description statefulset部署器接口 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public interface StatefulSetDeployer<T extends AbstractResourceCreateInfo> { | |||||
| /** | |||||
| * 构建service信息 | |||||
| * @param info 资源信息 | |||||
| * @return StatefulSet构建者 | |||||
| */ | |||||
| StatefulSetBuilder deploy(T info); | |||||
| } | |||||
| @@ -0,0 +1,246 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.deployer.impl; | |||||
| import cn.hutool.core.collection.CollectionUtil; | |||||
| import com.google.common.collect.Lists; | |||||
| import io.fabric8.kubernetes.api.model.CapabilitiesBuilder; | |||||
| import io.fabric8.kubernetes.api.model.Container; | |||||
| import io.fabric8.kubernetes.api.model.ContainerPortBuilder; | |||||
| import io.fabric8.kubernetes.api.model.EnvVar; | |||||
| import io.fabric8.kubernetes.api.model.EnvVarBuilder; | |||||
| import io.fabric8.kubernetes.api.model.SecurityContextBuilder; | |||||
| import io.fabric8.kubernetes.api.model.Volume; | |||||
| import io.fabric8.kubernetes.api.model.VolumeBuilder; | |||||
| import io.fabric8.kubernetes.api.model.VolumeMount; | |||||
| import io.fabric8.kubernetes.api.model.VolumeMountBuilder; | |||||
| import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||||
| import org.onebrain.operator.action.deployer.JobDeployer; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import java.util.*; | |||||
| import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_0; | |||||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_1; | |||||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||||
| /** | |||||
| * @description Job部署器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public class BaseJobDeployer implements JobDeployer<ChildResourceCreateInfo> { | |||||
| public static final String PVC_WORKSPACE = "pvc-workspace"; | |||||
| public static final String SSH = "ssh"; | |||||
| public static final String WORKSPACE = "/workspace"; | |||||
| public static final String PVC_DATASET = "pvc-dataset"; | |||||
| public static final String DATASET = "/dataset"; | |||||
| public static final String PVC_MODEL = "pvc-model"; | |||||
| public static final String MODEL = "/model"; | |||||
| public static final String MEMORY = "Memory"; | |||||
| public static final String DEV_SHM = "/dev/shm"; | |||||
| public static final String BIN_BASH = "/bin/bash"; | |||||
| public static final String IPC_LOCK = "IPC_LOCK"; | |||||
| public static final String RESTART_POLICY_NEVER = "Never"; | |||||
| /** | |||||
| * 部署Job | |||||
| * @param info 资源信息 | |||||
| * @return | |||||
| */ | |||||
| @Override | |||||
| public JobBuilder deploy(ChildResourceCreateInfo info) { | |||||
| //容器 | |||||
| Container container = buildContainer(info); | |||||
| //存储卷 | |||||
| List<Volume> volumes = buildVolumes(info); | |||||
| //挂载 | |||||
| List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | |||||
| container.setVolumeMounts(volumeMounts); | |||||
| //启动命令 | |||||
| container.setCommand(Collections.singletonList(BIN_BASH)); | |||||
| //训练等待命令 | |||||
| //一个是等待 pretreatment 文件 通过 podApi 拷贝 到pod上 | |||||
| //另一个是等待 服务(svc)创建成功 | |||||
| List<String> cmdLines = Arrays.asList("while [ ! -f /home/pretreatment ]; do echo pretreatment not exist >> pretreatment.log; sleep 1;done && chmod a+x /home/pretreatment && bash /home/pretreatment ", "until nslookup " + info.getSvcName() + "; do sleep 5; done", info.getMasterCmd()); | |||||
| container.setArgs(Arrays.asList("-c", CollectionUtil.join(cmdLines, " && "))); | |||||
| //权限 | |||||
| container.setSecurityContext(new SecurityContextBuilder() | |||||
| .withAllowPrivilegeEscalation(true) | |||||
| .withCapabilities(new CapabilitiesBuilder() | |||||
| .withAdd(Collections.singletonList(IPC_LOCK)) | |||||
| .build()) | |||||
| .build()); | |||||
| //用户自定义的标签 | |||||
| Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||||
| JobBuilder builder = new JobBuilder(); | |||||
| builder.withNewMetadata() | |||||
| .withName(info.getJobName()) | |||||
| .withNamespace(info.getNamespace()) | |||||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||||
| .addToLabels(customizeLabels) | |||||
| .addToOwnerReferences(info.getOwnerReference()) | |||||
| .endMetadata() | |||||
| .withNewSpec() | |||||
| //并行1个 | |||||
| .withParallelism(NUMBER_1) | |||||
| //共计运行1次 | |||||
| .withCompletions(NUMBER_1) | |||||
| //失败重试次数 | |||||
| .withBackoffLimit(KubeConstants.BACKOFFLIMIT) | |||||
| .withNewTemplate() | |||||
| .withNewMetadata() | |||||
| .withName(info.getJobName()) | |||||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||||
| .addToLabels(KubeConstants.JOB_LABEL, info.getJobName()) | |||||
| .addToLabels(customizeLabels) | |||||
| .endMetadata() | |||||
| .withNewSpec() | |||||
| //关闭指令发出时 立即执行 | |||||
| .withTerminationGracePeriodSeconds(LONG_NUMBER_0) | |||||
| .addToContainers(container) | |||||
| .addToVolumes(volumes.toArray(new Volume[volumes.size()])) | |||||
| .withRestartPolicy(RESTART_POLICY_NEVER) | |||||
| .endSpec() | |||||
| .endTemplate() | |||||
| .endSpec(); | |||||
| //init-container | |||||
| JobBuilder finalBuilder = builder; | |||||
| Optional.ofNullable(info.getInitContainer()) | |||||
| .ifPresent(initContainer -> { | |||||
| finalBuilder.editSpec() | |||||
| .editTemplate() | |||||
| .editSpec() | |||||
| .addToInitContainers(initContainer) | |||||
| .endSpec() | |||||
| .endTemplate() | |||||
| .endSpec(); | |||||
| }); | |||||
| //固定节点调度 | |||||
| if(CollectionUtil.isNotEmpty(info.getNodeSelector())){ | |||||
| builder = builder.editSpec() | |||||
| .editTemplate().editSpec() | |||||
| .addToNodeSelector(info.getNodeSelector()) | |||||
| .endSpec().endTemplate() | |||||
| .endSpec(); | |||||
| } | |||||
| return builder; | |||||
| } | |||||
| /** | |||||
| * 构建容器 | |||||
| * @param info 资源信息 | |||||
| * @return 容器信息 | |||||
| */ | |||||
| private Container buildContainer(ChildResourceCreateInfo info){ | |||||
| //容器 | |||||
| Container container = new Container(); | |||||
| //镜像 | |||||
| container.setName(KubeConstants.MASTER_CONTAINER_NAME); | |||||
| container.setImage(info.getImage()); | |||||
| container.setImagePullPolicy(info.getImagePullPolicy()); | |||||
| //端口映射 | |||||
| container.setPorts(Arrays.asList(new ContainerPortBuilder() | |||||
| .withContainerPort(NUMBER_22) | |||||
| .withName(SSH).build())); | |||||
| //环境变量 | |||||
| List<EnvVar> envVars = Lists.newArrayList(new EnvVarBuilder() | |||||
| .withName(KubeConstants.ENV_NODE_NUM) | |||||
| .withValue(String.valueOf(info.getSlaveReplicas() + info.getMasterReplicas())) | |||||
| .build()); | |||||
| Optional.ofNullable(info.getEnv()).ifPresent(v -> envVars.addAll(v)); | |||||
| container.setEnv(envVars); | |||||
| //资源限制 | |||||
| Optional.ofNullable(info.getMasterResources()).ifPresent(v->container.setResources(v)); | |||||
| return container; | |||||
| } | |||||
| /** | |||||
| * 构建存储卷集合 | |||||
| * @param info 资源信息 | |||||
| * @return 存储卷集合 | |||||
| */ | |||||
| private List<Volume> buildVolumes(ChildResourceCreateInfo info){ | |||||
| //存储卷 | |||||
| List<Volume> volumes = new LinkedList<>(); | |||||
| Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||||
| Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||||
| Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||||
| //shm默认就有 | |||||
| volumes.add(new VolumeBuilder() | |||||
| .withName(KubeConstants.VOLUME_SHM) | |||||
| .withNewEmptyDir() | |||||
| .withMedium(MEMORY) | |||||
| .endEmptyDir() | |||||
| .build()); | |||||
| return volumes; | |||||
| } | |||||
| /** | |||||
| * 构建挂载存储卷集合 | |||||
| * @param volumes 存储卷集合 | |||||
| * @return 构建挂载存储卷集合 | |||||
| */ | |||||
| private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | |||||
| List<VolumeMount> volumeMounts = new LinkedList<>(); | |||||
| for (Volume volume : volumes) { | |||||
| if(PVC_WORKSPACE.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(WORKSPACE) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| if(PVC_DATASET.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(DATASET) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| if(PVC_MODEL.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(MODEL) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| } | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(KubeConstants.VOLUME_SHM) | |||||
| .withMountPath(DEV_SHM) | |||||
| .build()); | |||||
| return volumeMounts; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,73 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.deployer.impl; | |||||
| import cn.hutool.core.collection.CollectionUtil; | |||||
| import io.fabric8.kubernetes.api.model.IntOrString; | |||||
| import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||||
| import org.onebrain.operator.action.deployer.ServiceDeployer; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import java.util.Collections; | |||||
| import java.util.HashMap; | |||||
| import java.util.Map; | |||||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_30000; | |||||
| /** | |||||
| * @description Service部署器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public class BaseServiceDeployer implements ServiceDeployer<ChildResourceCreateInfo> { | |||||
| public static final String WEB_SSH = "web-ssh"; | |||||
| public static final String NONE = "None"; | |||||
| /** | |||||
| * 构建service信息 | |||||
| * @param info 资源信息 | |||||
| * @return | |||||
| */ | |||||
| @Override | |||||
| public ServiceBuilder deploy(ChildResourceCreateInfo info) { | |||||
| //用户自定义的标签 | |||||
| Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||||
| return new ServiceBuilder() | |||||
| .withNewMetadata() | |||||
| .withName(info.getSvcName()) | |||||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||||
| .addToLabels(customizeLabels) | |||||
| .withNamespace(info.getNamespace()) | |||||
| .addToOwnerReferences(info.getOwnerReference()) | |||||
| .endMetadata() | |||||
| .withNewSpec() | |||||
| .addNewPort() | |||||
| .withPort(NUMBER_30000) | |||||
| .withTargetPort(new IntOrString(NUMBER_22)) | |||||
| .withName(WEB_SSH) | |||||
| .endPort() | |||||
| .withClusterIP(NONE) | |||||
| //选择带有分布式训练的节点 | |||||
| .withSelector(Collections.singletonMap(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName())) | |||||
| .endSpec(); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,246 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.deployer.impl; | |||||
| import cn.hutool.core.collection.CollectionUtil; | |||||
| import com.google.common.collect.ImmutableMap; | |||||
| import com.google.common.collect.Lists; | |||||
| import io.fabric8.kubernetes.api.model.CapabilitiesBuilder; | |||||
| import io.fabric8.kubernetes.api.model.Container; | |||||
| import io.fabric8.kubernetes.api.model.ContainerPortBuilder; | |||||
| import io.fabric8.kubernetes.api.model.EnvVar; | |||||
| import io.fabric8.kubernetes.api.model.EnvVarBuilder; | |||||
| import io.fabric8.kubernetes.api.model.LabelSelector; | |||||
| import io.fabric8.kubernetes.api.model.SecurityContextBuilder; | |||||
| import io.fabric8.kubernetes.api.model.Volume; | |||||
| import io.fabric8.kubernetes.api.model.VolumeBuilder; | |||||
| import io.fabric8.kubernetes.api.model.VolumeMount; | |||||
| import io.fabric8.kubernetes.api.model.VolumeMountBuilder; | |||||
| import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||||
| import org.onebrain.operator.action.deployer.StatefulSetDeployer; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import java.util.Arrays; | |||||
| import java.util.Collections; | |||||
| import java.util.HashMap; | |||||
| import java.util.LinkedList; | |||||
| import java.util.List; | |||||
| import java.util.Map; | |||||
| import java.util.Optional; | |||||
| import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_0; | |||||
| import static org.onebrain.operator.constants.NumberConstant.LONG_NUMBER_60; | |||||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_22; | |||||
| /** | |||||
| * @description StatefullSet部署器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public class BaseStatefulSetDeployer implements StatefulSetDeployer<ChildResourceCreateInfo> { | |||||
| public static final String SSH = "ssh"; | |||||
| public static final String PVC_WORKSPACE = "pvc-workspace"; | |||||
| public static final String WORKSPACE = "/workspace"; | |||||
| public static final String PVC_DATASET = "pvc-dataset"; | |||||
| public static final String DATASET = "/dataset"; | |||||
| public static final String PVC_MODEL = "pvc-model"; | |||||
| public static final String MODEL = "/model"; | |||||
| public static final String MEMORY = "Memory"; | |||||
| public static final String DEV_SHM = "/dev/shm"; | |||||
| public static final String BIN_BASH = "/bin/bash"; | |||||
| public static final String IPC_LOCK = "IPC_LOCK"; | |||||
| /** | |||||
| * 生成 StatefullSet 信息 | |||||
| * @param info 资源信息 | |||||
| * @return | |||||
| */ | |||||
| @Override | |||||
| public StatefulSetBuilder deploy(ChildResourceCreateInfo info) { | |||||
| //标签筛选 | |||||
| LabelSelector labelSelector = new LabelSelector(); | |||||
| labelSelector.setMatchLabels(ImmutableMap.of(KubeConstants.STATEFULSET_LABEL, info.getStatefulSetName())); | |||||
| //存储卷 | |||||
| List<Volume> volumes = buildVolumes(info); | |||||
| //容器 | |||||
| Container container = buildContainer(info); | |||||
| //挂载 | |||||
| List<VolumeMount> volumeMounts = buildVolumeMounts(volumes); | |||||
| container.setVolumeMounts(volumeMounts); | |||||
| //启动命令 | |||||
| List<String> cmdLines = Arrays.asList("while [ ! -f /home/pretreatment ]; do echo pretreatment not exist >> pretreatment.log; sleep 1;done && chmod a+x /home/pretreatment && bash /home/pretreatment ", "until nslookup " + info.getSvcName() + "; do sleep 5; done", info.getSlaveCmd()); | |||||
| container.setCommand(Collections.singletonList(BIN_BASH)); | |||||
| container.setArgs(Arrays.asList("-c", CollectionUtil.join(cmdLines, " && "))); | |||||
| //权限 | |||||
| container.setSecurityContext(new SecurityContextBuilder() | |||||
| .withAllowPrivilegeEscalation(true) | |||||
| // .withPrivileged(true) | |||||
| .withCapabilities(new CapabilitiesBuilder() | |||||
| .withAdd(Collections.singletonList(IPC_LOCK)) | |||||
| .build()) | |||||
| .build()); | |||||
| //用户自定义的标签 | |||||
| Map<String,String> customizeLabels = CollectionUtil.isNotEmpty(info.getLabels())? info.getLabels(): new HashMap<>(); | |||||
| StatefulSetBuilder builder = new StatefulSetBuilder(); | |||||
| builder.withNewMetadata() | |||||
| .withName(info.getStatefulSetName()) | |||||
| .withNamespace(info.getNamespace()) | |||||
| .addToOwnerReferences(info.getOwnerReference()) | |||||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||||
| .endMetadata() | |||||
| .withNewSpec() | |||||
| .withSelector(labelSelector) | |||||
| .withServiceName(info.getStatefulSetName()) | |||||
| .withReplicas(info.getSlaveReplicas()) | |||||
| .withNewTemplate() | |||||
| .withNewMetadata() | |||||
| .withName(info.getStatefulSetName()) | |||||
| .addToLabels(KubeConstants.DISTRIBUTE_TRAIN_LABEL, info.getParentName()) | |||||
| .addToLabels(KubeConstants.STATEFULSET_LABEL, info.getStatefulSetName()) | |||||
| .addToLabels(customizeLabels) | |||||
| .endMetadata() | |||||
| .withNewSpec() | |||||
| .withTerminationGracePeriodSeconds(LONG_NUMBER_0) | |||||
| .withTerminationGracePeriodSeconds(LONG_NUMBER_60) | |||||
| .addToContainers(container) | |||||
| .addToVolumes(volumes.toArray(new Volume[0])) | |||||
| .endSpec() | |||||
| .endTemplate() | |||||
| .endSpec(); | |||||
| //init-container | |||||
| StatefulSetBuilder finalBuilder = builder; | |||||
| Optional.ofNullable(info.getInitContainer()) | |||||
| .ifPresent(initContainer -> { | |||||
| finalBuilder.editSpec() | |||||
| .editTemplate() | |||||
| .editSpec() | |||||
| .addToInitContainers(initContainer) | |||||
| .endSpec() | |||||
| .endTemplate() | |||||
| .endSpec(); | |||||
| }); | |||||
| //固定节点调度 | |||||
| if(CollectionUtil.isNotEmpty(info.getNodeSelector())){ | |||||
| builder = builder.editSpec() | |||||
| .editTemplate().editSpec() | |||||
| .addToNodeSelector(info.getNodeSelector()) | |||||
| .endSpec().endTemplate() | |||||
| .endSpec(); | |||||
| } | |||||
| return builder; | |||||
| } | |||||
| /** | |||||
| * 构建容器 | |||||
| * @param info 资源信息 | |||||
| * @return 容器信息 | |||||
| */ | |||||
| private Container buildContainer(ChildResourceCreateInfo info) { | |||||
| Container container = new Container(); | |||||
| //镜像 | |||||
| container.setName(KubeConstants.SLAVE_CONTAINER_NAME); | |||||
| container.setImage(info.getImage()); | |||||
| container.setImagePullPolicy(info.getImagePullPolicy()); | |||||
| //端口映射 | |||||
| container.setPorts(Arrays.asList(new ContainerPortBuilder() | |||||
| .withContainerPort(NUMBER_22) | |||||
| .withName(SSH).build())); | |||||
| //环境变量 | |||||
| List<EnvVar> envVars = Lists.newArrayList(new EnvVarBuilder() | |||||
| .withName(KubeConstants.ENV_NODE_NUM) | |||||
| .withValue(String.valueOf(info.getSlaveReplicas() + info.getMasterReplicas())) | |||||
| .build()); | |||||
| Optional.ofNullable(info.getEnv()).ifPresent(v -> envVars.addAll(v)); | |||||
| container.setEnv(envVars); | |||||
| //资源限制 | |||||
| Optional.ofNullable(info.getSlaveResources()).ifPresent(v -> container.setResources(v)); | |||||
| return container; | |||||
| } | |||||
| /** | |||||
| * 构建存储卷集合 | |||||
| * @param info 资源信息 | |||||
| * @return 存储卷集合 | |||||
| */ | |||||
| private List<Volume> buildVolumes(ChildResourceCreateInfo info) { | |||||
| List<Volume> volumes = buildVolumes(info); | |||||
| Optional.ofNullable(info.getWorkspaceVolume()).ifPresent(v-> volumes.add(v)); | |||||
| Optional.ofNullable(info.getDatasetVolume()).ifPresent(v-> volumes.add(v)); | |||||
| Optional.ofNullable(info.getModelVolume()).ifPresent(v-> volumes.add(v)); | |||||
| //shm默认就有 | |||||
| volumes.add(new VolumeBuilder() | |||||
| .withName(KubeConstants.VOLUME_SHM) | |||||
| .withNewEmptyDir() | |||||
| .withMedium(MEMORY) | |||||
| .endEmptyDir() | |||||
| .build()); | |||||
| return volumes; | |||||
| } | |||||
| /** | |||||
| * 构建挂载存储卷集合 | |||||
| * @param volumes 存储卷集合 | |||||
| * @return 构建挂载存储卷集合 | |||||
| */ | |||||
| private List<VolumeMount> buildVolumeMounts(List<Volume> volumes) { | |||||
| List<VolumeMount> volumeMounts=new LinkedList<>(); | |||||
| for (Volume volume : volumes) { | |||||
| if(PVC_WORKSPACE.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(WORKSPACE) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| if(PVC_DATASET.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(DATASET) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| if(PVC_MODEL.equals(volume.getName())){ | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(volume.getName()) | |||||
| .withMountPath(MODEL) | |||||
| .build()); | |||||
| continue; | |||||
| } | |||||
| } | |||||
| volumeMounts.add(new VolumeMountBuilder() | |||||
| .withName(KubeConstants.VOLUME_SHM) | |||||
| .withMountPath(DEV_SHM) | |||||
| .build()); | |||||
| return volumeMounts; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,614 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.handler; | |||||
| import cn.hutool.core.collection.CollectionUtil; | |||||
| import cn.hutool.core.io.FileUtil; | |||||
| import cn.hutool.core.util.ObjectUtil; | |||||
| import cn.hutool.core.util.StrUtil; | |||||
| import com.alibaba.fastjson.JSONArray; | |||||
| import com.alibaba.fastjson.JSONObject; | |||||
| import com.google.common.collect.Lists; | |||||
| import com.google.common.io.Files; | |||||
| import io.fabric8.kubernetes.api.model.ObjectMeta; | |||||
| import io.fabric8.kubernetes.api.model.Pod; | |||||
| import io.fabric8.kubernetes.api.model.Service; | |||||
| import io.fabric8.kubernetes.api.model.ServiceBuilder; | |||||
| import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||||
| import io.fabric8.kubernetes.api.model.apps.StatefulSetBuilder; | |||||
| import io.fabric8.kubernetes.api.model.batch.Job; | |||||
| import io.fabric8.kubernetes.api.model.batch.JobBuilder; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.onebrain.operator.action.PodInfo; | |||||
| import org.onebrain.operator.action.deployer.ChildResourceCreateInfo; | |||||
| import org.onebrain.operator.action.deployer.JobDeployer; | |||||
| import org.onebrain.operator.action.deployer.ServiceDeployer; | |||||
| import org.onebrain.operator.action.deployer.StatefulSetDeployer; | |||||
| import org.onebrain.operator.action.deployer.impl.BaseJobDeployer; | |||||
| import org.onebrain.operator.action.deployer.impl.BaseServiceDeployer; | |||||
| import org.onebrain.operator.action.deployer.impl.BaseStatefulSetDeployer; | |||||
| import org.onebrain.operator.api.pod.PodApi; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import org.onebrain.operator.crd.DistributeTrain; | |||||
| import org.onebrain.operator.crd.DistributeTrainSpec; | |||||
| import org.onebrain.operator.crd.DistributeTrainStatus; | |||||
| import org.onebrain.operator.exception.OperatorException; | |||||
| import org.onebrain.operator.redis.RedisService; | |||||
| import org.onebrain.operator.redis.key.OperatorKey; | |||||
| import org.onebrain.operator.utils.DistributeTrainClientHolder; | |||||
| import org.onebrain.operator.utils.IOUtils; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.core.io.ClassPathResource; | |||||
| import org.springframework.stereotype.Component; | |||||
| import java.io.File; | |||||
| import java.io.InputStream; | |||||
| import java.util.Collections; | |||||
| import java.util.List; | |||||
| import java.util.Map; | |||||
| import java.util.Optional; | |||||
| import java.util.concurrent.ConcurrentHashMap; | |||||
| import java.util.concurrent.LinkedBlockingQueue; | |||||
| import java.util.concurrent.ThreadFactory; | |||||
| import java.util.concurrent.ThreadPoolExecutor; | |||||
| import java.util.concurrent.TimeUnit; | |||||
| import java.util.concurrent.atomic.AtomicInteger; | |||||
| import static org.onebrain.operator.constants.KubeConstants.CHARSET; | |||||
| import static org.onebrain.operator.constants.KubeConstants.JOB_LABEL; | |||||
| import static org.onebrain.operator.constants.KubeConstants.MASTER_CONTAINER_NAME; | |||||
| import static org.onebrain.operator.constants.KubeConstants.SLAVE_CONTAINER_NAME; | |||||
| import static org.onebrain.operator.constants.KubeConstants.STATEFULSET_LABEL; | |||||
| import static org.onebrain.operator.constants.NumberConstant.NUMBER_2; | |||||
| /** | |||||
| * @description 分布式训练添加事件的处理器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Component("addActionHandler") | |||||
| @Slf4j | |||||
| public class AddActionHandler implements DistributeTrainActionHandler { | |||||
| public static final String JOB_WATCHER = "job-watcher-"; | |||||
| public static final String PRETREATMENT = "pretreatment"; | |||||
| public static final String JOB_NAME = "job-name"; | |||||
| public static final String RUNNING = "Running"; | |||||
| public static final String MASTER = "master"; | |||||
| public static final String SLAVE = "slave"; | |||||
| public static final String PRETREATMENT_TARGET_DIR = "/home/pretreatment"; | |||||
| public static final String IP = "ip"; | |||||
| public static final String ROLE = "role"; | |||||
| public static final String HOSTFILE_TARGET_DIR = "/home/hostfile.json"; | |||||
| @Autowired | |||||
| private KubernetesClient client; | |||||
| @Autowired | |||||
| private PodApi podApi; | |||||
| /** | |||||
| * String 训练uid List pod信息 | |||||
| */ | |||||
| private Map<String, List<PodInfo>> dtMap = new ConcurrentHashMap(); | |||||
| @Autowired | |||||
| private RedisService redis; | |||||
| /** | |||||
| * 线程池 | |||||
| */ | |||||
| private ThreadPoolExecutor pool = new ThreadPoolExecutor(5, 10, 10, TimeUnit.SECONDS, new LinkedBlockingQueue<>(1), new ThreadFactory() { | |||||
| private final AtomicInteger mThreadNum = new AtomicInteger(1); | |||||
| @Override | |||||
| public Thread newThread(Runnable r) { | |||||
| return new Thread(r, JOB_WATCHER + mThreadNum.getAndIncrement()); | |||||
| } | |||||
| }, new ThreadPoolExecutor.DiscardOldestPolicy()); | |||||
| /** | |||||
| * 处理事件的任务 | |||||
| */ | |||||
| class HandlerActionTask implements Runnable { | |||||
| private DistributeTrain distributeTrain; | |||||
| public HandlerActionTask(DistributeTrain distributeTrain) { | |||||
| this.distributeTrain = distributeTrain; | |||||
| } | |||||
| @Override | |||||
| public void run() { | |||||
| doAction(distributeTrain); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 执行任务动作 | |||||
| * @param distributeTrain | |||||
| */ | |||||
| public void doAction(DistributeTrain distributeTrain) { | |||||
| log.info("doAction=>distributeTrain : 【{}】", distributeTrain); | |||||
| ChildResourceCreateInfo info = null; | |||||
| try { | |||||
| //redis重复检查 | |||||
| //根据k8s 创建DistributionTrain 的uid去重 | |||||
| if (null != redis.get(OperatorKey.CR, distributeTrain.getMetadata().getUid())) { | |||||
| log.info("distribute train 【{}】 in namespace 【{}】 already exists", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||||
| return; | |||||
| } else { | |||||
| //录入redis做消费记录 | |||||
| redis.set(OperatorKey.CR, distributeTrain.getMetadata().getUid(), System.currentTimeMillis()); | |||||
| } | |||||
| //参数检查,提取并生成所需参数 | |||||
| validateParams(distributeTrain); | |||||
| info = ChildResourceCreateInfo.fromCr(distributeTrain); | |||||
| //按照size,创建副本数为size-1的statefulSet | |||||
| createStatefulSet(info); | |||||
| //等待statefulset全部ready | |||||
| waitUntilStatefulSetReady(info); | |||||
| //创建job,job此时在死循环 | |||||
| createJob(info); | |||||
| //等待job ready | |||||
| waitUntilJobReady(info); | |||||
| //复制 /home/pretreatment 到 pod | |||||
| copyPretreatmentShell(info); | |||||
| //收集statefulSet和job的ip | |||||
| validateAndCollectPods(info); | |||||
| //本地生成公私钥、认证文件,并拷贝到所有节点的~/.ssh目录下 | |||||
| sshAuthWithoutPass(info); | |||||
| //本地生成hostfile,并拷贝到所有节点的指定目录下 | |||||
| generateAndUploadHostFile(info); | |||||
| //解锁job的死循环 | |||||
| releaseInterLock(info); | |||||
| //改状态 | |||||
| //updateStatus(info, distributeTrain); | |||||
| //为job注册监听器 | |||||
| registerJobListener(info); | |||||
| log.info("all parts of【{}】 are ready", info.getParentName()); | |||||
| } catch (Exception e) { | |||||
| log.error("doAction error:【{}】", e); | |||||
| //移除缓存 | |||||
| redis.del(OperatorKey.CR, distributeTrain.getMetadata().getUid()); | |||||
| //回收创建的资源 | |||||
| if (info != null) { | |||||
| recycleCr(info); | |||||
| } | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 处理分布式训练 | |||||
| * @param distributeTrain 分布式训练信息 | |||||
| */ | |||||
| @Override | |||||
| public void handlerAction(DistributeTrain distributeTrain) { | |||||
| log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); | |||||
| HandlerActionTask handlerActionTask = new HandlerActionTask(distributeTrain); | |||||
| pool.getActiveCount(); | |||||
| pool.execute(handlerActionTask); | |||||
| } | |||||
| /** | |||||
| * 校验参数合法性 | |||||
| * @param distributeTrain 分布式训练 | |||||
| */ | |||||
| private void validateParams(DistributeTrain distributeTrain) { | |||||
| log.info("validateParams=>distributeTrain : 【{}】", distributeTrain); | |||||
| Integer size = distributeTrain.getSpec().getSize(); | |||||
| if (size < NUMBER_2) { | |||||
| throw new OperatorException("size must be greater than 1"); | |||||
| } | |||||
| String masterCmd = distributeTrain.getSpec().getMasterCmd(); | |||||
| String slaveCmd = distributeTrain.getSpec().getSlaveCmd(); | |||||
| if (StrUtil.isEmpty(slaveCmd) || StrUtil.isEmpty(masterCmd)) { | |||||
| throw new OperatorException("cmd lines must not be empty"); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 拷贝文件pretreatment到pod | |||||
| * @param info 资源信息 | |||||
| */ | |||||
| private void copyPretreatmentShell(ChildResourceCreateInfo info) { | |||||
| log.info("start to copy pretreatment for 【{}】 ", info.getParentName()); | |||||
| try { | |||||
| String path = System.getProperty(KubeConstants.USER_DIR_SYSTEM_PROPERTY) + File.separator + PRETREATMENT; | |||||
| if (!FileUtil.exist(path)) { | |||||
| FileUtil.writeFromStream(new ClassPathResource("/shell/pretreatment").getInputStream(), path); | |||||
| } | |||||
| File pretreatment = new File(path); | |||||
| //上传到pod指定目录 | |||||
| List<Pod> pods = getPods(info); | |||||
| for (int i = 0; i < pods.size(); i++) { | |||||
| Pod pod = pods.get(i); | |||||
| //默认第一个为master | |||||
| String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, pretreatment, PRETREATMENT_TARGET_DIR); | |||||
| } | |||||
| } catch (Exception e) { | |||||
| log.error("copy pretreatment shell error: 【{}】",e); | |||||
| throw new OperatorException("exception is thrown when copy pretreatment for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 创建statefulSet | |||||
| * @param info 资源信息 | |||||
| */ | |||||
| private void createStatefulSet(ChildResourceCreateInfo info) { | |||||
| log.info("createStatefulSet=>childResourceCreateInfo : 【{}】", info); | |||||
| StatefulSet statefulSet = client.apps().statefulSets() | |||||
| .inNamespace(info.getNamespace()) | |||||
| .withName(info.getStatefulSetName()).get(); | |||||
| //已存在 | |||||
| if (statefulSet != null) { | |||||
| log.info("statefulSet 【{}】 already exists", statefulSet.getMetadata().getName()); | |||||
| return; | |||||
| } | |||||
| //不存在,新建 | |||||
| StatefulSetDeployer deployer = new BaseStatefulSetDeployer(); | |||||
| StatefulSetBuilder builder = deployer.deploy(info); | |||||
| statefulSet = builder.build(); | |||||
| client.apps().statefulSets().create(statefulSet); | |||||
| log.info("create statefulSet【{}】 successfully", statefulSet.getMetadata().getName()); | |||||
| } | |||||
| /** | |||||
| * 等待statefulSet全部ready | |||||
| * @param info 资源信息 | |||||
| */ | |||||
| private void waitUntilStatefulSetReady(ChildResourceCreateInfo info) { | |||||
| log.info("wait for statefulSet 【{}】 in namespace 【{}】 ready", info.getStatefulSetName(), info.getNamespace()); | |||||
| try { | |||||
| client.apps().statefulSets() | |||||
| .inNamespace(info.getNamespace()) | |||||
| .withName(info.getStatefulSetName()) | |||||
| //阻塞 直到全部pod Ready 最长阻塞时间2小时 | |||||
| .waitUntilCondition(c -> | |||||
| c.getStatus().getReplicas() != null | |||||
| && ObjectUtil.equal(c.getStatus().getReplicas(), c.getStatus().getReadyReplicas()), | |||||
| NUMBER_2, TimeUnit.HOURS); | |||||
| log.info("statefulSet 【{}】 in namespace 【{}】 is ready", info.getStatefulSetName(), info.getNamespace()); | |||||
| } catch (Exception e) { | |||||
| log.error("wait until statefulSet ready error:【{}】", e); | |||||
| throw new OperatorException("exception is thrown when waiting for statefulSet 【" + info.getStatefulSetName() + "】 ready : \n" + e.getMessage()); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 创建job | |||||
| * @param info Job信息 | |||||
| */ | |||||
| private void createJob(ChildResourceCreateInfo info) { | |||||
| log.info("createJob=>childResourceCreateInfo : 【{}】", info); | |||||
| Job job = client.batch().jobs() | |||||
| .inNamespace(info.getNamespace()) | |||||
| .withName(info.getJobName()).get(); | |||||
| //已存在 | |||||
| if (job != null) { | |||||
| log.info("job 【{}】 already exists", job.getMetadata().getName()); | |||||
| return; | |||||
| } | |||||
| //不存在,新建 | |||||
| JobDeployer deployer = new BaseJobDeployer(); | |||||
| JobBuilder builder = deployer.deploy(info); | |||||
| job = builder.build(); | |||||
| log.info("job is : 【{}】", job); | |||||
| client.batch().jobs().create(job); | |||||
| log.info("create job【{}】 successfully", job.getMetadata().getName()); | |||||
| } | |||||
| /** | |||||
| * 等待job全部ready | |||||
| * @param info 资源信息 | |||||
| */ | |||||
| private void waitUntilJobReady(ChildResourceCreateInfo info) { | |||||
| log.info("wait for job 【{}】 in namespace 【{}】 ready", info.getStatefulSetName(), info.getNamespace()); | |||||
| try { | |||||
| List<Pod> podList = client.pods().inNamespace(info.getNamespace()) | |||||
| .withLabel(JOB_NAME, info.getJobName()) | |||||
| .list().getItems(); | |||||
| while (CollectionUtil.isEmpty(podList)) { | |||||
| TimeUnit.SECONDS.sleep(2); | |||||
| podList = client.pods().inNamespace(info.getNamespace()) | |||||
| .withLabel(JOB_NAME, info.getJobName()) | |||||
| .list().getItems(); | |||||
| } | |||||
| Pod pod = podList.get(0); | |||||
| client.pods().inNamespace(info.getNamespace()) | |||||
| .withName(pod.getMetadata().getName()) | |||||
| //等待直到Ready状态 最长2小时 | |||||
| .waitUntilReady(2, TimeUnit.HOURS); | |||||
| log.info("job 【{}】 in namespace 【{}】 is ready", info.getJobName(), info.getNamespace()); | |||||
| } catch (Exception e) { | |||||
| log.info(e.getMessage(), e); | |||||
| throw new OperatorException("exception is thrown when waiting for job 【" + info.getJobName() + "】 ready : \n" + e.getMessage()); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 收集资源的podInfo | |||||
| * @param info 资源信息 | |||||
| */ | |||||
| private void validateAndCollectPods(ChildResourceCreateInfo info) { | |||||
| //检查是否都在正常运行 | |||||
| log.info("validate pods status for 【{}】", info.getParentName()); | |||||
| boolean isAllSlaveRunning = true; | |||||
| boolean isMasterRunning = true; | |||||
| Pod masterPod = null; | |||||
| List<Pod> slavePods = null; | |||||
| do { | |||||
| //取得主的pod | |||||
| masterPod = getMasterPod(info); | |||||
| //取得从的所有pod | |||||
| slavePods = getSlavePods(info); | |||||
| if (masterPod == null) { | |||||
| log.info("can not find pod belongs to job 【{}】", info.getJobName()); | |||||
| return; | |||||
| } | |||||
| if (CollectionUtil.isEmpty(slavePods)) { | |||||
| log.info("can not find pod belongs to statefulSet 【{}】", info.getStatefulSetName()); | |||||
| return; | |||||
| } | |||||
| isMasterRunning = RUNNING.equals(masterPod.getStatus().getPhase()); | |||||
| isAllSlaveRunning = true; | |||||
| for (Pod slavePod : slavePods) { | |||||
| boolean isSlaveRunning = RUNNING.equals(slavePod.getStatus().getPhase()); | |||||
| if (!isSlaveRunning) { | |||||
| isAllSlaveRunning = false; | |||||
| break; | |||||
| } | |||||
| } | |||||
| } while (!(isMasterRunning && isAllSlaveRunning)); | |||||
| log.info("status checked 【{}】 all right", info.getParentName()); | |||||
| collectChildPodInfo(info, masterPod, slavePods); | |||||
| } | |||||
| /** | |||||
| * 收集pod基本信息 | |||||
| * @param info 资源信息 | |||||
| * @param masterPod | |||||
| * @param slavePods | |||||
| */ | |||||
| private void collectChildPodInfo(ChildResourceCreateInfo info, Pod masterPod, List<Pod> slavePods) { | |||||
| log.info("collectChildPodInfo=>childResourceCreateInfo : 【{}】, masterPod : 【{}】, slavePods : 【{}】", info, masterPod, slavePods); | |||||
| String key = info.getOwnerReference().getUid(); | |||||
| if (dtMap.containsKey(key)) { | |||||
| dtMap.remove(key); | |||||
| } | |||||
| List<PodInfo> podInfos = Lists.newArrayList(); | |||||
| PodInfo masterPodInfo = PodInfo.builder() | |||||
| .ip(masterPod.getStatus().getPodIP()) | |||||
| .role(MASTER) | |||||
| .build(); | |||||
| podInfos.add(masterPodInfo); | |||||
| for (Pod slavePod : slavePods) { | |||||
| PodInfo slavePodInfo = PodInfo.builder() | |||||
| .ip(slavePod.getStatus().getPodIP()) | |||||
| .role(SLAVE) | |||||
| .build(); | |||||
| podInfos.add(slavePodInfo); | |||||
| } | |||||
| dtMap.put(key, podInfos); | |||||
| } | |||||
| /** | |||||
| * ssh免密互通相关配置 | |||||
| * @param info 资源信息 | |||||
| */ | |||||
| private void sshAuthWithoutPass(ChildResourceCreateInfo info) { | |||||
| log.info("start to configure ssh no password environment for 【{}】 ", info.getParentName()); | |||||
| File tempDir = Files.createTempDir(); | |||||
| try ( | |||||
| InputStream isRsa = getClass().getClassLoader().getResourceAsStream("key/id_rsa"); | |||||
| InputStream isRsaPub = getClass().getClassLoader().getResourceAsStream("key/id_rsa.pub") | |||||
| ) { | |||||
| //id_rsa | |||||
| File tempIdRsa = FileUtil.createTempFile(tempDir); | |||||
| IOUtils.copy(isRsa, tempIdRsa); | |||||
| //id_rsa.pub | |||||
| File tempIdRsaPub = FileUtil.createTempFile(tempDir); | |||||
| IOUtils.copy(isRsaPub, tempIdRsaPub); | |||||
| List<String> pubLines = FileUtil.readLines(tempIdRsaPub, CHARSET); | |||||
| String pubKeyContent = pubLines.get(0); | |||||
| //按机器修改id_rsa.pub, 并组装一个大而全的authorized_keys | |||||
| List<File> idRsaPubFiles = Lists.newArrayList(); | |||||
| File tempAuthorizedKeys = FileUtil.createTempFile(tempDir); | |||||
| List<String> pubKeys = Lists.newArrayList(); | |||||
| for (PodInfo podInfo : dtMap.get(info.getOwnerReference().getUid())) { | |||||
| String podPubKeyContent = pubKeyContent.replace("{{ip}}", podInfo.getIp()); | |||||
| File tempIdRsaPubOnPod = FileUtil.createTempFile(tempDir); | |||||
| FileUtil.writeLines(Collections.singletonList(podPubKeyContent), tempIdRsaPubOnPod, CHARSET); | |||||
| idRsaPubFiles.add(tempIdRsaPubOnPod); | |||||
| pubKeys.add(podPubKeyContent); | |||||
| } | |||||
| FileUtil.writeLines(pubKeys, tempAuthorizedKeys, CHARSET); | |||||
| //获得所有pod, 上传三个文件 | |||||
| List<Pod> pods = getPods(info); | |||||
| for (int i = 0; i < pods.size(); i++) { | |||||
| Pod pod = pods.get(i); | |||||
| String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||||
| //上传id_rsa | |||||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempIdRsa, "/root/.ssh/id_rsa"); | |||||
| //上传id_rsa.pub | |||||
| File tempIdRsaPubOnPod = idRsaPubFiles.get(i); | |||||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempIdRsaPubOnPod, "/root/.ssh/id_rsa.pub"); | |||||
| //上传authorized_keys | |||||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempAuthorizedKeys, "/root/.ssh/authorized_keys"); | |||||
| //修改权限 | |||||
| String chmodCmd = StrUtil.format("chmod 644 /root/.ssh/authorized_keys && chmod 600 /root/.ssh/id_rsa && chmod 644 /root/.ssh/id_rsa.pub"); | |||||
| podApi.exec(info.getNamespace(), pod.getMetadata().getName(), containerName, chmodCmd); | |||||
| } | |||||
| log.info("configure ssh no password environment for 【{}】 successfully ", info.getParentName()); | |||||
| } catch (Exception e) { | |||||
| log.error("sshAuthWithoutPass error:【{}】", e); | |||||
| throw new OperatorException("exception is thrown when configure ssh no password environment for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||||
| } finally { | |||||
| //清理临时文件 | |||||
| FileUtil.del(tempDir); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 生成并上传hostfile | |||||
| * @param info 资源信息 | |||||
| */ | |||||
| private void generateAndUploadHostFile(ChildResourceCreateInfo info) { | |||||
| log.info("start to configure hostfile for 【{}】 ", info.getParentName()); | |||||
| File tempDir = Files.createTempDir(); | |||||
| try { | |||||
| //生成hostfile | |||||
| JSONArray jsonArray = new JSONArray(); | |||||
| List<PodInfo> podInfos = dtMap.get(info.getOwnerReference().getUid()); | |||||
| for (PodInfo podInfo : podInfos) { | |||||
| JSONObject podJson = new JSONObject(); | |||||
| podJson.put(IP, podInfo.getIp()); | |||||
| podJson.put(ROLE, podInfo.getRole()); | |||||
| jsonArray.add(podJson); | |||||
| } | |||||
| File tempHostFile = FileUtil.createTempFile(tempDir); | |||||
| FileUtil.writeLines(Collections.singletonList(jsonArray.toJSONString()), tempHostFile, CHARSET); | |||||
| //上传到pod指定目录 | |||||
| List<Pod> pods = getPods(info); | |||||
| for (int i = 0; i < pods.size(); i++) { | |||||
| Pod pod = pods.get(i); | |||||
| String containerName = i < 1 ? MASTER_CONTAINER_NAME : SLAVE_CONTAINER_NAME; | |||||
| podApi.copyToPod(info.getNamespace(), pod.getMetadata().getName(), containerName, tempHostFile, HOSTFILE_TARGET_DIR); | |||||
| } | |||||
| } catch (Exception e) { | |||||
| log.error("generateAndUploadHostFile error:【{}】", e); | |||||
| throw new OperatorException("exception is thrown when generate and upload hostfile for 【" + info.getParentName() + "】 : \n" + e.getMessage()); | |||||
| } finally { | |||||
| //清理临时文件 | |||||
| FileUtil.del(tempDir); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 创建service 解除闭锁 | |||||
| * @param info | |||||
| */ | |||||
| private void releaseInterLock(ChildResourceCreateInfo info) { | |||||
| log.info("release lock for 【{}】", info.getParentName()); | |||||
| ServiceDeployer deployer = new BaseServiceDeployer(); | |||||
| ServiceBuilder builder = deployer.deploy(info); | |||||
| Service svc = builder.build(); | |||||
| client.services().create(svc); | |||||
| log.info("lock for 【{}】 released", info.getParentName()); | |||||
| } | |||||
| /** | |||||
| * 回收cr | |||||
| * @param info | |||||
| */ | |||||
| private void recycleCr(ChildResourceCreateInfo info) { | |||||
| log.info("recycleCr=>childResourceCreateInfo : 【{}】", info); | |||||
| Optional.ofNullable(DistributeTrainClientHolder.getClient()) | |||||
| .ifPresent(distributeTrainClient -> { | |||||
| ObjectMeta metadata = new ObjectMeta(); | |||||
| metadata.setName(info.getParentName()); | |||||
| metadata.setNamespace(info.getNamespace()); | |||||
| DistributeTrain dt = new DistributeTrain(metadata, DistributeTrainSpec.builder() | |||||
| .build()); | |||||
| distributeTrainClient.delete(dt); | |||||
| log.info("recycle distribute train 【{}】", info.getParentName()); | |||||
| }); | |||||
| } | |||||
| /**更新状态*/ | |||||
| private void updateStatus(ChildResourceCreateInfo info, DistributeTrain distributeTrain) { | |||||
| log.info("updateStatus=>childResourceCreateInfo : 【{}】, distributeTrain : 【{}】", info, distributeTrain); | |||||
| if (distributeTrain.getStatus() == null) { | |||||
| distributeTrain.setStatus(new DistributeTrainStatus()); | |||||
| } | |||||
| Integer size = distributeTrain.getSpec().getSize(); | |||||
| distributeTrain.getStatus().setReplicas(size); | |||||
| distributeTrain.getStatus().setReadyReplicas(size); | |||||
| } | |||||
| /** | |||||
| * 为job注册监听器 | |||||
| * @param info | |||||
| */ | |||||
| private void registerJobListener(ChildResourceCreateInfo info) { | |||||
| log.info("register listener for distribute train 【{}】", info.getParentName()); | |||||
| // client.batch().jobs() | |||||
| // .inNamespace(info.getNamespace()) | |||||
| // .withName(info.getJobName()).watch(null); | |||||
| } | |||||
| /** | |||||
| * 获取所有分布式训练相关的pod | |||||
| * @param info | |||||
| * @return List<Pod> 分布式相关Pod集合 | |||||
| */ | |||||
| private List<Pod> getPods(ChildResourceCreateInfo info) { | |||||
| log.info("getPods=>childResourceCreateInfo : 【{}】", info); | |||||
| List<Pod> pods = Lists.newArrayList(); | |||||
| pods.add(getMasterPod(info)); | |||||
| pods.addAll(getSlavePods(info)); | |||||
| if (CollectionUtil.hasNull(pods) || pods.size() != info.getSlaveReplicas() + 1) { | |||||
| throw new OperatorException("can not get pods in correct numbers"); | |||||
| } | |||||
| return pods; | |||||
| } | |||||
| /** | |||||
| * 获取master信息 | |||||
| * @param info 资源信息 | |||||
| * @return Pod Master节点对应的Pod | |||||
| */ | |||||
| private Pod getMasterPod(ChildResourceCreateInfo info) { | |||||
| log.info("getMasterPod=>childResourceCreateInfo : 【{}】", info); | |||||
| List<Pod> masterPods = client.pods().inNamespace(info.getNamespace()) | |||||
| .withLabel(JOB_LABEL, info.getJobName()) | |||||
| .list().getItems(); | |||||
| if (CollectionUtil.isEmpty(masterPods)) { | |||||
| return null; | |||||
| } | |||||
| return masterPods.get(0); | |||||
| } | |||||
| /** | |||||
| * 取得从的所有pod | |||||
| * @param info 资源信息 | |||||
| * @return List<Pod> Slave节点对应的Pod集合 | |||||
| */ | |||||
| private List<Pod> getSlavePods(ChildResourceCreateInfo info) { | |||||
| log.info("getSlavePods=>childResourceCreateInfo : 【{}】", info); | |||||
| //取得从的所有pod | |||||
| List<Pod> slavePods = client.pods().inNamespace(info.getNamespace()) | |||||
| .withLabel(STATEFULSET_LABEL, info.getStatefulSetName()) | |||||
| .list().getItems(); | |||||
| if (CollectionUtil.isEmpty(slavePods)) { | |||||
| return null; | |||||
| } | |||||
| return slavePods; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,88 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.handler; | |||||
| import cn.hutool.core.collection.CollectionUtil; | |||||
| import io.fabric8.kubernetes.api.model.Service; | |||||
| import io.fabric8.kubernetes.api.model.ServiceList; | |||||
| import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||||
| import io.fabric8.kubernetes.api.model.apps.StatefulSetList; | |||||
| import io.fabric8.kubernetes.api.model.batch.Job; | |||||
| import io.fabric8.kubernetes.api.model.batch.JobList; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import org.onebrain.operator.crd.DistributeTrain; | |||||
| import org.onebrain.operator.redis.RedisService; | |||||
| import org.onebrain.operator.redis.key.OperatorKey; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.stereotype.Component; | |||||
| /** | |||||
| * @description 删除事件的处理器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Component("deleteActionHandler") | |||||
| @Slf4j | |||||
| public class DeleteActionHandler implements DistributeTrainActionHandler { | |||||
| @Autowired | |||||
| private KubernetesClient client; | |||||
| @Autowired | |||||
| private RedisService redis; | |||||
| /** | |||||
| * 处理删除事件 | |||||
| * @param distributeTrain 分布式训练信息 | |||||
| */ | |||||
| @Override | |||||
| public void handlerAction(DistributeTrain distributeTrain) { | |||||
| log.info("handlerAction=>distributeTrain : 【{}】", distributeTrain); | |||||
| String namespace = distributeTrain.getMetadata().getNamespace(); | |||||
| String parentName = distributeTrain.getMetadata().getName(); | |||||
| // namespace+parentName(分布式训练名称) 确定相应的资源 | |||||
| //删除job | |||||
| JobList jobList = client.batch().jobs().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||||
| if(CollectionUtil.isNotEmpty(jobList.getItems())){ | |||||
| for (Job item : jobList.getItems()) { | |||||
| client.batch().jobs().delete(item); | |||||
| } | |||||
| log.info("delete job in distributeTrain 【{}】", parentName); | |||||
| } | |||||
| //删除statefullSete | |||||
| StatefulSetList statefulSetList = client.apps().statefulSets().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||||
| if(CollectionUtil.isNotEmpty(statefulSetList.getItems())){ | |||||
| for (StatefulSet item : statefulSetList.getItems()) { | |||||
| client.apps().statefulSets().delete(item); | |||||
| } | |||||
| log.info("delete statefulSet in distributeTrain 【{}】", parentName); | |||||
| } | |||||
| //删除service | |||||
| ServiceList svcList = client.services().inNamespace(namespace).withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, parentName).list(); | |||||
| if(CollectionUtil.isNotEmpty(svcList.getItems())){ | |||||
| for (Service item : svcList.getItems()) { | |||||
| client.services().delete(item); | |||||
| } | |||||
| log.info("delete svc in distributeTrain 【{}】", parentName); | |||||
| } | |||||
| //删除redis里记录的分布式训练信息 | |||||
| redis.del(OperatorKey.CR, distributeTrain.getMetadata().getUid()); | |||||
| log.info("delete distributeTrain 【{}】 successfully", parentName); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,33 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.action.handler; | |||||
| import org.onebrain.operator.crd.DistributeTrain; | |||||
| /** | |||||
| * @description 分布式训练的事件处理器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public interface DistributeTrainActionHandler { | |||||
| /** | |||||
| * 处理相应的事件 | |||||
| * @param distributeTrain 分布式训练信息 | |||||
| */ | |||||
| void handlerAction(DistributeTrain distributeTrain); | |||||
| } | |||||
| @@ -0,0 +1,85 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.api.pod; | |||||
| import io.fabric8.kubernetes.client.dsl.ExecListener; | |||||
| import lombok.Getter; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import okhttp3.Response; | |||||
| import java.util.concurrent.CountDownLatch; | |||||
| /** | |||||
| * @description 默认命令执行监听器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Slf4j | |||||
| @Getter | |||||
| public class DefaultPodExecListener implements ExecListener { | |||||
| /** | |||||
| * pod名称 | |||||
| */ | |||||
| private String podName; | |||||
| /** | |||||
| * 命名空间 | |||||
| */ | |||||
| private String namespace; | |||||
| /** | |||||
| * 容器名称 | |||||
| */ | |||||
| private String containerName; | |||||
| /** | |||||
| * 执行门栓 线程通信用 | |||||
| */ | |||||
| private CountDownLatch execLatch; | |||||
| public DefaultPodExecListener(String podName, String namespace, String containerName, CountDownLatch execLatch) { | |||||
| this.podName = podName; | |||||
| this.namespace = namespace; | |||||
| this.containerName = containerName; | |||||
| this.execLatch = execLatch; | |||||
| } | |||||
| @Override | |||||
| public void onOpen(Response response) { | |||||
| log.debug("shell environment in pod '{}', namespace '{}' is opened", podName, namespace); | |||||
| log.debug("onOpen: {}", response); | |||||
| } | |||||
| @Override | |||||
| public void onFailure(Throwable t, Response response) { | |||||
| log.error("shell environment in pod '{}', namespace '{}' barfed", podName, namespace); | |||||
| log.error("onFailure: {} {}", t.getMessage(), response); | |||||
| if (execLatch != null) { | |||||
| execLatch.countDown(); | |||||
| } | |||||
| } | |||||
| @Override | |||||
| public void onClose(int code, String reason) { | |||||
| log.debug("shell environment in pod '{}', namespace '{}' closed", podName, namespace); | |||||
| log.debug("onClose: {} {}", code, reason); | |||||
| if (execLatch != null) { | |||||
| execLatch.countDown(); | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,177 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.api.pod; | |||||
| import cn.hutool.core.util.StrUtil; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import io.fabric8.kubernetes.client.dsl.ExecWatch; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.apache.commons.io.FileUtils; | |||||
| import org.onebrain.operator.context.KubeContext; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.stereotype.Component; | |||||
| import java.io.File; | |||||
| import java.io.IOException; | |||||
| import java.io.PipedInputStream; | |||||
| import java.io.PipedOutputStream; | |||||
| import java.util.concurrent.CountDownLatch; | |||||
| import java.util.concurrent.atomic.AtomicBoolean; | |||||
| /** | |||||
| * | |||||
| * @description PodApi 操作pod 里的容器用于上传文件等操作吧 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Component | |||||
| @Slf4j | |||||
| public class PodApi { | |||||
| private static final Integer DEFAULT_LOG_LINES = 50; | |||||
| @Autowired | |||||
| private KubeContext kubeContext; | |||||
| @Autowired | |||||
| private KubernetesClient client; | |||||
| /** | |||||
| * 从Pod下载单个文件 | |||||
| * @return File 临时文件,用后需要及时清理 | |||||
| * **/ | |||||
| public File copyFileFromPod(String namespace, String podName, String containerName, String filePath){ | |||||
| try { | |||||
| File tmpFile = File.createTempFile("copy-from-pod-", ""); | |||||
| client.pods().inNamespace(namespace).withName(podName) | |||||
| .inContainer(containerName) | |||||
| .file(filePath) | |||||
| .copy(tmpFile.toPath()); | |||||
| if(tmpFile.length() == 0){ | |||||
| return null; | |||||
| } | |||||
| return tmpFile; | |||||
| } catch (IOException e) { | |||||
| log.error(" File copy error : 【{}】",e); | |||||
| } | |||||
| return null; | |||||
| } | |||||
| /** | |||||
| * 从Pod下载目录 | |||||
| * @return File 临时文件,用后需要及时清理 | |||||
| * **/ | |||||
| public File copyFolderFromPod(String namespace, String podName, String containerName, String folderPath){ | |||||
| final PipedInputStream stdoutInput = new PipedInputStream(); | |||||
| final PipedOutputStream stdoutOutput = new PipedOutputStream(); | |||||
| final PipedInputStream stderrInput = new PipedInputStream(); | |||||
| final PipedOutputStream stderrOutput = new PipedOutputStream(); | |||||
| final AtomicBoolean failed = new AtomicBoolean(false); | |||||
| try { | |||||
| stdoutInput.connect(stdoutOutput); | |||||
| stderrInput.connect(stderrOutput); | |||||
| //去除路径上的/前缀 | |||||
| if(folderPath.startsWith(StrUtil.SLASH)){ | |||||
| folderPath = StrUtil.removePrefix(folderPath, StrUtil.SLASH); | |||||
| } | |||||
| //监听器异步执行 | |||||
| DefaultPodExecListener defaultPodExecListener = new DefaultPodExecListener(podName, namespace, containerName, null); | |||||
| StdPodExecListener stdPodExecListener = new StdPodExecListener(defaultPodExecListener, stdoutOutput, stderrOutput, failed); | |||||
| ExecWatch watch = client.pods().inNamespace(namespace) | |||||
| .withName(podName).inContainer(containerName) | |||||
| .writingOutput(stdoutOutput).writingError(stderrOutput) | |||||
| .usingListener(stdPodExecListener) | |||||
| .exec("tar", "cf", "-", "-C", folderPath, "."); | |||||
| // execLatch.await(); | |||||
| } catch (IOException e) { | |||||
| log.error("copyFolderFromPod:【{}】",e); | |||||
| } | |||||
| File tmpFile = null; | |||||
| try { | |||||
| tmpFile = File.createTempFile("copy-from-pod-", ".tar"); | |||||
| int length; | |||||
| byte[] buffer = new byte[1024]; | |||||
| while (!Thread.currentThread().isInterrupted() | |||||
| && (length = stdoutInput.read(buffer)) != -1) { | |||||
| byte[] content = new byte[length]; | |||||
| System.arraycopy(buffer, 0, content, 0, length); | |||||
| FileUtils.writeByteArrayToFile(tmpFile, content, true); | |||||
| } | |||||
| while (!Thread.currentThread().isInterrupted() | |||||
| && (length = stderrInput.read(buffer)) != -1) { | |||||
| log.error(new String(buffer, 0, length)); | |||||
| } | |||||
| } catch (IOException e) { | |||||
| if (!Thread.currentThread().isInterrupted()) { | |||||
| log.error("Error while pumping stream. 【{}】", e); | |||||
| } else { | |||||
| log.error("Interrupted while pumping stream. 【{}】", e); | |||||
| } | |||||
| } | |||||
| return tmpFile; | |||||
| } | |||||
| /** | |||||
| * 拷贝文件到pod | |||||
| * @param namespace 命名空间 | |||||
| * @param podName pod名称 | |||||
| * @param containerName 容器名称 | |||||
| * @param file 文件 | |||||
| * @param targetDir 目标路径 | |||||
| */ | |||||
| public void copyToPod(String namespace, String podName, String containerName, File file, String targetDir){ | |||||
| client.pods().inNamespace(namespace).withName(podName) | |||||
| .inContainer(containerName) | |||||
| .file(targetDir) | |||||
| .upload(file.toPath()); | |||||
| } | |||||
| /** | |||||
| * 同步执行 | |||||
| * @param namespace 命名空间 | |||||
| * @param podName pod名称 | |||||
| * @param containerName 容器名称 | |||||
| * @param cmd 命令 | |||||
| */ | |||||
| public void exec(String namespace, String podName, String containerName, String cmd){ | |||||
| try { | |||||
| final CountDownLatch execLatch = new CountDownLatch(1); | |||||
| ExecWatch execWatch = client.pods().inNamespace(namespace).withName(podName).inContainer(containerName) | |||||
| .redirectingOutput() | |||||
| .withTTY() //不展示输出 | |||||
| .usingListener(new DefaultPodExecListener(namespace, podName, containerName, execLatch)) | |||||
| .exec("sh", "-c", cmd); | |||||
| execLatch.await(); | |||||
| } catch (InterruptedException e) { | |||||
| log.error(" PodApi execute cmd error : 【{}】",e); | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,83 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.api.pod; | |||||
| import io.fabric8.kubernetes.client.dsl.ExecListener; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import okhttp3.Response; | |||||
| import java.io.IOException; | |||||
| import java.io.PipedOutputStream; | |||||
| import java.util.concurrent.atomic.AtomicBoolean; | |||||
| /** | |||||
| * @description 标准pod执行监听器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Slf4j | |||||
| public class StdPodExecListener implements ExecListener { | |||||
| private ExecListener defaultExecListener; | |||||
| private PipedOutputStream stdoutOutput; | |||||
| private PipedOutputStream stderrOutput; | |||||
| private AtomicBoolean failed; | |||||
| public StdPodExecListener(ExecListener defaultExecListener, PipedOutputStream stdoutOutput, PipedOutputStream stderrOutput, AtomicBoolean failed) { | |||||
| this.defaultExecListener = defaultExecListener; | |||||
| this.stdoutOutput = stdoutOutput; | |||||
| this.stderrOutput = stderrOutput; | |||||
| this.failed = failed; | |||||
| } | |||||
| @Override | |||||
| public void onOpen(Response response) { | |||||
| log.info("onOpen=>response : 【{}】",response); | |||||
| defaultExecListener.onOpen(response); | |||||
| } | |||||
| @Override | |||||
| public void onFailure(Throwable t, Response response) { | |||||
| log.info("onFailure=> t :【{}】,response : 【{}】",t,response); | |||||
| try { | |||||
| failed.set(true); | |||||
| stdoutOutput.close(); | |||||
| stderrOutput.close(); | |||||
| } catch (IOException e) { | |||||
| log.error("Failed to close stdout and stderr pipes. 【{}】", e); | |||||
| } finally { | |||||
| defaultExecListener.onFailure(t, response); | |||||
| } | |||||
| } | |||||
| @Override | |||||
| public void onClose(int code, String reason) { | |||||
| log.info("onClose=>code : 【{}】,reason : 【{}】",code,reason); | |||||
| try { | |||||
| stdoutOutput.close(); | |||||
| stderrOutput.close(); | |||||
| } catch (IOException e) { | |||||
| log.error("Failed to close stdout and stderr pipes. 【{}】", e); | |||||
| } finally { | |||||
| defaultExecListener.onClose(code, reason); | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,66 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.config; | |||||
| import cn.hutool.core.util.StrUtil; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import org.onebrain.operator.context.KubeContext; | |||||
| import org.onebrain.operator.properties.KubeProperties; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.boot.context.properties.EnableConfigurationProperties; | |||||
| import org.springframework.context.annotation.Bean; | |||||
| import org.springframework.context.annotation.Configuration; | |||||
| /** | |||||
| * @description k8s配置类 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Configuration | |||||
| @EnableConfigurationProperties(KubeProperties.class) | |||||
| public class KubeConfig { | |||||
| @Autowired | |||||
| private KubeProperties kubeProperties; | |||||
| /** | |||||
| * 注册k8s配置 | |||||
| * @return | |||||
| */ | |||||
| @Bean | |||||
| public KubeContext kubeContext() { | |||||
| if (kubeProperties == null) { | |||||
| return null; | |||||
| } | |||||
| final String configSource = kubeProperties.getKubeconfig(); | |||||
| if(StrUtil.isEmpty(configSource)){ | |||||
| return null; | |||||
| } | |||||
| return new KubeContext(kubeProperties); | |||||
| } | |||||
| /** | |||||
| * 注册k8s客户端 | |||||
| * @param kubeContext k8s配置 | |||||
| * @return | |||||
| */ | |||||
| @Bean | |||||
| public KubernetesClient kubernetesClient(KubeContext kubeContext){ | |||||
| return kubeContext.getClient(); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,34 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.constants; | |||||
| /** | |||||
| * @description crd 常量信息 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public class CrdConstants { | |||||
| public static final String CRD_GROUP = "onebrain.oneflow.org"; | |||||
| public static final String CRD_SINGULAR_NAME = "distributetrain"; | |||||
| public static final String CRD_PLURAL_NAME = "distributetrains"; | |||||
| public static final String CRD_NAME = CRD_PLURAL_NAME + "." + CRD_GROUP; | |||||
| public static final String CRD_KIND = "DistributeTrain"; | |||||
| public static final String CRD_SCOPE = "Namespaced"; | |||||
| public static final String CRD_SHORT_NAME = "dt"; | |||||
| public static final String CRD_VERSION = "v1alpha1"; | |||||
| public static final String CRD_API_VERSION = "apiextensions.k8s.io/v1beta1"; | |||||
| } | |||||
| @@ -0,0 +1,40 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.constants; | |||||
| /** | |||||
| * @description k8s常量 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public class KubeConstants { | |||||
| public static final String DISTRIBUTE_TRAIN_LABEL = "dt-name"; | |||||
| public static final String STATEFULSET_LABEL = "dt-ss-name"; | |||||
| public static final String JOB_LABEL = "dt-job-name"; | |||||
| public static final String MASTER_CONTAINER_NAME = "distribute-train-master"; | |||||
| public static final String SLAVE_CONTAINER_NAME = "distribute-train-slave"; | |||||
| public final static String USER_DIR_SYSTEM_PROPERTY = "user.dir"; | |||||
| //不许重试 | |||||
| public static final Integer BACKOFFLIMIT = 0; | |||||
| public static final String CHARSET = "utf-8"; | |||||
| public static final String ENV_NODE_NUM = "NODE_NUM"; | |||||
| public static final String VOLUME_SHM = "dshm"; | |||||
| } | |||||
| @@ -0,0 +1,43 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.constants; | |||||
| /** | |||||
| * @Description 数字常量 | |||||
| * @Date 2020-6-9 | |||||
| */ | |||||
| public class NumberConstant { | |||||
| public final static int NUMBER_0 = 0; | |||||
| public final static long LONG_NUMBER_0 = 0L; | |||||
| public final static int NUMBER_1 = 1; | |||||
| public final static int NUMBER_2 = 2; | |||||
| public final static int NUMBER_3 = 3; | |||||
| public final static int NUMBER_5 = 5; | |||||
| public final static int NUMBER_10 = 10; | |||||
| public final static int NUMBER_22 = 22; | |||||
| public final static int NUMBER_30 = 30; | |||||
| public final static int NUMBER_50 = 50; | |||||
| public final static int NUMBER_60 = 60; | |||||
| public final static long LONG_NUMBER_60 = 60L; | |||||
| public final static int HOUR_SECOND = 60 * 60; | |||||
| public final static int DAY_SECOND = 60 * 60 * 24; | |||||
| public final static int WEEK_SECOND = 60 * 60 * 24 * 7; | |||||
| public final static int MAX_PAGE_SIZE = 2000; | |||||
| public final static int NUMBER_30000 = 30000; | |||||
| } | |||||
| @@ -0,0 +1,117 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.context; | |||||
| import cn.hutool.core.util.StrUtil; | |||||
| import com.fasterxml.jackson.core.JsonProcessingException; | |||||
| import io.fabric8.kubernetes.api.model.HasMetadata; | |||||
| import io.fabric8.kubernetes.client.Config; | |||||
| import io.fabric8.kubernetes.client.DefaultKubernetesClient; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import io.fabric8.kubernetes.client.VersionInfo; | |||||
| import io.fabric8.kubernetes.client.internal.SerializationUtils; | |||||
| import io.fabric8.kubernetes.client.utils.Utils; | |||||
| import lombok.Getter; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.onebrain.operator.properties.KubeProperties; | |||||
| import org.springframework.beans.BeansException; | |||||
| import org.springframework.context.ApplicationContext; | |||||
| import org.springframework.context.ApplicationContextAware; | |||||
| /** | |||||
| * @description k8s上下文 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Slf4j | |||||
| @Getter | |||||
| public class KubeContext implements ApplicationContextAware { | |||||
| private static final String AUTO = "auto"; | |||||
| private ApplicationContext applicationContext; | |||||
| private KubernetesClient client; | |||||
| private Config config; | |||||
| public KubeContext(KubeProperties kubeProperties) { | |||||
| String configSource = kubeProperties.getKubeconfig(); | |||||
| try { | |||||
| if(AUTO.equals(configSource)){ | |||||
| //在集群内部可自动侦测 | |||||
| log.info("kubernetes client is in cluster mode"); | |||||
| client = new DefaultKubernetesClient(); | |||||
| config = client.getConfiguration(); | |||||
| }else{ | |||||
| if(configSource.startsWith(StrUtil.SLASH)){ | |||||
| log.info("read kubeconfig from file system:{}", configSource); | |||||
| System.setProperty(Config.KUBERNETES_KUBECONFIG_FILE, configSource); | |||||
| }else{ | |||||
| log.info("read kubeconfig from classpath:{}", configSource); | |||||
| final String testKubeconfigFile = Utils.filePath(getClass().getResource(StrUtil.SLASH + configSource)); | |||||
| //修改环境变量,重新指定kubeconfig读取位置 | |||||
| System.setProperty(Config.KUBERNETES_KUBECONFIG_FILE, testKubeconfigFile); | |||||
| } | |||||
| client = new DefaultKubernetesClient(); | |||||
| config = client.getConfiguration(); | |||||
| } | |||||
| //打印集群信息 | |||||
| log.info("ApiVersion : {}", client.getApiVersion()); | |||||
| log.info("MasterUrl : {}", client.getMasterUrl()); | |||||
| if(log.isDebugEnabled()){ | |||||
| VersionInfo versionInfo = client.getVersion(); | |||||
| log.debug("Version details of this Kubernetes cluster :-"); | |||||
| log.debug("Major : {}", versionInfo.getMajor()); | |||||
| log.debug("Minor : {}", versionInfo.getMinor()); | |||||
| log.debug("GitVersion : {}", versionInfo.getGitVersion()); | |||||
| log.debug("GitCommit : {}", versionInfo.getGitCommit()); | |||||
| log.debug("BuildDate : {}", versionInfo.getBuildDate()); | |||||
| log.debug("GitTreeState : {}", versionInfo.getGitTreeState()); | |||||
| log.debug("Platform : {}", versionInfo.getPlatform()); | |||||
| log.debug("GoVersion : {}", versionInfo.getGoVersion()); | |||||
| } | |||||
| }catch (Exception e){ | |||||
| client = null; | |||||
| log.error("初始化 K8sUtils 失败!", e); | |||||
| e.printStackTrace(); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 导出成yaml字符串 | |||||
| * @param resource k8s元数据 | |||||
| * @return | |||||
| */ | |||||
| public String convertToYaml(HasMetadata resource) { | |||||
| try { | |||||
| return SerializationUtils.dumpAsYaml(resource); | |||||
| } catch (JsonProcessingException e) { | |||||
| e.printStackTrace(); | |||||
| throw new RuntimeException("can not transform resource to yaml"); | |||||
| } | |||||
| } | |||||
| @Override | |||||
| public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { | |||||
| this.applicationContext = applicationContext; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,131 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.controller; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||||
| import io.fabric8.kubernetes.client.dsl.Resource; | |||||
| import io.fabric8.kubernetes.client.informers.ResourceEventHandler; | |||||
| import io.fabric8.kubernetes.client.informers.SharedIndexInformer; | |||||
| import io.fabric8.kubernetes.client.informers.cache.Lister; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.onebrain.operator.action.handler.DistributeTrainActionHandler; | |||||
| import org.onebrain.operator.crd.DistributeTrain; | |||||
| import org.onebrain.operator.crd.DistributeTrainList; | |||||
| import org.onebrain.operator.crd.DoneableDistributeTrain; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.beans.factory.annotation.Qualifier; | |||||
| import org.springframework.scheduling.annotation.Async; | |||||
| import java.util.concurrent.TimeUnit; | |||||
| /** | |||||
| * @description 分布式训练控制器 | |||||
| * @date 2020-06-16 | |||||
| */ | |||||
| @Slf4j | |||||
| public class DistributeTrainController { | |||||
| @Autowired | |||||
| private KubernetesClient client; | |||||
| /** | |||||
| * 分布式训练informer | |||||
| */ | |||||
| private SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer; | |||||
| /** | |||||
| * 分布式训练k8s访问客户端 | |||||
| */ | |||||
| private MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient; | |||||
| /** | |||||
| * 分布式训练lister | |||||
| */ | |||||
| private Lister<DistributeTrain> distributeTrainLister; | |||||
| @Autowired | |||||
| @Qualifier("addActionHandler") | |||||
| private DistributeTrainActionHandler addActionHandler; | |||||
| @Autowired | |||||
| @Qualifier("deleteActionHandler") | |||||
| private DistributeTrainActionHandler deleteActionHandler; | |||||
| public DistributeTrainController(MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient, SharedIndexInformer<DistributeTrain> distributeTrainSharedIndexInformer, String namespace) { | |||||
| this.distributeTrainSharedIndexInformer = distributeTrainSharedIndexInformer; | |||||
| this.distributeTrainClient = distributeTrainClient; | |||||
| this.distributeTrainLister = new Lister<>(distributeTrainSharedIndexInformer.getIndexer()); | |||||
| } | |||||
| /** | |||||
| * 添加事件监听器 | |||||
| */ | |||||
| public void create() { | |||||
| distributeTrainSharedIndexInformer.addEventHandler(new ResourceEventHandler<DistributeTrain>() { | |||||
| /** | |||||
| * 处理添加事件 | |||||
| * @param distributeTrain 分布式训练信息 | |||||
| */ | |||||
| @Override | |||||
| public void onAdd(DistributeTrain distributeTrain) { | |||||
| log.info("add distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||||
| addActionHandler.handlerAction(distributeTrain); | |||||
| } | |||||
| /** | |||||
| * 处理更内心事件 | |||||
| * @param distributeTrain 旧的 分布式训练信息 | |||||
| * @param newDistributeTrain 新的 分布式训练信息 | |||||
| */ | |||||
| @Override | |||||
| public void onUpdate(DistributeTrain distributeTrain, DistributeTrain newDistributeTrain) { | |||||
| log.info("update distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||||
| } | |||||
| /** | |||||
| * 处理删除事件 | |||||
| * @param distributeTrain 分布式训练信息 | |||||
| * @param b 是否为未知事件 | |||||
| */ | |||||
| @Override | |||||
| public void onDelete(DistributeTrain distributeTrain, boolean b) { | |||||
| log.info("delete distributeTrain named 【{}】 in namespace 【{}】", distributeTrain.getMetadata().getName(), distributeTrain.getMetadata().getNamespace()); | |||||
| deleteActionHandler.handlerAction(distributeTrain); | |||||
| } | |||||
| }); | |||||
| } | |||||
| /** | |||||
| * 运行 | |||||
| */ | |||||
| @Async | |||||
| public void run() { | |||||
| log.info("Starting DistributeTrain controller"); | |||||
| try { | |||||
| //分布式训练信息同步 | |||||
| while (!distributeTrainSharedIndexInformer.hasSynced()){ | |||||
| TimeUnit.SECONDS.sleep(1); | |||||
| } | |||||
| } catch (InterruptedException e) { | |||||
| e.printStackTrace(); | |||||
| log.error("run error:【{}】",e); | |||||
| } | |||||
| log.info("DistributeTrain controller is Running"); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,47 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.crd; | |||||
| import io.fabric8.kubernetes.api.model.ObjectMeta; | |||||
| import io.fabric8.kubernetes.client.CustomResource; | |||||
| import lombok.Data; | |||||
| import lombok.NoArgsConstructor; | |||||
| /** | |||||
| * @description 分布式训练 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| @Data | |||||
| @NoArgsConstructor | |||||
| public class DistributeTrain extends CustomResource { | |||||
| /** | |||||
| * 分布式训练详细规格 | |||||
| */ | |||||
| private DistributeTrainSpec spec; | |||||
| /** | |||||
| * 分布式训练状态 | |||||
| */ | |||||
| private DistributeTrainStatus status; | |||||
| public DistributeTrain(ObjectMeta objectMeta, DistributeTrainSpec spec) { | |||||
| this.setMetadata(objectMeta); | |||||
| this.spec = spec; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,27 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.crd; | |||||
| import io.fabric8.kubernetes.client.CustomResourceList; | |||||
| /** | |||||
| * @description CRD资源列表(分布式训练) | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| public class DistributeTrainList extends CustomResourceList<DistributeTrain> { | |||||
| } | |||||
| @@ -0,0 +1,108 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.crd; | |||||
| import com.fasterxml.jackson.databind.JsonDeserializer; | |||||
| import com.fasterxml.jackson.databind.annotation.JsonDeserialize; | |||||
| import io.fabric8.kubernetes.api.model.*; | |||||
| import lombok.AllArgsConstructor; | |||||
| import lombok.Builder; | |||||
| import lombok.Data; | |||||
| import lombok.NoArgsConstructor; | |||||
| import java.util.List; | |||||
| import java.util.Map; | |||||
| /** | |||||
| * @description 分布式训练详细规格 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @JsonDeserialize( | |||||
| using = JsonDeserializer.None.class | |||||
| ) | |||||
| @Data | |||||
| @NoArgsConstructor | |||||
| @AllArgsConstructor | |||||
| @Builder | |||||
| public class DistributeTrainSpec implements KubernetesResource { | |||||
| /** | |||||
| * 镜像 | |||||
| */ | |||||
| private String image; | |||||
| /** | |||||
| * 镜像拉取策略 | |||||
| */ | |||||
| private String imagePullPolicy; | |||||
| /** | |||||
| * 机器数 | |||||
| */ | |||||
| private Integer size; | |||||
| /** | |||||
| * 环境参数 | |||||
| */ | |||||
| private List<EnvVar> env; | |||||
| /** | |||||
| * master 命令 | |||||
| */ | |||||
| private String masterCmd; | |||||
| /** | |||||
| * slave命令 | |||||
| */ | |||||
| private String slaveCmd; | |||||
| /** | |||||
| * master 资源节点限制 | |||||
| */ | |||||
| private ResourceRequirements masterResources; | |||||
| /** | |||||
| * slave 资源节点限制 | |||||
| */ | |||||
| private ResourceRequirements slaveResources; | |||||
| /** | |||||
| * 节点调度选择器 | |||||
| */ | |||||
| private Map<String,String> nodeSelector; | |||||
| /** | |||||
| * 初始化容器 | |||||
| */ | |||||
| private Container initContainer; | |||||
| /** | |||||
| * 工作目录挂载 | |||||
| */ | |||||
| private Volume workspaceStorage; | |||||
| /** | |||||
| * 数据集目录挂载 | |||||
| */ | |||||
| private Volume datasetStorage; | |||||
| /** | |||||
| * 模型目录挂载 | |||||
| */ | |||||
| private Volume modelStorage; | |||||
| } | |||||
| @@ -0,0 +1,55 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.crd; | |||||
| import com.fasterxml.jackson.databind.JsonDeserializer; | |||||
| import com.fasterxml.jackson.databind.annotation.JsonDeserialize; | |||||
| import io.fabric8.kubernetes.api.model.KubernetesResource; | |||||
| import lombok.Data; | |||||
| /** | |||||
| * @description 分布式训练状态 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @JsonDeserialize( | |||||
| using = JsonDeserializer.None.class | |||||
| ) | |||||
| @Data | |||||
| public class DistributeTrainStatus implements KubernetesResource { | |||||
| /** | |||||
| * 副本数 | |||||
| */ | |||||
| private Integer replicas; | |||||
| /** | |||||
| * 处在ready状态的副本数 | |||||
| */ | |||||
| private Integer readyReplicas; | |||||
| /** | |||||
| * 成功数 | |||||
| */ | |||||
| private Integer success; | |||||
| /** | |||||
| * 失败数 | |||||
| */ | |||||
| private Integer failed; | |||||
| } | |||||
| @@ -0,0 +1,31 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.crd; | |||||
| import io.fabric8.kubernetes.api.builder.Function; | |||||
| import io.fabric8.kubernetes.client.CustomResourceDoneable; | |||||
| /** | |||||
| * @description CRD资源的修改Builder | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| public class DoneableDistributeTrain extends CustomResourceDoneable<DistributeTrain> { | |||||
| public DoneableDistributeTrain(DistributeTrain resource, Function<DistributeTrain, DistributeTrain> function) { | |||||
| super(resource, function); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,56 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.enums; | |||||
| /** | |||||
| * @description pvc的访问模式 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| public enum AccessModeEnum { | |||||
| /** | |||||
| * RWO是最基本的方式,可读可写,但只支持被单个Pod挂载 | |||||
| */ | |||||
| RWO("ReadWriteOnce"), | |||||
| /** | |||||
| * 可以以只读的方式被多个Pod挂载 | |||||
| */ | |||||
| ROX("ReadOnlyMany"), | |||||
| /****/ | |||||
| /** | |||||
| * 这种存储可以以读写的方式被多个Pod共享。 | |||||
| * 不是每一种存储都支持这三种方式,像共享方式,目前支持的还比较少,比较常用的是NFS。 | |||||
| * 在PVC绑定PV时通常根据两个条件来绑定,一个是存储的大小,另一个就是访问模式。 | |||||
| */ | |||||
| RWX("ReadWriteMany"); | |||||
| /** | |||||
| * 模式 | |||||
| */ | |||||
| private final String mode; | |||||
| AccessModeEnum(String mode) { | |||||
| this.mode = mode; | |||||
| } | |||||
| public String getMode() { | |||||
| return mode; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,49 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.exception; | |||||
| import lombok.Getter; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| /** | |||||
| * @description Operator自定义异常 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| @Slf4j | |||||
| @Getter | |||||
| public class OperatorException extends RuntimeException{ | |||||
| /** | |||||
| * 信息 | |||||
| */ | |||||
| private String msg; | |||||
| /** | |||||
| * 原因 | |||||
| */ | |||||
| private Throwable cause; | |||||
| public OperatorException(String msg, Throwable cause) { | |||||
| this.msg = msg; | |||||
| this.cause = cause; | |||||
| } | |||||
| public OperatorException(String msg) { | |||||
| this.msg = msg; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,34 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.properties; | |||||
| import lombok.Data; | |||||
| import org.springframework.boot.context.properties.ConfigurationProperties; | |||||
| import org.springframework.stereotype.Component; | |||||
| /** | |||||
| * @description 属性配置 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| @Data | |||||
| @ConfigurationProperties("k8s") | |||||
| @Component | |||||
| public class KubeProperties { | |||||
| private String kubeconfig; | |||||
| } | |||||
| @@ -0,0 +1,65 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.redis; | |||||
| /** | |||||
| * @description redis Key 前缀 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public abstract class AbstractKeyPrefix { | |||||
| /** | |||||
| * key模板 | |||||
| */ | |||||
| private static final String KEY_TEMPLATE = "Operator:%s"; | |||||
| /** | |||||
| * 过期时间 | |||||
| */ | |||||
| private int expireSeconds; | |||||
| /** | |||||
| * 前缀 | |||||
| */ | |||||
| private String prefix; | |||||
| public AbstractKeyPrefix(String prefix) {//0代表永不过期 | |||||
| this(prefix,0); | |||||
| } | |||||
| public AbstractKeyPrefix(String prefix, int expireSeconds) { | |||||
| this.expireSeconds = expireSeconds; | |||||
| this.prefix = prefix; | |||||
| } | |||||
| /** | |||||
| * 获取过期时间 | |||||
| * @return | |||||
| */ | |||||
| public int getExpireSeconds() {//默认0代表永不过期 | |||||
| return expireSeconds; | |||||
| } | |||||
| /** | |||||
| * 获取前缀 | |||||
| * @return | |||||
| */ | |||||
| public String getPrefix() { | |||||
| return String.format(KEY_TEMPLATE, prefix); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,290 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.redis; | |||||
| import org.onebrain.operator.utils.FastjsonUtils; | |||||
| import org.onebrain.operator.utils.RedisUtils; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.stereotype.Service; | |||||
| import java.util.Set; | |||||
| /** | |||||
| * @description redis服务 | |||||
| * @date 2020-09-03 | |||||
| */ | |||||
| @Service | |||||
| public class RedisService { | |||||
| @Autowired | |||||
| private RedisUtils redisUtils; | |||||
| /** | |||||
| * 真正key模板 | |||||
| */ | |||||
| private static final String REAL_KEY_TEMPLATE = "%s:%s"; | |||||
| /** | |||||
| * 获取真正的key | |||||
| * @param prefix 前缀 | |||||
| * @param key key值 | |||||
| * @return 放入redis里的key值 | |||||
| */ | |||||
| private String getRealKey(AbstractKeyPrefix prefix, String key){ | |||||
| return String.format(REAL_KEY_TEMPLATE, prefix.getPrefix(), key); | |||||
| } | |||||
| /** | |||||
| * 实现命令:TTL key,以秒为单位,返回给定 key的剩余生存时间(TTL, time to live)。 | |||||
| * @param prefix 前缀 | |||||
| * @param key key值 | |||||
| * @return 返回过期时间秒数 | |||||
| */ | |||||
| public long ttl(AbstractKeyPrefix prefix, String key) { | |||||
| return redisUtils.ttl(getRealKey(prefix, key)); | |||||
| } | |||||
| /** | |||||
| * 实现命令:expire 设置过期时间,单位秒 | |||||
| * @param prefix 前缀 | |||||
| * @param key key值 | |||||
| * @param timeout 期望过期时间 | |||||
| */ | |||||
| public void expire(AbstractKeyPrefix prefix, String key, long timeout) { | |||||
| redisUtils.expire(getRealKey(prefix, key), timeout); | |||||
| } | |||||
| /** | |||||
| * 实现命令:INCR key,增加key一次 | |||||
| * @param prefix 前缀 | |||||
| * @param key key值 | |||||
| * @param delta 增量 | |||||
| * @return 计数值 | |||||
| */ | |||||
| public long incr(AbstractKeyPrefix prefix, String key, long delta) { | |||||
| return redisUtils.incr(getRealKey(prefix, key), delta); | |||||
| } | |||||
| /** | |||||
| * 实现命令: key,减少key一次 | |||||
| * @param prefix 前缀 | |||||
| * @param key key值 | |||||
| * @param delta 增量 | |||||
| * @return 计数值 | |||||
| */ | |||||
| public long decr(AbstractKeyPrefix prefix, String key, long delta) { | |||||
| String realKey = getRealKey(prefix, key); | |||||
| if(delta < 0){ | |||||
| //throw new RuntimeException("递减因子必须大于0"); | |||||
| del(realKey); | |||||
| return 0; | |||||
| } | |||||
| return redisUtils.decr(realKey, delta); | |||||
| } | |||||
| /** | |||||
| * 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||||
| * @param prefix key前缀 | |||||
| * @return key集合 | |||||
| */ | |||||
| public Set<String> keys(AbstractKeyPrefix prefix) { | |||||
| String pattern = prefix.getPrefix(); | |||||
| return redisUtils.keys(pattern + ":*"); | |||||
| } | |||||
| /** | |||||
| * 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @return key集合 | |||||
| */ | |||||
| public Set<String> keys(AbstractKeyPrefix prefix, String key) { | |||||
| String pattern = prefix.getPrefix(); | |||||
| return redisUtils.keys(pattern + ":" + key + ":*"); | |||||
| } | |||||
| /** | |||||
| * 实现命令:DEL key,删除一个key | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| */ | |||||
| public void del(AbstractKeyPrefix prefix, String key) { | |||||
| redisUtils.del(getRealKey(prefix, key)); | |||||
| } | |||||
| /** | |||||
| * 删除一个key | |||||
| * @param realKey 真正的key | |||||
| */ | |||||
| public void del(String realKey) { | |||||
| redisUtils.del(realKey); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| */ | |||||
| public void set(AbstractKeyPrefix prefix, String key, String value) { | |||||
| if(prefix.getExpireSeconds() <= 0){ | |||||
| redisUtils.set(getRealKey(prefix, key), value); | |||||
| }else{ | |||||
| redisUtils.set(getRealKey(prefix, key), value, prefix.getExpireSeconds()); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param <T> 指定类型 | |||||
| */ | |||||
| public <T> void set(AbstractKeyPrefix prefix, String key, T value) { | |||||
| if(prefix.getExpireSeconds() <= 0){ | |||||
| redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value)); | |||||
| }else{ | |||||
| redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), prefix.getExpireSeconds()); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param timeout 过期时间 | |||||
| */ | |||||
| public void set(AbstractKeyPrefix prefix, String key, String value, long timeout) { | |||||
| redisUtils.set(getRealKey(prefix, key), value, timeout); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param timeout 过期时间 | |||||
| * @param <T> 指定类型 | |||||
| */ | |||||
| public <T> void set(AbstractKeyPrefix prefix, String key, T value, long timeout) { | |||||
| redisUtils.set(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), timeout); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @return 是否设值成功 | |||||
| */ | |||||
| public Boolean setnx(AbstractKeyPrefix prefix, String key, String value){ | |||||
| if(prefix.getExpireSeconds() <= 0){ | |||||
| return redisUtils.setnx(getRealKey(prefix, key), value); | |||||
| }else{ | |||||
| return redisUtils.setnx(getRealKey(prefix, key), value, prefix.getExpireSeconds()); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param <T> 指定类型 | |||||
| * @return 是否设值成功 | |||||
| */ | |||||
| public <T> Boolean setnx(AbstractKeyPrefix prefix, String key, T value){ | |||||
| if(prefix.getExpireSeconds() <= 0){ | |||||
| return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value)); | |||||
| }else{ | |||||
| return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), prefix.getExpireSeconds()); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param timeout 过期时间 | |||||
| * @return 是否设值成功 | |||||
| */ | |||||
| public Boolean setnx(AbstractKeyPrefix prefix, String key, String value, long timeout) { | |||||
| return redisUtils.setnx(getRealKey(prefix, key), value, timeout); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param timeout 过期时间 | |||||
| * @param <T> 指定类型 | |||||
| * @return 是否设值成功 | |||||
| */ | |||||
| public <T> Boolean setnx(AbstractKeyPrefix prefix, String key, T value, long timeout) { | |||||
| return redisUtils.setnx(getRealKey(prefix, key), FastjsonUtils.convertObjectToJSON(value), timeout); | |||||
| } | |||||
| /** | |||||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @return 值 | |||||
| */ | |||||
| public String get(AbstractKeyPrefix prefix, String key) { | |||||
| return redisUtils.get(getRealKey(prefix, key)); | |||||
| } | |||||
| /** | |||||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @param <T> 指定类型 | |||||
| * @return 值 | |||||
| */ | |||||
| public <T> T get(AbstractKeyPrefix prefix, String key, Class<T> clazz) { | |||||
| return redisUtils.get(getRealKey(prefix, key), clazz); | |||||
| } | |||||
| /** | |||||
| * 根据key获取值 | |||||
| * @param lastKey 真正的key | |||||
| * @param clazz 类型 | |||||
| * @param <T> 泛型 | |||||
| * @return | |||||
| */ | |||||
| public <T> T get(String lastKey, Class<T> clazz) { | |||||
| return redisUtils.get(lastKey, clazz); | |||||
| } | |||||
| /** | |||||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||||
| * @param prefix key前缀 | |||||
| * @param key key值 | |||||
| * @return 是否存在 | |||||
| */ | |||||
| public Boolean exists(AbstractKeyPrefix prefix, String key) { | |||||
| return redisUtils.exists(getRealKey(prefix, key)); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,45 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.redis.key; | |||||
| import org.onebrain.operator.redis.AbstractKeyPrefix; | |||||
| /** | |||||
| * @description 由operator产生的cr的唯一标识 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public class OperatorKey extends AbstractKeyPrefix { | |||||
| public OperatorKey(String prefix) { | |||||
| super(prefix); | |||||
| } | |||||
| public OperatorKey(String prefix, int expireSeconds) { | |||||
| super(prefix, expireSeconds); | |||||
| } | |||||
| /** | |||||
| * 分布式训练 Key | |||||
| */ | |||||
| public static final OperatorKey CR = new OperatorKey("DistributeTrain"); | |||||
| /** | |||||
| * 分布式训练Job Key | |||||
| */ | |||||
| public static final OperatorKey CR_JOB = new OperatorKey("DistributeTrain:Job"); | |||||
| } | |||||
| @@ -0,0 +1,41 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.utils; | |||||
| import io.fabric8.kubernetes.client.dsl.MixedOperation; | |||||
| import io.fabric8.kubernetes.client.dsl.Resource; | |||||
| import org.onebrain.operator.crd.DistributeTrain; | |||||
| import org.onebrain.operator.crd.DistributeTrainList; | |||||
| import org.onebrain.operator.crd.DoneableDistributeTrain; | |||||
| /** | |||||
| * @description 分布式训练客户端持有器 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| public class DistributeTrainClientHolder { | |||||
| private static MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> distributeTrainClient; | |||||
| public static MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> getClient(){ | |||||
| return distributeTrainClient; | |||||
| } | |||||
| public static void setDistributeTrainClient(MixedOperation<DistributeTrain, DistributeTrainList, DoneableDistributeTrain, Resource<DistributeTrain, DoneableDistributeTrain>> client){ | |||||
| distributeTrainClient = client; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,188 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.utils; | |||||
| import com.alibaba.fastjson.JSON; | |||||
| import com.alibaba.fastjson.JSONObject; | |||||
| import com.alibaba.fastjson.serializer.SerializerFeature; | |||||
| import java.util.List; | |||||
| import java.util.Map; | |||||
| /** | |||||
| * @description json工具类 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| public class FastjsonUtils { | |||||
| private static final SerializerFeature[] FEATURES = { | |||||
| // 输出空置字段 | |||||
| SerializerFeature.WriteMapNullValue, | |||||
| //日期类型用日期字符串 yyyy-MM-dd HH:mm:ss | |||||
| SerializerFeature.WriteDateUseDateFormat, | |||||
| // list字段如果为null,输出为[],而不是null | |||||
| SerializerFeature.WriteNullListAsEmpty, | |||||
| // 数值字段如果为null,输出为0,而不是null | |||||
| SerializerFeature.WriteNullNumberAsZero, | |||||
| // Boolean字段如果为null,输出为false,而不是null | |||||
| SerializerFeature.WriteNullBooleanAsFalse, | |||||
| // 字符类型字段如果为null,输出为"",而不是null | |||||
| SerializerFeature.WriteNullStringAsEmpty | |||||
| }; | |||||
| /** | |||||
| * 将对象转为json | |||||
| * @param object | |||||
| * @return json的String | |||||
| */ | |||||
| public static String convertObjectToJSON(Object object) { | |||||
| return JSON.toJSONString(object, FEATURES); | |||||
| } | |||||
| /** | |||||
| * 将对象转为json(无循环引用) | |||||
| * @param object | |||||
| * @return json的String | |||||
| */ | |||||
| public static String toJSONNoFeatures(Object object) { | |||||
| return JSON.toJSONString(object, SerializerFeature.DisableCircularReferenceDetect); | |||||
| } | |||||
| /** | |||||
| * 将json转为对象 | |||||
| * @param text | |||||
| * @return 对象 | |||||
| */ | |||||
| public static Object toBean(String text) { | |||||
| return JSON.parse(text); | |||||
| } | |||||
| /** | |||||
| * 将json转为对象 | |||||
| * @param text 文本字符串 | |||||
| * @param clazz 类型 | |||||
| * @param <T> 泛型 | |||||
| * @return 泛型对象 | |||||
| */ | |||||
| public static <T> T toBean(String text, Class<T> clazz) { | |||||
| return JSON.parseObject(text, clazz); | |||||
| } | |||||
| /** | |||||
| * 转换为数组 | |||||
| * @param text 文本字符串 | |||||
| * @return 泛型对象 | |||||
| */ | |||||
| public static <T> Object[] toArray(String text) { | |||||
| return toArray(text, null); | |||||
| } | |||||
| /** | |||||
| * 转换为数组 | |||||
| * @param text 文本字符串 | |||||
| * @param clazz 类型 | |||||
| * @return | |||||
| */ | |||||
| public static <T> Object[] toArray(String text, Class<T> clazz) { | |||||
| return JSON.parseArray(text, clazz).toArray(); | |||||
| } | |||||
| /** | |||||
| * 转换为List | |||||
| * @param text 文本字符串 | |||||
| * @param clazz 类型 | |||||
| * @return | |||||
| */ | |||||
| public static <T> List<T> toList(String text, Class<T> clazz) { | |||||
| return JSON.parseArray(text, clazz); | |||||
| } | |||||
| /** | |||||
| * 将string转化为序列化的json字符串 | |||||
| * @param text 文本字符串 | |||||
| * @return json对象 | |||||
| */ | |||||
| public static Object textToJson(String text) { | |||||
| Object objectJson = JSON.parse(text); | |||||
| return objectJson; | |||||
| } | |||||
| /** | |||||
| * json字符串转化为map | |||||
| * @param text json字符串 | |||||
| * @return Map集合 | |||||
| */ | |||||
| public static <K, V> Map<K, V> stringToCollect(String text) { | |||||
| Map<K, V> m = (Map<K, V>) JSONObject.parseObject(text); | |||||
| return m; | |||||
| } | |||||
| /** | |||||
| * 转换JSON字符串为对象 | |||||
| * @param jsonData json字符串 | |||||
| * @param clazz 转换目标对象的类型 | |||||
| * @return json对象 | |||||
| */ | |||||
| public static Object convertJsonToObject(String jsonData, Class<?> clazz) { | |||||
| return JSONObject.parseObject(jsonData, clazz); | |||||
| } | |||||
| /** | |||||
| * 将map转化为string | |||||
| * @param m Map集合 | |||||
| * @return 字符串 | |||||
| */ | |||||
| public static <K, V> String collectToString(Map<K, V> m) { | |||||
| String s = JSONObject.toJSONString(m); | |||||
| return s; | |||||
| } | |||||
| /** | |||||
| * json字符串转化为map | |||||
| * | |||||
| * @param text 字符串 | |||||
| * @return Map 对象 | |||||
| */ | |||||
| public static Map stringToMap(String text) { | |||||
| Map m = JSONObject.parseObject(text); | |||||
| return m; | |||||
| } | |||||
| /** | |||||
| * 将map转化为string | |||||
| * | |||||
| * @param m Map集合 | |||||
| * @return 字符串 | |||||
| */ | |||||
| public static String mapToString(Map m) { | |||||
| String s = JSONObject.toJSONString(m); | |||||
| return s; | |||||
| } | |||||
| /** | |||||
| * 把对象转换为指定对象 | |||||
| * @param source 原对象 | |||||
| * @param target 目标class | |||||
| * @param <T> 泛型 | |||||
| * @return 泛型对象 | |||||
| */ | |||||
| public static <T> T toObjectFromSource(Object source,Class<T> target) { | |||||
| return toBean(convertObjectToJSON(source), target); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,56 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.utils; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import java.io.File; | |||||
| import java.io.FileOutputStream; | |||||
| import java.io.IOException; | |||||
| import java.io.InputStream; | |||||
| /** | |||||
| * @description IO工具类 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| @Slf4j | |||||
| public class IOUtils { | |||||
| /** | |||||
| * 将input流转换为文件 | |||||
| * | |||||
| * @param is 输入流 | |||||
| * @param targetFile 目标文件 | |||||
| */ | |||||
| public static void copy(InputStream is, File targetFile) { | |||||
| try (FileOutputStream fos = new FileOutputStream(targetFile)) { | |||||
| byte[] b = new byte[1024]; | |||||
| int readCount = is.read(b); | |||||
| while (readCount != -1) { | |||||
| // 写入数据 | |||||
| fos.write(b, 0, readCount); | |||||
| readCount = is.read(b); | |||||
| } | |||||
| is.close(); | |||||
| fos.flush(); | |||||
| } catch (IOException e) { | |||||
| log.error("copy file error:【{}】", e); | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,289 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.utils; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.data.redis.core.StringRedisTemplate; | |||||
| import org.springframework.stereotype.Component; | |||||
| import java.util.Map; | |||||
| import java.util.Set; | |||||
| import java.util.concurrent.TimeUnit; | |||||
| /** | |||||
| * @description 封装redis简单的key-value操作 | |||||
| * @date 2020-09-23 | |||||
| */ | |||||
| @Component | |||||
| public class RedisUtils { | |||||
| @Autowired | |||||
| private StringRedisTemplate redisTemplate; | |||||
| /** | |||||
| * 实现命令:TTL key,以秒为单位,返回给定 key的剩余生存时间(TTL, time to live)。 | |||||
| * @param key key值 | |||||
| * @return 返回过期时间秒数 | |||||
| */ | |||||
| public long ttl(String key) { | |||||
| return redisTemplate.getExpire(key); | |||||
| } | |||||
| /** | |||||
| * 实现命令:expire 设置过期时间,单位秒 | |||||
| * @param key key值 | |||||
| * @param timeout 期望过期时间 | |||||
| */ | |||||
| public void expire(String key, long timeout) { | |||||
| redisTemplate.expire(key, timeout, TimeUnit.SECONDS); | |||||
| } | |||||
| /** | |||||
| * 实现命令:INCR key,增加key一次 | |||||
| * @param key key值 | |||||
| * @param delta 增量 | |||||
| * @return 计数值 | |||||
| */ | |||||
| public long incr(String key, long delta) { | |||||
| return redisTemplate.opsForValue().increment(key, delta); | |||||
| } | |||||
| /** | |||||
| * 实现命令: key,减少key一次 | |||||
| * @param key key值 | |||||
| * @param delta 增量 | |||||
| * @return 计数值 | |||||
| */ | |||||
| public long decr(String key, long delta) { | |||||
| if(delta < 0){ | |||||
| //throw new RuntimeException("递减因子必须大于0"); | |||||
| del(key); | |||||
| return 0; | |||||
| } | |||||
| return redisTemplate.opsForValue().increment(key, -delta); | |||||
| } | |||||
| /** | |||||
| * 实现命令:KEYS pattern,查找所有符合给定模式 pattern的 key | |||||
| * @return key集合 | |||||
| */ | |||||
| public Set<String> keys(String pattern) { | |||||
| return redisTemplate.keys(pattern); | |||||
| } | |||||
| /** | |||||
| * 实现命令:DEL key,删除一个key | |||||
| * @param key key值 | |||||
| */ | |||||
| public void del(String key) { | |||||
| redisTemplate.delete(key); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| */ | |||||
| public void set(String key, String value) { | |||||
| redisTemplate.opsForValue().set(key, value); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SET key value,设置一个key-value(将字符串值 value关联到 key) | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param <T> 指定类型 | |||||
| */ | |||||
| public <T> void set(String key, T value) { | |||||
| redisTemplate.opsForValue().set(key, FastjsonUtils.convertObjectToJSON(value)); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param timeout 过期时间 | |||||
| */ | |||||
| public void set(String key, String value, long timeout) { | |||||
| redisTemplate.opsForValue().set(key, value, timeout, TimeUnit.SECONDS); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SET key value EX seconds,设置key-value和超时时间(秒) | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param timeout 过期时间 | |||||
| * @param <T> 指定类型 | |||||
| */ | |||||
| public <T> void set(String key, T value, long timeout) { | |||||
| redisTemplate.opsForValue().set(key, FastjsonUtils.convertObjectToJSON(value), timeout, TimeUnit.SECONDS); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @return 是否设值成功 | |||||
| */ | |||||
| public Boolean setnx(String key, String value){ | |||||
| return redisTemplate.opsForValue().setIfAbsent(key, value); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SETNX key value,设置一个key-value(将字符串值 value关联到 key) | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param <T> 指定类型 | |||||
| * @return 是否设值成功 | |||||
| */ | |||||
| public <T> Boolean setnx(String key, T value){ | |||||
| return redisTemplate.opsForValue().setIfAbsent(key, FastjsonUtils.convertObjectToJSON(value)); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param timeout 过期时间 | |||||
| * @return 是否设值成功 | |||||
| */ | |||||
| public Boolean setnx(String key, String value, long timeout) { | |||||
| return redisTemplate.opsForValue().setIfAbsent(key, value, timeout, TimeUnit.SECONDS); | |||||
| } | |||||
| /** | |||||
| * 实现命令:SETNX key value EX seconds,设置key-value和超时时间(秒) | |||||
| * @param key key值 | |||||
| * @param value 值 | |||||
| * @param timeout 过期时间 | |||||
| * @param <T> 指定类型 | |||||
| * @return 是否设值成功 | |||||
| */ | |||||
| public <T> Boolean setnx(String key, T value, long timeout) { | |||||
| return redisTemplate.opsForValue().setIfAbsent(key, FastjsonUtils.convertObjectToJSON(value), timeout, TimeUnit.SECONDS); | |||||
| } | |||||
| /** | |||||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||||
| * @param key key值 | |||||
| * @return 值 | |||||
| */ | |||||
| public String get(String key) { | |||||
| return (String) redisTemplate.opsForValue().get(key); | |||||
| } | |||||
| /** | |||||
| * | |||||
| * 根据key获取值 | |||||
| * @param key 真正的key | |||||
| * @param clazz 类型 | |||||
| * @param <T> 泛型 | |||||
| * @return | |||||
| */ | |||||
| public <T> T get(String key, Class<T> clazz) { | |||||
| String value = (String) redisTemplate.opsForValue().get(key); | |||||
| return (T) FastjsonUtils.convertJsonToObject(value, clazz); | |||||
| } | |||||
| /** | |||||
| * 实现命令:GET key,返回 key所关联的字符串值。 | |||||
| * @param key key值 | |||||
| * @return 是否存在 | |||||
| */ | |||||
| public Boolean exists(String key) { | |||||
| return redisTemplate.hasKey(key); | |||||
| } | |||||
| /****----------------------------------Hash----------------------------------------****/ | |||||
| /** | |||||
| * 实现命令:HSET key field value,将哈希表 key中的域 field的值设为 value | |||||
| * | |||||
| * @param key key | |||||
| * @param field 域 | |||||
| * @param value 值 | |||||
| */ | |||||
| public void hset(String key, String field, Object value) { | |||||
| redisTemplate.opsForHash().put(key, field, value); | |||||
| } | |||||
| /** | |||||
| * 实现命令:HGET key field,返回哈希表 key中给定域 field的值 | |||||
| * | |||||
| * @param key key | |||||
| * @param field 域 | |||||
| * @return | |||||
| */ | |||||
| public String hget(String key, String field) { | |||||
| return (String) redisTemplate.opsForHash().get(key, field); | |||||
| } | |||||
| /** | |||||
| * 实现命令:HDEL key field [field ...],删除哈希表 key 中的一个或多个指定域,不存在的域将被忽略。 | |||||
| * | |||||
| * @param key key | |||||
| * @param fields 域 | |||||
| */ | |||||
| public void hdel(String key, Object... fields) { | |||||
| redisTemplate.opsForHash().delete(key, fields); | |||||
| } | |||||
| /** | |||||
| * 实现命令:HGETALL key,返回哈希表 key中,所有的域和值。 | |||||
| * | |||||
| * @param key | |||||
| * @return 域和值 | |||||
| */ | |||||
| public Map<Object, Object> hgetall(String key) { | |||||
| return redisTemplate.opsForHash().entries(key); | |||||
| } | |||||
| /****----------------------------------List----------------------------------------****/ | |||||
| /** | |||||
| * 实现命令:LPUSH key value,将一个值 value插入到列表 key的表头 | |||||
| * | |||||
| * @param key | |||||
| * @param value | |||||
| * @return 执行 LPUSH命令后,列表的长度。 | |||||
| */ | |||||
| public long lpush(String key, String value) { | |||||
| return redisTemplate.opsForList().leftPush(key, value); | |||||
| } | |||||
| /** | |||||
| * 实现命令:LPOP key,移除并返回列表 key的头元素。 | |||||
| * | |||||
| * @param key | |||||
| * @return 列表key的头元素。 | |||||
| */ | |||||
| public String lpop(String key) { | |||||
| return (String) redisTemplate.opsForList().leftPop(key); | |||||
| } | |||||
| /** | |||||
| * 实现命令:RPUSH key value,将一个值 value插入到列表 key的表尾(最右边)。 | |||||
| * | |||||
| * @param key | |||||
| * @param value | |||||
| * @return 执行 LPUSH命令后,列表的长度。 | |||||
| */ | |||||
| public long rpush(String key, String value) { | |||||
| return redisTemplate.opsForList().rightPush(key, value); | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,99 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.utils; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.springframework.beans.BeansException; | |||||
| import org.springframework.beans.factory.DisposableBean; | |||||
| import org.springframework.context.ApplicationContext; | |||||
| import org.springframework.context.ApplicationContextAware; | |||||
| import org.springframework.stereotype.Component; | |||||
| /** | |||||
| * @description 上下文工具类 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| @Component | |||||
| @Slf4j | |||||
| public class SpringContextHolder implements ApplicationContextAware, DisposableBean { | |||||
| public static ApplicationContext applicationContext = null; | |||||
| /** | |||||
| * 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型. | |||||
| * @param name bean名称 | |||||
| * @param <T> 类型 | |||||
| * @return bean对象 | |||||
| */ | |||||
| @SuppressWarnings("unchecked") | |||||
| public static <T> T getBean(String name) { | |||||
| assertContextInjected(); | |||||
| return (T) applicationContext.getBean(name); | |||||
| } | |||||
| /** | |||||
| * 从静态变量applicationContext中取得Bean, 自动转型为所赋值对象的类型. | |||||
| * @param requiredType bean类型 class | |||||
| * @param <T> 泛型 | |||||
| * @return bean对象 | |||||
| */ | |||||
| public static <T> T getBean(Class<T> requiredType) { | |||||
| assertContextInjected(); | |||||
| return applicationContext.getBean(requiredType); | |||||
| } | |||||
| /** | |||||
| * 检查ApplicationContext不为空. | |||||
| */ | |||||
| private static void assertContextInjected() { | |||||
| if (applicationContext == null) { | |||||
| throw new IllegalStateException("applicaitonContext属性未注入, 请在applicationContext" + | |||||
| ".xml中定义SpringContextHolder或在SpringBoot启动类中注册SpringContextHolder."); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * 清除SpringContextHolder中的ApplicationContext为Null. | |||||
| */ | |||||
| private static void clearHolder() { | |||||
| log.debug("清除SpringContextHolder中的ApplicationContext:" | |||||
| + applicationContext); | |||||
| applicationContext = null; | |||||
| } | |||||
| /** | |||||
| * 销毁回调函数 | |||||
| */ | |||||
| @Override | |||||
| public void destroy() { | |||||
| SpringContextHolder.clearHolder(); | |||||
| } | |||||
| /** | |||||
| * spring上下文设置 | |||||
| * @param applicationContext | |||||
| * @throws BeansException | |||||
| */ | |||||
| @Override | |||||
| public void setApplicationContext(ApplicationContext applicationContext) throws BeansException { | |||||
| if (SpringContextHolder.applicationContext != null) { | |||||
| log.warn("SpringContextHolder中的ApplicationContext被覆盖, 原有ApplicationContext为:" + SpringContextHolder.applicationContext); | |||||
| } | |||||
| SpringContextHolder.applicationContext = applicationContext; | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,111 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.watcher; | |||||
| import cn.hutool.core.collection.CollectionUtil; | |||||
| import cn.hutool.core.util.StrUtil; | |||||
| import io.fabric8.kubernetes.api.model.OwnerReference; | |||||
| import io.fabric8.kubernetes.api.model.apps.StatefulSet; | |||||
| import io.fabric8.kubernetes.api.model.batch.Job; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import lombok.Data; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import org.onebrain.operator.redis.RedisService; | |||||
| import org.onebrain.operator.redis.key.OperatorKey; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.stereotype.Component; | |||||
| import java.util.List; | |||||
| import static org.onebrain.operator.constants.CrdConstants.CRD_KIND; | |||||
| /** | |||||
| * @description Job处理器 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| @Data | |||||
| @Slf4j | |||||
| @Component | |||||
| public class JobHandler { | |||||
| public static final String FINISHED = "finished"; | |||||
| public static final String PENDING = "pending"; | |||||
| @Autowired | |||||
| private RedisService redis; | |||||
| @Autowired | |||||
| private KubernetesClient client; | |||||
| /** | |||||
| * 处理Job | |||||
| * | |||||
| * @param job | |||||
| */ | |||||
| public void handleJob(Job job) { | |||||
| log.info("handleJob=>job : 【{}】", job); | |||||
| //筛选出DistributeTrain下的job | |||||
| List<OwnerReference> ownerReferences = job.getMetadata().getOwnerReferences(); | |||||
| if (CollectionUtil.isEmpty(ownerReferences) || !CRD_KIND.equals(ownerReferences.get(0).getKind())) { | |||||
| return; | |||||
| } | |||||
| String key = job.getMetadata().getUid(); | |||||
| if (StrUtil.equals(redis.get(OperatorKey.CR_JOB, key), FINISHED)) { | |||||
| return; | |||||
| } | |||||
| try { | |||||
| redis.set(OperatorKey.CR_JOB, key, PENDING); | |||||
| final Integer parallelism = job.getSpec().getParallelism(); | |||||
| final Integer backoffLimit = job.getSpec().getBackoffLimit(); | |||||
| //成功 或者 失败达到最大次数 | |||||
| if (job.getStatus() != null | |||||
| && ((job.getStatus().getFailed() != null && job.getStatus().getFailed() + 1 >= backoffLimit) | |||||
| || (job.getStatus().getSucceeded() != null && parallelism.equals(job.getStatus().getSucceeded())))) { | |||||
| //得到DistributeTrain的Statefulset | |||||
| String dtName = ownerReferences.get(0).getName(); | |||||
| String namespace = job.getMetadata().getNamespace(); | |||||
| List<StatefulSet> statefulsetList = client.apps().statefulSets() | |||||
| .inNamespace(namespace) | |||||
| .withLabel(KubeConstants.DISTRIBUTE_TRAIN_LABEL, dtName) | |||||
| .list().getItems(); | |||||
| if (CollectionUtil.isEmpty(statefulsetList)) { | |||||
| log.info("jobWatcher: statefulset of 【{}】 not exists", dtName); | |||||
| return; | |||||
| } | |||||
| //缩容Statefulset的replica到0 | |||||
| StatefulSet statefulSet = statefulsetList.get(0); | |||||
| statefulSet.getSpec().setReplicas(0); | |||||
| client.resource(statefulSet).createOrReplace(); | |||||
| log.info("jobWatcher: reduce replicas of 【{}】 to zero", dtName); | |||||
| redis.set(OperatorKey.CR_JOB, key, "finished"); | |||||
| } | |||||
| } catch (Exception e) { | |||||
| redis.set(OperatorKey.CR_JOB, key, "error"); | |||||
| log.error("handle job error:【{}】", e); | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,71 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.watcher; | |||||
| import io.fabric8.kubernetes.api.model.batch.Job; | |||||
| import io.fabric8.kubernetes.client.KubernetesClientException; | |||||
| import io.fabric8.kubernetes.client.Watcher; | |||||
| import lombok.Data; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| /** | |||||
| * @description Job监视器 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| @Data | |||||
| @Slf4j | |||||
| public class JobWatcher implements Watcher<Job> { | |||||
| private String namespace; | |||||
| private String jobName; | |||||
| private KubeWatcherManager manager; | |||||
| private JobHandler jobHandler; | |||||
| public JobWatcher(JobHandler jobHandler, KubeWatcherManager manager) { | |||||
| this.manager = manager; | |||||
| this.jobHandler = jobHandler; | |||||
| } | |||||
| /** | |||||
| * 接收事件进行处理 | |||||
| * @param action 事件类型 | |||||
| * @param job job信息 | |||||
| */ | |||||
| @Override | |||||
| public void eventReceived(Action action, Job job) { | |||||
| log.info("Job Event received: {} at {}", job.getMetadata().getUid(), job.getMetadata().getCreationTimestamp()); | |||||
| jobHandler.handleJob(job); | |||||
| } | |||||
| /** | |||||
| * 关闭事件 | |||||
| * @param e 客户端异常 | |||||
| */ | |||||
| @Override | |||||
| public void onClose(KubernetesClientException e) { | |||||
| log.debug("job watcher close"); | |||||
| if (e != null) { | |||||
| log.error(e.getMessage()); | |||||
| log.info("restart new job watcher thread"); | |||||
| manager.putNewWatcher(); | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,120 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator.watcher; | |||||
| import io.fabric8.kubernetes.client.KubernetesClient; | |||||
| import lombok.extern.slf4j.Slf4j; | |||||
| import org.onebrain.operator.context.KubeContext; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.stereotype.Component; | |||||
| import java.util.concurrent.LinkedBlockingQueue; | |||||
| import java.util.concurrent.ThreadFactory; | |||||
| import java.util.concurrent.ThreadPoolExecutor; | |||||
| import java.util.concurrent.TimeUnit; | |||||
| import java.util.concurrent.atomic.AtomicInteger; | |||||
| /** | |||||
| * @description 监视器的管理器 | |||||
| * @date 2020-09-24 | |||||
| */ | |||||
| @Slf4j | |||||
| @Component | |||||
| public class KubeWatcherManager { | |||||
| /** | |||||
| * 监视队列 | |||||
| */ | |||||
| private static final LinkedBlockingQueue<JobWatcher> watchQueue = new LinkedBlockingQueue<>(1000); | |||||
| /** | |||||
| * 单例线程池 | |||||
| */ | |||||
| private ThreadPoolExecutor pool = new ThreadPoolExecutor(1, 1, 1, TimeUnit.SECONDS, new LinkedBlockingQueue<>(1), new ThreadFactory() { | |||||
| private final AtomicInteger mThreadNum = new AtomicInteger(1); | |||||
| @Override | |||||
| public Thread newThread(Runnable r) { | |||||
| return new Thread(r, "job-watcher-" + mThreadNum.getAndIncrement()); | |||||
| } | |||||
| }); | |||||
| @Autowired | |||||
| private KubeContext kubeContext; | |||||
| @Autowired | |||||
| private JobHandler jobHandler; | |||||
| /** | |||||
| * 第一次启动时 | |||||
| */ | |||||
| public void startWatching(){ | |||||
| JobWatchHolder jobWatchHolder = new JobWatchHolder(); | |||||
| pool.execute(jobWatchHolder); | |||||
| putNewWatcher(); | |||||
| } | |||||
| /** | |||||
| * 监听指定job | |||||
| * @param jobWatcher | |||||
| */ | |||||
| public void watch(JobWatcher jobWatcher){ | |||||
| KubernetesClient client = kubeContext.getClient(); | |||||
| //监听指定job | |||||
| client.batch().jobs() | |||||
| .inAnyNamespace().watch(jobWatcher); | |||||
| } | |||||
| /** | |||||
| * 加入新watcher | |||||
| */ | |||||
| public void putNewWatcher(){ | |||||
| try { | |||||
| JobWatcher jobWatcher = new JobWatcher(jobHandler, this); | |||||
| watchQueue.put(jobWatcher); | |||||
| } catch (InterruptedException e) { | |||||
| e.printStackTrace(); | |||||
| } | |||||
| } | |||||
| /** | |||||
| * Job监视器持有者 | |||||
| */ | |||||
| class JobWatchHolder implements Runnable { | |||||
| @Override | |||||
| public void run() { | |||||
| while(true){ | |||||
| try { | |||||
| //无监视器时阻塞 | |||||
| JobWatcher jobWatcher = watchQueue.take(); | |||||
| //启动监视器 | |||||
| try{ | |||||
| watch(jobWatcher); | |||||
| }catch (Exception e){ | |||||
| //出错不影响其他listener | |||||
| log.error("JobWatchHolder watch error:【{}】",e); | |||||
| } | |||||
| } catch (InterruptedException e) { | |||||
| log.error("JobWatchHolder run error:【{}】",e); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,27 @@ | |||||
| -----BEGIN RSA PRIVATE KEY----- | |||||
| MIIEogIBAAKCAQEA06ZOLQq4pzBZL+bybsxdl4PzYg3jB4kRVc771nm5Y8JenDAT | |||||
| hlOTz6+nGH4EDT63J7oNj4JYLufsONKYhJkya8p0btWeKHqz5LgEfLGwz/FTMRH5 | |||||
| WTCZCZUa/3i9gQeKK/CKEned1h4l2w4agrYrnXHpnuNSw6HSlTpX8FgaQGfmTkL3 | |||||
| XtzSCeY9F2fXGOm9fMfVmv5I5uP6B4TmKwtWPvx3a/1MDgHbmtoaCqYP/JmzWHyi | |||||
| mc9l2ilX3kTPxh57oRtW9N3FATc8/OCYkNt4vDUTRVB4drODaR5TgUbFtkBVGcFR | |||||
| f7MrQo4Krd2g8rtEv7PaWN/wlNle5ANXJ/oL3wIDAQABAoIBADiqC8APYMSSMy6Z | |||||
| /EohuOT51M1pvmCkF9oLYm1XhYTp4v6Z+IA8HBS8iFYMVvVc1xhxvXOwh/925E2K | |||||
| RH8rrM4jE+0gkAlyYHtZsQnZYOcrSwSWNVXlpvNj0iiXoNTMufdtnOm40K8kvynY | |||||
| qsxYDXFHsC5z2hK6XnDJgAW+8LhRHCizWwxc0dSN9r33VGry0rgndUZsj2ZBf7u5 | |||||
| rdslZKvRzMymXct7CIQQ3s5IUO3qbaj7TIzMIo14bmHgD3zlBQ66ESCX1o5A+hPq | |||||
| 1gfUNqUPBtJhsNJg4YYJ/bGgGhBxAxam8jWz3DFZEuYHr6fCDIhLJzL5ulxoQS2z | |||||
| vJYBwsECgYEA8JGfw004BxqcBVxqBveestsCVGIWDtb+Zx4OI+uBAmYMXd2WCzxv | |||||
| XxgQJ/IrpNx6FAXZ/bFdE0HRZWR6H07wtNgABuBgd0tAfcH8sw2CJkTO/0N2Xr6/ | |||||
| O4kh3yHNMy/wAxnktISf1hE/ElEdPI6slhwGDQObRdXxaqBEq+Tjc28CgYEA4TnM | |||||
| rCaJ8aMaUE0nvVzrev3VTLp4f1qOcPUOnrHDdyrPs1SjYzmAOC72X/FylJZmtkvh | |||||
| coMQUKVQgiBn1dTtnALANq705b1S+0U07m6+dGJ7LWchOY2tFPiIsx3SZvNJeEKJ | |||||
| 38PsaFi2eDcDP8cKriNoAoby8TbqjqiyHgDX9pECfxww9IfuhKJQe/gk3Ef0vKQ5 | |||||
| BgzdcbhLeYScAQw0jOm7C7f0P6ERc/uw1jPYLUUkkSnHhcQ1BLM9A0zeeXExzwNi | |||||
| TJ6BrMxOBUC3euWAr7/MUHWZckWoFMDlURLU4zccZwP2BNcis5hibQG4f7SZA6CT | |||||
| qCHeSlPkvmXAYkvChuUCgYEA0DNlL9KkfBqBja/1R4jpKhYSIs7R6zCkMmlm7W54 | |||||
| ueV6gVWBgI08KTPIj2KcwBzUsDovG3NrFpHrfY9FTZd7W1fzpdlQDDxaxGryhmMb | |||||
| bm1HXu5R+WktkhA6FhJAWOkXhrNDzvXHyaIQc8qvFzsBdX7HfGaRmEhixiPOHAw9 | |||||
| l/ECgYEAwNywUARR9HtmgoyrwifrzIkMo6jcmLNEIzi2kJ4OQQxW5eKj5JgSV0ND | |||||
| QUoAIWDAhHQd3ygSfbeShcvtcw+zoF92iOVFn0SLiSe1TgA5ggzC/VJUnInO7zx7 | |||||
| 8Sj8Zk5tHrVmTlelEA2Nbq5H7/U1Q33c1AWbw8yxqD/JRxudHKA= | |||||
| -----END RSA PRIVATE KEY----- | |||||
| @@ -0,0 +1 @@ | |||||
| ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDTpk4tCrinMFkv5vJuzF2Xg/NiDeMHiRFVzvvWebljwl6cMBOGU5PPr6cYfgQNPrcnug2Pglgu5+w40piEmTJrynRu1Z4oerPkuAR8sbDP8VMxEflZMJkJlRr/eL2BB4or8IoSd53WHiXbDhqCtiudceme41LDodKVOlfwWBpAZ+ZOQvde3NIJ5j0XZ9cY6b18x9Wa/kjm4/oHhOYrC1Y+/Hdr/UwOAdua2hoKpg/8mbNYfKKZz2XaKVfeRM/GHnuhG1b03cUBNzz84JiQ23i8NRNFUHh2s4NpHlOBRsW2QFUZwVF/sytCjgqt3aDyu0S/s9pY3/CU2V7kA1cn+gvf root@{{ip}} | |||||
| @@ -0,0 +1,19 @@ | |||||
| apiVersion: v1 | |||||
| clusters: | |||||
| - cluster: | |||||
| certificate-authority-data: {} | |||||
| server: {} | |||||
| name: kubernetes | |||||
| contexts: | |||||
| - context: | |||||
| cluster: kubernetes | |||||
| user: kubernetes-admin | |||||
| name: kubernetes-admin@kubernetes | |||||
| current-context: kubernetes-admin@kubernetes | |||||
| kind: Config | |||||
| preferences: {} | |||||
| users: | |||||
| - name: kubernetes-admin | |||||
| user: | |||||
| client-certificate-data: {} | |||||
| client-key-data: {} | |||||
| @@ -0,0 +1,46 @@ | |||||
| #!/bin/bash | |||||
| if [ ! -f "/etc/init.d/ssh" ]; then | |||||
| if [ ! -f "/etc/redhat-release" ]; then | |||||
| echo 'apt install -y openssh-server' >> pretreatment.log | |||||
| apt update >> pretreatment.log | |||||
| apt install -y openssh-server >> pretreatment.log | |||||
| fi | |||||
| if [ ! -f "/etc/lsb-release" ]; then | |||||
| echo 'yum install -y sshd' >> pretreatment.log | |||||
| yum update >> pretreatment.log | |||||
| yum install -y sshd >> pretreatment.log | |||||
| fi | |||||
| fi | |||||
| echo '/etc/init.d/ssh start' >> pretreatment.log | |||||
| /etc/init.d/ssh start >> pretreatment.log | |||||
| if [ -f "/etc/redhat-release" ]; then | |||||
| if command -v nslookup >/dev/null 2>&1; then | |||||
| echo 'exists nslookup' >> pretreatment.log | |||||
| else | |||||
| echo 'yum install dnsutils jq' >> pretreatment.log | |||||
| yum install -y dnsutils >> pretreatment.log | |||||
| yum install -y jq >> pretreatment.log | |||||
| fi | |||||
| if command -v nslookup >/dev/null 2>&1; then | |||||
| echo 'exists nslookup' >> pretreatment.log | |||||
| else | |||||
| echo 'yum install dnsutils jq' >> pretreatment.log | |||||
| yum install -y dnsutils >> pretreatment.log | |||||
| yum install -y jq >> pretreatment.log | |||||
| fi | |||||
| fi | |||||
| if [ -f "/etc/lsb-release" ]; then | |||||
| if command -v jq >/dev/null 2>&1; then | |||||
| echo 'exists jq' >> pretreatment.log | |||||
| else | |||||
| echo 'apt install jq' >> pretreatment.log | |||||
| apt install -y jq >> pretreatment.log | |||||
| fi | |||||
| if command -v nslookup >/dev/null 2>&1; then | |||||
| echo 'exists nslookup' >> pretreatment.log | |||||
| else | |||||
| echo 'apt install dnsutils' >> pretreatment.log | |||||
| apt install -y dnsutils >> pretreatment.log | |||||
| fi | |||||
| fi | |||||
| @@ -0,0 +1,43 @@ | |||||
| /** | |||||
| * Copyright 2020 Zhejiang Lab & The OneFlow Authors. All Rights Reserved. | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| * ============================================================= | |||||
| */ | |||||
| package org.onebrain.operator; | |||||
| import org.onebrain.operator.api.pod.PodApi; | |||||
| import org.onebrain.operator.constants.KubeConstants; | |||||
| import org.springframework.beans.factory.annotation.Autowired; | |||||
| import org.springframework.boot.test.context.SpringBootTest; | |||||
| import java.io.File; | |||||
| import java.net.URISyntaxException; | |||||
| import java.net.URL; | |||||
| @SpringBootTest | |||||
| public class DistributeTrainOperatorApplicationTests { | |||||
| @Autowired | |||||
| private PodApi podApi; | |||||
| // @Test | |||||
| public void contextLoads() throws URISyntaxException { | |||||
| final URL url = getClass().getClassLoader().getResource("key/id_rsa"); | |||||
| File file = new File(url.toURI()); | |||||
| podApi.copyToPod("default", "distribute-train-test-job-sv2dj", KubeConstants.MASTER_CONTAINER_NAME, file, "/root/.ssh/id_rsa"); | |||||
| } | |||||
| } | |||||