Browse Source

update data process

tags/v0.4.0
之江天枢 3 years ago
parent
commit
69f765d45a
72 changed files with 3943 additions and 750 deletions
  1. +9
    -0
      dataset-util/.gitignore
  2. +53
    -29
      dataset-util/README.md
  3. +65
    -16
      dataset-util/pom.xml
  4. +82
    -16
      dataset-util/src/main/java/org/dubhe/datasetutil/DatasetUtilApplication.java
  5. +0
    -76
      dataset-util/src/main/java/org/dubhe/datasetutil/common/aspect/LogAspect.java
  6. +2
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/base/MagicNumConstant.java
  7. +63
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/config/ImageConfig.java
  8. +8
    -4
      dataset-util/src/main/java/org/dubhe/datasetutil/common/config/MinioConfig.java
  9. +93
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/constant/AnnotateTypeEnum.java
  10. +62
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/constant/BusinessConstant.java
  11. +88
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/constant/DatatypeEnum.java
  12. +58
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/enums/DatatypeEnum.java
  13. +77
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/enums/PresetDatasetEnum.java
  14. +91
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/EsConfiguration.java
  15. +43
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/FileUtil.java
  16. +13
    -46
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/GeneratorKeyUtil.java
  17. +24
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/HandleFileUtil.java
  18. +45
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/IOUtil.java
  19. +1
    -9
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/LogUtil.java
  20. +102
    -10
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/MinioUtil.java
  21. +21
    -8
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/MyPreciseShardingAlgorithm.java
  22. +32
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/ProcessBarUtil.java
  23. +13
    -7
      dataset-util/src/main/java/org/dubhe/datasetutil/common/util/ThreadUtils.java
  24. +46
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataFileAnnotationMapper.java
  25. +32
    -1
      dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataFileMapper.java
  26. +21
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataLabelMapper.java
  27. +13
    -4
      dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataSequenceMapper.java
  28. +42
    -2
      dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataVersionFileMapper.java
  29. +9
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/dao/DatasetDataLabelMapper.java
  30. +28
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/dao/DatasetMapper.java
  31. +11
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/dao/DatasetVersionMapper.java
  32. +40
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/AnnotationDTO.java
  33. +95
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/EsTransportDTO.java
  34. +46
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/FileAnnotationDTO.java
  35. +28
    -16
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/IdAlloc.java
  36. +70
    -9
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DataFile.java
  37. +88
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DataFileAnnotation.java
  38. +5
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DataLabelGroup.java
  39. +15
    -6
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DataVersionFile.java
  40. +4
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/Dataset.java
  41. +53
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DatasetVersion.java
  42. +1
    -1
      dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/LogInfo.java
  43. +171
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/handle/CustomDatasetImportHandle.java
  44. +253
    -34
      dataset-util/src/main/java/org/dubhe/datasetutil/handle/DatasetImageUploadHandle.java
  45. +317
    -102
      dataset-util/src/main/java/org/dubhe/datasetutil/handle/DatasetImportHandle.java
  46. +938
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/handle/PresetDatasetImportHandle.java
  47. +43
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/DataFileAnnotationService.java
  48. +13
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/DataFileService.java
  49. +18
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/DataLabelService.java
  50. +10
    -1
      dataset-util/src/main/java/org/dubhe/datasetutil/service/DataSequenceService.java
  51. +16
    -1
      dataset-util/src/main/java/org/dubhe/datasetutil/service/DataVersionFileService.java
  52. +7
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/DatasetDataLabelService.java
  53. +49
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/DatasetService.java
  54. +15
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/DatasetVersionService.java
  55. +53
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataFileAnnotationServiceImpl.java
  56. +30
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataFileServiceImpl.java
  57. +36
    -1
      dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataLabelServiceImpl.java
  58. +24
    -2
      dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataSequenceServiceImpl.java
  59. +36
    -3
      dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataVersionFileServiceImpl.java
  60. +13
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DatasetDataLabelServiceImpl.java
  61. +70
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DatasetServiceImpl.java
  62. +30
    -0
      dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DatasetVersionServiceImpl.java
  63. +0
    -32
      dataset-util/src/main/resources/application-dev.yml
  64. +41
    -0
      dataset-util/src/main/resources/application-prod.yml
  65. +0
    -32
      dataset-util/src/main/resources/application-test.yml
  66. +36
    -29
      dataset-util/src/main/resources/application.yml
  67. +0
    -0
      dataset-util/src/main/resources/logback-spring-prod.xml
  68. +0
    -248
      dataset-util/src/main/resources/logback-spring-test.xml
  69. +13
    -0
      dataset-util/src/main/resources/mapper/DataFileAnnotationMapper.xml
  70. +2
    -2
      dataset-util/src/main/resources/mapper/DataFileMapper.xml
  71. +14
    -0
      dataset-util/src/main/resources/mapper/DataLabelMapper.xml
  72. +3
    -3
      dataset-util/src/main/resources/mapper/DataVersionFileMapper.xml

+ 9
- 0
dataset-util/.gitignore View File

@@ -4,3 +4,12 @@
logs
HELP.md
target/

# vscode
.classpath
.settings/org.eclipse.core.resources.prefs
.settings/org.eclipse.m2e.core.prefs
.settings/org.eclipse.jdt.apt.core.prefs
.settings/org.eclipse.jdt.core.prefs
.project
.factorypath

+ 53
- 29
dataset-util/README.md View File

@@ -1,50 +1,58 @@
# 之江天枢-数据集导入脚本
# 之江天枢 - 数据集导入脚本

**之江天枢一站式人工智能开源平台**(简称:**之江天枢**),为了实现其他平台已标注完成的数据集在「一站式开发平台」上进行开发,我们增加了数据集导入的功能,实现对数据集的全流程功能操作
**之江天枢一站式人工智能开源平台**(简称:**之江天枢**),为了实现其他平台已标注完成的数据集在「一站式开发平台」上进行开发,我们增加了数据集导入功能,用来导入本地已存在的数据集文件

## 源码部署
## 环境依赖

安装如下软件环境。
- OpenJDK:1.8+

## 下载脚本

- 数据集模板:http://tianshu.org.cn/static/upload/file/dubhe-dataset-template.zip
- 上传数据集脚本:http://tianshu.org.cn/static/upload/file/upload_dataset.zip
- 数据集导入模板:http://tianshu.org.cn/static/upload/file/dubhe-dataset-template.zip
- 数据集导入脚本:http://tianshu.org.cn/static/upload/file/upload_dataset.zip


## 脚本使用说明
## 创建数据集

- 登录天枢系统深度学习平台,在数据管理菜单下的数据集管理中创建数据集。获取数据集ID
- 需要自行准备图片文件、标注文件、标签文件
- 首先需要参考[部署文档](http://docs.dubhe.ai/docs/setup/deploy-guide)成功部署「一站式平台」
- 准备好本地待导入数据集文件,包括图片、标注和标签文件,文件格式参考 [目录说明](http://docs.dubhe.ai/docs/module/dataset/import-dataset#%E7%9B%AE%E5%BD%95%E8%AF%B4%E6%98%8E)
- 登录天枢深度学习平台,在「数据管理」模块下创建数据集,[使用文档](http://docs.dubhe.ai/docs/module/dataset/create-dataset)
## 运行脚本:

1.解压下载的zip文件,需要自行配置数据源、MinIO相关配置
1.下载导入脚本压缩包(upload_dataset),解压之后, `application-{env}` 为脚本配置文件,默认 `env` 环境为 `dev`,需要自行配置数据源、MinIO 相关配置。

2.运行脚本Windows 运行 run.bat; macOS/Linux 系统运行 run.sh
2.运行脚本,Windows 下执行 `run.bat`; macOS/Linux 系统运行 run.sh。

注:可自行配置'application-{env}.xml'文件,执行命令后面添加 'run.bat {env}'即可执行对应的application-{env}.xml;然后按提示进行操作
3. 根据不同环境需求,可自行配置 `application-{env}.yml`文件。`
run.bat {env}`即可执行对应的 `application-{env}.yml` 配置文件,注意在运行脚本前需要保证配置文件已存在。

3.输入数据集ID
3.根据提示输入数据集 ID。
4.根据提示输入待导入数据集绝对路径。

4.输入待导入数据集绝对路径
5. 导入成功。

## 目录结构:

```
标签文件: label_{name}.json ({name} 代表标签组名,可自行定义; 只读标签组文件夹下的第一个标签组文件,标签文件仅支持:.json 支持大小写;文件内容为JSON字符串)
图片文件目录: origin (图片文件需要有后缀名,支持四种格式:.jpg,.png,.bmp,.jpeg 支持大小写)
标注文件目录: annotation (标注文件需要有后缀名,仅支持格式:.json 支持大小写; 文件内容为JSON字符串)
```
[目录说明](http://docs.dubhe.ai/img/data/import-data9.png)

- 图片目录:origin (图片支持四种格式:.jpg,.png,.bmp,.jpeg)
- 标注目录:annotation (标注文件仅支持 .json 格式)
- 标签文件:文件格式为 `label_{name}.json`,其中 `name` 为「标签组」名称,且不能与已有标签组名称重复
## 文件格式

- 标签文件内容样例:
### 标签文件:

> 格式如下:
```
name: 名称
color: 颜色(16进制编码)
```
```

详细示例:
```
[{
@@ -61,23 +69,39 @@
}]
```

- 标注文件内容样例:
### 标注文件:

1. 图片分类

> 格式如下:
```
name: 名称
name: 对应标签名称
score:置信分数(0-1)
```
详细示例:
```
[{"name":"wheaten_terrier","score":1}]
```

2. 目标检测

> 格式如下:
```
name: 对应标签名称
bbox: 标注位置
score:分数
score:置信分数(0-1)
```
详细示例:
```
[{
"name": "行人",
"bbox": [321.6755762696266, 171.32076993584633, 185.67924201488495, 145.02639323472977],
"score": 0.6922634840011597
"name": "行人",
"bbox": [321.6755762696266, 171.32076993584633, 185.67924201488495, 145.02639323472977],
"score": 0.6922634840011597
},
{
"name": "自行车",
"bbox": [40.88740050792694, 22.707078605890274, 451.21362805366516, 326.0102793574333],
"score": 0.6069411635398865
"name": "自行车",
"bbox": [40.88740050792694, 22.707078605890274, 451.21362805366516, 326.0102793574333],
"score": 0.6069411635398865
}]
```


+ 65
- 16
dataset-util/pom.xml View File

@@ -4,7 +4,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>org.dubhe</groupId>
<artifactId>dataset-util</artifactId>
<version>0.0.1-SNAPSHOT</version>
<version>0.1.0-SNAPSHOT</version>
<name>dataset-util</name>
<description>数据处理模块工具</description>

@@ -48,19 +48,49 @@
<dependency>
<groupId>io.minio</groupId>
<artifactId>minio</artifactId>
<version>7.0.2</version>
<version>8.2.1</version>
<exclusions>
<exclusion>
<groupId>stax</groupId>
<artifactId>stax-api</artifactId>
</exclusion>
<exclusion>
<groupId>stax</groupId>
<artifactId>stax</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-annotations</artifactId>
</exclusion>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.xiaoleilu</groupId>
<artifactId>hutool-all</artifactId>
<version>3.0.1</version>
</dependency>

<!-- for spring boot -->
<dependency>
<groupId>org.apache.shardingsphere</groupId>
<artifactId>sharding-jdbc-spring-boot-starter</artifactId>
<version>4.0.0-RC1</version>
<version>4.1.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.shardingsphere</groupId>
<artifactId>shardingsphere-sql-parser-oracle</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.shardingsphere</groupId>
<artifactId>shardingsphere-sql-parser-postgresql</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.shardingsphere</groupId>
<artifactId>shardingsphere-sql-parser-sqlserver</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
@@ -74,12 +104,6 @@
<version>1.3.2</version>
</dependency>

<dependency>
<groupId>org.aspectj</groupId>
<artifactId>aspectjweaver</artifactId>
<version>1.8.9</version>
</dependency>

<!--监控sql日志-->
<dependency>
<groupId>org.bgee.log4jdbc-log4j2</groupId>
@@ -90,10 +114,29 @@
<!--工具包-->
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<artifactId>hutool-core</artifactId>
<version>5.0.6</version>
</dependency>

<dependency>
<groupId>com.twelvemonkeys.imageio</groupId>
<artifactId>imageio-jpeg</artifactId>
<version>3.4.1</version>
</dependency>
<dependency>
<groupId>me.tongfei</groupId>
<artifactId>progressbar</artifactId>
<version>0.9.1</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-aop</artifactId>
</dependency>
<!-- elasticsearch -->
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
</dependency>
</dependencies>

<dependencyManagement>
@@ -104,6 +147,12 @@
<version>${spring-boot.version}</version>
<type>pom</type>
<scope>import</scope>
<exclusions>
<exclusion>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</dependencyManagement>


+ 82
- 16
dataset-util/src/main/java/org/dubhe/datasetutil/DatasetUtilApplication.java View File

@@ -17,15 +17,19 @@
package org.dubhe.datasetutil;

import lombok.extern.slf4j.Slf4j;
import org.dubhe.datasetutil.common.util.PrintUtils;
import org.dubhe.datasetutil.common.util.SpringContextHolder;
import org.dubhe.datasetutil.handle.CustomDatasetImportHandle;
import org.dubhe.datasetutil.handle.DatasetImageUploadHandle;
import org.dubhe.datasetutil.handle.DatasetImportHandle;
import org.dubhe.datasetutil.common.util.PrintUtils;
import org.dubhe.datasetutil.handle.PresetDatasetImportHandle;
import org.mybatis.spring.annotation.MapperScan;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.ApplicationContext;

import java.time.Duration;
import java.time.LocalDateTime;
import java.util.Scanner;

/**
@@ -43,7 +47,7 @@ public class DatasetUtilApplication {
* @param args 入参
*/
public static void main(String[] args) {
ApplicationContext applicationContext = SpringApplication.run(DatasetUtilApplication.class, args);
ApplicationContext applicationContext = SpringApplication.run(org.dubhe.datasetutil.DatasetUtilApplication.class, args);
SpringContextHolder springContextHolder = new SpringContextHolder();
springContextHolder.setApplicationContext(applicationContext);
execute(applicationContext);
@@ -57,18 +61,19 @@ public class DatasetUtilApplication {
public static void execute(ApplicationContext applicationContext) {
while (true) {
Scanner scanner = new Scanner(System.in);
log.warn("###################请输入需要执行的任务#############");
log.warn("# 输入1.执行上传图片 ");
log.warn("# 输入2.执行导入数据集 ");
log.warn("# 输入命令 :exit 退出 ");
log.warn("################################################");
System.out.println(" ");
System.out.println("###请输入需要执行的任务###");
System.out.println("# 输入1:上传文件 ");
System.out.println("# 输入2:导入数据集 ");
System.out.println("# 输入exit:退出 ");
System.out.println("##########################");
String a = scanner.nextLine();
switch (a) {
case "1":
uploadDatasetImage(scanner, applicationContext);
break;
case "2":
importDataset(scanner, applicationContext);
executeImportDataset(applicationContext);
break;
case "exit":
default:
@@ -78,6 +83,47 @@ public class DatasetUtilApplication {
}
}

public static void executeImportDataset(ApplicationContext applicationContext) {
Boolean importFlag = true;
while (importFlag) {
Scanner scanner = new Scanner(System.in);
System.out.println(" ");
System.out.println("###请输入导入数据集类型###");
System.out.println("# 输入1: 导入普通数据集 ");
System.out.println("# 输入2: 导入预置数据集 ");
System.out.println("# 输入3: 导入自定义数据集 ");
System.out.println("# 输入命令:exit 返回 ");
System.out.println("##########################");

switch (scanner.nextLine()) {
case "1":
importDataset(scanner, applicationContext);
break;
case "2":
importPresetDataset(scanner, applicationContext);
break;
case "3":
importCustomDataset(scanner, applicationContext);
break;
case "exit":
default:
importFlag = false;
break;
}
}
}

/**
* 导入预置数据集
*
* @param scanner 输入控制台
* @param applicationContext 请求上下文
*/
private static void importPresetDataset(Scanner scanner, ApplicationContext applicationContext) {
PresetDatasetImportHandle datasetImportHandle = (PresetDatasetImportHandle) applicationContext.getBean("presetDatasetImportHandle");
datasetImportHandle.importPresetDataset(scanner);
}

/**
* 导入图片
*
@@ -85,17 +131,13 @@ public class DatasetUtilApplication {
* @param applicationContext 请求上下文
*/
public static void uploadDatasetImage(Scanner scanner, ApplicationContext applicationContext) {
log.warn("# 请输入数据集ID #");
String datasetIdStr = scanner.nextLine();
Long datasetId = Long.parseLong(datasetIdStr);
log.warn("# 请输入要上传的图片地址 #");
String filePath = scanner.nextLine();
DatasetImageUploadHandle datasetImageUploadHandle = (DatasetImageUploadHandle) applicationContext.getBean("datasetImageUploadHandle");
try {
datasetImageUploadHandle.execute(filePath, datasetId);
datasetImageUploadHandle.importPicture(scanner);
} catch (Exception e) {
log.error("", e);
log.error("# 数据集上传失败,请重新尝试.....");
log.error("");
PrintUtils.printLine(" Error:" + e.getMessage(), PrintUtils.RED);
log.error("");
}
}

@@ -116,4 +158,28 @@ public class DatasetUtilApplication {
}
}

/**
* 导入之定义数据集
*
* @param scanner 输入控制台
* @param applicationContext 请求上下文
*/
public static void importCustomDataset(Scanner scanner, ApplicationContext applicationContext) {
System.out.println(" ");
System.out.println("# 请输入数据集ID #");
String datasetIdStr = scanner.nextLine();
Long datasetId = Long.parseLong(datasetIdStr);
System.out.println(" ");
System.out.println("# 请输入待上传本地文件的绝对路径 #");
String filePath = scanner.nextLine();
CustomDatasetImportHandle customDatasetImportHandle = (CustomDatasetImportHandle) applicationContext.getBean("customDatasetImportHandle");
try {
customDatasetImportHandle.execute(new Object[]{datasetId, filePath});
} catch (Exception e) {
log.error("");
PrintUtils.printLine(" Error:" + e.getMessage(), PrintUtils.RED);
log.error("");
}
}

}

+ 0
- 76
dataset-util/src/main/java/org/dubhe/datasetutil/common/aspect/LogAspect.java View File

@@ -1,76 +0,0 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dubhe.datasetutil.common.aspect;

import lombok.extern.slf4j.Slf4j;
import org.aspectj.lang.JoinPoint;
import org.aspectj.lang.ProceedingJoinPoint;
import org.aspectj.lang.annotation.Around;
import org.aspectj.lang.annotation.Aspect;
import org.aspectj.lang.annotation.Pointcut;
import org.dubhe.datasetutil.common.enums.LogEnum;
import org.dubhe.datasetutil.common.util.LogUtil;
import org.slf4j.MDC;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;

import java.util.UUID;

/**
* @description 日志切面
* @date 2020-04-10
*/
@Component
@Aspect
@Slf4j
public class LogAspect {

public static final String TRACE_ID = "traceId";

@Pointcut("execution(* org.dubhe..service..*.*(..))) ")
public void serviceAspect() {
}

@Pointcut(" serviceAspect() ")
public void aroundAspect() {
}

@Around("aroundAspect()")
public Object around(JoinPoint joinPoint) throws Throwable {
if (StringUtils.isEmpty(MDC.get(TRACE_ID))) {
MDC.put(TRACE_ID, UUID.randomUUID().toString());
}
return combineLogInfo(joinPoint);
}

/**
* 根据连接点返回结果
*
* @param joinPoint 连接点
* @return Object 返回结果
*/
private Object combineLogInfo(JoinPoint joinPoint) throws Throwable {
Object[] param = joinPoint.getArgs();
LogUtil.info(LogEnum.REST_REQ, "uri:{},input:{},==>begin", joinPoint.getSignature(), param);
long start = System.currentTimeMillis();
Object result = ((ProceedingJoinPoint) joinPoint).proceed();
long end = System.currentTimeMillis();
LogUtil.info(LogEnum.REST_REQ, "uri:{},output:{},proc_time:{},<==end", joinPoint.getSignature().toString(),
result, end - start);
return result;
}

}

+ 2
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/base/MagicNumConstant.java View File

@@ -47,6 +47,7 @@ public final class MagicNumConstant {
public static final int ONE_HUNDRED = 100;
public static final int ONE_HUNDRED_TWENTY_EIGHT = 128;
public static final int TWO_HUNDRED = 200;
public static final int FOUR_HUNDRED = 400;
public static final int FIVE_HUNDRED = 500;
public static final int FIVE_HUNDRED_AND_SIXTEEN = 516;
public static final int ONE_THOUSAND = 1000;
@@ -89,6 +90,7 @@ public final class MagicNumConstant {
public static final long TWELVE_LONG = 12L;
public static final long SIXTY_LONG = 60L;
public static final long FIFTY_LONG = 50L;
public static final long HUNDRED_LONG = 100L;
public static final long THOUSAND_LONG = 1000L;
public static final long TEN_THOUSAND_LONG = 10000L;
public static final long ONE_ZERO_ONE_ZERO_ONE_ZERO_LONG = 101010L;


+ 63
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/config/ImageConfig.java View File

@@ -0,0 +1,63 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.common.config;

import lombok.Data;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

import java.util.Arrays;
import java.util.List;

/**
* @description 图片格式配置文件
* @date 2020-10-29
*/
@Data
@Component
@ConfigurationProperties(prefix = "suffix")
public class ImageConfig {
/**
* 图片格式字符串
*/
private String imageFormat;

/**
* 文本格式字符串
*/
private String txtFormat;

/**
* 构建图片格式集合
*
* @return List<String>
*/
public List<String> buildImageFormat() {
return Arrays.asList(imageFormat.split(BusinessConstant.COMMA));
}

/**
* 构建文本格式集合
*
* @return List<String>
*/
public List<String> buildTxtFormat() {
return Arrays.asList(txtFormat.split(BusinessConstant.COMMA));
}

}

+ 8
- 4
dataset-util/src/main/java/org/dubhe/datasetutil/common/config/MinioConfig.java View File

@@ -17,8 +17,6 @@
package org.dubhe.datasetutil.common.config;

import io.minio.MinioClient;
import io.minio.errors.InvalidEndpointException;
import io.minio.errors.InvalidPortException;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Bean;
@@ -45,14 +43,20 @@ public class MinioConfig {

private String bucketName;

private String nfsRootPath;

private String serverUserName;

private double blockingCoefficient;

/**
* 获取Minio客户端信息
*
* @return Minio客户端信息
*/
@Bean
public MinioClient getMinioClient() throws InvalidEndpointException, InvalidPortException {
return new MinioClient(endpoint, port, accessKey, secretKey,secure);
public MinioClient getMinioClient() {
return MinioClient.builder().endpoint("http://" + endpoint + ":" + port).credentials(accessKey, secretKey).build();
}

}

+ 93
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/constant/AnnotateTypeEnum.java View File

@@ -0,0 +1,93 @@
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/

package org.dubhe.datasetutil.common.constant;

import lombok.Getter;

/**
* @description 标注类型枚举类
* @date 2020-05-21
*/
@Getter
public enum AnnotateTypeEnum {

/**
* 图像分类
*/
CLASSIFICATION(2, "图像分类"),
/**
* 目标检测
*/
OBJECT_DETECTION(1, "目标检测"),
/**
* 目标跟踪
*/
OBJECT_TRACK(5, "目标跟踪"),
/**
* 语义分割
*/
SEMANTIC_CUP(7, "语义分割"),
/**
* 文本分类
*/
TEXT_CLASSIFICATION(6, "文本分类"),
/**
* 自定义导入
*/
AUTO_IMPORT(100, "自定义导入");


AnnotateTypeEnum(Integer value, String msg) {
this.value = value;
this.msg = msg;
}

private Integer value;
private String msg;

/**
* 标注类型校验 用户web端接口调用时参数校验
*
* @param value 标注类型Integer值
* @return 参数校验结果
*/
public static boolean isValid(Integer value) {
for (AnnotateTypeEnum annotateTypeEnum : AnnotateTypeEnum.values()) {
if (annotateTypeEnum.value.equals(value)) {
return true;
}
}
return false;
}

/**
* 根据标注类型获取类型code值
*
* @param annotate 标注类型
* @return 类型code值
*/
public static Integer getConvertAnnotateType(String annotate) {
for (AnnotateTypeEnum annotateTypeEnum : AnnotateTypeEnum.values()) {
if (annotateTypeEnum.msg.equals(annotate)) {
return annotateTypeEnum.value;
}
}
return null;
}

}

+ 62
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/constant/BusinessConstant.java View File

@@ -35,6 +35,12 @@ public class BusinessConstant {
* 分表业务编码 - 文件版本关系表
*/
public static final String DATA_VERSION_FILE = "DATA_VERSION_FILE";

/**
* 分表业务编码 - 文件版本标签关系表
*/
public static final String DATA_FILE_ANNOTATION = "DATA_FILE_ANNOTATION";

/**
* 图片文件路径
*/
@@ -43,6 +49,26 @@ public class BusinessConstant {
* 标注文件路径
*/
public static final String ANNOTATION = "annotation";
/**
* 版本文件路径
*/
public static final String VERSION_FILE = "versionFile";
/**
* 视频文件路径
*/
public static final String VIDEO = "video";
/**
* 版本文件V0001路径
*/
public static final String V0001 = "V0001";
/**
* 版本文件ofrecord路径
*/
public static final String OFRECORD = "ofrecord";
/**
* 版本文件train路径
*/
public static final String TRAIN = "train";
/**
* 标签文件路径
*/
@@ -55,11 +81,20 @@ public class BusinessConstant {
* 后缀.
*/
public static final String SPOT = ".";
/**
* 逗号,
*/
public static final String COMMA = ",";

/**
* JSON后缀名
*/
public static final String SUFFIX_JSON = ".JSON";

/**
* SQL后缀名
*/
public static final String SUFFIX_SQL = ".sql";
/**
* minio根目录
*/
@@ -83,4 +118,31 @@ public class BusinessConstant {
*/
public static final String Y = "Y";

public static final String DEFAULT_VERSION = "V0001";

/**
* 版本文件表
*/
public static final String DATA_DATASET_VERSION_FILE = "data_dataset_version_file";
/**
* 数据集文件表
*/
public static final String DATASET_FILE = "data_file";

/**
* 文本摘要
*/
public static final String ABSTRACT = "abstract_";

/**
* 表后缀
*/
public static final String TABLE_SUFFIX = "_1";

/**
* 删除服务器无效文件(大文件)
* 示例:rsync --delete-before -d /空目录 /需要回收的源目录
*/
public static final String DEL_COMMAND = "ssh %s@%s \"mkdir -p %s; rsync --delete-before -d %s %s; rmdir %s %s\"";

}

+ 88
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/constant/DatatypeEnum.java View File

@@ -0,0 +1,88 @@
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/

package org.dubhe.datasetutil.common.constant;

import lombok.Getter;

/**
* @description 数据类型
* @date 2020-05-21
*/
@Getter
public enum DatatypeEnum {

/**
* 图片
*/
IMAGE(0, "图片"),
/**
* 视频
*/
VIDEO(1, "视频"),
/**
* 文本
*/
TEXT(2, "文本"),
/**
* 自定义导入
*/
AUTO_IMPORT(100, "自定义导入");

DatatypeEnum(Integer value, String msg) {
this.value = value;
this.msg = msg;
}

private Integer value;
private String msg;

/**
* 数据类型校验 用户web端接口调用时参数校验
*
* @param value 数据类型
* @return 参数校验结果
*/
public static boolean isValid(Integer value) {
for (DatatypeEnum datatypeEnum : DatatypeEnum.values()) {
if (datatypeEnum.value.equals(value)) {
return true;
}
}
return false;
}

/**
* 获取数据类型枚举
*
* @param value 获取数据类型枚举值
* @return 数据类型枚举
*/
public static DatatypeEnum getEnumValue(Integer value) {
switch (value) {
case 0:
return IMAGE;
case 1:
return VIDEO;
case 2:
return TEXT;
default:
return IMAGE;
}
}

}

+ 58
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/enums/DatatypeEnum.java View File

@@ -0,0 +1,58 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/

package org.dubhe.datasetutil.common.enums;

import lombok.Getter;

/**
* @description 数据类型
* @date 2020-11-23
*/
@Getter
public enum DatatypeEnum {

/**
* 图片
*/
IMAGE(0, "图片"),
/**
* 视频
*/
VIDEO(1, "视频"),
/**
* 文本
*/
TXT(2, "文本");

DatatypeEnum(Integer value, String msg) {
this.value = value;
this.msg = msg;
}

/**
* 数据类型
*/
private Integer value;

/**
* 数据描述
*/
private String msg;


}

+ 77
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/enums/PresetDatasetEnum.java View File

@@ -0,0 +1,77 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/

package org.dubhe.datasetutil.common.enums;

import lombok.Getter;
import lombok.ToString;

/**
* @Description 预置数据集类型枚举
* @Date 2020-11-03
*/
@ToString
@Getter
public enum PresetDatasetEnum {
/**
* COCO2017-val
*/
COCO2017Val("1", "COCO2017-val"),

/**
* Caltech-256
*/
Caltech256("2", "Caltech-256"),

/**
* COCO2017-train
*/
COCO2017Train("3", "COCO2017-train"),

/**
* Object-Tracking
*/
ObjectTracking("4", "Object-Tracking"),

/**
* Data-Augment
*/
DataAugment("5", "Data-Augment"),

/**
* IMDB_DATASET
*/
ImdbDataset("101", "NLP_IMDB"),

;

/**
* 预置数据集类型
*/
private String type;

/**
* 操作类型备注
*/
private String desc;

PresetDatasetEnum(String type, String desc) {
this.type = type;
this.desc = desc;
}

}

+ 91
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/EsConfiguration.java View File

@@ -0,0 +1,91 @@
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/

package org.dubhe.datasetutil.common.util;

import org.elasticsearch.action.bulk.BackoffPolicy;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

import java.net.InetAddress;
import java.net.UnknownHostException;

/**
* @description ES批量同步数据配置
* @date 2021-03-24
*/
@Configuration
public class EsConfiguration {

/**
* es服务地址
*/
@Value("${es.host}")
private String esServerHost;

/**
* es同步端口
*/
@Value("${es.transportPort}")
private String estransportPort;

/**
* 集群名称
*/
@Value("${es.clusterName}")
private String clusterName;

@Bean(name = "bulkProcessor")
public BulkProcessor bulkProcessor() throws UnknownHostException {
Settings settings = Settings.builder().put("cluster.name", clusterName).build();
Client client = new PreBuiltTransportClient(settings)
.addTransportAddress(new TransportAddress(InetAddress.getByName(esServerHost), Integer.parseInt(estransportPort)));
return BulkProcessor.builder(client, new BulkProcessor.Listener() {
@Override
public void beforeBulk(long l, BulkRequest bulkRequest) {

}

@Override
public void afterBulk(long l, BulkRequest bulkRequest, BulkResponse bulkResponse) {

}

@Override
public void afterBulk(long l, BulkRequest bulkRequest, Throwable throwable) {

}

}).setBulkActions(1000)
.setBulkSize(new ByteSizeValue(5, ByteSizeUnit.MB))
.setFlushInterval(TimeValue.timeValueSeconds(5))
.setConcurrentRequests(1)
.setBackoffPolicy(BackoffPolicy.exponentialBackoff(TimeValue.timeValueMillis(100), 3))
.build();
}
}

+ 43
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/FileUtil.java View File

@@ -0,0 +1,43 @@
package org.dubhe.datasetutil.common.util;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

/**
* @description 文件处理工具
* @date 2021-03-23
*/
public class FileUtil {

/**
* 遍历文件
*
* @param path 文件路径
*/
public static List<String> traverseFolder(String path) {
List<String> filePaths = new ArrayList<>();
// 实例化file对象,指明要操作的文件路径
File file = new File(path);
// 判断是否有文件
if (file.exists()) {
// 获取该目录下的所有文件或者文件目录的File数组
File[] files = file.listFiles();
// 判断文件是否为空
if (files != null && files.length > 0) {
// 利用foreach 进行循环遍历
for (File f : files) {
// 判断是文件还是文件夹
if (f.isDirectory()) {
// 递归调用
filePaths.addAll(traverseFolder(f.getPath()));
} else {
filePaths.add(f.getAbsolutePath());
}
}
}
}
return filePaths;
}

}

+ 13
- 46
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/GeneratorKeyUtil.java View File

@@ -18,16 +18,14 @@ package org.dubhe.datasetutil.common.util;

import cn.hutool.core.util.ObjectUtil;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.common.enums.LogEnum;
import org.dubhe.datasetutil.common.exception.DataSequenceException;
import org.dubhe.datasetutil.domain.dto.IdAlloc;
import org.dubhe.datasetutil.domain.entity.DataSequence;
import org.dubhe.datasetutil.service.DataSequenceService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
import java.util.Queue;
import java.util.concurrent.ConcurrentHashMap;

/**
@@ -49,8 +47,7 @@ public class GeneratorKeyUtil {
* @param number 数量
* @return Long 起始位置
*/
@Transactional(rollbackFor = Exception.class)
public synchronized Long getSequenceByBusinessCode(String businessCode, int number) {
public synchronized Queue<Long> getSequenceByBusinessCode(String businessCode, int number) {
if (StringUtils.isEmpty(businessCode)) {
throw new DataSequenceException("业务编码不可为空");
}
@@ -63,56 +60,26 @@ public class GeneratorKeyUtil {
idAllocConcurrentHashMap.put(businessCode, idAlloc);
}

if (idAlloc.getUsedNumber() == MagicNumConstant.ZERO) {
DataSequence dataSequence = getDataSequence(businessCode);
updateDataSequence(businessCode);
idAlloc.setStartNumber(dataSequence.getStart());
idAlloc.setEndNumber(dataSequence.getStart() + dataSequence.getStep() - MagicNumConstant.ONE);
idAlloc.setUsedNumber(idAlloc.getEndNumber() - idAlloc.getStartNumber() + MagicNumConstant.ONE);
}
if (idAlloc.getUsedNumber() <= number) {
if (idAlloc.getUnUsed() < number) {
//执行扩容操作
expansionUsedNumber(businessCode, number);
}
long returnStartNumber = idAlloc.getStartNumber();
idAlloc.setStartNumber(idAlloc.getStartNumber() + number);
idAlloc.setUsedNumber(idAlloc.getUsedNumber() - number);
return returnStartNumber;
}

/**
* 根据业务编码获取配置信息
* @param businessCode 业务编码
* @return DataSequence 数据索引
*/
private DataSequence getDataSequence(String businessCode) {
DataSequence dataSequence = dataSequenceService.getSequence(businessCode);
if (dataSequence == null || dataSequence.getStart() == null || dataSequence.getStep() == null) {
throw new DataSequenceException("配置出错,请检查data_sequence表配置");
}
return dataSequence;
//获取ids
return idAlloc.poll(number);
}

/**
* 根据业务编码更新起始值
* @param businessCode 业务编码
*/
private void updateDataSequence(String businessCode) {
dataSequenceService.updateSequenceStart(businessCode);
}

/**
* 多次扩容
* 扩容
* @param businessCode 业务编码
* @param number 数量
*/
private void expansionUsedNumber(String businessCode, int number) {
protected void expansionUsedNumber(String businessCode, int number) {
IdAlloc idAlloc = idAllocConcurrentHashMap.get(businessCode);
updateDataSequence(businessCode);
DataSequence dataSequenceNew = getDataSequence(businessCode);
idAlloc.setEndNumber(idAlloc.getEndNumber() + dataSequenceNew.getStep());
idAlloc.setUsedNumber(idAlloc.getEndNumber() - idAlloc.getStartNumber() + MagicNumConstant.ONE);
if (idAlloc.getUsedNumber() <= number) {
DataSequence dataSequenceNew = dataSequenceService.expansionUsedNumber(businessCode);
idAlloc.add(dataSequenceNew);
if(idAlloc.getUnUsed() < number) {
expansionUsedNumber(businessCode, number);
}
}
}

}

+ 24
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/HandleFileUtil.java View File

@@ -19,9 +19,12 @@ package org.dubhe.datasetutil.common.util;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.LineIterator;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.springframework.util.ObjectUtils;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
* @description 文件工具类
@@ -45,6 +48,27 @@ public class HandleFileUtil {
}


/**
* 读取文件内容
*
* @param file 文件对象
* @return Map<String,List<String>> 文件内容
*/
public static List<String> readFileInfo(File file) throws IOException{

List<String> datasetList = new ArrayList<>();
LineIterator fileContext = FileUtils.lineIterator(file,"UTF-8");
while (fileContext.hasNext()) {
String line = fileContext.nextLine();
if(!ObjectUtils.isEmpty(line)){
datasetList.add(line);
}

}
return datasetList;
}


/**
* 获取文件名后缀名
*


+ 45
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/IOUtil.java View File

@@ -0,0 +1,45 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.common.util;

import org.dubhe.datasetutil.common.exception.ImportDatasetException;
import java.io.Closeable;
import java.io.IOException;

/**
* @description IO流操作工具类
* @date 2020-11-14
*/
public class IOUtil {

/**
* 循环的依次关闭流
*
* @param closeableList 要被关闭的流集合
*/
public static void close(Closeable... closeableList) {
for (Closeable closeable : closeableList) {
try {
if (closeable != null) {
closeable.close();
}
} catch (IOException e) {
throw new ImportDatasetException(" 流关闭异常 ");
}
}
}
}

+ 1
- 9
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/LogUtil.java View File

@@ -22,15 +22,12 @@ import com.alibaba.fastjson.JSON;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.dubhe.datasetutil.common.aspect.LogAspect;
import org.dubhe.datasetutil.common.enums.LogEnum;
import org.dubhe.datasetutil.domain.entity.LogInfo;
import org.slf4j.MDC;
import org.slf4j.MarkerFactory;
import org.slf4j.helpers.MessageFormatter;

import java.util.Arrays;
import java.util.UUID;

/**
* @description 日志工具类
@@ -179,14 +176,9 @@ public class LogUtil {
logType = LogEnum.SYS_ERR;
}

// 获取trace_id
if (StringUtils.isEmpty(MDC.get(LogAspect.TRACE_ID))) {
MDC.put(LogAspect.TRACE_ID, UUID.randomUUID().toString());
}
// 设置logInfo的level,type,traceId属性
logInfo.setLevel(level.levelStr)
.setType(logType.toString())
.setTraceId(MDC.get(LogAspect.TRACE_ID));
.setType(logType.toString());


//自定义日志级别


+ 102
- 10
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/MinioUtil.java View File

@@ -16,14 +16,20 @@
*/
package org.dubhe.datasetutil.common.util;

import io.minio.MinioClient;
import io.minio.PutObjectOptions;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.io.IoUtil;
import io.minio.*;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.common.config.MinioConfig;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.dubhe.datasetutil.common.enums.LogEnum;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;

/**
* @description Minio工具类
@@ -41,24 +47,110 @@ public class MinioUtil {
/**
* 上传文件
*
* @param objectName 对象名称
* @param inputStream 文件流
* @param sourceFilePath 原文件绝对路径
* @param targetFilePath 目标文件路径
* @throws Exception 上传异常
*/
public void upLoadFile(String sourceFilePath, String targetFilePath) throws Exception {
LogUtil.info(LogEnum.BIZ_DATASET, "源文件目录: 【" + sourceFilePath + "】" + " 目标目录: 【" + targetFilePath + "】");
try {
ObjectWriteResponse objectWriteResponse = minioClient.uploadObject(UploadObjectArgs
.builder()
.bucket(minioConfig.getBucketName())
.object(targetFilePath)
.filename(sourceFilePath)
.contentType(FileUtil.getName(sourceFilePath))
.build()
);
} catch (IOException e) {
LogUtil.error(LogEnum.BIZ_DATASET, "上传文件失败: {} ", e);
}
}


/**
* 上传文件 (文件流消费后直接关闭)
*
* @param targetFilePath 原文件绝对路径
* @throws Exception 上传异常
*/
public void upLoadFile(String objectName, InputStream inputStream) throws Exception {
LogUtil.info(LogEnum.BIZ_DATASET,"文件上传名称为: 【" + objectName + "】");
PutObjectOptions options = new PutObjectOptions(inputStream.available(), -1);
minioClient.putObject(minioConfig.getBucketName(), objectName, inputStream, options);
public void upLoadFileByInputStream(String targetFilePath, String filePath) throws Exception {
try {
minioClient.uploadObject(UploadObjectArgs
.builder()
.bucket(minioConfig.getBucketName())
.object(targetFilePath)
.filename(filePath)
.contentType(
contentType(
FileUtil.getName(filePath)
)
).build()
);
} catch (Exception e) {
LogUtil.error(LogEnum.BIZ_DATASET, "上传文件失败: {} ", e);
}
}

/**
* 获取文件URL
*
*
* @param objectName 对象名称
* @return String 文件路径
*/
public String getUrl(String objectName) {
return minioConfig.getBucketName() + "/" + objectName;
return minioConfig.getBucketName() + BusinessConstant.FILE_SEPARATOR + objectName;
}

/**
* 读取文件
*
* @param bucketName 桶
* @param fullFilePath 文件存储的全路径,包括文件名,非'/'开头. e.g. dataset/12/annotation/test.txt
* @return String
*/
public String readString(String bucketName, String fullFilePath) {
try (InputStream is = minioClient.getObject(GetObjectArgs
.builder()
.bucket(bucketName)
.object(fullFilePath)
.build()
)) {
return IoUtil.read(is, Charset.defaultCharset());
} catch (Exception e) {
LogUtil.error(LogEnum.BIZ_DATASET, "读取文本content失败: {} ", e);
return null;
}
}

private String contentType(String fileName) {
if (fileName.endsWith("xml")) {
return "text/xml";
} else if (fileName.endsWith("jpg") || fileName.endsWith("jpe") || fileName.endsWith("jpeg")) {
return "image/jpg";
} else if (fileName.endsWith("png")) {
return "image/png";
} else if (fileName.endsWith("pic")) {
return "image/pict";
} else if (fileName.endsWith("avi")) {
return "video/x-msvideo";
} else if (fileName.endsWith("mp4")) {
return "video/mp4";
} else if (fileName.endsWith("ogg")) {
return "video/ogg";
} else if (fileName.endsWith("webm")) {
return "video/webm";
} else if (fileName.endsWith("HTML") || fileName.endsWith("html")) {
return "text/html";
} else if (fileName.endsWith("DOCX") || fileName.endsWith("docx") || fileName.endsWith("DOC")
|| fileName.endsWith("doc")) {
return "application/msword";
} else if (fileName.endsWith("XML") || fileName.endsWith("xml")) {
return "text/xml";
} else if (fileName.endsWith("pdf")) {
return "application/pdf";
}
return "image/jpeg";
}

}

+ 21
- 8
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/MyPreciseShardingAlgorithm.java View File

@@ -16,6 +16,7 @@
*/
package org.dubhe.datasetutil.common.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.shardingsphere.api.sharding.standard.PreciseShardingAlgorithm;
import org.apache.shardingsphere.api.sharding.standard.PreciseShardingValue;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
@@ -24,16 +25,21 @@ import org.dubhe.datasetutil.service.DataSequenceService;
import org.springframework.beans.factory.annotation.Autowired;

import java.util.Collection;
import java.util.HashSet;
import java.util.Set;

/**
* @description 数据分片
* @date 2020-09-21
*/
public class MyPreciseShardingAlgorithm implements PreciseShardingAlgorithm<Long>{
@Slf4j
public class MyPreciseShardingAlgorithm implements PreciseShardingAlgorithm<Long> {

@Autowired
private DataSequenceService dataSequenceService;

private static Set<String> tableNames = new HashSet<>();

/**
* 数据表分片
*
@@ -45,10 +51,17 @@ public class MyPreciseShardingAlgorithm implements PreciseShardingAlgorithm<Long
public String doSharding(Collection<String> collection, PreciseShardingValue<Long> preciseShardingValue) {
long startIndex = MagicNumConstant.ONE;
long endIndex = MagicNumConstant.FIFTY;
dataSequenceService = SpringContextHolder.getBean(DataSequenceService.class);
String tableName = preciseShardingValue.getLogicTableName()+ BusinessConstant.UNDERLINE + preciseSharding(preciseShardingValue.getValue(),startIndex ,endIndex);
if(!dataSequenceService.checkTableExist(tableName)){
dataSequenceService.createTable(tableName);
String tableName = preciseShardingValue.getLogicTableName() + BusinessConstant.UNDERLINE + preciseSharding(preciseShardingValue.getValue(), startIndex, endIndex);
if (!tableNames.contains(tableName)) {
dataSequenceService = SpringContextHolder.getBean(DataSequenceService.class);
if (!dataSequenceService.checkTableExist(tableName)) {
try {
dataSequenceService.createTable(tableName);
} catch (Exception e) {
log.error("table name repeat {}", tableName);
}
}
tableNames.add(tableName);
}
return tableName;
}
@@ -61,11 +74,11 @@ public class MyPreciseShardingAlgorithm implements PreciseShardingAlgorithm<Long
* @param endIndex 结束值
* @return long 返回截止值
*/
public long preciseSharding(long indexId,long startIndex , long endIndex){
if(indexId > endIndex){
public long preciseSharding(long indexId, long startIndex, long endIndex) {
if (indexId > endIndex) {
startIndex = startIndex + BusinessConstant.INTERVAL_NUMBER;
endIndex = endIndex + BusinessConstant.INTERVAL_NUMBER;
return preciseSharding(indexId,startIndex,endIndex);
return preciseSharding(indexId, startIndex, endIndex);
}
return endIndex / BusinessConstant.INTERVAL_NUMBER;
}


+ 32
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/ProcessBarUtil.java View File

@@ -0,0 +1,32 @@
package org.dubhe.datasetutil.common.util;

import me.tongfei.progressbar.ProgressBar;

/**
* @description 进度条工具类
* @date 2021-03-23
*/
public class ProcessBarUtil {

public static ProgressBar pb = null;

/**
* 初始化进度条工具
*
* @param task
* @param maxValue
*/
public static void initProcess(String task, Long maxValue) {
pb = new ProgressBar(task, maxValue);
}

/**
* 更新进度条
*
* @param step
*/
public static void processBar01(Long step) {
pb.stepBy(step);
}

}

+ 13
- 7
dataset-util/src/main/java/org/dubhe/datasetutil/common/util/ThreadUtils.java View File

@@ -18,6 +18,9 @@ package org.dubhe.datasetutil.common.util;


import lombok.extern.slf4j.Slf4j;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.common.config.MinioConfig;
import org.springframework.util.CollectionUtils;

import java.util.List;
import java.util.concurrent.*;
@@ -29,7 +32,8 @@ import java.util.concurrent.*;
@Slf4j
public class ThreadUtils {

private ThreadUtils(){}
private ThreadUtils() {
}

/**
* 根据需要处理的数量创建线程数
@@ -38,7 +42,7 @@ public class ThreadUtils {
* @return int 数量
*/
public static int createThread(int listSize) {
return listSize / getNeedThreadNumber() == 0 ? 1 : listSize / getNeedThreadNumber();
return listSize / getNeedThreadNumber() == MagicNumConstant.ZERO ? MagicNumConstant.ONE : listSize / getNeedThreadNumber();
}


@@ -49,24 +53,26 @@ public class ThreadUtils {
*/
public static int getNeedThreadNumber() {
final int numOfCores = Runtime.getRuntime().availableProcessors();
final double blockingCoefficient = 0.8;
return (int) (numOfCores / (1 - blockingCoefficient));
MinioConfig minioConfig = (MinioConfig) SpringContextHolder.getBean("minioConfig");
final double blockingCoefficient = minioConfig.getBlockingCoefficient();
return (int) (numOfCores / (MagicNumConstant.ONE - blockingCoefficient));
}

/**
* 按要求分多线程执行
*
* @param partitions 分线程集合
* @param partitions 分线程集合
* @throws Exception 线程执行异常
*/
public static void runMultiThread(List<Callable<Integer>> partitions) throws Exception {
final ExecutorService executorService = Executors.newFixedThreadPool(ThreadUtils.getNeedThreadNumber());
final List<Future<Integer>> valueOfStocks = executorService.invokeAll(partitions);
Integer endCount = 0;
Integer endCount = MagicNumConstant.ZERO;
for (final Future<Integer> value : valueOfStocks) {
endCount += value.get();
}
log.warn("#-------------处理结束,成功处理文件 【" + endCount + "】个-------------#");
executorService.shutdown();
Thread.sleep(1000);
}

}

+ 46
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataFileAnnotationMapper.java View File

@@ -0,0 +1,46 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.dao;

import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Delete;
import org.apache.ibatis.annotations.Param;
import org.dubhe.datasetutil.domain.entity.DataFileAnnotation;

import java.util.List;

/**
* @description nlp文件 服务实现类
* @date 2021-01-07
*/
public interface DataFileAnnotationMapper extends BaseMapper<DataFileAnnotation> {

/**
* 批量保存nlp中间表
*
* @param dataFileAnnotations nlp集合
*/
void saveDataFileAnnotation(@Param("dataFileAnnotations") List<DataFileAnnotation> dataFileAnnotations);

/**
* 删除数据集文件标注数据通过数据集ID
*
* @param datasetId 数据集ID
*/
@Delete("delete from data_file_annotation where dataset_id = #{datasetId}")
void delDataFileAnnotationById(long datasetId);
}

+ 32
- 1
dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataFileMapper.java View File

@@ -17,7 +17,7 @@
package org.dubhe.datasetutil.dao;

import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.*;
import org.dubhe.datasetutil.domain.entity.DataFile;
import java.util.List;

@@ -33,4 +33,35 @@ public interface DataFileMapper extends BaseMapper<DataFile> {
* @param listDataFile 文件数据集合
*/
void saveBatchDataFile(@Param("listDataFile") List<DataFile> listDataFile);


/**
* 创建新表 data_file_1
*/
@Update("CREATE TABLE data_file_1 LIKE data_file")
void createNewTableOne();


/**
* 创建新表 data_file_2
*/
@Update("CREATE TABLE data_file_2 LIKE data_file")
void createNewTableTwo();

/**
* 根据表名获取表数量
*
* @param tableName 表名称
* @return 表数量
*/
@Select("select count(*) from information_schema.TABLES where table_name = #{tableName}")
int selectCountByTableName(@Param("tableName") String tableName);

/**
* 删除数据集文件通过数据集ID
*
* @param datasetId 数据集ID
*/
@Delete("delete from data_file where dataset_id = #{datasetId}")
void deleteFileByDatasetId(@Param("datasetId") long datasetId);
}

+ 21
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataLabelMapper.java View File

@@ -17,6 +17,7 @@
package org.dubhe.datasetutil.dao;

import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Delete;
import org.apache.ibatis.annotations.Param;
import org.dubhe.datasetutil.domain.entity.DataLabel;

@@ -33,4 +34,24 @@ public interface DataLabelMapper extends BaseMapper<DataLabel> {
* @param listDataLabel 标签数据
*/
void saveBatchDataLabel(@Param("listDataLabel") List<DataLabel> listDataLabel);



/**
* 根据预置标签组获取预置标签
*
* @param groupIds 预置标签组IDS
* @return 预置标签 key: 预置标签名称 value:预置标签ID
*/
List<DataLabel> getPresetLabelList(@Param("groupIds") List<Long> groupIds);

/**
* 删除标签
*
* @param datasetId
*/
@Delete("delete from data_label where id IN (\n" +
" select * from ( select label_id from data_dataset_label where dataset_id = #{datasetId}) t\n" +
")")
void deleteLabelByDatasetId(@Param("datasetId") long datasetId);
}

+ 13
- 4
dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataSequenceMapper.java View File

@@ -34,7 +34,16 @@ public interface DataSequenceMapper extends BaseMapper<DataSequence> {
* @return DataSequence 根据业务编码得到的序列
*/
@Select("select id, business_code ,start, step from data_sequence where business_code = #{businessCode}")
DataSequence selectByBusiness(String businessCode);
DataSequence selectByBusiness(@Param("businessCode") String businessCode);

/**
* 根据ID查询
*
* @param id 序列ID
* @return DataSequence 根据业务编码得到的序列
*/
@Select("select id, business_code ,start, step from data_sequence where id = #{id} for update")
DataSequence selectDataSequenceById(@Param("id") Long id);

/**
* 根据业务编码更新序列起始值
@@ -43,7 +52,7 @@ public interface DataSequenceMapper extends BaseMapper<DataSequence> {
* @return DataSequence 根据业务编码更新序列起始值
*/
@Update("update data_sequence set start = start + step where business_code = #{businessCode} ")
int updateStartByBusinessCode(String businessCode);
int updateStartByBusinessCode(@Param("businessCode") String businessCode);

/**
* 查询存在表的记录数
@@ -60,7 +69,7 @@ public interface DataSequenceMapper extends BaseMapper<DataSequence> {
* @param tableName 类型名称
* @param oldTableName 旧类型名称
*/
@Update({"CREATE TABLE ${tableName} AS select * from ${oldTableName} "})
@Update({"CREATE TABLE ${tableName} like ${oldTableName}"})
void createNewTable(@Param("tableName") String tableName, @Param("oldTableName") String oldTableName);

}
}

+ 42
- 2
dataset-util/src/main/java/org/dubhe/datasetutil/dao/DataVersionFileMapper.java View File

@@ -17,8 +17,8 @@
package org.dubhe.datasetutil.dao;

import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Param;
import org.dubhe.datasetutil.domain.dto.DataVersionFile;
import org.apache.ibatis.annotations.*;
import org.dubhe.datasetutil.domain.entity.DataVersionFile;

import java.util.List;

@@ -34,4 +34,44 @@ public interface DataVersionFileMapper extends BaseMapper<DataVersionFile> {
* @param listDataVersionFile 数据集文件中间表数据集合
*/
void saveBatchDataFileVersion(@Param("listDataVersionFile") List<DataVersionFile> listDataVersionFile);


/**
* 创建新表 data_dataset_version_file_1
*/
@Update("create table data_dataset_version_file_1 like data_dataset_version_file")
void createNewTableOne();


/**
* 创建新表 data_dataset_version_file_2
*/
@Update("create table data_dataset_version_file_2 like data_dataset_version_file")
void createNewTableTwo();


/**
* 根据表名获取表数量
*
* @param tableName 表名称
* @return 表数量
*/
@Select("select count(*) from information_schema.TABLES where table_name = #{tableName}")
int selectCountByTableName(@Param("tableName") String tableName);

/**
* 删除数据集版本文件通过数据集ID
*
* @param datasetId 数据集ID
*/
@Delete("delete from data_dataset_version_file where dataset_id = #{datasetId}")
void deleteVersionFileByDatasetId(@Param("datasetId") long datasetId);

/**
* 删除数据集版本通过数据集ID
*
* @param datasetId 数据集ID
*/
@Delete("delete from data_dataset_version where dataset_id = #{datasetId}")
void deleteVersionByDatasetId(@Param("datasetId") long datasetId);
}

+ 9
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/dao/DatasetDataLabelMapper.java View File

@@ -17,6 +17,7 @@
package org.dubhe.datasetutil.dao;

import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Delete;
import org.apache.ibatis.annotations.Param;
import org.dubhe.datasetutil.domain.entity.DatasetDataLabel;

@@ -34,4 +35,12 @@ public interface DatasetDataLabelMapper extends BaseMapper<DatasetDataLabel> {
* @param datasetDataLabelList 数据集与标签
*/
void saveBatchDatasetDataLabel(@Param("datasetDataLabelList") List<DatasetDataLabel> datasetDataLabelList);

/**
* 删除数据集标签关系通过数据集ID
*
* @param datasetId 数据集ID
*/
@Delete("delete from data_dataset_label where dataset_id = #{datasetId}")
void deleteDatasetLabelByDatasetId(@Param("datasetId") long datasetId);
}

+ 28
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/dao/DatasetMapper.java View File

@@ -17,6 +17,8 @@
package org.dubhe.datasetutil.dao;

import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.apache.ibatis.annotations.Delete;
import org.apache.ibatis.annotations.Insert;
import org.apache.ibatis.annotations.Param;
import org.apache.ibatis.annotations.Select;
import org.dubhe.datasetutil.domain.entity.Dataset;
@@ -53,4 +55,30 @@ public interface DatasetMapper extends BaseMapper<Dataset> {
*/
@Select("select count(1) from data_file where dataset_id = #{datasetId}")
int findDataFileById(@Param("datasetId") Long datasetId);

/**
* 根据数据集ID查询数据集
*
* @param datasetId 数据集id
* @return Dataset 根据数据集ID得到数据集
*/
@Select("select * from data_dataset where id = #{datasetId}")
Dataset findDatasetByIdNormal(@Param("datasetId") Long datasetId);


/**
* 新增数据集
*
* @param insertSql sql语句
*/
@Insert("${insertSql}")
void saveBatch(@Param("insertSql") String insertSql);

/**
* 删除数据集通过数据集ID
*
* @param datasetId 数据集ID
*/
@Delete("delete from data_dataset where id = #{datasetId}")
void deleteDatasetById(@Param("datasetId") long datasetId);
}

+ 11
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/dao/DatasetVersionMapper.java View File

@@ -0,0 +1,11 @@
package org.dubhe.datasetutil.dao;

import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import org.dubhe.datasetutil.domain.entity.DatasetVersion;

/**
* @description TODO
* @date 2021-03-23
*/
public interface DatasetVersionMapper extends BaseMapper<DatasetVersion> {
}

+ 40
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/AnnotationDTO.java View File

@@ -0,0 +1,40 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.domain.dto;

import lombok.Data;

import java.io.Serializable;

/**
* @description 标注DTO
* @date 2021-04-14
*/
@Data
public class AnnotationDTO implements Serializable {

/**
* 标签ID
*/
private Long categoryId;

/**
* 预估分
*/
private Double score;
}


+ 95
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/EsTransportDTO.java View File

@@ -0,0 +1,95 @@
/**
* Copyright 2020 Tianshu AI Platform. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/

package org.dubhe.datasetutil.domain.dto;

import lombok.Data;

import java.sql.Timestamp;

/**
* @description ES数据同步DTO
* @date 2020-03-24
*/
@Data
public class EsTransportDTO {

/**
* 状态
*/
private Integer annotationStatus;

/**
* 文件名称
*/
private String fileName;

/**
* 文件url
*/
private String url;

/**
* 创建人ID
*/
private Long createUserId;

/**
* 创建时间
*/
private Timestamp createTime;

/**
* 更新人ID
*/
private Long updateUserId;

/**
* 更新时间
*/
private Timestamp updateTime;

/**
* 文件类型
*/
private Integer fileType;

/**
* 增强类型
*/
private Integer enhanceType;

/**
* 用户ID
*/
private Long originUserId;

/**
* 预测值
*/
private Double prediction;

/**
* 文件ID
*/
private Long id;

/**
* 标签ID
*/
private Long labelId;
}

+ 46
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/FileAnnotationDTO.java View File

@@ -0,0 +1,46 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.domain.dto;

import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;

import java.io.Serializable;

/**
* @description 文件标注DTO
* @date 2020-01-07
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
@Builder
public class FileAnnotationDTO implements Serializable {

/**
* 标签ID
*/
private Long categoryId;

/**
* 分数
*/
private String score;

}

+ 28
- 16
dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/IdAlloc.java View File

@@ -17,7 +17,10 @@
package org.dubhe.datasetutil.domain.dto;

import lombok.Data;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.domain.entity.DataSequence;

import java.util.LinkedList;
import java.util.Queue;

/**
* @description ID策略实体
@@ -26,25 +29,34 @@ import org.dubhe.datasetutil.common.base.MagicNumConstant;
@Data
public class IdAlloc {

/**
* 起始位置
*/
private long startNumber;
private Queue<Long> ids;

/**
* 结束位置
*/
private long endNumber;
private Long unUsed;

public IdAlloc() {
ids = new LinkedList<>();
unUsed = 0L;
}

/**
* 可用数量
* 补充ID
*
* @param dataSequence
*/
private long usedNumber;
public void add(DataSequence dataSequence) {
for (Long i = dataSequence.getStart(); i < dataSequence.getStart() + dataSequence.getStep(); i++) {
ids.add(i);
unUsed++;
}
}

public IdAlloc() {
this.startNumber = MagicNumConstant.ZERO;
this.endNumber = MagicNumConstant.ZERO;
this.usedNumber = MagicNumConstant.ZERO;
public Queue<Long> poll(int number) {
Queue<Long> result = new LinkedList<>();
for (int i = 0; i < number; i++) {
result.add(ids.poll());
unUsed--;
}
return result;
}

}
}

+ 70
- 9
dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DataFile.java View File

@@ -16,10 +16,16 @@
*/
package org.dubhe.datasetutil.domain.entity;

import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import org.apache.commons.lang3.StringUtils;
import org.dubhe.datasetutil.common.base.BaseEntity;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.dubhe.datasetutil.common.constant.FileStateCodeConstant;

import java.awt.image.BufferedImage;
import java.io.Serializable;
@@ -28,13 +34,16 @@ import java.io.Serializable;
* @description 文件类
* @date 2020-09-17
*/
@AllArgsConstructor
@EqualsAndHashCode(callSuper = false)
@Builder
@TableName("data_file")
@Data
public class DataFile extends BaseEntity implements Serializable {

/**
* id
*/
@TableId(type = IdType.AUTO)
private Long id;

/**
@@ -92,7 +101,8 @@ public class DataFile extends BaseEntity implements Serializable {
*/
private Long originUserId;

public DataFile() {}
public DataFile() {
}

/**
* 插入文件表
@@ -101,18 +111,69 @@ public class DataFile extends BaseEntity implements Serializable {
* @param datasetId 数据集id
* @param url 文件路径
* @param createUserId 创建人id
* @param read 文件宽高
* @return DataFile file对象
*/
public DataFile(String name, Long datasetId, String url, Long createUserId, BufferedImage read) {
this.name = name.substring(0, name.lastIndexOf("."));
*/
public DataFile(String name, Long datasetId, String url, Long createUserId, int status) {
this.name = name;
this.datasetId = datasetId;
this.url = url;
this.status = 101;
this.status = status;
this.setDeleted(false);
this.originUserId = createUserId;
}


/**
* 插入文件表
*
* @param name 文件名字
* @param datasetId 数据集id
* @param url 文件路径
* @param createUserId 创建人id
* @param status 状态
* @param fileType 文件类型
* @param pid 父文件ID
* @param originUserId 资源拥有者ID
* @return DataFile file对象
*/
public DataFile(String name, Long datasetId, String url, Long createUserId, int status, int fileType, long pid, long originUserId) {
this.name = name;
this.datasetId = datasetId;
this.url = url;
this.status = status;
this.setDeleted(false);
this.setCreateUserId(createUserId);
this.fileType = fileType;
this.pid = pid;
this.originUserId = originUserId;
}

/**
* 插入文件表
*
* @param name 文件名字
* @param datasetId 数据集id
* @param url 文件路径
* @param createUserId 创建人id
* @param read 文件宽高
* @param status 状态
* @param fileType 文件类型
* @param pid 父文件ID
* @param originUserId 资源拥有者ID
* @return DataFile file对象
*/
public DataFile(String name, Long datasetId, String url, Long createUserId, BufferedImage read, int status, int fileType, long pid, long originUserId) {
this.name = name;
this.datasetId = datasetId;
this.url = url;
this.status = status;
this.setDeleted(false);
this.setCreateUserId(createUserId);
this.width = read.getWidth();
this.height = read.getHeight();
this.fileType = fileType;
this.pid = pid;
this.originUserId = originUserId;
}

}

+ 88
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DataFileAnnotation.java View File

@@ -0,0 +1,88 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.domain.entity;

import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
import org.dubhe.datasetutil.common.base.BaseEntity;

import java.io.Serializable;

/**
* @description nlp中间表
* @date 2020-01-07
*/
@Data
@NoArgsConstructor
@AllArgsConstructor
@EqualsAndHashCode(callSuper = false)
@Builder
@TableName("data_file_annotation")
public class DataFileAnnotation extends BaseEntity implements Serializable {

/**
* id
*/
private Long id;

/**
* 数据集ID
*/
private Long datasetId;

/**
* 标签ID
*/
private Long LabelId;

/**
* 数据集版本文件ID
*/
private Long versionFileId;

/**
* 预测值(值=实际值*100)
*/
private Double prediction;

/**
* 文件名称
*/
private String fileName;

/**
* 插入nlp中间表
*
* @param datasetId 数据集id
* @param labelId 标签id
* @param versionFileId 数据集版本文件id
* @param prediction 预测值
* @param createUserId 创建人id
* @param fileName 文件名称
* @return DataFileAnnotation nlp中间表
*/
public DataFileAnnotation(Long datasetId,Long labelId,Long versionFileId,Double prediction,Long createUserId, String fileName){
this.datasetId = datasetId;
this.LabelId = labelId;
this.versionFileId = versionFileId;
this.prediction = prediction;
this.setCreateUserId(createUserId);
this.fileName = fileName;
}
}

+ 5
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DataLabelGroup.java View File

@@ -67,4 +67,9 @@ public class DataLabelGroup extends BaseEntity implements Serializable {
@TableField(value = "origin_user_id")
private Long originUserId;

/**
* 标签组数据类型(0:视觉,1:文本)
*/
private Integer labelGroupType;

}

dataset-util/src/main/java/org/dubhe/datasetutil/domain/dto/DataVersionFile.java → dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DataVersionFile.java View File

@@ -14,11 +14,10 @@
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.domain.dto;
package org.dubhe.datasetutil.domain.entity;

import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import lombok.Data;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
import org.dubhe.datasetutil.common.base.BaseEntity;

import java.io.Serializable;
@@ -28,13 +27,16 @@ import java.io.Serializable;
* @description 数据集文件关系类
* @date 2020-9-17
*/
@AllArgsConstructor
@EqualsAndHashCode(callSuper = false)
@Builder
@TableName("data_dataset_version_file")
@Data
public class DataVersionFile extends BaseEntity implements Serializable {

/**
* id
*/
@TableId(type = IdType.AUTO)
private Long id;

/**
@@ -72,6 +74,12 @@ public class DataVersionFile extends BaseEntity implements Serializable {
*/
private Integer changed;

/**
* 文件名称
*/
private String fileName;


public DataVersionFile() {
}

@@ -84,10 +92,11 @@ public class DataVersionFile extends BaseEntity implements Serializable {
* @param status 状态
* @return DataVersionFile 数据集版本文件表
*/
public DataVersionFile(Long datasetId, Long fileId,Integer annotationStatus,Integer status) {
public DataVersionFile(Long datasetId, Long fileId,Integer annotationStatus,Integer status,String fileName) {
this.datasetId = datasetId;
this.fileId = fileId;
this.annotationStatus = annotationStatus;
this.status = status;
this.fileName = fileName;
}
}

+ 4
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/Dataset.java View File

@@ -140,6 +140,10 @@ public class Dataset extends BaseEntity implements Serializable {
*/
private Long originUserId;


private Long labelGroupId;


public Dataset() {}

}

+ 53
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/DatasetVersion.java View File

@@ -0,0 +1,53 @@
package org.dubhe.datasetutil.domain.entity;

import com.baomidou.mybatisplus.annotation.*;
import lombok.Data;
import org.dubhe.datasetutil.common.base.BaseEntity;

import java.sql.Timestamp;
import java.util.Date;

/**
* @description TODO
* @date 2021-03-23
*/
@Data
@TableName("data_dataset_version")
public class DatasetVersion extends BaseEntity {

@TableId(type = IdType.AUTO)
private Long id;

private Long datasetId;

private Long teamId;

private String versionName;

private String versionNote;

private String versionSource;

private String versionUrl;

private Integer dataConversion;

@TableField(value = "deleted", fill = FieldFill.INSERT)
private Boolean deleted = false;

private Long originUserId;

public DatasetVersion() {}

public DatasetVersion(Long datasetId, String versionName, String versionNote) {
this.datasetId = datasetId;
this.versionName = versionName;
this.setCreateUserId(0L);
this.setCreateTime(new Timestamp(System.currentTimeMillis()));
this.versionUrl = "dataset/"+datasetId +"/versionFile/"+versionName;
this.dataConversion = 2;
this.originUserId = 0L;
this.versionNote = versionNote;
}

}

+ 1
- 1
dataset-util/src/main/java/org/dubhe/datasetutil/domain/entity/LogInfo.java View File

@@ -17,8 +17,8 @@

package org.dubhe.datasetutil.domain.entity;

import cn.hutool.core.date.DateUtil;
import com.alibaba.fastjson.annotation.JSONField;
import com.xiaoleilu.hutool.date.DateUtil;
import lombok.Data;
import lombok.experimental.Accessors;



+ 171
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/handle/CustomDatasetImportHandle.java View File

@@ -0,0 +1,171 @@
package org.dubhe.datasetutil.handle;

import lombok.extern.slf4j.Slf4j;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.common.constant.AnnotateTypeEnum;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.dubhe.datasetutil.common.constant.DataStateCodeConstant;
import org.dubhe.datasetutil.common.constant.DatatypeEnum;
import org.dubhe.datasetutil.common.exception.BusinessException;
import org.dubhe.datasetutil.common.util.*;
import org.dubhe.datasetutil.domain.entity.Dataset;
import org.dubhe.datasetutil.service.DatasetService;
import org.dubhe.datasetutil.service.DatasetVersionService;
import org.springframework.aop.framework.AopContext;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.EnableAspectJAutoProxy;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;

import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Scanner;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;

/**
* @description 自定义数据集导入
* @date 2021-03-23
*/
@Slf4j
@Component
@EnableAspectJAutoProxy(exposeProxy = true)
public class CustomDatasetImportHandle {

@Autowired
DatasetService datasetService;
@Autowired
DatasetVersionService datasetVersionService;
@Autowired
MinioUtil minioUtil;

/**
* 自定义数据集导入
* 1.修改数据集状态为已完成
* 2.创建版本数据
* 3.文件导入到版本目录
*
* @param args 参数 (1)数据集ID (2)文件路径
*/
public void execute(Object[] args) throws Exception {
valid(args);
((CustomDatasetImportHandle) AopContext.currentProxy()).sqlExecute(args);
fileExecute(args);
log.warn("");
PrintUtils.printLine(" Success: 执行成功 ", PrintUtils.GREEN);
log.warn("");
System.out.println("# 是否结束? Y / N #");
Scanner scannerExit = new Scanner(System.in);
if (BusinessConstant.Y.toLowerCase().equals(scannerExit.nextLine().toLowerCase())) {
System.exit(MagicNumConstant.ZERO);
}
}

/**
* 数据库处理
* 1.修改数据集状态为已完成
* 2.增加数据集版本数据
* 已存在的情况下,不重复添加
*
* @param args 参数 (1)数据集ID (2)文件路径
*/
@Transactional(rollbackFor = Exception.class)
public void sqlExecute(Object[] args) {
Dataset dataset = datasetService.findDatasetById((long)args[0]);
if (Objects.isNull(dataset)) {
throw new BusinessException("数据集不存在");
}
//更新数据集状态为已完成
if (!DataStateCodeConstant.ANNOTATION_COMPLETE_STATE.equals(dataset.getStatus())) {
dataset.setStatus(DataStateCodeConstant.ANNOTATION_COMPLETE_STATE);
dataset.setCurrentVersionName(BusinessConstant.DEFAULT_VERSION);
datasetService.updateDataset(dataset);
}
//生成版本信息 只会生成V0001
if (Objects.isNull(datasetVersionService.getByDatasetIdAndVersionNum(dataset.getId(), BusinessConstant.DEFAULT_VERSION))) {
datasetVersionService.insertVersion(dataset.getId(), BusinessConstant.DEFAULT_VERSION, "自定义");
}

}

/**
* 遍历用户文件夹上传所有问题
*
* @param args 参数 (1)数据集ID (2)文件路径
*/
public void fileExecute(Object[] args) throws Exception {
List<String> filePaths = FileUtil.traverseFolder((String) args[1]);
List<Callable<Integer>> partitions = new ArrayList<>();
int oneSize = ThreadUtils.createThread(filePaths.size());
List<String> need = new ArrayList<>();
Integer integer = new Integer(0);
//初始化进度条
ProcessBarUtil.initProcess("自定义导入", (long) filePaths.size());
for (String filePath : filePaths) {
need.add(filePath);
if (need.size() == oneSize || integer.intValue() == filePaths.size() - 1) {
List<String> now = new ArrayList<>(need);
need.clear();
partitions.add(() -> run(now, args));
}
integer ++;
}
ThreadUtils.runMultiThread(partitions);
}

public Integer run(List<String> filePaths, Object[] args) {
log.info("#-------------开始处理,时间[" + DateUtil.getNowStr() + "]-------------#");
log.info("#-------------文件数量[" + filePaths.size() + "]------------------------");
Integer success = 0;
for (String str : filePaths) {
try {
String objectName = "dataset/" + (long) args[0] + "/versionFile/V0001" + str.replace((String)args[1], "").replaceAll("\\\\", "/");
minioUtil.upLoadFileByInputStream(objectName, str);
ProcessBarUtil.processBar01(1L);
success ++;
} catch (Exception e) {
log.error(str + "upload error {}", e);
}
}
return success;
}

/**
* 数据校验
* 1.参数校验
* 2.参数对应数据集是否存在
* 3.用户输入目录下是否有问题
* 4.数据集标注类型和数据类型是否正确
*
* @param args 参数 (1)数据集ID (2)文件路径
*/
public void valid(Object[] args) {
if (args == null || args.length != 2) {
throw new BusinessException("参数数量不匹配");
}
Long datasetId = null;
try {
datasetId = (long) args[0];
} catch (Exception e) {
log.error("数据集ID输入不正确, {}", e);
throw new BusinessException("数据集ID输入不正确");
}
Dataset dataset = datasetService.findDatasetById(datasetId);
if (Objects.isNull(dataset)) {
throw new BusinessException("输入数据集不存在");
}
if (!AnnotateTypeEnum.AUTO_IMPORT.getValue().equals(dataset.getAnnotateType()) || !DatatypeEnum.AUTO_IMPORT.getValue().equals(dataset.getDataType())) {
throw new BusinessException("请确认该数据及的标注类型以及数据类型都是自定义导入");
}
String filePath = (String) args[1];
if (!cn.hutool.core.io.FileUtil.exist(filePath) || !cn.hutool.core.io.FileUtil.isDirectory(filePath)) {
throw new BusinessException("请确保您输入的数据集路径是否正确");
}
}

}

+ 253
- 34
dataset-util/src/main/java/org/dubhe/datasetutil/handle/DatasetImageUploadHandle.java View File

@@ -16,29 +16,39 @@
*/
package org.dubhe.datasetutil.handle;

import com.xiaoleilu.hutool.io.FileUtil;
import cn.hutool.core.io.FileUtil;
import com.google.common.collect.Lists;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.checkerframework.checker.units.qual.A;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.common.config.ImageConfig;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.dubhe.datasetutil.common.util.DateUtil;
import org.dubhe.datasetutil.common.util.GeneratorKeyUtil;
import org.dubhe.datasetutil.common.util.MinioUtil;
import org.dubhe.datasetutil.common.util.ThreadUtils;
import org.dubhe.datasetutil.common.enums.LogEnum;
import org.dubhe.datasetutil.common.util.*;
import org.dubhe.datasetutil.common.constant.DataStateCodeConstant;
import org.dubhe.datasetutil.common.constant.FileStateCodeConstant;
import org.dubhe.datasetutil.common.enums.DatatypeEnum;
import org.dubhe.datasetutil.common.exception.ImportDatasetException;
import org.dubhe.datasetutil.common.util.*;
import org.dubhe.datasetutil.domain.entity.DataFile;
import org.dubhe.datasetutil.domain.dto.DataVersionFile;
import org.dubhe.datasetutil.domain.entity.DataVersionFile;
import org.dubhe.datasetutil.domain.entity.Dataset;
import org.dubhe.datasetutil.service.DataFileService;
import org.dubhe.datasetutil.service.DataVersionFileService;
import org.dubhe.datasetutil.service.DatasetService;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.index.IndexRequest;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.util.CollectionUtils;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;

/**
* @description 上传图片工具类
@@ -48,6 +58,12 @@ import java.util.concurrent.atomic.AtomicInteger;
@Component
public class DatasetImageUploadHandle {

/**
* esSearch索引
*/
@Value("${es.index}")
private String esIndex;

@Autowired
private MinioUtil minioUtil;

@@ -63,6 +79,34 @@ public class DatasetImageUploadHandle {
@Autowired
private GeneratorKeyUtil generatorKeyUtil;

@Autowired
private ImageConfig imageConfig;

@Autowired
private BulkProcessor bulkProcessor;

/**
* 缺陷图片集合
*/
public final List<String> defectsFile = new ArrayList<>();

/**
* 上传图片
*
* @param scanner 输入
*/
public void importPicture(Scanner scanner) throws Exception {
Dataset dataset = verificationDatasetId(scanner);
String imagePath = verificationFilePath(scanner,dataset);
try{
execute(imagePath, dataset.getId());
} catch (Exception e) {
log.error("");
PrintUtils.printLine(" Error:" + e.getMessage(), PrintUtils.RED);
log.error("");
}
}

/**
* 启动线程
*
@@ -70,27 +114,89 @@ public class DatasetImageUploadHandle {
* @param datasetId 数据集Id
*/
public void execute(String imagePath, Long datasetId) throws Exception {
log.info("#-------------开始处理,时间[" + DateUtil.getNowStr() + "]-------------#");
List<String> fileNames = FileUtil.listFileNames(imagePath);
log.info("#-------------文件数量[" + fileNames.size() + "]------------------------");
log.warn("需要处理文件:【" + fileNames.size() + "】个文件");
String fileBaseDir = BusinessConstant.MINIO_ROOT_PATH + BusinessConstant.FILE_SEPARATOR + datasetId
+ BusinessConstant.FILE_SEPARATOR + BusinessConstant.IMAGE_ORIGIN + BusinessConstant.FILE_SEPARATOR;
List<Callable<Integer>> partitions = new ArrayList<>();
int oneSize = ThreadUtils.createThread(fileNames.size());
int batchNumber = MagicNumConstant.ZERO;
//初始化进度条
ProcessBarUtil.initProcess("图片导入", (long) fileNames.size());
if (fileNames.size() > MagicNumConstant.TEN_THOUSAND) {
log.warn("........系统处理中.........");
List<List<String>> partitionList = Lists.partition(fileNames, MagicNumConstant.FIVE_THOUSAND);
for (List<String> imageFileNameList1 : partitionList) {
batchNumber++;
dealFileList(imageFileNameList1, oneSize, fileBaseDir, imagePath, datasetId, batchNumber);
}
} else {
log.warn("........系统处理中.........");
batchNumber++;
dealFileList(fileNames, oneSize, fileBaseDir, imagePath, datasetId, batchNumber);
}
log.warn("");
PrintUtils.printLine(" Success: 执行成功 ", PrintUtils.GREEN);
log.warn("");
System.out.println("# 是否结束? Y / N #");
Scanner scannerExit = new Scanner(System.in);
if (BusinessConstant.Y.toLowerCase().equals(scannerExit.nextLine().toLowerCase())) {
System.exit(MagicNumConstant.ZERO);
}
}


/**
* @param fileNames 图片集合
* @param oneSize 每次处理次数
* @param fileBaseDir 文件根目录
* @param imagePath 图片文件路径
* @param datasetId 数据集ID
* @throws Exception
*/
public void dealFileList(List<String> fileNames, int oneSize, String fileBaseDir, String imagePath, Long datasetId, int batchNumber) throws Exception {
int dealSize = MagicNumConstant.ZERO;
Dataset dataset = datasetService.queryDatasetById(datasetId);
List<String> need = new ArrayList<>();
AtomicInteger atomicInteger = new AtomicInteger(0);
for (String fileName : fileNames) {
need.add(fileName);
if (need.size() == oneSize || atomicInteger.intValue() == fileNames.size() - 1) {
List<String> now = new ArrayList<>(need);
need.clear();
partitions.add(() -> run(datasetId, now, fileBaseDir, imagePath));
List<Callable<Integer>> partitions = new ArrayList<>();
//初始化进度条
for (int i = 0; i < fileNames.size(); i++) {
String suffixFileName = fileNames.get(i).substring(fileNames.get(i).lastIndexOf(BusinessConstant.SPOT));
if(dataset.getDataType().equals(DatatypeEnum.TXT.getValue())){
if (imageConfig.getTxtFormat().contains(suffixFileName.toLowerCase())) {
need.add(fileNames.get(i));
if (need.size() == oneSize || i == fileNames.size() - MagicNumConstant.ONE) {
List<String> now = new ArrayList<>(need);
dealSize += now.size();
need.clear();
partitions.add(() -> run(datasetId, now, fileBaseDir, imagePath));
}
}
} else {
if (imageConfig.getImageFormat().contains(suffixFileName.toLowerCase())) {
need.add(fileNames.get(i));
if (need.size() == oneSize || i == fileNames.size() - MagicNumConstant.ONE) {
List<String> now = new ArrayList<>(need);
dealSize += now.size();
need.clear();
partitions.add(() -> run(datasetId, now, fileBaseDir, imagePath));
}
}
}
atomicInteger.getAndIncrement();
}
ThreadUtils.runMultiThread(partitions);
if (!CollectionUtils.isEmpty(defectsFile)) {
log.error("");
log.warn("#-------------系统共排查出缺陷文件【" + defectsFile.size() + "】个-------------#");
log.error("");
log.warn("缺陷文件列表 " + defectsFile.toString() + "");
log.error("");
defectsFile.clear();
}


}


/**
* 插入数据库数据
*
@@ -98,43 +204,156 @@ public class DatasetImageUploadHandle {
* @param fileNames 文件Name
* @param fileBaseDir 文件路径
* @param imagePath 文件地址
* @return java.lang.Integer 成功数量
* @return Integer 成功数量
*/
public Integer run(Long datasetId, List<String> fileNames, String fileBaseDir, String imagePath) {
Integer success = 0;
Integer success = MagicNumConstant.ZERO;
Dataset dataset = datasetService.findCreateUserIdById(datasetId);
List<DataFile> dataFiles = new ArrayList<>();
List<DataVersionFile> dataVersionFiles = new ArrayList<>();
for (int i = 0; i < fileNames.size(); i++) {
try {
minioUtil.upLoadFile(fileBaseDir + fileNames.get(i), FileUtil.getInputStream(imagePath + BusinessConstant.FILE_SEPARATOR + fileNames.get(i)));
BufferedImage read = ImageIO.read(new File(imagePath + BusinessConstant.FILE_SEPARATOR + fileNames.get(i)));
success++;
dataFiles.add(new DataFile(fileNames.get(i), datasetId, minioUtil.getUrl(fileBaseDir + fileNames.get(i)), dataset.getCreateUserId(), read));
String fileName = StringUtils.substring(fileNames.get(i), MagicNumConstant.ZERO, fileNames.get(i).lastIndexOf(BusinessConstant.SPOT)) + System.nanoTime();
String suffixFileName = fileNames.get(i).substring(fileNames.get(i).lastIndexOf(BusinessConstant.SPOT));
minioUtil.upLoadFile(imagePath + BusinessConstant.FILE_SEPARATOR + fileNames.get(i), fileBaseDir + fileName + suffixFileName);
DataFile dataFile = new DataFile(fileName, datasetId, minioUtil.getUrl(fileBaseDir + fileName + suffixFileName),
dataset.getCreateUserId(), FileStateCodeConstant.NOT_ANNOTATION_FILE_STATE);

if (dataFiles.size() % 500 == 0 || i == fileNames.size() - 1) {
long startDataFileIndex = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_FILE, dataFiles.size());
if (dataset.getDataType().compareTo(DatatypeEnum.IMAGE.getValue()) == 0) {
BufferedImage read;
try {
read = ImageIO.read(new File(imagePath + BusinessConstant.FILE_SEPARATOR + fileNames.get(i)));
} catch (ArrayIndexOutOfBoundsException e) {
defectsFile.add(fileNames.get(i));
throw new ImportDatasetException("该图片文件内部错误 " + fileNames.get(i) + ",请重新审核后再去上传此图片,当前已经跳过此图片");
}
dataFile.setWidth(read.getWidth());
dataFile.setHeight(read.getHeight());
}
success++;
dataFiles.add(dataFile);
if (dataFiles.size() % MagicNumConstant.FIVE_HUNDRED == MagicNumConstant.ZERO || i == fileNames.size() - MagicNumConstant.ONE) {
Queue<Long> dataFileIds = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_FILE, dataFiles.size());
for (DataFile dataFileEntity : dataFiles) {
dataFileEntity.setId(startDataFileIndex++);
dataFileEntity.setId(dataFileIds.poll());
}

dataFileService.saveBatchDataFile(dataFiles);
for (DataFile file : dataFiles) {
dataVersionFiles.add(new DataVersionFile(datasetId, file.getId(), 101, 0));
dataVersionFiles.add(new DataVersionFile(datasetId, file.getId(), DataStateCodeConstant.NOT_ANNOTATION_STATE, MagicNumConstant.ZERO, file.getName()));
if(dataset.getDataType().equals(DatatypeEnum.TXT.getValue())){
try{
String bucketName = StringUtils.substringBefore(file.getUrl(),"/");
String fullFilePath = StringUtils.substringAfter(file.getUrl(), "/");
String content = minioUtil.readString(bucketName, fullFilePath);
Map<String, String> jsonMap = new HashMap<>();
jsonMap.put("content",content);
jsonMap.put("name", file.getName());
jsonMap.put("status",FileStateCodeConstant.NOT_ANNOTATION_FILE_STATE.toString());
jsonMap.put("datasetId",dataset.getId().toString());
jsonMap.put("createUserId",file.getCreateUserId()==null?null:file.getCreateUserId().toString());
jsonMap.put("createTime",file.getCreateTime()==null?null:file.getCreateTime().toString());
jsonMap.put("updateUserId",file.getUpdateUserId()==null?null:file.getUpdateUserId().toString());
jsonMap.put("updateTime",file.getUpdateTime()==null?null:file.getUpdateTime().toString());
jsonMap.put("fileType",file.getFileType()==null?null:file.getFileType().toString());
jsonMap.put("enhanceType",file.getEnhanceType()==null?null:file.getEnhanceType().toString());
jsonMap.put("originUserId",file.getOriginUserId().toString());
jsonMap.put("versionName", StringUtils.isEmpty(dataset.getCurrentVersionName())?"V0000" : dataset.getCurrentVersionName());
bulkProcessor.add(new IndexRequest(esIndex, "_doc", file.getId().toString()).source(jsonMap));
} catch (Exception e){
LogUtil.error(LogEnum.BIZ_DATASET, "上传es失败: {} ", e);
}
}
}
if(dataset.getDataType().equals(DatatypeEnum.TXT.getValue())){
bulkProcessor.flush();
}
long startDataFileVersionIndex = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_VERSION_FILE, dataVersionFiles.size());
Queue<Long> dataFileVersionIds = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_VERSION_FILE, dataVersionFiles.size());
for (DataVersionFile dataVersionFile : dataVersionFiles) {
dataVersionFile.setId(startDataFileVersionIndex++);
dataVersionFile.setId(dataFileVersionIds.poll());
}
dataVersionFileService.saveBatchDataFileVersion(dataVersionFiles);
ProcessBarUtil.processBar01((long) dataVersionFiles.size());
dataVersionFiles.clear();
dataFiles.clear();
}
} catch (Exception e) {
log.error("{}", e);
log.error(fileNames.get(i) + "{}", e);
log.error("运行异常: {}", e.getMessage());
}
}
return success;
}


/**
* 校验数据集ID
*
* @param scanner 控制台输入参数
* @return Dataset 数据集
*/
public Dataset verificationDatasetId(Scanner scanner) {
boolean flag = false;
Dataset dataset = new Dataset();
while (!flag) {
System.out.println(" ");
System.out.println("# 请输入数据集ID #");
String datasetIdStr = scanner.nextLine();
long datasetId;
try {
datasetId = Long.parseLong(datasetIdStr.trim());
} catch (Exception e) {
log.error("");
PrintUtils.printLine(" Error: 数据集ID非法,请重新输入", PrintUtils.RED);
log.error("");
continue;
}
dataset = datasetService.findDatasetByIdNormal(datasetId);
if (dataset == null) {
log.error("");
PrintUtils.printLine(" Error: 数据集ID不存在,请重新输入", PrintUtils.RED);
log.error("");
continue;
} else {
flag = true;
}
}
return dataset;
}

/**
* 校验文件路径及格式
*
* @param scanner 输入控制台
* @param dataset 数据集
* @return String 字符串
*/
public String verificationFilePath(Scanner scanner,Dataset dataset) {
boolean flag = false;
String filePath = "";
while (!flag) {
System.out.println(" ");
System.out.println("# 请输入待上传本地文件的绝对路径 #");
filePath = scanner.nextLine();
File file = new File(filePath.trim());
if (!file.exists()) {
log.error("");
PrintUtils.printLine(" 【" + filePath + "】文件路径不存在,请重新输入", PrintUtils.RED);
log.error("");
continue;
}
File fileNames = new File(filePath);
File[] imageFiles = fileNames.listFiles();
if (imageFiles == null || imageFiles.length == MagicNumConstant.ZERO) {
log.error("");
PrintUtils.printLine(" 【" + filePath + "】目录下不存在文件 ", PrintUtils.RED);
log.error("");
continue;
} else {
flag = true;
}
}

return filePath;
}

}

+ 317
- 102
dataset-util/src/main/java/org/dubhe/datasetutil/handle/DatasetImportHandle.java View File

@@ -1,12 +1,12 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
*
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,35 +16,45 @@
*/
package org.dubhe.datasetutil.handle;

import cn.hutool.core.io.FileUtil;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.alibaba.fastjson.TypeReference;
import com.xiaoleilu.hutool.io.FileUtil;
import com.google.common.collect.Lists;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.common.config.ImageConfig;
import org.dubhe.datasetutil.common.config.MinioConfig;
import org.dubhe.datasetutil.common.constant.AnnotateTypeEnum;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.dubhe.datasetutil.common.constant.FileStateCodeConstant;
import org.dubhe.datasetutil.common.enums.DatatypeEnum;
import org.dubhe.datasetutil.common.enums.LogEnum;
import org.dubhe.datasetutil.common.exception.ImportDatasetException;
import org.dubhe.datasetutil.common.util.*;
import org.dubhe.datasetutil.domain.dto.AnnotationDTO;
import org.dubhe.datasetutil.domain.entity.*;
import org.dubhe.datasetutil.domain.dto.DataVersionFile;
import org.dubhe.datasetutil.domain.entity.DataVersionFile;
import org.dubhe.datasetutil.service.*;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.index.IndexRequest;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.CollectionUtils;
import javax.annotation.Resource;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

/**
@@ -55,6 +65,12 @@ import java.util.stream.Collectors;
@Component
public class DatasetImportHandle {

/**
* esSearch索引
*/
@Value("${es.index}")
private String esIndex;

@Autowired
private DatasetService datasetService;

@@ -76,6 +92,9 @@ public class DatasetImportHandle {
@Autowired
private DataVersionFileService dataVersionFileService;

@Autowired
private DataFileAnnotationService dataFileAnnotationService;

@Autowired
private MinioUtil minioUtil;

@@ -85,10 +104,11 @@ public class DatasetImportHandle {
@Autowired
private GeneratorKeyUtil generatorKeyUtil;

/**
* 可支持的图片格式集合
*/
private static final List<String> SUFFIX_LIST = new ArrayList<>();
@Autowired
private ImageConfig imageConfig;

@Resource
private BulkProcessor bulkProcessor;

/**
* 标注文件中JSON的key
@@ -99,11 +119,6 @@ public class DatasetImportHandle {
* 加载静态集合数据
*/
static {
SUFFIX_LIST.add(".jpg");
SUFFIX_LIST.add(".png");
SUFFIX_LIST.add(".bmp");
SUFFIX_LIST.add(".jpeg");

annotationFileContextKey.add("score");
annotationFileContextKey.add("area");
annotationFileContextKey.add("name");
@@ -120,19 +135,21 @@ public class DatasetImportHandle {
public void importDataset(Scanner scanner) throws Exception {
Dataset dataset = verificationDatasetId(scanner);
String filePath = verificationFilePath(scanner);
File labelJsonFile = verificationFile(filePath);
File labelJsonFile = verificationFile(filePath, dataset);
DataLabelGroup dataLabelGroup = saveDataLabelGroup(HandleFileUtil.getLabelGroupName(labelJsonFile.getName()), dataset);
List<DataLabel> dataLabelList = readLabelContext(labelJsonFile);
saveDataLabel(dataset, dataLabelList, dataLabelGroup.getId());
log.info("........数据校验完成,即将执行下一步操作,请勿关闭窗口.................");
executeUploadAndSave(dataLabelList, filePath, dataset);
dataset.setLabelGroupId(dataLabelGroup.getId());
datasetService.updateDatasetStatus(dataset);
log.warn("");
PrintUtils.printLine(" Success: 执行成功 ", PrintUtils.GREEN);
log.warn("");
log.warn("# 是否结束? Y / N #");
System.out.println("# 是否结束? Y / N #");
Scanner scannerExit = new Scanner(System.in);
if (BusinessConstant.Y.toLowerCase().equals(scannerExit.nextLine().toLowerCase())) {
System.exit(0);
System.exit(MagicNumConstant.ZERO);
}
}

@@ -140,14 +157,15 @@ public class DatasetImportHandle {
* 检查文件结构 、类型
*
* @param globalFilePath 文件路径
* @param dataset 数据集
* @return file 标签文件
*/
public File verificationFile(String globalFilePath) throws IOException {
public File verificationFile(String globalFilePath, Dataset dataset) throws IOException {
File labelRootFiles = new File(globalFilePath);
File imageRootFiles = new File(globalFilePath + HandleFileUtil.generateFilePath(BusinessConstant.IMAGE_ORIGIN));
File annotationRootFiles = new File(globalFilePath + HandleFileUtil.generateFilePath(BusinessConstant.ANNOTATION));
if (imageRootFiles.list() == null || annotationRootFiles.listFiles() == null) {
throw new ImportDatasetException(" 【" + globalFilePath + "】目录中的图片目录(origin)或者标注文件目录(annotation)的文件夹为空 ");
throw new ImportDatasetException("【" + globalFilePath + "】目录中的文件目录(origin)或者标注文件目录(annotation)的文件夹为空 ");
}
File labelJsonFile = null;
for (File file : Objects.requireNonNull(labelRootFiles.listFiles())) {
@@ -158,35 +176,49 @@ public class DatasetImportHandle {
}
}
if (labelJsonFile == null) {
throw new ImportDatasetException(" 【" + globalFilePath + "】目录中未找到标签组文件");
throw new ImportDatasetException("【" + globalFilePath + "】目录中未找到标签组文件");
}
dealLabelGroup(labelJsonFile.getName());
List<DataLabel> dataLabelList = readLabelContext(labelJsonFile);
Map<String, List<DataLabel>> dataLabelMap = dataLabelList.stream().collect(Collectors.groupingBy(DataLabel::getName));
for (Map.Entry<String, List<DataLabel>> entry : dataLabelMap.entrySet()) {
if (entry.getValue().size() > 1) {
if (entry.getValue().size() > MagicNumConstant.ONE) {
throw new ImportDatasetException(" 标签组中标签存在重复标签:【" + entry.getKey() + "】");
}
}
File[] imageFiles = imageRootFiles.listFiles();
if (imageFiles == null || imageFiles.length == 0) {
throw new ImportDatasetException(" 图片文件下不存在图片文件 ");
if (imageFiles == null || imageFiles.length == MagicNumConstant.ZERO) {
throw new ImportDatasetException(" 文件下不存在文件 ");
}
log.info("........校验文件格式,请勿关闭窗口..............");
for (File imageFile : imageFiles) {
String suffixFileName = imageFile.getName().substring(imageFile.getName().lastIndexOf(BusinessConstant.SPOT));
if (!SUFFIX_LIST.contains(suffixFileName.toLowerCase())) {
throw new ImportDatasetException(" 图片文件文件夹中存在非法格式 ");
if (dataset.getDataType().compareTo(DatatypeEnum.IMAGE.getValue()) == 0) {
if (!imageConfig.getImageFormat().contains(suffixFileName.toLowerCase())) {
throw new ImportDatasetException(" 图片文件文件夹中存在非法格式 ");
}
} else {
if (!imageConfig.getTxtFormat().contains(suffixFileName.toLowerCase())) {
throw new ImportDatasetException(" 文本文件文件夹中存在非法格式 ");
}
}

}
File[] annotationFiles = annotationRootFiles.listFiles();
if (annotationFiles == null || annotationFiles.length == 0) {
throw new ImportDatasetException(" 图片文件下不存在标注文件 ");
if (annotationFiles == null || annotationFiles.length == MagicNumConstant.ZERO) {
throw new ImportDatasetException(" 文件下不存在标注文件 ");
}
log.info("........校验文件格式完成,即将执行下一步操作,请勿关闭窗口.........");
log.info("........校验标注文件格式,请勿关闭窗口..............");
for (File annotationFile : annotationFiles) {
if (!annotationFile.getName().toLowerCase().endsWith(BusinessConstant.SUFFIX_JSON.toLowerCase())) {
throw new ImportDatasetException(" 标注文件文件夹中存在非法格式 ");
}
if (!containsJsonKey(annotationFile)) {
throw new ImportDatasetException(" 标注文件【" + annotationFile.getName() + "】 未包含'name'节点 ");
}
}
log.info("........校验标注文件格式完成,即将执行下一步操作,请勿关闭窗口..............");
return labelJsonFile;
}

@@ -199,20 +231,44 @@ public class DatasetImportHandle {
public void executeUploadAndSave(List<DataLabel> dataLabelList, String filePath, Dataset dataset) throws Exception {
String localImageFilePath = filePath + HandleFileUtil.generateFilePath(BusinessConstant.IMAGE_ORIGIN);
List<String> imageFileNameList = FileUtil.listFileNames(localImageFilePath);
log.info("需要处理: 【" + imageFileNameList.size() + "】张图片");
log.warn("........系统需要处理:【" + imageFileNameList.size() + "】个文件,请勿关闭窗口.........");
int batchNumber = MagicNumConstant.ZERO;
int oneSize = ThreadUtils.createThread(imageFileNameList.size());
log.info("需要创建线程数: 【" + oneSize + "】 条");
ProcessBarUtil.initProcess("数据集导入", (long) imageFileNameList.size());
if (imageFileNameList.size() > MagicNumConstant.TEN_THOUSAND) {
log.warn("........系统处理中.........");
List<List<String>> partitionList = Lists.partition(imageFileNameList, MagicNumConstant.FIVE_THOUSAND);
for (List<String> imageFileNameList1 : partitionList) {
batchNumber++;
dealFileList(imageFileNameList1, oneSize, dataLabelList, filePath, dataset, batchNumber);
}
} else {
log.warn("........系统处理中.........");
batchNumber++;
dealFileList(imageFileNameList, oneSize, dataLabelList, filePath, dataset, batchNumber);
}
}

/**
* @param imageFileNameList 图片集合
* @param oneSize 每次处理次数
* @param dataLabelList 数据集标签集合
* @param filePath 文件路径
* @param dataset 数据集
* @throws Exception
*/
public void dealFileList(List<String> imageFileNameList, int oneSize, List<DataLabel> dataLabelList, String filePath, Dataset dataset, int batchNumber) throws Exception {
int dealSize = MagicNumConstant.ZERO;
List<Callable<Integer>> partitions = new ArrayList<>();
List<String> need = new ArrayList<>();
AtomicInteger atomicInteger = new AtomicInteger(MagicNumConstant.ZERO);
for (String fileName : imageFileNameList) {
need.add(fileName);
if (need.size() == oneSize || atomicInteger.intValue() == imageFileNameList.size() - MagicNumConstant.ONE) {
for (int i = 0; i < imageFileNameList.size(); i++) {
need.add(imageFileNameList.get(i));
if (need.size() == oneSize || i == imageFileNameList.size() - MagicNumConstant.ONE) {
List<String> fileNameList = new ArrayList<>(need);
dealSize += fileNameList.size();
need.clear();
partitions.add(() -> runTask(dataLabelList, dataset, fileNameList, filePath));
}
atomicInteger.getAndIncrement();
}
ThreadUtils.runMultiThread(partitions);
}
@@ -220,71 +276,157 @@ public class DatasetImportHandle {
/**
* 实际执行任务
*
* @param dataset 数据集
* @param fileNameList 文件名字集合
* @param dataSetRootFilePath 文件路径
* @param dataset 数据集
* @param fileNameList 文件名字集合
* @param dataSetRootFilePath 文件路径
* @return Integer 执行次数
*/
private Integer runTask(List<DataLabel> dataLabelList, Dataset dataset, List<String> fileNameList, String dataSetRootFilePath) throws Exception {
Integer success = MagicNumConstant.ZERO;
List<DataFile> dataFilesList = new ArrayList<>();
AtomicInteger atomicInteger = new AtomicInteger(MagicNumConstant.ZERO);
String imageFileBaseDir = BusinessConstant.MINIO_ROOT_PATH + BusinessConstant.FILE_SEPARATOR + dataset.getId()
+ BusinessConstant.FILE_SEPARATOR + BusinessConstant.IMAGE_ORIGIN + BusinessConstant.FILE_SEPARATOR;
String annotationFileBaseDir = BusinessConstant.MINIO_ROOT_PATH + BusinessConstant.FILE_SEPARATOR + dataset.getId()
+ BusinessConstant.FILE_SEPARATOR + BusinessConstant.ANNOTATION + BusinessConstant.FILE_SEPARATOR;
for (String fileName : fileNameList) {
String imageUploadFile = imageFileBaseDir + fileName;
String annotationFileName = HandleFileUtil.readFileName(fileName);
File annotationFile = new File(dataSetRootFilePath + HandleFileUtil.generateFilePath(BusinessConstant.ANNOTATION) + BusinessConstant.FILE_SEPARATOR + annotationFileName + BusinessConstant.SUFFIX_JSON.toLowerCase());
for (int i = 0; i < fileNameList.size(); i++) {
String imageUploadFile = imageFileBaseDir + fileNameList.get(i);
String annotationFileName = HandleFileUtil.readFileName(fileNameList.get(i));
File annotationFile = new File(dataSetRootFilePath + HandleFileUtil.generateFilePath(BusinessConstant.ANNOTATION) + BusinessConstant.FILE_SEPARATOR + annotationFileName + BusinessConstant.SUFFIX_JSON.toLowerCase());
JSONArray jsonArray = replaceJsonNode(annotationFile, dataLabelList);
minioUtil.upLoadFile(imageUploadFile, FileUtil.getInputStream(dataSetRootFilePath + HandleFileUtil.generateFilePath(BusinessConstant.IMAGE_ORIGIN) + BusinessConstant.FILE_SEPARATOR + fileName));
minioUtil.upLoadFile(annotationFileBaseDir + annotationFileName, IOUtils.toInputStream(jsonArray.toString(), StandardCharsets.UTF_8.name()));
DataFile dataFile = new DataFile();
dataFile.setName(annotationFileName);
dataFile.setUrl(minioConfig.getBucketName() + BusinessConstant.FILE_SEPARATOR + imageUploadFile);
dataFile.setStatus(FileStateCodeConstant.ANNOTATION_COMPLETE_FILE_STATE);
dataFile.setDatasetId(dataset.getId());
dataFile.setFileType(MagicNumConstant.ZERO);
dataFile.setPid(MagicNumConstant.ZERO_LONG);
dataFile.setCreateUserId(dataset.getCreateUserId());
try {
BufferedImage image = ImageIO.read(new File(dataSetRootFilePath + HandleFileUtil.generateFilePath(BusinessConstant.IMAGE_ORIGIN) + BusinessConstant.FILE_SEPARATOR + fileName));
minioUtil.upLoadFile(dataSetRootFilePath + HandleFileUtil.generateFilePath(BusinessConstant.IMAGE_ORIGIN) + BusinessConstant.FILE_SEPARATOR + fileNameList.get(i), imageUploadFile);
String tempFilePath = annotationFile.getAbsolutePath() + "_temp.json";
FileUtil.appendString(jsonArray.toJSONString(), tempFilePath, "UTF-8");
minioUtil.upLoadFileByInputStream(annotationFileBaseDir + annotationFileName, tempFilePath);
FileUtil.del(tempFilePath);
datasetService.updateDatasetStatusIsImport(dataset);
DataFile dataFile = new DataFile(annotationFileName, dataset.getId(), minioConfig.getBucketName() + BusinessConstant.FILE_SEPARATOR + imageUploadFile, dataset.getCreateUserId(),
FileStateCodeConstant.ANNOTATION_COMPLETE_FILE_STATE, MagicNumConstant.ZERO, MagicNumConstant.ZERO_LONG, dataset.getCreateUserId());
if (dataset.getDataType().compareTo(DatatypeEnum.IMAGE.getValue()) == 0) {
BufferedImage image;
try {
image = ImageIO.read(new File(dataSetRootFilePath + HandleFileUtil.generateFilePath(BusinessConstant.IMAGE_ORIGIN) + BusinessConstant.FILE_SEPARATOR + fileNameList.get(i)));
} catch (IOException e) {
throw new ImportDatasetException(" 读取图片高和宽失败 ");
}
dataFile.setWidth(image.getWidth());
dataFile.setHeight(image.getHeight());
} catch (IOException e) {
throw new ImportDatasetException(" 读取图片高和宽失败 ");
}
dataFile.setOriginUserId(dataset.getCreateUserId());
dataFilesList.add(dataFile);
if (dataFilesList.size() % MagicNumConstant.FIVE_HUNDRED == MagicNumConstant.ZERO || atomicInteger.intValue() == fileNameList.size() - MagicNumConstant.ONE) {
long startDataFileIndex = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_FILE, dataFilesList.size());
for (DataFile dataFileEntity : dataFilesList) {
dataFileEntity.setId(startDataFileIndex++);
if (dataFilesList.size() % MagicNumConstant.FIVE_HUNDRED == MagicNumConstant.ZERO || i == fileNameList.size() - MagicNumConstant.ONE) {
if(!CollectionUtils.isEmpty(dataFilesList)){
Queue<Long> dataFileIds = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_FILE, dataFilesList.size());
for (DataFile dataFileEntity : dataFilesList) {
dataFileEntity.setId(dataFileIds.poll());
}
saveDataFile(dataFilesList);
}
saveDataFile(dataFilesList);
List<DataVersionFile> dataVersionFileList = new ArrayList<>();
for (DataFile file : dataFilesList) {
DataVersionFile dataVersionFile = new DataVersionFile();
dataVersionFile.setDatasetId(dataset.getId());
dataVersionFile.setFileId(file.getId());
dataVersionFile.setStatus(MagicNumConstant.ZERO);
dataVersionFile.setAnnotationStatus(FileStateCodeConstant.ANNOTATION_COMPLETE_FILE_STATE);
File annotationFileTxt = new File(dataSetRootFilePath + HandleFileUtil.generateFilePath(BusinessConstant.ANNOTATION) + BusinessConstant.FILE_SEPARATOR + file.getName() + BusinessConstant.SUFFIX_JSON.toLowerCase());
JSONArray jsonArrayTxt = replaceJsonNode(annotationFileTxt, dataLabelList);
DataVersionFile dataVersionFile = new DataVersionFile(dataset.getId(), file.getId(), FileStateCodeConstant.ANNOTATION_COMPLETE_FILE_STATE, MagicNumConstant.ZERO, file.getName());
dataVersionFileList.add(dataVersionFile);
if (DatatypeEnum.TXT.getValue().equals(dataset.getDataType())) {
try {
String bucketName = StringUtils.substringBefore(file.getUrl(), "/");
String fullFilePath = StringUtils.substringAfter(file.getUrl(), "/");
String content = minioUtil.readString(bucketName, fullFilePath);
Map<String, String> jsonMap = new HashMap<>();
jsonMap.put("content", content);
jsonMap.put("name", file.getName());
jsonMap.put("status", FileStateCodeConstant.ANNOTATION_COMPLETE_FILE_STATE.toString());
jsonMap.put("datasetId", dataset.getId().toString());
jsonMap.put("createUserId", file.getCreateUserId() == null ? null : file.getCreateUserId().toString());
jsonMap.put("createTime", file.getCreateTime() == null ? null : file.getCreateTime().toString());
jsonMap.put("updateUserId", file.getUpdateUserId() == null ? null : file.getUpdateUserId().toString());
jsonMap.put("updateTime", file.getUpdateTime() == null ? null : file.getUpdateTime().toString());
jsonMap.put("fileType", file.getFileType() == null ? null : file.getFileType().toString());
jsonMap.put("enhanceType", file.getEnhanceType() == null ? null : file.getEnhanceType().toString());
jsonMap.put("originUserId", file.getOriginUserId().toString());
jsonMap.put("prediction", jsonArrayTxt.getJSONObject(0).get("score").toString());
jsonMap.put("labelId", jsonArrayTxt.getJSONObject(0).get("category_id").toString());
jsonMap.put("versionName", StringUtils.isEmpty(dataset.getCurrentVersionName())?"V0000" : dataset.getCurrentVersionName());
bulkProcessor.add(new IndexRequest(esIndex, "_doc", file.getId().toString()).source(jsonMap));
} catch (Exception e) {
LogUtil.error(LogEnum.BIZ_DATASET, "上传es失败: {} ", e);
}
}
}
long startDataFileVersionIndex = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_VERSION_FILE, dataVersionFileList.size());
for (DataVersionFile dataVersionFile : dataVersionFileList) {
dataVersionFile.setId(startDataFileVersionIndex++);
if(!CollectionUtils.isEmpty(dataVersionFileList)){
Queue<Long> dataFileVersionIds = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_VERSION_FILE, dataVersionFileList.size());
for (DataVersionFile dataVersionFile : dataVersionFileList) {
dataVersionFile.setId(dataFileVersionIds.poll());
}
saveDataVersionFile(dataVersionFileList);
}
saveDataVersionFile(dataVersionFileList);
List<DataFileAnnotation> dataFileAnnotations = new ArrayList<>();
for (DataVersionFile dataVersionFile : dataVersionFileList) {
File annotationFileDb = new File(dataSetRootFilePath + HandleFileUtil.generateFilePath(BusinessConstant.ANNOTATION) + BusinessConstant.FILE_SEPARATOR + dataVersionFile.getFileName() + BusinessConstant.SUFFIX_JSON.toLowerCase());
JSONArray jsonArrayDb = replaceJsonNode(annotationFileDb, dataLabelList);
List<AnnotationDTO> annotationDTOSDb = JSONObject.parseArray(jsonArrayDb.toJSONString(), AnnotationDTO.class);
if(!CollectionUtils.isEmpty(jsonArrayDb)){
if (AnnotateTypeEnum.CLASSIFICATION.getValue().equals(dataset.getAnnotateType()) || AnnotateTypeEnum.TEXT_CLASSIFICATION.getValue().equals(dataset.getAnnotateType())) {
AnnotationDTO annotationDTO = annotationDTOSDb.stream().max(Comparator.comparingDouble(AnnotationDTO::getScore)).get();
Long labelId1 = annotationDTO.getCategoryId();
Double perdiction = annotationDTO.getScore();
dataFileAnnotations.add(new DataFileAnnotation(dataset.getId(), labelId1, dataVersionFile.getId(), perdiction, dataset.getCreateUserId(), dataVersionFile.getFileName()));
}
if (AnnotateTypeEnum.OBJECT_DETECTION.getValue().equals(dataset.getAnnotateType()) || AnnotateTypeEnum.OBJECT_TRACK.getValue().equals(dataset.getAnnotateType())
|| AnnotateTypeEnum.SEMANTIC_CUP.getValue().equals(dataset.getAnnotateType())) {
for (int j = 0; j < jsonArrayDb.size(); j++) {
Object perdictionObject = jsonArrayDb.getJSONObject(j).get("score");
Double perdiction = null;
if (!Objects.isNull(perdictionObject)) {
perdiction = Double.parseDouble(String.valueOf(perdictionObject));
}
Long labelId = (Long) jsonArrayDb.getJSONObject(j).get("category_id");
DataFileAnnotation dataFileAnnotation = new DataFileAnnotation(dataset.getId(), labelId, dataVersionFile.getId(), perdiction, dataset.getCreateUserId(), dataVersionFile.getFileName());
dataFileAnnotations.add(dataFileAnnotation);
}
}

}
if(!CollectionUtils.isEmpty(dataFileAnnotations)){
Queue<Long> dataFileAnnotationIds = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_FILE_ANNOTATION, dataFileAnnotations.size());
for (DataFileAnnotation dataFileAnnotation : dataFileAnnotations) {
dataFileAnnotation.setId(dataFileAnnotationIds.poll());
}
saveDataFileAnnotation(dataFileAnnotations);
}
dataFileAnnotations.clear();
}
ProcessBarUtil.processBar01((long) dataVersionFileList.size());
dataVersionFileList.clear();
dataFilesList.clear();
}
atomicInteger.getAndIncrement();
success++;
}
return atomicInteger.getAndIncrement();
return success;

}

/**
* 截取文本摘要信息
*
* @param file 文本file
* @return String 文本摘要信息
*/
public String InterceptingText(File file) {
String result = "";
try {
BufferedReader br = new BufferedReader(new FileReader(file));
String s = null;
while ((s = br.readLine()) != null) {
result = result + s;
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
String abstractTxt = StringUtils.substring(result, MagicNumConstant.ZERO, MagicNumConstant.FOUR_HUNDRED);
return abstractTxt;
}

/**
* 检查并且替换JSON中的节点
*
@@ -350,6 +492,34 @@ public class DatasetImportHandle {
return listKey.stream().distinct().collect(Collectors.toList());
}

/**
* 校验json文件中是否包含name
*
* @param file 标注文件
* @return true/false true 包含 false 不包含
*/
public boolean containsJsonKey(File file) {
boolean flag = true;
String annotationFileContext;
try {
annotationFileContext = HandleFileUtil.readFile(file);
} catch (IOException e) {
throw new ImportDatasetException(" 解析【" + file.getName() + "】文件出错,请确认内容是否正确");
}
if (!StringUtils.isEmpty(annotationFileContext)) {
JSONArray jsonArray = JSONArray.parseArray(annotationFileContext);
for (Object object : jsonArray) {
LinkedHashMap<String, String> jsonMap = JSON.parseObject(object.toString(), new TypeReference<LinkedHashMap<String, String>>() {
});
if (!jsonMap.containsKey("name")) {
flag = false;
}
}
}
return flag;
}


/**
* 读取标签文件中标签数据
*
@@ -376,7 +546,7 @@ public class DatasetImportHandle {
String groupName = HandleFileUtil.getLabelGroupName(labelGroupName);
int count = dataLabelGroupService.selectByLabelGroupName(groupName);
if (count > MagicNumConstant.ZERO) {
throw new ImportDatasetException(" 标签组名称【" + groupName + "】已存在,请修改label_xxx.json文件名 ");
throw new ImportDatasetException(" 标签组名称【" + groupName + "】已存在,请修改label_{name}.json文件名 ");
}
}

@@ -390,6 +560,11 @@ public class DatasetImportHandle {
public DataLabelGroup saveDataLabelGroup(String labelGroupName, Dataset dataset) {
long timeStamp = System.currentTimeMillis();
DataLabelGroup dataLabelGroup = new DataLabelGroup();
if (dataset.getDataType().equals(DatatypeEnum.TXT.getValue())) {
dataLabelGroup.setLabelGroupType(MagicNumConstant.ONE);
} else {
dataLabelGroup.setLabelGroupType(MagicNumConstant.ZERO);
}
dataLabelGroup.setName(labelGroupName);
dataLabelGroup.setOriginUserId(dataset.getCreateUserId());
dataLabelGroup.setType(MagicNumConstant.ZERO_LONG);
@@ -422,7 +597,7 @@ public class DatasetImportHandle {
List<DataGroupLabel> listDataGroupLabel = new ArrayList<>();
for (DatasetDataLabel datasetDataLabel : listDatasetDataLabel) {
DataGroupLabel dataGroupLabel = new DataGroupLabel();
dataGroupLabel.setLabelId(datasetDataLabel.getId());
dataGroupLabel.setLabelId(datasetDataLabel.getLabelId());
dataGroupLabel.setLabelGroupId(dataLabelGroupId);
listDataGroupLabel.add(dataGroupLabel);
}
@@ -435,6 +610,7 @@ public class DatasetImportHandle {
*
* @param listDataFile file集合
*/
@Transactional(rollbackFor = Exception.class)
public void saveDataFile(List<DataFile> listDataFile) {
dataFileService.saveBatchDataFile(listDataFile);
}
@@ -444,6 +620,7 @@ public class DatasetImportHandle {
*
* @param listDataVersionFile 文件版本数据
*/
@Transactional(rollbackFor = Exception.class)
public void saveDataVersionFile(List<DataVersionFile> listDataVersionFile) {
dataVersionFileService.saveBatchDataFileVersion(listDataVersionFile);
}
@@ -454,6 +631,7 @@ public class DatasetImportHandle {
*
* @param listDatasetDataLabel 标签与数据集关系表
*/
@Transactional(rollbackFor = Exception.class)
public void saveDatasetDataLabel(List<DatasetDataLabel> listDatasetDataLabel) {
datasetDataLabelService.saveBatchDatasetDataLabel(listDatasetDataLabel);
}
@@ -461,12 +639,23 @@ public class DatasetImportHandle {
/**
* 批量保存标签与标签组的关系
*
* @param listDataGroupLabel 标签与标签组集合
* @param listDataGroupLabel 标签与标签组集合
*/
@Transactional(rollbackFor = Exception.class)
public void saveDatasetDataGroupLabel(List<DataGroupLabel> listDataGroupLabel) {
dataGroupLabelService.saveDataGroupLabel(listDataGroupLabel);
}

/**
* 批量保存nlp中间表
*
* @param dataFileAnnotations nlp集合
*/
@Transactional(rollbackFor = Exception.class)
public void saveDataFileAnnotation(List<DataFileAnnotation> dataFileAnnotations) {
dataFileAnnotationService.saveDataFileAnnotation(dataFileAnnotations);
}

/**
* 查询数据集
*
@@ -480,26 +669,42 @@ public class DatasetImportHandle {
/**
* 校验数据集ID
*
* @param scanner 控制台输入参数
* @param scanner 控制台输入参数
* @return Dataset 数据集
*/
public Dataset verificationDatasetId(Scanner scanner) {
log.warn("# 请输入数据集ID #");
String datasetIdStr = scanner.nextLine();
long datasetId;
try {
datasetId = Long.parseLong(datasetIdStr.trim());
} catch (Exception e) {
throw new ImportDatasetException(" 数据集ID非法,请重新输入 ");
}
Dataset dataset = findDataset(datasetId);
if (dataset == null) {
throw new ImportDatasetException(" 数据集ID不存在,请重新输入 ");
}
int countDataLabel = datasetService.findDataLabelById(dataset.getId());
int countDataFile = datasetService.findDataFileById(dataset.getId());
if (countDataLabel > MagicNumConstant.ZERO || countDataFile > MagicNumConstant.ZERO) {
throw new ImportDatasetException(" 当前数据集文件已存在,请勿重新导入 ");
boolean flag = false;
Dataset dataset = new Dataset();
while (!flag) {
System.out.println(" ");
System.out.println("# 请输入数据集ID #");
String datasetIdStr = scanner.nextLine();
long datasetId = 0;
try {
datasetId = Long.parseLong(datasetIdStr.trim());
} catch (Exception e) {
log.error("");
PrintUtils.printLine(" Error: 数据集ID非法,请重新输入", PrintUtils.RED);
log.error("");
continue;
}
dataset = findDataset(datasetId);
if (dataset == null) {
log.error("");
PrintUtils.printLine(" Error: 数据集ID不存在,请重新输入", PrintUtils.RED);
log.error("");
continue;
}
int countDataLabel = datasetService.findDataLabelById(dataset.getId());
int countDataFile = datasetService.findDataFileById(dataset.getId());
if (countDataLabel > MagicNumConstant.ZERO || countDataFile > MagicNumConstant.ZERO) {
log.error("");
PrintUtils.printLine(" Error: 当前数据集文件已存在,请勿重新导入 ", PrintUtils.RED);
log.error("");
continue;
} else {
flag = true;
}
}
return dataset;
}
@@ -511,11 +716,21 @@ public class DatasetImportHandle {
* @return String 字符串
*/
public String verificationFilePath(Scanner scanner) {
log.warn("# 请输入待导入数据集绝对路径地址 #");
String filePath = scanner.nextLine();
File file = new File(filePath.trim());
if (!file.exists()) {
throw new ImportDatasetException(" 【" + filePath + "】 文件路径不存在,请重新输入");
boolean flag = false;
String filePath = "";
while (!flag) {
System.out.println(" ");
System.out.println("# 请输入待导入本地数据集绝对路径 #");
filePath = scanner.nextLine();
File file = new File(filePath.trim());
if (!file.exists()) {
log.error("");
PrintUtils.printLine(" 【" + filePath + "】文件路径不存在,请重新输入", PrintUtils.RED);
log.error("");
continue;
} else {
flag = true;
}
}
return filePath;
}


+ 938
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/handle/PresetDatasetImportHandle.java View File

@@ -0,0 +1,938 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.handle;

import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.RandomUtil;
import cn.hutool.core.util.StrUtil;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.google.common.collect.Lists;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.dubhe.datasetutil.common.base.MagicNumConstant;
import org.dubhe.datasetutil.common.config.MinioConfig;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.dubhe.datasetutil.common.constant.FileStateCodeConstant;
import org.dubhe.datasetutil.common.enums.DatatypeEnum;
import org.dubhe.datasetutil.common.enums.LogEnum;
import org.dubhe.datasetutil.common.enums.PresetDatasetEnum;
import org.dubhe.datasetutil.common.exception.ImportDatasetException;
import org.dubhe.datasetutil.common.util.*;
import org.dubhe.datasetutil.domain.dto.FileAnnotationDTO;
import org.dubhe.datasetutil.domain.entity.*;
import org.dubhe.datasetutil.service.*;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.index.IndexRequest;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.CollectionUtils;
import org.springframework.util.ObjectUtils;

import javax.annotation.Resource;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import static org.dubhe.datasetutil.common.constant.BusinessConstant.FILE_SEPARATOR;

/**
* @description 导入预置数据集工具类
* @date 2020-10-12
*/
@Slf4j
@Component
public class PresetDatasetImportHandle {

/**
* esSearch索引
*/
@Value("${es.index}")
private String esIndex;

@Autowired
private DatasetService datasetService;

@Autowired
private DataFileService dataFileService;

@Autowired
private DataLabelService dataLabelService;

@Autowired
private DatasetDataLabelService datasetDataLabelService;

@Autowired
private DataVersionFileService dataVersionFileService;

@Autowired
private DataFileAnnotationService dataFileAnnotationService;

@Autowired
private MinioUtil minioUtil;

@Autowired
private MinioConfig minioConfig;

@Autowired
private GeneratorKeyUtil generatorKeyUtil;

@Resource
private BulkProcessor bulkProcessor;

private final AtomicInteger fileCount = new AtomicInteger();


private final List<File> annotationFiles = new LinkedList<>();

private final List<File> originFiles = new LinkedList<>();

private final Map<String, FileAnnotationDTO> fileAnnotationMap = new ConcurrentHashMap<>();


@Value("${minio.dosAddress}")
private String dosAddress;

private final static Set<String> datasetIds = new HashSet<>();


private volatile List<DataLabel> labels = new ArrayList<>();

static {
PresetDatasetEnum[] values = PresetDatasetEnum.values();
for (PresetDatasetEnum datasetEnum : values) {
datasetIds.add(datasetEnum.getType());
}
}

/**
* 导入预置数据集
*
* @param scanner 控制台输入数据
*/
public synchronized void importPresetDataset(Scanner scanner) {
//校验数据集信息
long datasetId = verificationDatasetId(scanner);
try {
LocalDateTime startTime = LocalDateTime.now();
//校验文件目录并保存sql文件信息
String rootPath = verificationFilePathAndSaveSqlData(scanner, datasetId);
//构建上传文件路径数据
Dataset dataset = findDataset(datasetId);
if (Objects.isNull(dataset)) {
throw new ImportDatasetException("数据集ID: " + datasetId + "不存在!");
}
//上传文件到 minio
executeUploadToMinio(dataset, rootPath);
executeUploadToDB(dataset);
LocalDateTime endTime = LocalDateTime.now();
Duration between = Duration.between(startTime, endTime);
log.warn("");
PrintUtils.printLine(" Success: 执行成功 ", PrintUtils.GREEN);
PrintUtils.printLine(" 执行开始时间:{" + startTime + "} 执行结束时间:{" + endTime + "} 执行总时长(分钟){" + between.toMinutes() + "}", PrintUtils.YELLOW);
log.warn("");
System.out.println("# 是否结束? Y / N #");
Scanner scannerExit = new Scanner(System.in);
if (BusinessConstant.Y.toLowerCase().equals(scannerExit.nextLine().toLowerCase())) {
System.exit(MagicNumConstant.ZERO);
}
} catch (Exception e) {
log.error("");
PrintUtils.printLine(" Error:" + e.getMessage(), PrintUtils.RED);
log.error("");
Dataset dataset = findDataset(datasetId);
if (!Objects.isNull(dataset)) {
PrintUtils.printLine(" 执行异常,正在清理异常数据,请勿关闭窗口 ", PrintUtils.RED);
//删除minio数据
delDatasetMinioInfo(dataset.getUri());
//删除数据集信息
delDatasetInfoById(datasetId, dataset.getDataType());
}
} finally {
originFiles.clear();
annotationFiles.clear();
labels.clear();
}

}


/**
* 实际上传文件到Minio
*
* @param dataset 数据集实体
* @param rootPath 文件根路径
* @throws Exception 上传异常
*/
private void executeUploadToMinio(Dataset dataset, String rootPath) throws Exception {
List<File> allFileList = new LinkedList<>(annotationFiles);
allFileList.addAll(originFiles);
log.warn("........系统需要处理:【" + allFileList.size() + "】份文件,请勿关闭窗口.........");
int batchNumber = MagicNumConstant.ZERO;
int oneSize = ThreadUtils.createThread(allFileList.size());
ProcessBarUtil.initProcess("预置数据集导入", (long) allFileList.size());
if (allFileList.size() > MagicNumConstant.TEN_THOUSAND) {
log.warn("........系统处理中.........");
List<List<File>> partitionList = Lists.partition(allFileList, MagicNumConstant.FIVE_THOUSAND);
for (List<File> imageFileNameList1 : partitionList) {
batchNumber++;
dealFileListToMinio(imageFileNameList1, oneSize, dataset, batchNumber, rootPath);
}
} else {
log.warn("........系统处理中.........");
batchNumber++;
dealFileListToMinio(allFileList, oneSize, dataset, batchNumber, rootPath);
}

}


/**
* 实际上传文件到数据库
*
* @param dataset 数据集实体
* @throws Exception 上传异常
*/
private void executeUploadToDB(Dataset dataset) throws Exception {
log.warn("........系统需要处理:【" + originFiles.size() + "】份文件到数据库,请勿关闭窗口.........");
int batchNumber = MagicNumConstant.ZERO;
int oneSize = ThreadUtils.createThread(originFiles.size());
//视频数据导入单线程顺序处理
if (DatatypeEnum.VIDEO.getValue().compareTo(dataset.getDataType()) == 0) {
sortByName(originFiles);
runTaskSql(originFiles, dataset);
log.warn("#-------------系统已总共成功处理文件 【" + oneSize + "】个-------------#");
return;
}
if (originFiles.size() > MagicNumConstant.TEN_THOUSAND) {
List<List<File>> partitionList = Lists.partition(originFiles, MagicNumConstant.FIVE_THOUSAND);
for (List<File> imageFileNameList1 : partitionList) {
batchNumber++;
LogUtil.info(LogEnum.BIZ_DATASET, "第: 【" + batchNumber + "】批次,需要处理:【" + imageFileNameList1.size() + "】 文件: ");
dealFileListToSql(imageFileNameList1, oneSize, dataset, batchNumber);
}
} else {
batchNumber++;
dealFileListToSql(originFiles, oneSize, dataset, batchNumber);
}

}


/**
* 多线程上传数据到minio
*
* @param allFileList 文件数据
* @param oneSize 每次处理次数
* @param dataset 数据集实体
* @param batchNumber 上传批次
* @param rootPath 根路径
* @throws Exception 上传异常
*/
public void dealFileListToMinio(List<File> allFileList, int oneSize, Dataset dataset, int batchNumber, String rootPath) throws Exception {
List<Callable<Integer>> partitions = new LinkedList<>();
List<File> need = new LinkedList<>();
for (int i = 0; i < allFileList.size(); i++) {
need.add(allFileList.get(i));
if (need.size() == oneSize || i == allFileList.size() - MagicNumConstant.ONE) {
List<File> fileNameList = new LinkedList<>(need);

need.clear();
partitions.add(() -> runTask(fileNameList, dataset));
}
}
ThreadUtils.runMultiThread(partitions);
}


/**
* 多线程上传数据到sql
*
* @param allFileList 文件数据
* @param oneSize 每次处理次数
* @param dataset 数据集实体
* @param batchNumber 上传批次
* @throws Exception 上传异常
*/
public void dealFileListToSql(List<File> allFileList, int oneSize, Dataset dataset, int batchNumber) throws Exception {
int dealSize = MagicNumConstant.ZERO;
List<Callable<Integer>> partitions = new LinkedList<>();
List<File> need = new LinkedList<>();
for (int i = 0; i < allFileList.size(); i++) {
need.add(allFileList.get(i));
if (need.size() == oneSize || i == allFileList.size() - MagicNumConstant.ONE) {
List<File> fileNameList = new LinkedList<>(need);
dealSize += fileNameList.size();
LogUtil.info(LogEnum.BIZ_DATASET, "系统将处理第: 【" + batchNumber + "】批次,需要处理:【" + dealSize + "】个文件至数据库");
need.clear();
partitions.add(() -> runTaskSql(fileNameList, dataset));
}
}
ThreadUtils.runMultiThread(partitions);
}


/**
* 实际实际上传执行方法
*
* @param files 上传文件
* @param dataset 数据集实体
* @return 执行次数
*/
private Integer runTaskSql(List<File> files, Dataset dataset) {
Integer success = MagicNumConstant.ZERO;
List<DataFile> dataFilesList = new LinkedList<>();
for (int i = 0; i < files.size(); i++) {
File file = files.get(i);
//绝对路径
String absolutePath = file.getAbsolutePath();
//根目录 /${datasetID}/
String rootName = BusinessConstant.FILE_SEPARATOR + dataset.getId() + BusinessConstant.FILE_SEPARATOR;
// dubhe-dev/dataset/${datasetID}/origin/${a.jpg}
String fileName = minioConfig.getBucketName() + File.separator + BusinessConstant.MINIO_ROOT_PATH + rootName +
StringUtils.substringAfter(absolutePath, File.separator + dataset.getId() + File.separator);
//转换 Linux 斜杠
String targetFilePath = StringUtils.replaceChars(fileName, "\\", "/");
//构建 dataset对象
DataFile dataFile = new DataFile();
dataFile.setName(HandleFileUtil.readFileName(file.getName()));
dataFile.setUrl(targetFilePath);
dataFile.setStatus(FileStateCodeConstant.ANNOTATION_COMPLETE_FILE_STATE);
dataFile.setDatasetId(dataset.getId());
dataFile.setFileType(MagicNumConstant.ZERO);
dataFile.setPid(MagicNumConstant.ZERO_LONG);
dataFile.setCreateUserId(dataset.getCreateUserId());
dataFile.setOriginUserId(MagicNumConstant.ZERO_LONG);
if (dataset.getDataType().compareTo(DatatypeEnum.IMAGE.getValue()) == 0) {
try {
BufferedImage image = ImageIO.read(file);
dataFile.setWidth(image.getWidth());
dataFile.setHeight(image.getHeight());
} catch (IOException e) {
throw new ImportDatasetException(" 读取图片高和宽失败 ");
}
}
dataFile.setOriginUserId(MagicNumConstant.ZERO_LONG);
dataFilesList.add(dataFile);
// 500 写一次库 或者最后写一次库
if (dataFilesList.size() % MagicNumConstant.FIVE_HUNDRED == MagicNumConstant.ZERO || i == files.size() - MagicNumConstant.ONE) {
Queue<Long> dataFileIds = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_FILE, dataFilesList.size());
for (DataFile dataFileEntity : dataFilesList) {
dataFileEntity.setId(dataFileIds.poll());
}
//写 dataset_file 表
dataFileService.saveBatchDataFile(dataFilesList);
//构建 DatasetVersionFile对象
List<DataVersionFile> dataVersionFileList = new ArrayList<>();
for (DataFile datasetFile : dataFilesList) {
DataVersionFile dataVersionFile = new DataVersionFile();
dataVersionFile.setDatasetId(dataset.getId());
dataVersionFile.setFileId(datasetFile.getId());
dataVersionFile.setStatus(MagicNumConstant.TWO);
dataVersionFile.setVersionName(dataset.getDataType().compareTo(DatatypeEnum.TXT.getValue()) == 0 ? null : BusinessConstant.V0001);
dataVersionFile.setAnnotationStatus(FileStateCodeConstant.ANNOTATION_COMPLETE_FILE_STATE);
dataVersionFile.setFileName(datasetFile.getName());
dataVersionFileList.add(dataVersionFile);
}
Queue<Long> dataFileVersionIds = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_VERSION_FILE, dataVersionFileList.size());
for (DataVersionFile dataVersionFile : dataVersionFileList) {
dataVersionFile.setId(dataFileVersionIds.poll());
}
//写 dataset_version_file 表
dataVersionFileService.saveBatchDataFileVersion(dataVersionFileList);

List<DataFileAnnotation> dataFileAnnotations = dataVersionFileList.stream().map(dataVersionFile -> {

FileAnnotationDTO fileAnnotationDTO = null;
try {
fileAnnotationDTO = fileAnnotationMap.get(dataVersionFile.getFileName());
//构建 datasetFileAnnotation 对象
DataFileAnnotation dataFileAnnotation = DataFileAnnotation.builder()
.datasetId(dataset.getId())
.LabelId(ObjectUtils.isEmpty(fileAnnotationDTO) ? null : fileAnnotationDTO.getCategoryId())
.prediction(1D)
.versionFileId(dataVersionFile.getId())
.build();
if (DatatypeEnum.TXT.getValue().equals(dataset.getDataType())) {
try {
String bucketName = StringUtils.substringBefore(dataFile.getUrl(), "/");
String fullFilePath = StringUtils.substringAfter(dataFile.getUrl(), "/");
String content = minioUtil.readString(bucketName, fullFilePath);
Map<String, String> jsonMap = new HashMap<>();
jsonMap.put("content", content);
jsonMap.put("name", dataFile.getName());
jsonMap.put("status", FileStateCodeConstant.ANNOTATION_COMPLETE_FILE_STATE.toString());
jsonMap.put("datasetId", dataset.getId().toString());
jsonMap.put("createUserId", dataFile.getCreateUserId() == null ? null : dataFile.getCreateUserId().toString());
jsonMap.put("createTime", dataFile.getCreateTime() == null ? null : dataFile.getCreateTime().toString());
jsonMap.put("updateUserId", dataFile.getUpdateUserId() == null ? null : dataFile.getUpdateUserId().toString());
jsonMap.put("updateTime", dataFile.getUpdateTime() == null ? null : dataFile.getUpdateTime().toString());
jsonMap.put("fileType", dataFile.getFileType() == null ? null : dataFile.getFileType().toString());
jsonMap.put("enhanceType", dataFile.getEnhanceType() == null ? null : dataFile.getEnhanceType().toString());
jsonMap.put("originUserId", dataFile.getOriginUserId().toString());
jsonMap.put("prediction", "1");
jsonMap.put("labelId", dataFileAnnotation.getLabelId().toString());
jsonMap.put("versionName", StringUtils.isEmpty(dataset.getCurrentVersionName())?"V0000" : dataset.getCurrentVersionName());
IndexRequest request = new IndexRequest(esIndex);
request.source(jsonMap);
request.id(dataVersionFile.getFileId().toString());
bulkProcessor.add(request);
} catch (Exception e) {
LogUtil.error(LogEnum.BIZ_DATASET, "上传es失败: {} ", e);
}
}
return ObjectUtils.isEmpty(dataFileAnnotation.getLabelId()) ? null : dataFileAnnotation;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}

).filter(dataVersionFile -> !ObjectUtils.isEmpty(dataVersionFile)).collect(Collectors.toList());
Queue<Long> dataFileAnnotationIds = generatorKeyUtil.getSequenceByBusinessCode(BusinessConstant.DATA_FILE_ANNOTATION, dataFileAnnotations.size());
for (DataFileAnnotation dataFileAnnotation : dataFileAnnotations) {
dataFileAnnotation.setId(dataFileAnnotationIds.poll());
}
//写 dataset_file_annotation 表
dataFileAnnotationService.saveDataFileAnnotation(dataFileAnnotations);

dataFileAnnotations.clear();

dataVersionFileList.clear();
dataFilesList.clear();
}
success++;
}
bulkProcessor.flush();
return success;

}

/**
* 实际执行任务
*
* @param files 上传文件
* @param dataset 数据集
* @return Integer 执行次数
*/
private Integer runTask(List<File> files, Dataset dataset) throws Exception {
Integer success = MagicNumConstant.ZERO;

for (int i = 0; i < files.size(); i++) {
File file = files.get(i);
File parentFile = file.getParentFile();
String absolutePath = file.getAbsolutePath();
String rootName = BusinessConstant.FILE_SEPARATOR + dataset.getId() + BusinessConstant.FILE_SEPARATOR;
String fileName = StringUtils.substringAfter(absolutePath, File.separator + dataset.getId() + File.separator);
String targetFilePath = StringUtils.replaceChars(BusinessConstant.MINIO_ROOT_PATH + rootName + fileName, "\\", "/");

if (BusinessConstant.ANNOTATION.equals(parentFile.getName()) || (
BusinessConstant.ANNOTATION.equals(parentFile.getParentFile().getName()) &&
BusinessConstant.V0001.equals(parentFile.getName())
)) {
targetFilePath = buildFileName(targetFilePath);
JSONArray jsonArray = replaceJsonNode(file, labels, dataset);
String tempFilePath = absolutePath + "_temp.json";
FileUtil.appendString(jsonArray.toJSONString(), tempFilePath, "UTF-8");
minioUtil.upLoadFileByInputStream(targetFilePath, tempFilePath);
FileUtil.del(tempFilePath);
} else {
minioUtil.upLoadFile(absolutePath, targetFilePath);
}
ProcessBarUtil.processBar01(1L);
success++;
}

return success;

}


/**
* 构建文件名称
*
* @param fileName 文件名称
* @return 构建后文件名称
*/
public String buildFileName(String fileName) {
if (fileName.toLowerCase().endsWith(BusinessConstant.SUFFIX_JSON.toLowerCase())) {
fileName = StringUtils.substringBefore(fileName, BusinessConstant.SUFFIX_JSON.toLowerCase());
}
return fileName;
}

/**
* 校验数据集ID
*
* @param scanner 控制台输入参数
*/
public long verificationDatasetId(Scanner scanner) {
boolean flag = false;
long datasetId = 0;
while (!flag) {
System.out.println("");
System.out.println("# 请选择预置数据集 (参考文档: " + dosAddress + ") #");
System.out.println("");
for (PresetDatasetEnum presetDatasetEnum : PresetDatasetEnum.values()) {
StringBuffer sb = new StringBuffer();
sb.append("# ").append(presetDatasetEnum.getType()).append(":").append(presetDatasetEnum.getDesc()).append(" ");
System.out.println(sb.toString());
}
String datasetIdStr = scanner.nextLine();

try {
datasetId = Long.parseLong(datasetIdStr.trim());
} catch (Exception e) {
log.error("");
PrintUtils.printLine(" Error: 数据集ID非法,请重新输入", PrintUtils.RED);
log.error("");
continue;
}

long finalDatasetId = datasetId;
Optional<PresetDatasetEnum> datasetEnum = Arrays.stream(PresetDatasetEnum.values()).filter(a -> a.getType().equals(String.valueOf(finalDatasetId))).findAny();
if (!datasetEnum.isPresent()) {
log.error("");
PrintUtils.printLine(" Error: 数据集ID不属于预置数据集ID", PrintUtils.RED);
log.error("");
continue;
}


Dataset dataset = findDataset(datasetId);
if (!Objects.isNull(dataset)) {
log.error("");
PrintUtils.printLine(" Error: 数据集已存在,请重新选择", PrintUtils.RED);
log.error("");
continue;
}


flag = true;
}

return datasetId;
}

/**
* 读取标签文件中标签数据
*
* @param file 标签文件
* @return List<DataLabel> 标签数据集合
*/
public List<DataLabel> readLabelContext(File file) throws IOException {
String fileContext = HandleFileUtil.readFile(file);
List<DataLabel> dataLabelList = JSONArray.parseArray(fileContext, DataLabel.class);
for (DataLabel dataLabel : dataLabelList) {
if (StringUtils.isEmpty(dataLabel.getName()) || StringUtils.isEmpty(dataLabel.getColor())) {
throw new ImportDatasetException(" 标签文件不规范,未能读到 'name' 或者 'color' ");
}
}
return dataLabelList;
}


/**
* 查询数据集
*
* @param datasetId 数据集Id
* @return Dataset 根据数据集ID查询返回的数据集
*/
private Dataset findDataset(Long datasetId) {
return datasetService.findDatasetByIdNormal(datasetId);
}


/**
* 校验文件路径
*
* @param scanner 输入控制台
* @param datasetId 数据集ID
* @return String 字符串
*/
public String verificationFilePathAndSaveSqlData(Scanner scanner, Long datasetId) throws Exception {
boolean flag = false;
String filePath = "";
while (!flag) {
System.out.println(" ");
System.out.println("# 请输入待上传本地预置数据集的完整路径 #");
filePath = scanner.nextLine();
File file = new File(filePath.trim());

if (!file.exists()) {
log.error("");
PrintUtils.printLine(" 【" + filePath + "】 文件路径不存在,请重新输入", PrintUtils.RED);
log.error("");
continue;
} else {
//校验文件目录是否合法并保存sql文件数据
log.info("........数据校验开始,请勿关闭窗口.................");
checkFileDirectoryAndSaveSqlData(filePath, datasetId);
log.info("........数据校验完成,即将执行下一步操作,请勿关闭窗口.................");
flag = true;
}
}
return filePath;
}


/**
* 读取并保存sql文件中数据
*
* @param file sql文件
*/
@Transactional(rollbackFor = Exception.class)
public void readAndSaveSqlData(File file) throws Exception {
List<String> list = HandleFileUtil.readFileInfo(file);
if (!CollectionUtils.isEmpty(list)) {
datasetService.saveBatch(list);
}
}


/**
* 检查并且替换JSON中的节点
*
* @param annotationFile 标注文件
* @param dataLabelList 数据集集合
* @param dataset 数据集实体
* @return 标签json数据
* @throws IOException
*/
public JSONArray replaceJsonNode(File annotationFile, List<DataLabel> dataLabelList, Dataset dataset) throws IOException {
JSONArray jsonArray = new JSONArray();
if (annotationFile.exists()) {
String annotationFileContext = HandleFileUtil.readFile(annotationFile);
jsonArray = JSONArray.parseArray(annotationFileContext);
if (!jsonArray.isEmpty()) {
replaceAllNode(jsonArray, dataLabelList, dataset, annotationFile.getName());
}
}
return jsonArray;
}

/**
* 替换节点值
*
* @param jsonArray 标注文件集合
* @param dataLabelList 标签集合
* @param dataset 数据集实体
* @param fileName 文件名称
*/
public void replaceAllNode(JSONArray jsonArray, List<DataLabel> dataLabelList, Dataset dataset, String fileName) {
for (int i = MagicNumConstant.ZERO; i < jsonArray.size(); i++) {
JSONObject jsonObject = jsonArray.getJSONObject(i);
jsonObject.put("category_id", findDataLabelId(dataLabelList, jsonObject.get("name").toString()));
FileAnnotationDTO annotationDTO = jsonObject.toJavaObject(FileAnnotationDTO.class);
fileAnnotationMap.put(buildFileName(fileName), annotationDTO);
jsonObject.put("category_id",jsonObject.get("name"));
jsonObject.remove("name");
}
}

/**
* 查询需要替换的节点
*
* @param dataLabelList 标签集合
* @param objectValue 替换的节点值
* @return long 替换标签的Id
*/
public long findDataLabelId(List<DataLabel> dataLabelList, String objectValue) {
Optional<DataLabel> matchedDataLabel = dataLabelList.stream().filter(dataLabel -> objectValue.equals(dataLabel.getName())).findAny();
if (!matchedDataLabel.isPresent()) {
throw new ImportDatasetException(" 标注文件中name的值不存在于标签中!");
}
return matchedDataLabel.get().getId();
}


/**
* 校验文件目录
*
* @param strPath 文件地址
* @param datasetId 数据集ID
*/
public void checkFileDirectoryAndSaveSqlData(String strPath, Long datasetId) throws Exception {
File f = new File(strPath);
if (f.isDirectory()) {
File[] files = f.listFiles();
if (files == null || Objects.requireNonNull(files).length == 0) {
throw new ImportDatasetException(" 文件目录 【" + strPath + "】下不存在文件 ");
}
for (File file : files) {
//是文件夹则一层剥一层的去校验
if (file.isDirectory()) {
//校验文件目录
checkoutDirectoryName(file);
checkFileDirectoryAndSaveSqlData(file.getPath(), datasetId);
// /Downloads/COCO2017-val/1/ 在此目录文件夹下
// annotation dataset.sql label_COCO2017-val.json origin versionFile
} else if (datasetIds.contains(file.getParentFile().getName())) {
//读取并保存 sql文件
if (file.getName().toLowerCase().endsWith(BusinessConstant.SUFFIX_SQL.toLowerCase())) {
readAndSaveSqlData(file);
}
// 判断是否为 .json 结尾的标签文件
if (file.getName().toLowerCase().endsWith(BusinessConstant.SUFFIX_JSON.toLowerCase())) {
labels = readLabelContext(file);
if (!CollectionUtils.isEmpty(labels)) {
dataLabelService.saveBatchDataLabel(labels);
List<DatasetDataLabel> dataLabels = labels.stream().map(a ->
DatasetDataLabel.builder().datasetId(datasetId).labelId(a.getId()).build()).collect(Collectors.toList());
datasetDataLabelService.saveBatchDatasetDataLabel(dataLabels);
}
}
// /Downloads/COCO2017-val/1/ 不在此目录文件夹下(在/1/目录下的子文件夹中)
} else if (!datasetIds.contains(file.getParentFile().getName())) {
///Downloads/COCO2017-val/1/origin/
File parentFile = file.getParentFile();
// 在 origin 目录中
if (
BusinessConstant.IMAGE_ORIGIN.equals(parentFile.getName()) &&
String.valueOf(datasetId).equals(parentFile.getParentFile().getName())
) {
originFiles.add(file);
} else {
annotationFiles.add(file);
}
//文件计数
fileCount.getAndIncrement();
}


}
}
}


/**
* 校验文件目录名称
*
* @param file 文件
*/
public void checkoutDirectoryName(File file) {
//获取文件名
String fileName = file.getName();
//获取文件路径
String path = file.getPath();
//获取当前文件所在文件夹的名称
String parentFileName = file.getParentFile().getName();
//筛选出当前文件夹中符合预置数据集名称的文件
Optional<PresetDatasetEnum> optional = Arrays.stream(PresetDatasetEnum.values()).filter(a -> a.getType().equals(parentFileName)).findAny();
//文件路径如果输入 /Downloads/COCO2017-val/1/xxx/xxx 则错误
//以下均为文件路径的校验
if (optional.isPresent() &&
!(BusinessConstant.IMAGE_ORIGIN.equals(fileName) || BusinessConstant.VERSION_FILE.equals(fileName)
|| BusinessConstant.ANNOTATION.equals(fileName) || BusinessConstant.VIDEO.equals(fileName))
) {
log.error("");
PrintUtils.printLine(" 【" + path + "】 文件路径不合法,请重新输入", PrintUtils.RED);
log.error("");
} else if (BusinessConstant.ANNOTATION.equals(parentFileName) && !(BusinessConstant.V0001.equals(fileName))) {
log.error("");
PrintUtils.printLine(" 【" + path + "】 文件路径不合法,请重新输入", PrintUtils.RED);
log.error("");

} else if (BusinessConstant.VERSION_FILE.equals(parentFileName) && !(BusinessConstant.V0001.equals(fileName))) {
log.error("");
PrintUtils.printLine(" 【" + path + "】 文件路径不合法,请重新输入", PrintUtils.RED);
log.error("");

} else if (BusinessConstant.OFRECORD.equals(parentFileName) && !(BusinessConstant.TRAIN.equals(fileName))) {
log.error("");
PrintUtils.printLine(" 【" + path + "】 文件路径不合法,请重新输入", PrintUtils.RED);
log.error("");

} else if (BusinessConstant.V0001.equals(parentFileName) &&
!(BusinessConstant.IMAGE_ORIGIN.equals(fileName) || BusinessConstant.ANNOTATION.equals(fileName) || BusinessConstant.OFRECORD.equals(fileName))
) {
log.error("");
PrintUtils.printLine(" 【" + path + "】 文件路径不合法,请重新输入", PrintUtils.RED);
log.error("");

}
}


/**
* 根据文件路径删除minio文件数据
*
* @param uri 文件路径
*/
private void delDatasetMinioInfo(String uri) {
if (!Objects.isNull(uri)) {
String path = minioConfig.getNfsRootPath() + minioConfig.getBucketName() + StrUtil.SLASH + uri;
deleteFileByCMD(path);
}

}

/**
* 删除数据集信息
*
* @param datasetId 数据集ID
* @param dataType 数据类型
*/
@Transactional(rollbackFor = Exception.class)
public void delDatasetInfoById(long datasetId, Integer dataType) {
datasetService.deleteDatasetById(datasetId);
dataFileService.deleteFileByDatasetId(datasetId);
dataVersionFileService.deleteVersionByDatasetId(datasetId);
dataLabelService.deleteLabelByDatasetId(datasetId);
datasetDataLabelService.deleteDatasetLabelByDatasetId(datasetId);
if (DatatypeEnum.TXT.getValue().compareTo(dataType) == 0) {
dataFileAnnotationService.delDataFileAnnotationById(datasetId);
}
}

/**
* 按名称排序
*
* @param list 文件集合
*/
private void sortByName(List<File> list) {
for (int i = 0; i < list.size() - 1; i++) {
for (int j = 1; j < list.size() - i; j++) {
File a;
if (compareByName(list.get(j - 1), list.get(j)) > 0) {
a = list.get(j - 1);
list.set((j - 1), list.get(j));
list.set(j, a);
}
}
}
}

/**
* 文件名称排序
*
* @param fileOne 文件名称
* @param fileTwo 文件名称
* @return 排序大小
*/
private int compareByName(File fileOne, File fileTwo) {
return buildImgName(fileOne).compareTo(buildImgName(fileTwo));
}


/**
* 构建图片名称
*
* @param file 文件
* @return 图片名称
*/
private Integer buildImgName(File file) {
int value = MagicNumConstant.ZERO;
try {
value = Integer.parseInt(StringUtils.substringBefore(StringUtils.substringAfterLast(file.getName(), "_"), "."));
} catch (Exception e) {
LogUtil.error(LogEnum.BIZ_DATASET, "文件: 【" + file.getName() + "】名称格式错误");
}
return value;
}

/**
* 文件删除
*
* @param path 删除路径
*/
public void deleteFileByCMD(String path) {
String sourcePath = formatPath(path);
//判断该路径是否存在文件或文件夹
String emptyDir = "";
String nfsBucket = minioConfig.getNfsRootPath() + minioConfig.getBucketName() + StrUtil.SLASH;
sourcePath = sourcePath.endsWith(StrUtil.SLASH) ? sourcePath : sourcePath + StrUtil.SLASH;
//校验回收文件是否存在以及回收文件必须至少在当前环境目录下还有一层目录,如:/nfs/dubhe-test/xxxx/
try {
if (sourcePath.startsWith((nfsBucket))
&& sourcePath.length() > nfsBucket.length()) {
emptyDir = "/tmp/empty_" + RandomUtil.randomNumbers(10) + StrUtil.SLASH;
LogUtil.info(LogEnum.BIZ_DATASET, "recycle task sourcePath:{},emptyDir:{}", sourcePath, emptyDir);
String exec = "/bin/sh";
String c = "-c";
if (System.getProperty("os.name").toLowerCase().contains("windows")) {
exec = "cmd.exe";
c = "/C";
}
Process process = Runtime.getRuntime().exec(new String[]{exec, c,
String.format(BusinessConstant.DEL_COMMAND, minioConfig.getServerUserName(), minioConfig.getEndpoint(), emptyDir, emptyDir, sourcePath, emptyDir, sourcePath)});
recycleSourceIsOk(process);
}
} catch (Exception e) {
LogUtil.error(LogEnum.BIZ_DATASET, "minio 文件流删除文件失败: {} ", e);
}
}

/**
* 判断执行服务器命名是否成功退出
*
* @param process Process对象
* @return boolean linux命令是否执行成功正常退出
*/
public boolean recycleSourceIsOk(Process process) {
InputStreamReader stream = new InputStreamReader(process.getErrorStream());
BufferedReader reader = new BufferedReader(stream);
StringBuffer errMessage = new StringBuffer();
boolean recycleIsOk = true;
try {
while (reader.read() != MagicNumConstant.NEGATIVE_ONE) {
errMessage.append(reader.readLine());
}
int status = process.waitFor();
if (status != 0) {
LogUtil.error(LogEnum.BIZ_DATASET, "文件流删除文件失败: {} ", errMessage.toString());
recycleIsOk = false;
}
} catch (Exception e) {
LogUtil.error(LogEnum.BIZ_DATASET, "文件流删除文件失败: {} ", e);
recycleIsOk = false;
} finally {
IOUtil.close(reader, stream);
}
return recycleIsOk;
}


/**
* 替换路劲中多余的 "/"
*
* @param path 路径
* @return String
*/
public String formatPath(String path) {
if (!StringUtils.isEmpty(path)) {
return path.replaceAll("///*", FILE_SEPARATOR);
}
return path;
}


}

+ 43
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/DataFileAnnotationService.java View File

@@ -0,0 +1,43 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.service;

import org.dubhe.datasetutil.domain.entity.DataFileAnnotation;
import org.springframework.transaction.annotation.Transactional;

import java.util.List;

/**
* @description nlp文件 服务实现类
* @date 2021-01-07
*/
public interface DataFileAnnotationService {

/**
* 批量保存nlp中间表
*
* @param dataFileAnnotations nlp集合
*/
void saveDataFileAnnotation(List<DataFileAnnotation> dataFileAnnotations);

/**
* 删除数据集文件标注数据通过数据集ID
*
* @param datasetId 数据集ID
*/
void delDataFileAnnotationById(long datasetId);
}

+ 13
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/DataFileService.java View File

@@ -32,4 +32,17 @@ public interface DataFileService {
*/
void saveBatchDataFile(List<DataFile> dataFiles);

/**
* 创建新表
*
* @param tableName 表名称
*/
void createNewTable(String tableName);

/**
* 删除数据集文件通过数据集ID
*
* @param datasetId 数据集ID
*/
void deleteFileByDatasetId(long datasetId);
}

+ 18
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/DataLabelService.java View File

@@ -19,6 +19,7 @@ package org.dubhe.datasetutil.service;
import org.dubhe.datasetutil.domain.entity.DataLabel;

import java.util.List;
import java.util.Map;

/**
* @description 数据集标签服务接口
@@ -31,4 +32,21 @@ public interface DataLabelService {
* @param listDataLabel 数据集标签集合
*/
void saveBatchDataLabel(List<DataLabel> listDataLabel);


/**
* 根据预置标签组获取预置标签
*
* @param groupIds 预置标签组IDS
* @return 预置标签map key: 预置标签名称 value:预置标签ID
*/
Map<String, Long> getPresetLabelList(List<Long> groupIds);


/**
* 删除标签
*
* @param datasetId 数据集ID
*/
void deleteLabelByDatasetId(long datasetId);
}

+ 10
- 1
dataset-util/src/main/java/org/dubhe/datasetutil/service/DataSequenceService.java View File

@@ -53,4 +53,13 @@ public interface DataSequenceService {
* @param tableId 表ID
*/
void createTable(String tableId);
}

/**
* 扩容可用数量
*
* @param businessCode 业务编码
* @return DataSequence 数据ID序列
*/
DataSequence expansionUsedNumber(String businessCode);

}

+ 16
- 1
dataset-util/src/main/java/org/dubhe/datasetutil/service/DataVersionFileService.java View File

@@ -16,7 +16,7 @@
*/
package org.dubhe.datasetutil.service;

import org.dubhe.datasetutil.domain.dto.DataVersionFile;
import org.dubhe.datasetutil.domain.entity.DataVersionFile;

import java.util.List;

@@ -31,4 +31,19 @@ public interface DataVersionFileService {
* @param dataVersionFiles 数据集文件数据集合
*/
void saveBatchDataFileVersion(List<DataVersionFile> dataVersionFiles);


/**
* 创建新表
*
* @param tableName 表名称
*/
void createNewTable(String tableName);

/**
* 删除数据集版本通过数据集ID
*
* @param datasetId 数据集ID
*/
void deleteVersionByDatasetId(long datasetId);
}

+ 7
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/DatasetDataLabelService.java View File

@@ -31,4 +31,11 @@ public interface DatasetDataLabelService {
* @param listDatasetDataLabel 数据集标签集合
*/
void saveBatchDatasetDataLabel(List<DatasetDataLabel> listDatasetDataLabel);

/**
* 删除数据集标签关系通过数据集ID
*
* @param datasetId 数据集ID
*/
void deleteDatasetLabelByDatasetId(long datasetId);
}

+ 49
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/DatasetService.java View File

@@ -18,6 +18,8 @@ package org.dubhe.datasetutil.service;

import org.dubhe.datasetutil.domain.entity.Dataset;

import java.util.List;

/**
* @description 数据集服务
* @date 2020-9-17
@@ -39,6 +41,14 @@ public interface DatasetService {
*/
Dataset findDatasetById(Long datasetId);

/**
* 根据ID查询数据集
*
* @param datasetId 数据集Id
* @return Dataset 数据集
*/
Dataset queryDatasetById(Long datasetId);

/**
* 更新数据集状态
*
@@ -62,4 +72,43 @@ public interface DatasetService {
* @return int 数量
*/
int findDataFileById(Long datasetId);

/**
* 根据Id查询数据集
*
* @param datasetId 数据集ID
* @return Dataset 数据集
*/
Dataset findDatasetByIdNormal(Long datasetId);


/**
* 新增数据集
*
* @param insertSql sql语句
*/
void saveBatch(List<String> insertSql);

/**
* 删除数据集通过数据集ID
*
* @param datasetId 数据集ID
*/
void deleteDatasetById(long datasetId);

/**
* 更新数据集状态
*
* @param dataset 数据集
*/
void updateDatasetStatusIsImport(Dataset dataset);

/**
* 更新数据集
*
* @param dataset 数据集信息
* @return int 数量
*/
int updateDataset(Dataset dataset);

}

+ 15
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/DatasetVersionService.java View File

@@ -0,0 +1,15 @@
package org.dubhe.datasetutil.service;

import org.dubhe.datasetutil.domain.entity.DatasetVersion;

/**
* @description TODO
* @date 2021-03-23
*/
public interface DatasetVersionService {

DatasetVersion getByDatasetIdAndVersionNum(Long datasetId, String versionNum);

void insertVersion(Long datasetId, String versionNum, String versionNote);

}

+ 53
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataFileAnnotationServiceImpl.java View File

@@ -0,0 +1,53 @@
/**
* Copyright 2020 Zhejiang Lab. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* =============================================================
*/
package org.dubhe.datasetutil.service.impl;

import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import org.dubhe.datasetutil.dao.DataFileAnnotationMapper;
import org.dubhe.datasetutil.domain.entity.DataFileAnnotation;
import org.dubhe.datasetutil.service.DataFileAnnotationService;
import org.springframework.stereotype.Service;

import java.util.List;

/**
* @description nlp文件 服务实现类
* @date 2021-01-07
*/
@Service
public class DataFileAnnotationServiceImpl extends ServiceImpl<DataFileAnnotationMapper, DataFileAnnotation> implements DataFileAnnotationService {

/**
* 批量保存nlp中间表
*
* @param dataFileAnnotations nlp集合
*/
@Override
public void saveDataFileAnnotation(List<DataFileAnnotation> dataFileAnnotations) {
baseMapper.saveDataFileAnnotation(dataFileAnnotations);
}

/**
* 删除数据集文件标注数据通过数据集ID
*
* @param datasetId 数据集ID
*/
@Override
public void delDataFileAnnotationById(long datasetId) {
baseMapper.delDataFileAnnotationById(datasetId);
}
}

+ 30
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataFileServiceImpl.java View File

@@ -57,4 +57,34 @@ public class DataFileServiceImpl extends ServiceImpl<DataFileMapper, DataFile> i
}
}



/**
* 创建新表
*
* @param tableName 表名称
*/
@Override
public void createNewTable(String tableName){
int count = baseMapper.selectCountByTableName(tableName);
if(count == 0 ){
if((BusinessConstant.DATASET_FILE+BusinessConstant.TABLE_SUFFIX).equals(tableName)){
baseMapper.createNewTableOne();
}else {
baseMapper.createNewTableTwo();
}
}
}


/**
* 删除数据集文件通过数据集ID
*
* @param datasetId 数据集ID
*/
@Override
public void deleteFileByDatasetId(long datasetId) {
baseMapper.deleteFileByDatasetId(datasetId);
}

}

+ 36
- 1
dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataLabelServiceImpl.java View File

@@ -23,15 +23,18 @@ import org.dubhe.datasetutil.domain.entity.DataLabel;
import org.dubhe.datasetutil.service.DataLabelService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* @description 数据集标签服务接口实现
* @date 2020-10-14
*/
@Service
public class DataLabelServiceImpl implements DataLabelService {
public class DataLabelServiceImpl implements DataLabelService {

@Autowired
private DataLabelMapper dataLabelMapper;
@@ -53,4 +56,36 @@ public class DataLabelServiceImpl implements DataLabelService {
dataLabelMapper.saveBatchDataLabel(listDataLabel);
}
}



/**
* 根据预置标签组获取预置标签
*
* @param groupIds 预置标签组IDS
* @return 预置标签 key: 预置标签名称 value:预置标签ID
*/
@Override
public Map<String, Long> getPresetLabelList(List<Long> groupIds) {
List<DataLabel> labels = dataLabelMapper.getPresetLabelList(groupIds);
Map<String, Long> map = new HashMap<>(labels.size());
if(!CollectionUtils.isEmpty(labels)){
labels.forEach(a->{
map.put(a.getName(),a.getId());
});
}
return map;
}

/**
* 删除标签关系通过数据集ID
*
* @param datasetId 数据集ID
*/
@Override
public void deleteLabelByDatasetId(long datasetId) {
dataLabelMapper.deleteLabelByDatasetId(datasetId);
}


}

+ 24
- 2
dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataSequenceServiceImpl.java View File

@@ -16,11 +16,13 @@
*/
package org.dubhe.datasetutil.service.impl;

import org.dubhe.datasetutil.common.exception.DataSequenceException;
import org.dubhe.datasetutil.dao.DataSequenceMapper;
import org.dubhe.datasetutil.domain.entity.DataSequence;
import org.dubhe.datasetutil.service.DataSequenceService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;


/**
@@ -35,7 +37,7 @@ public class DataSequenceServiceImpl implements DataSequenceService {

@Override
public DataSequence getSequence(String businessCode) {
return dataSequenceMapper.selectByBusiness(businessCode);
return dataSequenceMapper.selectDataSequenceById(dataSequenceMapper.selectByBusiness(businessCode).getId());
}

/**
@@ -45,6 +47,7 @@ public class DataSequenceServiceImpl implements DataSequenceService {
* @return int 数量
*/
@Override
@Transactional(rollbackFor = Exception.class)
public int updateSequenceStart(String businessCode) {
return dataSequenceMapper.updateStartByBusinessCode(businessCode);
}
@@ -75,4 +78,23 @@ public class DataSequenceServiceImpl implements DataSequenceService {
String oldTableName = tableName.substring(0,tableName.lastIndexOf("_"));
dataSequenceMapper.createNewTable(tableName,oldTableName);
}
}


/**
* 扩容可用数量
*
* @param businessCode 业务编码
* @return DataSequence 数据ID序列
*/
@Override
@Transactional(rollbackFor = Exception.class)
public DataSequence expansionUsedNumber(String businessCode) {
DataSequence dataSequenceNew = getSequence(businessCode);
if (dataSequenceNew == null || dataSequenceNew.getStart() == null || dataSequenceNew.getStep() == null) {
throw new DataSequenceException("配置出错,请检查data_sequence表配置");
}
updateSequenceStart(businessCode);
return dataSequenceNew;
}

}

+ 36
- 3
dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DataVersionFileServiceImpl.java View File

@@ -17,10 +17,12 @@
package org.dubhe.datasetutil.service.impl;

import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import org.dubhe.datasetutil.common.constant.BusinessConstant;
import org.dubhe.datasetutil.dao.DataVersionFileMapper;
import org.dubhe.datasetutil.domain.dto.DataVersionFile;
import org.dubhe.datasetutil.domain.entity.DataVersionFile;
import org.dubhe.datasetutil.service.DataVersionFileService;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.List;

@@ -29,7 +31,7 @@ import java.util.List;
* @date 2020-09-17
*/
@Service
public class DataVersionFileServiceImpl extends ServiceImpl <DataVersionFileMapper, DataVersionFile> implements DataVersionFileService {
public class DataVersionFileServiceImpl extends ServiceImpl<DataVersionFileMapper, DataVersionFile> implements DataVersionFileService {


/**
@@ -38,7 +40,38 @@ public class DataVersionFileServiceImpl extends ServiceImpl <DataVersionFileMap
* @param listDataVersionFile 数据集文件数据集合
*/
@Override
@Transactional(rollbackFor = Exception.class)
public void saveBatchDataFileVersion(List<DataVersionFile> listDataVersionFile) {
baseMapper.saveBatchDataFileVersion(listDataVersionFile);
baseMapper.saveBatchDataFileVersion(listDataVersionFile);
}


/**
* 创建新表
*
* @param tableName 表名称
*/
@Override
public void createNewTable(String tableName){
int count = baseMapper.selectCountByTableName(tableName);
if(count == 0){
if((BusinessConstant.DATA_DATASET_VERSION_FILE+BusinessConstant.TABLE_SUFFIX).equals(tableName)){
baseMapper.createNewTableOne();
}else {
baseMapper.createNewTableTwo();
}

}
}

/**
* 删除数据集版本通过数据集ID
*
* @param datasetId 数据集ID
*/
@Override
public void deleteVersionByDatasetId(long datasetId) {
baseMapper.deleteVersionByDatasetId(datasetId);
baseMapper.deleteVersionFileByDatasetId(datasetId);
}
}

+ 13
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DatasetDataLabelServiceImpl.java View File

@@ -54,4 +54,17 @@ public class DatasetDataLabelServiceImpl implements DatasetDataLabelService {
datasetDataLabelMapper.saveBatchDatasetDataLabel(listDatasetDataLabel);
}
}


/**
* 删除数据集标签关系通过数据集ID
*
* @param datasetId 数据集ID
*/
@Override
public void deleteDatasetLabelByDatasetId(long datasetId) {
datasetDataLabelMapper.deleteDatasetLabelByDatasetId(datasetId);
}


}

+ 70
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DatasetServiceImpl.java View File

@@ -23,6 +23,10 @@ import org.dubhe.datasetutil.domain.entity.Dataset;
import org.dubhe.datasetutil.service.DatasetService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.util.CollectionUtils;

import java.util.List;
import java.util.Objects;

/**
* @description 数据集 服务实现类
@@ -56,6 +60,11 @@ public class DatasetServiceImpl extends ServiceImpl<DatasetMapper, Dataset> impl
return datasetMapper.findDatasetById(datasetId);
}

@Override
public Dataset queryDatasetById(Long datasetId) {
return baseMapper.selectById(datasetId);
}

/**
* 更新数据集状态
*
@@ -89,4 +98,65 @@ public class DatasetServiceImpl extends ServiceImpl<DatasetMapper, Dataset> impl
public int findDataFileById(Long datasetId) {
return datasetMapper.findDataFileById(datasetId);
}

/**
* 根据ID 查询数据集
*
* @param datasetId 数据集ID
* @return Dataset 数据集
*/
@Override
public Dataset findDatasetByIdNormal(Long datasetId) {
return datasetMapper.findDatasetByIdNormal(datasetId);
}

/**
* 新增数据集
*
* @param insertSql sql语句
*/
@Override
public void saveBatch(List<String> insertSql) {
if(!CollectionUtils.isEmpty(insertSql)){
insertSql.forEach(sql->{
if(!Objects.isNull(sql)){
baseMapper.saveBatch(sql);
}
});
}

}

/**
* 删除数据集通过数据集ID
*
* @param datasetId 数据集ID
*/
@Override
public void deleteDatasetById(long datasetId) {
baseMapper.deleteDatasetById(datasetId);
}

/**
* 更新数据集状态
*
* @param dataset 数据集
*/
@Override
public void updateDatasetStatusIsImport(Dataset dataset) {
dataset.setStatus(DataStateCodeConstant.IN_THE_IMPORT_STATE);
datasetMapper.updateById(dataset);
}

/**
* 更新数据集
*
* @param dataset 数据集信息
* @return int 数量
*/
@Override
public int updateDataset(Dataset dataset) {
return datasetMapper.updateById(dataset);
}

}

+ 30
- 0
dataset-util/src/main/java/org/dubhe/datasetutil/service/impl/DatasetVersionServiceImpl.java View File

@@ -0,0 +1,30 @@
package org.dubhe.datasetutil.service.impl;

import com.baomidou.mybatisplus.core.conditions.query.QueryWrapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import org.dubhe.datasetutil.dao.DatasetVersionMapper;
import org.dubhe.datasetutil.domain.entity.DatasetVersion;
import org.dubhe.datasetutil.service.DatasetVersionService;
import org.springframework.stereotype.Service;

/**
* @description 版本数据处理
* @date 2021-03-23
*/
@Service
public class DatasetVersionServiceImpl extends ServiceImpl<DatasetVersionMapper, DatasetVersion> implements DatasetVersionService {

@Override
public DatasetVersion getByDatasetIdAndVersionNum(Long datasetId, String versionNum) {
QueryWrapper<DatasetVersion> queryWrapper = new QueryWrapper<DatasetVersion>();
queryWrapper.eq("dataset_id", datasetId);
queryWrapper.eq("version_name", versionNum);
return baseMapper.selectOne(queryWrapper);
}

@Override
public void insertVersion(Long datasetId, String versionNum, String versionNote) {
baseMapper.insert(new DatasetVersion(datasetId, versionNum, versionNote));
}

}

+ 0
- 32
dataset-util/src/main/resources/application-dev.yml View File

@@ -1,32 +0,0 @@
#应用名称
spring:
datasource:
#数据源URL
url: jdbc:mysql://127.0.0.1:3306/dubhe-dev?serverTimezone=Asia/Shanghai&characterEncoding=utf8&useSSL=false&allowMultiQueries=true&useInformationSchema=true
#数据源用户名
username: *
#数据源密码
password: *

#配置Sharding-JDBC数据源
shardingsphere:
datasource:
master:
#数据源URL
url: jdbc:log4jdbc:mysql://127.0.0.1:3306/dubhe-dev?serverTimezone=Asia/Shanghai&characterEncoding=utf8&useSSL=false&allowMultiQueries=true&useInformationSchema=true
#数据源用户名
username: *
#数据源密码
password: *

minio:
#minio地址
endpoint: 127.0.0.1
#minio端口号
port: 9000
#minio accessKey
accessKey: abcd
#minio secretKey
secretKey: abcd
#minio bucketName
bucketName: dubhe-dev

+ 41
- 0
dataset-util/src/main/resources/application-prod.yml View File

@@ -0,0 +1,41 @@
#应用名称
spring:
#配置Sharding-JDBC数据源
shardingsphere:
datasource:
master:
#数据源URL
url: jdbc:log4jdbc:mysql://127.0.0.1:3306/dubhe-cloud-prod?serverTimezone=Asia/Shanghai&characterEncoding=utf8&useSSL=false&allowMultiQueries=true&useInformationSchema=true
#数据源用户名
username: root
#数据源密码
password: 123456

minio:
#minio地址
endpoint: 127.0.0.1
#minio端口号
port: 9000
#minio accessKey
accessKey: admin
#minio secretKey
secretKey: 123@abc.com
#minio bucketName
bucketName: dubhe-prod
#minio nfsRootPath
nfsRootPath: /nfs/
# 文件存储服务器用户名
serverUserName: root
# 数据集文档说明地址
dosAddress: http://docs.dubhe.ai/docs/module/dataset/preset-dataset/
#图片后缀名
suffix:
imageFormat: .jpg,.png,.bmp,.jpeg
txtFormat: .txt
# ES服务地址及端口
es:
host: 127.0.0.1
serverPort: 9200
transportPort: 9300
clusterName: kubernetes-logging
index: dataset_text

+ 0
- 32
dataset-util/src/main/resources/application-test.yml View File

@@ -1,32 +0,0 @@
#应用名称
spring:
datasource:
#数据源URL
url: jdbc:mysql://127.0.0.1:3306/dubhe-test?serverTimezone=Asia/Shanghai&characterEncoding=utf8&useSSL=false&allowMultiQueries=true&useInformationSchema=true
#数据源用户名
username: *
#数据源密码
password: *

#配置Sharding-JDBC数据源
shardingsphere:
datasource:
master:
#数据源URL
url: jdbc:log4jdbc:mysql://127.0.0.1:3306/dubhe-test?serverTimezone=Asia/Shanghai&characterEncoding=utf8&useSSL=false&allowMultiQueries=true&useInformationSchema=true
#数据源用户名
username: *
#数据源密码
password: *

minio:
#minio地址
endpoint: 127.0.0.1
#minio端口号
port: 9000
#minio accessKey
accessKey: abcd
#minio secretKey
secretKey: abcd
#minio bucketName
bucketName: dubhe-test

+ 36
- 29
dataset-util/src/main/resources/application.yml View File

@@ -1,33 +1,11 @@
spring:
main:
allow-bean-definition-overriding: true
profiles:
active: dev
active: prod

application:
name: dataset-util
datasource:
type: com.alibaba.druid.pool.DruidDataSource
driver-class-name: com.mysql.jdbc.Driver

# 初始化配置
initial-size: 3
# 最小连接数
min-idle: 3
# 最大连接数
max-active: 15
# 获取连接超时时间
max-wait: 5000
# 连接有效性检测时间
time-between-eviction-runs-millis: 90000
# 最大空闲时间
min-evictable-idle-time-millis: 1800000
test-while-idle: true
test-on-borrow: false
test-on-return: false

validation-query: select 1
# 配置允许后面的Bean覆盖前面名称重复的Bean
main:
allow-bean-definition-overriding: true
# 配置Sharding-JDBC数据源名称,可配置多个
shardingsphere:
datasource:
@@ -37,19 +15,40 @@ spring:
driver-class-name: net.sf.log4jdbc.sql.jdbcapi.DriverSpy
test-while-idle: true
validation-query: select 1
# 初始化配置
initial-size: 3
# 最小连接数
min-idle: 3
# 最大连接数
max-active: 15
# 获取连接超时时间
max-wait: 5000
# 连接有效性检测时间
time-between-eviction-runs-millis: 90000
# 最大空闲时间
min-evictable-idle-time-millis: 1800000
test-on-borrow: false
test-on-return: false

#指定data_dataset表 主键id 生成策略
sharding:
tables:
data_file:
actual-data-nodes: master.data_file_$->{1..100000}
actual-data-nodes: master.data_file_$->{1..300}
table-strategy:
standard:
sharding-column: dataset_id
precise-algorithm-class-name: org.dubhe.datasetutil.common.util.MyPreciseShardingAlgorithm

data_dataset_version_file:
actual-data-nodes: master.data_dataset_version_file_$->{1..100000}
actual-data-nodes: master.data_dataset_version_file_$->{1..300}
table-strategy:
standard:
sharding-column: dataset_id
precise-algorithm-class-name: org.dubhe.datasetutil.common.util.MyPreciseShardingAlgorithm

data_file_annotation:
actual-data-nodes: master.data_file_annotation_$->{1..300}
table-strategy:
standard:
sharding-column: dataset_id
@@ -60,11 +59,19 @@ spring:

minio:
secure: false
blockingCoefficient: 0.5

#logback
logging.config:
classpath:logback-spring-dev.xml
classpath:logback-spring-${spring.profiles.active}.xml

mybatis-plus:
global-config:
banner: false
banner: false

# ES服务地址及端口
es:
host: 127.0.0.1
serverPort: 9200
transportPort: 9300
index: dataset_text_test

dataset-util/src/main/resources/logback-spring-dev.xml → dataset-util/src/main/resources/logback-spring-prod.xml View File


+ 0
- 248
dataset-util/src/main/resources/logback-spring-test.xml View File

@@ -1,248 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration scan="true" scanPeriod="30 seconds"
debug="false">
<contextName>dubhe</contextName>
<property name="log.charset" value="utf-8" />
<springProperty scope="context" name="app.active"
source="spring.profiles.active" />
<property name="log.path" value="dubhe-${app.active}" />
<property name="log.pattern"
value="%black(%contextName-) %red(%d{yyyy-MM-dd HH:mm:ss}) %green([%thread]) %highlight(%-5level) %boldMagenta(%logger{36}) [%X{traceId}] - %gray(%msg%n)" />

<!--输出到控制台 -->
<appender name="console"
class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>${log.pattern}</pattern>
</encoder>
</appender>

<!-- 滚动记录文件,先将日志记录到指定文件,复合条件后日志将记录到其他文件 -->
<appender name="info_file"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/${log.path}/info/dubhe-info.log</file>
<rollingPolicy
class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>logs/${log.path}/info/dubhe-${app.active}-info-%d{yyyy-MM-dd}.%i.log
</fileNamePattern>
<!-- 单个日志文件最多50MB, 14天的日志周期,最大不能超过250MB -->
<maxFileSize>50MB</maxFileSize>
<maxHistory>7</maxHistory>
<totalSizeCap>250MB</totalSizeCap>
</rollingPolicy>
<encoder>
<pattern>%m%n</pattern>
</encoder>
<!-- 默认true日志追加到文件结尾,false清空现存文件 -->
<append>true</append>
<filter class="org.dubhe.datasetutil.common.filter.BaseLogFilter">
<level>INFO</level>
<name>INFO,K8S_CALLBACK</name>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>

<!-- 滚动记录文件,先将日志记录到指定文件,复合条件后日志将记录到其他文件 -->
<appender name="debug_info"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/${log.path}/debug/dubhe-debug.log</file>
<rollingPolicy
class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>logs/${log.path}/debug/dubhe-${app.active}-debug-%d{yyyy-MM-dd}.%i.log
</fileNamePattern>
<!-- 单个日志文件最多50MB, 14天的日志周期,最大不能超过250MB -->
<maxFileSize>50MB</maxFileSize>
<maxHistory>7</maxHistory>
<totalSizeCap>250MB</totalSizeCap>
</rollingPolicy>
<encoder>
<pattern>%m%n</pattern>
</encoder>
<!-- 默认true日志追加到文件结尾,false清空现存文件 -->
<append>true</append>
<filter class="org.dubhe.datasetutil.common.filter.BaseLogFilter">
<level>DEBUG</level>
<name>DEBUG</name>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>

<!-- 滚动记录文件,先将日志记录到指定文件,复合条件后日志将记录到其他文件 -->
<appender name="error_file"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/${log.path}/error/dubhe-error.log</file>
<rollingPolicy
class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>logs/${log.path}/error/dubhe-${app.active}-error-%d{yyyy-MM-dd}.%i.log
</fileNamePattern>
<!-- 单个日志文件最多50MB, 14天的日志周期,最大不能超过250MB -->
<maxFileSize>50MB</maxFileSize>
<maxHistory>7</maxHistory>
<totalSizeCap>250MB</totalSizeCap>
</rollingPolicy>
<encoder>
<pattern>%m%n</pattern>
</encoder>
<!-- 默认true日志追加到文件结尾,false清空现存文件 -->
<append>true</append>
<filter class="org.dubhe.datasetutil.common.filter.BaseLogFilter">
<level>ERROR</level>
<name>ERROR</name>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>

<!-- 滚动记录文件,先将日志记录到指定文件,复合条件后日志将记录到其他文件 -->
<appender name="warn_file"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/${log.path}/warn/dubhe-warn.log</file>
<rollingPolicy
class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>logs/${log.path}/warn/dubhe-${app.active}-warn-%d{yyyy-MM-dd}.%i.log
</fileNamePattern>
<!-- 单个日志文件最多50MB, 14天的日志周期,最大不能超过250MB -->
<maxFileSize>50MB</maxFileSize>
<maxHistory>7</maxHistory>
<totalSizeCap>250MB</totalSizeCap>
</rollingPolicy>
<encoder>
<pattern>%m%n</pattern>
</encoder>
<!-- 默认true日志追加到文件结尾,false清空现存文件 -->
<append>true</append>

<filter class="org.dubhe.datasetutil.common.filter.BaseLogFilter">
<level>WARN</level>
<name>WARN</name>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>

<!-- 滚动记录文件,先将日志记录到指定文件,复合条件后日志将记录到其他文件 -->
<appender name="trace_file"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/${log.path}/trace/dubhe-trace.log</file>
<rollingPolicy
class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>logs/${log.path}/trace/dubhe-${app.active}-trace-%d{yyyy-MM-dd}.%i.log
</fileNamePattern>
<!-- 单个日志文件最多50MB, 14天的日志周期,最大不能超过250MB -->
<maxFileSize>50MB</maxFileSize>
<maxHistory>7</maxHistory>
<totalSizeCap>250MB</totalSizeCap>
</rollingPolicy>
<encoder>
<pattern>%m%n</pattern>
</encoder>
<!-- 默认true日志追加到文件结尾,false清空现存文件 -->
<append>true</append>
<filter class="org.dubhe.datasetutil.common.filter.BaseLogFilter">
<level>TRACE</level>
<name>TRACE</name>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>


<!-- 滚动记录文件,先将日志记录到指定文件,复合条件后日志将记录到其他文件 -->
<appender name="schedule_file"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/${log.path}/info/dubhe-schedule.log</file>
<rollingPolicy
class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>logs/${log.path}/info/dubhe-${app.active}-schedule-%d{yyyy-MM-dd}.%i.log
</fileNamePattern>
<!-- 单个日志文件最多50MB, 14天的日志周期,最大不能超过250MB -->
<maxFileSize>50MB</maxFileSize>
<maxHistory>7</maxHistory>
<totalSizeCap>250MB</totalSizeCap>
</rollingPolicy>
<encoder>
<pattern>%m%n</pattern>
</encoder>
<!-- 默认true日志追加到文件结尾,false清空现存文件 -->
<append>true</append>
<filter class="org.dubhe.datasetutil.common.filter.BaseLogFilter">
<level>INFO</level>
<name>SCHEDULE</name>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>

<!-- 滚动记录文件,先将日志记录到指定文件,复合条件后日志将记录到其他文件 -->
<appender name="global_request_file"
class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/${log.path}/info/dubhe-request.log</file>
<rollingPolicy
class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>logs/${log.path}/info/dubhe-${app.active}-request-%d{yyyy-MM-dd}.%i.log
</fileNamePattern>
<!-- 单个日志文件最多50MB, 14天的日志周期,最大不能超过250MB -->
<maxFileSize>50MB</maxFileSize>
<maxHistory>7</maxHistory>
<totalSizeCap>250MB</totalSizeCap>
</rollingPolicy>
<encoder>
<pattern>%m%n</pattern>
</encoder>
<!-- 默认true日志追加到文件结尾,false清空现存文件 -->
<append>true</append>
<filter class="org.dubhe.datasetutil.common.filter.GlobalRequestLogFilter">
<level>INFO</level>
<!-- name必须配置,不然没有日志,注意看GlobalLogFilter -->
<name>GLOBAL_REQUEST</name>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>

<!--普通日志输出到控制台 -->
<root level="info">
<appender-ref ref="console" />
<appender-ref ref="info_file" />
<appender-ref ref="schedule_file"/>
<appender-ref ref="global_request_file"/>
<appender-ref ref="debug_info" />
<appender-ref ref="error_file" />
<appender-ref ref="warn_file" />
<appender-ref ref="trace_file" />
</root>

<!--监控sql日志输出 -->
<!-- <logger name="jdbc.sqlonly" level="OFF" additivity="false"> -->
<logger name="jdbc.sqlonly" level="INFO" additivity="false">
<appender-ref ref="console" />
</logger>

<logger name="jdbc.resultset" level="ERROR" additivity="false">
<appender-ref ref="console" />
</logger>

<!-- 如想看到表格数据,将OFF改为INFO -->
<logger name="jdbc.resultsettable" level="OFF"
additivity="false">
<appender-ref ref="console" />
</logger>

<logger name="jdbc.connection" level="OFF" additivity="false">
<appender-ref ref="console" />
</logger>

<logger name="jdbc.sqltiming" level="OFF" additivity="false">
<appender-ref ref="console" />
</logger>

<logger name="jdbc.audit" level="OFF" additivity="false">
<appender-ref ref="console" />
</logger>

<logger name="org.dubhe.k8s" level="DEBUG" additivity="false">
<appender-ref ref="console" />
</logger>
</configuration>

+ 13
- 0
dataset-util/src/main/resources/mapper/DataFileAnnotationMapper.xml View File

@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >

<mapper namespace="org.dubhe.datasetutil.dao.DataFileAnnotationMapper">
<insert id="saveDataFileAnnotation" parameterType="java.util.List">
insert into data_file_annotation (id,dataset_id,label_id,version_file_id,prediction,create_user_id,file_name)
values
<foreach collection="dataFileAnnotations" item="file" separator=",">
(#{file.id},#{file.datasetId},#{file.labelId},#{file.versionFileId},
#{file.prediction},#{file.createUserId}, #{file.fileName})
</foreach>
</insert>
</mapper>

+ 2
- 2
dataset-util/src/main/resources/mapper/DataFileMapper.xml View File

@@ -2,8 +2,8 @@
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >

<mapper namespace="org.dubhe.datasetutil.dao.DataFileMapper">
<insert id="saveBatchDataFile" parameterType="java.util.List" useGeneratedKeys="true" keyProperty="id">
insert into data_file (id,`name`,dataset_id,status,url,enhance_type,width,height,origin_user_id,create_user_id,pid)
<insert id="saveBatchDataFile" parameterType="java.util.List">
insert into data_file (id,name,dataset_id,status,url,enhance_type,width,height,origin_user_id,create_user_id,pid)
values
<foreach collection="listDataFile" item="file" separator=",">
(#{file.id},#{file.name},#{file.datasetId},#{file.status},


+ 14
- 0
dataset-util/src/main/resources/mapper/DataLabelMapper.xml View File

@@ -9,4 +9,18 @@
( #{dataLabel.name},#{dataLabel.color},#{dataLabel.createUserId})
</foreach>
</insert>

<select id="getPresetLabelList" parameterType="java.util.List"
resultType="org.dubhe.datasetutil.domain.entity.DataLabel">
select dl.id,dl.name from data_label dl
left join data_group_label dgl on dl.id = dgl.label_id
left join data_label_group dlg on dgl.label_group_id = dlg.id
where dl.deleted = 0
<if test="groupIds!=null and groupIds.size()&gt; 0">
<foreach collection="groupIds" item="item" separator="," open="and dlg.id in(" close=")">
#{item}
</foreach>
</if>
</select>

</mapper>

+ 3
- 3
dataset-util/src/main/resources/mapper/DataVersionFileMapper.xml View File

@@ -2,11 +2,11 @@
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd" >

<mapper namespace="org.dubhe.datasetutil.dao.DataVersionFileMapper">
<insert id="saveBatchDataFileVersion" parameterType="java.util.List" useGeneratedKeys="true" keyProperty="id">
insert into data_dataset_version_file (id,dataset_id,file_id,annotation_status,status)
<insert id="saveBatchDataFileVersion" parameterType="java.util.List">
insert into data_dataset_version_file (id,dataset_id,file_id,annotation_status,status,version_name,file_name)
values
<foreach collection="listDataVersionFile" item="temp" separator=",">
(#{temp.id},#{temp.datasetId},#{temp.fileId},#{temp.annotationStatus},#{temp.status})
(#{temp.id},#{temp.datasetId},#{temp.fileId},#{temp.annotationStatus},#{temp.status},#{temp.versionName},#{temp.fileName})
</foreach>
</insert>
</mapper>

Loading…
Cancel
Save