@@ -0,0 +1,5 @@ | |||||
.idea | |||||
/model_compress/distil/data/ | |||||
/model_compress/distil/glove/ | |||||
/model_compress/distil/models/ | |||||
/model_compress/distil/outputs/ |
@@ -0,0 +1,155 @@ | |||||
# Oneflow-Model-Compression | |||||
## 概述 | |||||
炼知技术平台是一个模型压缩平台,包含剪枝、量化、知识蒸馏等一系列模型压缩策略。 | |||||
提供完整的模型压缩解决方案,可用于各种类型的自然语言和计算机视觉场景,如文本分类、推理,图像分类等。 | |||||
另外,平台在不断完善各种压缩策略在经典开源任务的Benchmark,以便用户参考。 | |||||
同时,平台也提供各种压缩策略的功能算子,方便用户使用、复现最新的论文方法,以及利用压缩算子进行二次开发。 | |||||
<p align="center"> | |||||
<br> | |||||
<img src="./docs/imgs/overview.png" width="600"/> | |||||
<br> | |||||
<p> | |||||
## 功能 | |||||
<table style="width:100%;" cellpadding="2" cellspacing="0" border="1" bordercolor="#000000"> | |||||
<tbody> | |||||
<tr> | |||||
<td style="text-align:center;"> | |||||
<span style="font-size:18px;">功能模块</span> | |||||
</td> | |||||
<td style="text-align:center;"> | |||||
<span style="font-size:18px;">算法</span> | |||||
</td> | |||||
<td style="text-align:center;"> | |||||
<span style="font-size:18px;">相关文档</span> | |||||
</td> | |||||
</tr> | |||||
<tr> | |||||
<td style="text-align:center;"> | |||||
量化 | |||||
</td> | |||||
<td> | |||||
<ul> | |||||
<li> | |||||
<span>deep compression</span>: <a href="https://arxiv.org/pdf/1510.00149.pdf" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Han S, Mao H, Dally W J. "Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding" </span><i>arXiv preprint arXiv:1510.00149</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;"> (2017).</span></a> | |||||
</li> | |||||
<li> | |||||
<span>NVIDIA TensorRT</span>: <a href="https://github.com/NVIDIA/TensorRT" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">a C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators. </span><i></i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;"> </span></a> | |||||
</li> | |||||
</ul> | |||||
</td> | |||||
<td> | |||||
<ul> | |||||
<li> | |||||
<a href="./docs/API_quant.md" target="_blank">量化API文档</a> | |||||
</li> | |||||
</ul> | |||||
</td> | |||||
</tr> | |||||
<tr> | |||||
<td style="text-align:center;"> | |||||
<span style="font-size:12px;">剪枝</span><span style="font-size:12px;"></span><br /> | |||||
</td> | |||||
<td> | |||||
<ul> | |||||
<li> | |||||
<span>bn channel slimming</span>: <a href="https://arxiv.org/abs/1708.06519" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Zhuang Liu, Jianguo Li, Zhiqiang Shen. "Learning Efficient Convolutional Networks through Network Slimming" </span><i>arXiv preprint arXiv:1708.06519</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;"> (2017).</span></a> | |||||
</li> | |||||
<li> | |||||
<span>conv channel slimming</span>: <a href="https://arxiv.org/abs/1608.08710" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Hao Li, Asim Kadav, Igor Durdanovic. "Pruning Filters for Efficient ConvNets" </span><i>arXiv preprint arXiv:1608.08710</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;"> (2016).</span></a> | |||||
</li> | |||||
<li> | |||||
<span>conv channel slimming</span>: <a href="http://cn.arxiv.org/abs/1607.03250" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Hengyuan Hu, Rui Peng, Yu-Wing Tai. "Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures" </span><i>arXiv preprint arXiv:1607.03250</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;"> (2016).</span></a> | |||||
</li> | |||||
</ul> | |||||
</td> | |||||
<td> | |||||
<ul> | |||||
<li> | |||||
<a href="./docs/API_prune.md" target="_blank">剪枝API文档</a> | |||||
</li> | |||||
</ul> | |||||
<ul> | |||||
<li> | |||||
<a href="./model_compress/ChannelSlimming" target="_blank">剪枝快速上手</a> | |||||
</li> | |||||
</ul> | |||||
</td> | |||||
</tr> | |||||
<tr> | |||||
<td style="text-align:center;"> | |||||
知识蒸馏 | |||||
</td> | |||||
<td> | |||||
<ul> | |||||
<li> | |||||
<span>Knowledge Distillation</span>: <a href="https://arxiv.org/abs/1503.02531" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Hinton, Geoffrey, Oriol Vinyals, and Jeff Dean. "Distilling the knowledge in a neural network." </span><i>arXiv preprint arXiv:1503.02531</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;"> (2015).</span></a> | |||||
</li> | |||||
<li> | |||||
Distilled-BiLSTM: <a href="https://arxiv.org/abs/1903.12136" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Tang, Raphael, et al. "Distilling task-specific knowledge from bert into simple neural networks." arXiv preprint arXiv:1903.12136 (2019).</span></a> | |||||
</li> | |||||
<li> | |||||
BERT-PKD: <a href="https://arxiv.org/abs/1908.09355" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Sun, Siqi, et al. "Patient knowledge distillation for bert model compression." arXiv preprint arXiv:1908.09355 (2019).</span></a> | |||||
</li> | |||||
<li> | |||||
TinyBERT: <a href="https://arxiv.org/abs/1909.10351" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Jiao, Xiaoqi, et al. "Tinybert: Distilling bert for natural language understanding." arXiv preprint arXiv:1909.10351 (2019).</span></a> | |||||
</li> | |||||
<li> | |||||
MobileBERT: <a href="https://arxiv.org/abs/2004.02984" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Sun, Zhiqing, et al. "Mobilebert: a compact task-agnostic bert for resource-limited devices." arXiv preprint arXiv:2004.02984 (2020).</span></a> | |||||
</li> | |||||
<li> | |||||
BERT-Theseus: <a href="https://arxiv.org/abs/2002.02925" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Xu, Canwen, et al. "Bert-of-theseus: Compressing bert by progressive module replacing." arXiv preprint arXiv:2002.02925 (2020).</span></a> | |||||
</li> | |||||
<li> | |||||
改进版的BERT-Theseus: <a href="https://arxiv.org/abs/2002.02925" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Xu, Canwen, et al. "Bert-of-theseus: Compressing bert by progressive module replacing." arXiv preprint arXiv:2002.02925 (2020).</span></a> | |||||
</li> | |||||
</ul> | |||||
</td> | |||||
<td> | |||||
<ul> | |||||
<li> | |||||
<a href="./docs/API_knowledge_distill.md" target="_blank">知识蒸馏API文档</a> | |||||
</li> | |||||
<li> | |||||
<a href="./model_compress/distil" target="_blank">知识蒸馏快速上手</a> | |||||
</li> | |||||
<li> | |||||
<a href="./model_compress/distil/examples/knowledge_distillation/README.md" target="_blank">Knowledge Distillation算法文档</a> | |||||
</li> | |||||
<li> | |||||
<a href="./model_compress/distil/examples/distilled-bilstm/README.md" target="_blank">Distilled-BiLSTM算法文档</a> | |||||
</li> | |||||
<li> | |||||
<a href="./model_compress/distil/examples/bert-pkd/README.md" target="_blank">BERT-PKD算法文档</a> | |||||
</li> | |||||
<li> | |||||
<a href="./model_compress/distil/examples/tinybert/README.md" target="_blank">TinyBERT算法文档</a> | |||||
</li> | |||||
<li> | |||||
<a href="model_compress/distil/theseus/README.md" target="_blank">BERT-Theseus算法文档</a> | |||||
</li> | |||||
</ul> | |||||
</td> | |||||
</tr> | |||||
</tbody> | |||||
</table>> | |||||
<br /> | |||||
## 使用 | |||||
- Oneflow介绍: 深度学习框架Oneflow[介绍以及环境安装说明](https://github.com/Oneflow-Inc/oneflow)。 | |||||
- Oneflow快速开始:通过[简单示例](http://docs.oneflow.org/quick_start/quickstart_in_3_min.html)介绍如何快速3分钟上手使用Oneflow。 | |||||
- 模型压缩API文档:用户接口文档,包含以下功能 | |||||
- [量化](./docs/API_quant.md) | |||||
- [剪枝](./docs/API_prune.md) | |||||
- [知识蒸馏](./docs/API_knowledge_distill.md) | |||||
- 高阶教程:包括在CV和NLP等应用场景任务的使用示例、算法使用步骤,高级特性的使用教程。 | |||||
- 量化功能文档: 介绍量化功能使用示例,主要包含int8量化。 | |||||
- 剪枝功能文档: 介绍通道剪枝实现和[使用示例](./model_compress/ChannelSlimming/readme.md),只要包括CNN模型、DNN模型的不同剪枝算子。 | |||||
- [知识蒸馏功能](./model_compress/distil)文档: 介绍知识蒸馏功能相关论文实现和使用示例,主要包含[KD](./model_compress/distil/examples/knowledge_distillation/README.md), [Distilled-BiLSTM](./model_compress/distil/examples/distilled-bilstm/README.md), [BERT-PKD](./model_compress/distil/examples/bert-pkd/README.md), [TinyBERT](./model_compress/distil/examples/tinybert/README.md), [BERT-Theseus](model_compress/distil/theseus/README.md)等算法。 | |||||
- [TensorRT量化部署](./docs/API_quant.md): 介绍如何使用TensorRT部署量化得到的Oneflow模型。 | |||||
- [模型库](./docs/model_zoo.md):各个压缩算法在文本分类、推理,图像分类等数据集上的实验结果,包括模型精度、模型尺寸和推理速度。 |
@@ -0,0 +1,63 @@ | |||||
知识蒸馏 | |||||
========= | |||||
"软标签蒸馏"算子: pred_distill | |||||
--------- | |||||
`knowledge_distill_util.pred_distill(args, student_logits, teacher_logits):` | |||||
[源代码](../model_compress/distil/src/knowledge_distill_util.py#L381) | |||||
`pred_distill`为teacher和student模型添加软标签损失,使得student模型可以学习教师模型的输出,达到student模型模仿teacher模型在预测层的表现的目的。 | |||||
采用[soft_cross_entropy](../model_compress/distil/src/knowledge_distill_util.py#L336)来计算损失。 | |||||
**参数:** | |||||
- **args**: 一些超参,如teacher_temperature和student_temperature,对student和teacher模型进行soft操作的温度值。 | |||||
- **student_logits**: student模型预测出的logits。 | |||||
- **teacher_logits**: teacher模型预测出的logits。 | |||||
**返回:** 由teacher模型和student模型组合得到的软标签损失。 | |||||
--- | |||||
"层与层蒸馏"算子: layer_distill | |||||
--------- | |||||
`knowledge_distill_util.layer_distill(args, student_reps, teacher_reps):` | |||||
[源代码](../model_compress/distil/src/knowledge_distill_util.py#L346) | |||||
`layer_distill`为teacher和student模型添加层与层损失,使得student模型可以学习教师模型的隐藏层特征,达到用teacher模型的暗知识(Dark Knowledge)指导student模型学习的目的,将teacher模型中的知识更好的蒸馏到student模型中。通过[MSE](../model_compress/distil/src/knowledge_distill_util.py#L343)来计算student模型和teacher模型中间层的距离。 | |||||
**参数:** | |||||
- **args**: 一些超参,暂未用到,仅留出接口。 | |||||
- **student_reps**: student模型的所有中间层表示。 | |||||
- **teacher_reps**: teacher模型的所有中间层表示。 | |||||
**返回:** 由teacher模型和student模型组合得到的层与层蒸馏损失。 | |||||
>注:该算子仅适用于BERT类的student和teacher模型。 | |||||
--- | |||||
"注意力蒸馏"算子: att_distill | |||||
--------- | |||||
`knowledge_distill_util.att_distill(args, student_atts, teacher_atts):` | |||||
[源代码](../model_compress/distil/src/knowledge_distill_util.py#L363) | |||||
`att_distill`为teacher和student模型添加注意力损失,使得student模型可以学习教师模型的attention score矩阵,学习到其中包含语义知识,例如语法和相互关系等。通过[MSE](../model_compress/distil/src/knowledge_distill_util.py#L343)来计算损失。 | |||||
**参数:** | |||||
- **args**: 一些超参,暂未用到,仅留出接口。 | |||||
- **student_reps**: student模型的所有的attention score矩阵。 | |||||
- **teacher_reps**: teacher模型的所有的attention score矩阵。 | |||||
**返回:** 由teacher模型和student模型组合得到的注意力蒸馏损失。 | |||||
>注:该算子仅适用于BERT类的student和teacher模型。 |
@@ -0,0 +1,86 @@ | |||||
1. 通道剪枝算子 | |||||
========= | |||||
## 1.1 "bn"剪枝算子 | |||||
- `get_pruneThre_bn():`卷积层对应的BN层的gamma参数作为缩放因子,获得剪枝对应阈值 | |||||
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L120) | |||||
- **返回**:剪枝对应的阈值 | |||||
- `get_removeIndex_bn(a, thre):`根据阈值获得当前卷积层需要剪枝的通道index | |||||
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L182) | |||||
- **参数**: | |||||
- **a**:当前卷积层的参数 | |||||
- **thre**:`get_pruneThre_bn()`返回的阈值 | |||||
1.2 "conv_avg"剪枝算子 | |||||
--------- | |||||
- `get_pruneThre_conv_avg():`卷积层参数的平均值作为缩放因子,获得剪枝对应阈值 | |||||
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L54) | |||||
- **返回**:剪枝对应的阈值 | |||||
- `get_removeIndex_conv_avg(a, shape, thre):`根据阈值获得当前卷积层需要剪枝的通道index | |||||
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L187) | |||||
- **参数**: | |||||
- **a**:当前卷积层的参数 | |||||
- **shape**:当前卷积层的shape信息 | |||||
- **thre**:`get_pruneThre_conv_avg()`返回的阈值 | |||||
## 1.3 "conv_max"剪枝算子 | |||||
- 同"conv_avg"剪枝算子 | |||||
## 1.4 "conv_all"剪枝算子 | |||||
- 同"conv_avg"剪枝算子 | |||||
1.5 "random"剪枝算子 | |||||
--------- | |||||
- `get_removeIndex_conv_avg(shape):`随机选择需要剪枝的通道index | |||||
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L220) | |||||
- **参数**: | |||||
- **shape**:当前卷积层的shape信息 | |||||
1.6 "dnn"剪枝算子 | |||||
--------- | |||||
- `get_pruneThre_fc():`全连接层的神经元的参数的平均值作为缩放因子,获得剪枝对应阈值 | |||||
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#137) | |||||
- **返回**:剪枝对应的阈值 | |||||
- `get_removeIndex_fc(a, shape, thre):`根据阈值获得当前全连接层需要剪枝的神经元index | |||||
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L171) | |||||
- **参数**: | |||||
- **a**:当前全连接层的参数 | |||||
- **shape**:当前全连接层的shape信息 | |||||
- **thre**:`get_pruneThre_fc()`返回的阈值 | |||||
2. 模型调用算子 | |||||
========= | |||||
## 2.1 pruneDnn.py | |||||
- DNN模型剪枝,可调用1.6剪枝算子 | |||||
- [文件](../model_compress/ChannelSlimming/prune/pruneDnn.py) | |||||
## 2.2 pruneLenet.py | |||||
- CNN模型的lenet模型剪枝,可调用1.1-1.5剪枝算子 | |||||
- [文件](../model_compress/ChannelSlimming/prune/pruneLenet.py) | |||||
## 2.3 pruneAlexnet.py | |||||
- CNN模型的lenet模型剪枝,可调用1.1-1.5剪枝算子 | |||||
- [文件](../model_compress/ChannelSlimming/prune/pruneAlexnet.py) | |||||
## 2.4 pruneVggnet.py | |||||
- CNN模型的lenet模型剪枝,可调用1.1-1.5剪枝算子 | |||||
- [文件](../model_compress/ChannelSlimming/prune/pruneVggnet.py) | |||||
## 2.5 pruneResnet.py | |||||
- CNN模型的lenet模型剪枝,可调用1.1-1.5剪枝算子 | |||||
- [文件](../model_compress/ChannelSlimming/prune/pruneResnet.py) |
@@ -0,0 +1,126 @@ | |||||
# OneFlow 量化推理 | |||||
## OneFlow 中的 XRT | |||||
XRT 是一个同时支持多个计算引擎的运行时加速库,目前已经集成了 TensorFlow XLA 和 NVIDIA | |||||
TensorRT 两个后端引擎。其中 XLA 全面支持训练和预测,TensorRT 支持预测以及部分算子支持训练。对于同一个计算图,XRT 允许多个计算引擎联合使用,以获得更好的加速效果,其中 TensorRT 具有 Int8 量化功能。 | |||||
由于 TensorRT 中官方支持的 op 并没有那么全面,其余自定义 op 有可能受接口限制,因此 OneFlow 后续会采用 plug-in 形式添加,支持更多算子。 | |||||
## 在 OneFlow 中使用 TensorRT | |||||
* 前期准备 | |||||
* 数据集:以测试 ResNet50 为例,需要提前准备 ImageNet 的 OFRecord 格式数据集。 | |||||
* 下载 TensorRT:编译时需要链接 TensorRT 的头文件和动态库,因此用户需要根据自己系统和已安装的 CUDA 版本选择相应版本的 TensorRT,同时满足 TensorRT 的其他依赖。 | |||||
* 下载 OneFlow-Benchmark:OneFlow-Benchmark 是 OneFlow 的模型基准仓库,提供了一系列完备实现的网络模型,本次测试选择的是其中的ResNet50。 | |||||
* 编译:编译时开启 -DWITH_TENSORRT 选项,并指定 TensorRT 源码解压后的所在路径 | |||||
``` | |||||
cmake .. -DWITH_TENSORRT=ON -DTENSORRT_ROOT=/home/${user}/TensorRT-6.0.1.8 && make -j 24 | |||||
``` | |||||
或者可以在 cmake 前使用环境变量指定 | |||||
``` | |||||
export TENSORRT_ROOT=/home/${user}/TensorRT-6.0.1.8 | |||||
``` | |||||
编译成功后即可安装支持 TensoRT 的 OneFlow。 | |||||
* 运行 | |||||
目前 OneFlow 中的 TensorRT 仅支持单卡推理。编译成功后切换到 dev_trt_infer 分支,在 config.py 中 | |||||
* 添加 --use\_tensorrt,可使用 TenosrRT 推理。 | |||||
* 添加 --use\_tensorrt 和 use\_int8,可开启 TenosrRT 的 int8 量化。 | |||||
## 环境 | |||||
硬件环境 | |||||
* CPU:Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz x 6 | |||||
* GPU:[GeForce GTX 1080] x 4 | |||||
软件环境 | |||||
* 系统:Ubuntu 18.04.4 LTS | |||||
* NVIDIA Driver Version:440.44 | |||||
* CUDA:10.2 | |||||
* GCC:7.5 | |||||
* Cmake:3.14.4 | |||||
* Make:4.1 | |||||
测试结果 | |||||
测试模型为 ResNet 50(以下称 rn50),使用在线量化,分别进行单机单卡和单机多卡推理,batch_size 取 64 和可运行的最大 batch_size。 | |||||
若正常运行,log 打印如下: | |||||
``` | |||||
================================================================== | |||||
Running resnet50: num_gpu_per_node = 1, num_nodes = 1. | |||||
================================================================== | |||||
dtype = float32 | |||||
gpu_num_per_node = 1 | |||||
num_nodes = 1 | |||||
node_ips = ['127.0.0.1'] | |||||
ctrl_port = 50051 | |||||
model = resnet50 | |||||
use_fp16 = None | |||||
use_xla = None | |||||
channel_last = None | |||||
pad_output = None | |||||
num_epochs = 1 | |||||
model_load_dir = resnet_v15_of_best_model_val_top1_77318 | |||||
batch_size_per_device = 64 | |||||
val_batch_size_per_device = 256 | |||||
nccl_fusion_threshold_mb = 0 | |||||
nccl_fusion_max_ops = 0 | |||||
fuse_bn_relu = False | |||||
fuse_bn_add_relu = False | |||||
gpu_image_decoder = False | |||||
image_path = test_img/tiger.jpg | |||||
num_classes = 1000 | |||||
num_examples = 1281167 | |||||
num_val_examples = 50000 | |||||
rgb_mean = [123.68, 116.779, 103.939] | |||||
rgb_std = [58.393, 57.12, 57.375] | |||||
image_shape = [3, 224, 224] | |||||
label_smoothing = 0.1 | |||||
model_save_dir = ./output/snapshots/model_save-20201123172206 | |||||
log_dir = ./output | |||||
loss_print_every_n_iter = 1 | |||||
image_size = 224 | |||||
resize_shorter = 256 | |||||
train_data_dir = None | |||||
train_data_part_num = 256 | |||||
val_data_dir = /dataset/ImageNet/ofrecord/validation | |||||
val_data_part_num = 256 | |||||
optimizer = sgd | |||||
learning_rate = 0.256 | |||||
wd = 3.0517578125e-05 | |||||
momentum = 0.875 | |||||
lr_decay = cosine | |||||
lr_decay_rate = 0.94 | |||||
lr_decay_epochs = 2 | |||||
warmup_epochs = 5 | |||||
decay_rate = 0.9 | |||||
epsilon = 1.0 | |||||
gradient_clipping = 0.0 | |||||
------------------------------------------------------------------ | |||||
Time stamp: 2020-11-23-17:22:06 | |||||
Restoring model from resnet_v15_of_best_model_val_top1_77318. | |||||
Loading data from /dataset/ImageNet/ofrecord/validation | |||||
W1123 17:23:41.120939 31217 trt_executable.cpp:146] Rebuild engine since the maximum batch size 1 is less than the input batch size 256 | |||||
W1123 17:24:25.756124 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead. | |||||
W1123 17:24:31.005220 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead. | |||||
W1123 17:24:36.085610 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead. | |||||
W1123 17:24:41.073289 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead. | |||||
W1123 17:24:45.920917 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead. | |||||
W1123 17:24:50.633805 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead. | |||||
W1123 17:24:55.354147 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead. | |||||
W1123 17:24:59.904863 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead. | |||||
validation: epoch 0, iter 195, top_1: 0.772155, top_k: 0.934856, samples/s: 181.038 1606123666.3968866 | |||||
``` | |||||
### 单机单卡 | |||||
@@ -0,0 +1,44 @@ | |||||
# 模型库 | |||||
# 1. 图像分类 | |||||
## 1.1 量化 | |||||
## 1.2 剪枝 | |||||
数据集:Cifar10 | |||||
模型:Alexnet、Lenet | |||||
设置:剪枝率为0.5、0.7 | |||||
| 模型 - 剪枝算子 | 测试次数 | Acc | 剪枝率 | 压缩比例 | 推理耗时samples/s | | |||||
| :---------------------: | :------: | :----: | :----: | :------: | :---------------: | | |||||
| Alexnet - 无剪枝 | 5 | 94.89% | - | 1x | 5409 | | |||||
| Alexnet - bn | 5 | 98.81% | 50% | 1.4x | 5968 | | |||||
| Alexnet - conv_all | 5 | 93.95% | 50% | 1.3x | 5969 | | |||||
| Alexnet - conv_avg | 5 | 98.56% | 50% | 1.3x | 5865 | | |||||
| Alexnet - conv_max | 5 | 97.44% | 50% | 1.3x | 5555 | | |||||
| Alexnet - random | 5 | 97.32% | 50% | 1.3x | 5580 | | |||||
| Alexnet -conv_threshold | 5 | 98.03% | 50% | x1.3x | 5567 | | |||||
| Lenet - 无剪枝 | 5 | 75.72% | - | 1x | 5821 | | |||||
| Lenet - bn | 5 | 64.89% | 70% | 3x | 1923 | | |||||
# 2. 文本分类 | |||||
## 2.1 知识蒸馏 | |||||
数据集:SST-2 | |||||
环境:单卡2080Ti | |||||
设置:BERT类模型最大序列长度设为128,LSTM类模型最大序列长度设为32,词表大小为10000 | |||||
| 模型 | 测试次数 | Acc | 层数 | 隐藏层维度/前馈层维度 | 模型尺寸 | 压缩比例 | 推理耗时 | 推理加速 | | |||||
|:--:|:---:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| | |||||
| BERT_base(Teacher) | 5 | 92.2% | 12 | 768/3072 | 110M | 1x | 4.04s | 1x | | |||||
| KD | 5 | 80.5% | 3 | 312/1200 | 14.5M | 7.5x | 0.81s | 5.0x | | |||||
| BiLSTM | 5 | 80.4% | 1 | 300/400 | 15.3M | 7.2x | 0.83s | 4.8x | | |||||
| Distilled-BiLSTM | 5 | 82.9% | 1 | 300/400 | 15.3M | 7.2x | 0.83s | 4.8x | | |||||
| BERT-PKD(from scratch) | 5 | 81.5% | 3 | 768/3072 | 45.7M | 2.4x | 1.69s | 2.4x | | |||||
| BERT-PKD | 5 | 88.4% | 3 | 768/3072 | 45.7M | 2.4x | 1.69s | 2.4x | | |||||
| TinyBERT | 5 | 91.3% | 4 | 312/1200 | 14.5M | 7.5x | 0.65s | 6.2x | | |||||
| BERT-of-Theseus | 5 | 87.2% | 4 | 768/3072 | 53.7M | 2.05x | 2.05s | 2.0x | | |||||
注:层数不包含embedding和prediction层。 |
@@ -0,0 +1,4 @@ | |||||
# 训练的日志文件 | |||||
- 日志文件夹,存储不同模型和数据的日志log文件,记录每个epoch在test数据集上的top1准确率、topk准确率、运行速度。 | |||||
- 如"log_vgg_cifar10_base_model.txt":vgg模型-cifar10数据集-baseline模型训练的log记录。 |
@@ -0,0 +1,263 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import oneflow as flow | |||||
from util.model_weights import modelWeight | |||||
def _batch_norm(inputs, name=None, trainable=True): | |||||
return flow.layers.batch_normalization( | |||||
inputs=inputs, | |||||
axis=1, | |||||
momentum=0.997, | |||||
epsilon=1.001e-5, | |||||
center=True, | |||||
scale=True, | |||||
# gamma_initializer=0, | |||||
gamma_regularizer=flow.regularizers.l1(1e-4), | |||||
trainable=trainable, | |||||
name=name, | |||||
) | |||||
def conv2d_layer( | |||||
name, | |||||
input, | |||||
filters, | |||||
kernel_size=1, | |||||
strides=1, | |||||
padding="VALID", | |||||
data_format="NCHW", | |||||
dilation_rate=1, | |||||
activation="Relu", | |||||
use_bias=True, | |||||
weight_initializer=flow.variance_scaling_initializer(2, 'fan_out', 'random_normal', data_format="NCHW"), | |||||
bias_initializer=flow.zeros_initializer(), | |||||
bn=True, | |||||
): | |||||
weight_shape = (filters, input.shape[1], kernel_size, kernel_size) | |||||
weight = flow.get_variable( | |||||
name + "_weight", | |||||
shape=weight_shape, | |||||
dtype=input.dtype, | |||||
initializer=weight_initializer, | |||||
) | |||||
output = flow.nn.conv2d( | |||||
input, weight, strides, padding, data_format, dilation_rate, name=name | |||||
) | |||||
if use_bias: | |||||
bias = flow.get_variable( | |||||
name + "_bias", | |||||
shape=(filters,), | |||||
dtype=input.dtype, | |||||
initializer=bias_initializer, | |||||
) | |||||
output = flow.nn.bias_add(output, bias, data_format) | |||||
if activation is not None: | |||||
if activation == "Relu": | |||||
if bn: | |||||
output = _batch_norm(output, name + "_bn") | |||||
# flow.watch(output) | |||||
output = flow.nn.relu(output) | |||||
else: | |||||
output = flow.nn.relu(output) | |||||
else: | |||||
raise NotImplementedError | |||||
return output | |||||
def alexnet(images, cfg, optimizer, trainable=True, need_transpose=False, | |||||
training=True, wd=1.0/32768, model_weight=True, bn=True): | |||||
if need_transpose: | |||||
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) | |||||
conv0 = conv2d_layer(name="conv0", input=images, filters=cfg[0], kernel_size=11, | |||||
padding="VALID", strides=1, bn=bn) | |||||
pool0 = flow.nn.max_pool2d(conv0, 3, 2, "VALID", "NCHW", name="pool0") | |||||
conv1 = conv2d_layer(name="conv1", input=pool0, filters=cfg[1], kernel_size=5, | |||||
padding="SAME", strides=1, bn=bn) | |||||
pool1 = flow.nn.max_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1") | |||||
conv2 = conv2d_layer(name="conv2", input=pool1, filters=cfg[2], kernel_size=3, | |||||
padding="SAME", strides=1, bn=bn) | |||||
conv3 = conv2d_layer(name="conv3", input=conv2, filters=cfg[3], kernel_size=3, | |||||
padding="SAME", strides=1, bn=bn) | |||||
conv4 = conv2d_layer(name="conv4", input=conv3, filters=cfg[4], kernel_size=3, | |||||
padding="SAME", strides=1, bn=bn) | |||||
pool2 = flow.nn.max_pool2d(conv4, 3, 2, "VALID", "NCHW", name="pool2") | |||||
pool2 = flow.reshape(pool2, [pool2.shape[0], -1]) | |||||
dense0 = flow.layers.dense( | |||||
inputs=pool2, | |||||
units=cfg[5], | |||||
activation=flow.nn.relu, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense0", | |||||
) | |||||
dense1 = flow.layers.dense( | |||||
inputs=dense0, | |||||
units=cfg[6], | |||||
activation=flow.nn.relu, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense1", | |||||
) | |||||
dense2 = flow.layers.dense( | |||||
inputs=dense1, | |||||
units=cfg[7], | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense2", | |||||
) | |||||
# flow.watch(fc8) | |||||
def getTypeAndShape(inputs,units): | |||||
in_shape = inputs.shape | |||||
in_num_axes = len(in_shape) | |||||
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs) | |||||
shape=(units, inputs.shape[1]) | |||||
dtype=inputs.dtype | |||||
return shape,dtype | |||||
if model_weight == True: | |||||
modelWeight.addConv(index=0, dtype=conv0.dtype, | |||||
shape1=(cfg[0], images.shape[1], 11, 11), shape2=(cfg[0],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=1, dtype=conv1.dtype, | |||||
shape1=(cfg[1], conv0.shape[1], 5, 5), shape2=(cfg[1],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=2, dtype=conv2.dtype, | |||||
shape1=(cfg[2], conv1.shape[1], 3, 3), shape2=(cfg[2],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=3, dtype=conv3.dtype, | |||||
shape1=(cfg[3], conv2.shape[1], 3, 3), shape2=(cfg[3],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=4, dtype=conv4.dtype, | |||||
shape1=(cfg[4], conv3.shape[1], 3, 3), shape2=(cfg[4],), | |||||
optimizer=optimizer) | |||||
shape_list = [] | |||||
dtype_list = [] | |||||
shape_weight, dtype = getTypeAndShape(pool2, cfg[5]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense0, cfg[6]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense1, cfg[7]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list, | |||||
optimizer=optimizer, dense_num=3) | |||||
return dense2 | |||||
def alexnet_simple(images, cfg, optimizer, trainable=True, need_transpose=False, | |||||
training=True, wd=1.0/32768, model_weight=True, bn=True): | |||||
if need_transpose: | |||||
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) | |||||
conv0 = conv2d_layer(name="conv0", input=images, filters=cfg[0], kernel_size=3, | |||||
padding="VALID", strides=1, bn=bn) | |||||
pool0 = flow.nn.max_pool2d(conv0, 3, 2, "VALID", "NCHW", name="pool0") | |||||
conv1 = conv2d_layer(name="conv1", input=pool0, filters=cfg[1], kernel_size=3, | |||||
padding="SAME", strides=1, bn=bn) | |||||
pool1 = flow.nn.max_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1") | |||||
conv2 = conv2d_layer(name="conv2", input=pool1, filters=cfg[2], kernel_size=3, | |||||
padding="SAME", strides=1, bn=bn) | |||||
conv3 = conv2d_layer(name="conv3", input=conv2, filters=cfg[3], kernel_size=3, | |||||
padding="SAME", strides=1, bn=bn) | |||||
conv4 = conv2d_layer(name="conv4", input=conv3, filters=cfg[4], kernel_size=3, | |||||
padding="SAME", strides=1, bn=bn) | |||||
pool2 = flow.nn.max_pool2d(conv4, 3, 2, "VALID", "NCHW", name="pool2") | |||||
pool2 = flow.reshape(pool2, [pool2.shape[0], -1]) | |||||
dense0 = flow.layers.dense( | |||||
inputs=pool2, | |||||
units=cfg[5], | |||||
activation=flow.nn.relu, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense0", | |||||
) | |||||
dense1 = flow.layers.dense( | |||||
inputs=dense0, | |||||
units=cfg[6], | |||||
activation=flow.nn.relu, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense1", | |||||
) | |||||
dense2 = flow.layers.dense( | |||||
inputs=dense1, | |||||
units=cfg[7], | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense2", | |||||
) | |||||
# flow.watch(fc8) | |||||
def getTypeAndShape(inputs,units): | |||||
in_shape = inputs.shape | |||||
in_num_axes = len(in_shape) | |||||
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs) | |||||
shape=(units, inputs.shape[1]) | |||||
dtype=inputs.dtype | |||||
return shape,dtype | |||||
if model_weight == True: | |||||
modelWeight.addConv(index=0, dtype=conv0.dtype, | |||||
shape1=(cfg[0], images.shape[1], 3, 3), shape2=(cfg[0],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=1, dtype=conv1.dtype, | |||||
shape1=(cfg[1], conv0.shape[1], 3, 3), shape2=(cfg[1],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=2, dtype=conv2.dtype, | |||||
shape1=(cfg[2], conv1.shape[1], 3, 3), shape2=(cfg[2],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=3, dtype=conv3.dtype, | |||||
shape1=(cfg[3], conv2.shape[1], 3, 3), shape2=(cfg[3],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=4, dtype=conv4.dtype, | |||||
shape1=(cfg[4], conv3.shape[1], 3, 3), shape2=(cfg[4],), | |||||
optimizer=optimizer) | |||||
shape_list = [] | |||||
dtype_list = [] | |||||
shape_weight, dtype = getTypeAndShape(pool2, cfg[5]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense0, cfg[6]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense1, cfg[7]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list, | |||||
optimizer=optimizer, dense_num=3) | |||||
return dense2 |
@@ -0,0 +1,152 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import oneflow as flow | |||||
from util.model_weights import modelWeight | |||||
def _batch_norm(inputs, name=None, trainable=True): | |||||
return flow.layers.batch_normalization( | |||||
inputs=inputs, | |||||
axis=1, | |||||
momentum=0.997, | |||||
epsilon=1.001e-5, | |||||
center=True, | |||||
scale=True, | |||||
# gamma_initializer=0, | |||||
gamma_regularizer=flow.regularizers.l1(1e-4), | |||||
trainable=trainable, | |||||
name=name, | |||||
) | |||||
def conv2d_layer( | |||||
name, | |||||
input, | |||||
filters, | |||||
kernel_size=1, | |||||
strides=1, | |||||
padding="VALID", | |||||
data_format="NCHW", | |||||
dilation_rate=1, | |||||
activation="Relu", | |||||
use_bias=True, | |||||
weight_initializer=flow.variance_scaling_initializer(2, 'fan_out', 'random_normal', data_format="NCHW"), | |||||
bias_initializer=flow.zeros_initializer(), | |||||
bn=True, | |||||
): | |||||
weight_shape = (filters, input.shape[1], kernel_size, kernel_size) | |||||
weight = flow.get_variable( | |||||
name + "_weight", | |||||
shape=weight_shape, | |||||
dtype=input.dtype, | |||||
initializer=weight_initializer, | |||||
) | |||||
output = flow.nn.conv2d( | |||||
input, weight, strides, padding, data_format, dilation_rate, name=name | |||||
) | |||||
if use_bias: | |||||
bias = flow.get_variable( | |||||
name + "_bias", | |||||
shape=(filters,), | |||||
dtype=input.dtype, | |||||
initializer=bias_initializer, | |||||
) | |||||
output = flow.nn.bias_add(output, bias, data_format) | |||||
if activation is not None: | |||||
if activation == "Relu": | |||||
if bn: | |||||
output = _batch_norm(output, name + "_bn") | |||||
output = flow.nn.relu(output) | |||||
else: | |||||
output = flow.nn.relu(output) | |||||
else: | |||||
raise NotImplementedError | |||||
return output | |||||
def lenet(images, cfg, optimizer, trainable=True, need_transpose=False, | |||||
training=True, wd=1.0/32768, model_weight=True, bn=True): | |||||
if need_transpose: | |||||
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) | |||||
conv0 = conv2d_layer(name="conv0", input=images, filters=cfg[0], kernel_size=5, | |||||
padding="VALID", strides=1, bn=bn) | |||||
pool0 = flow.nn.max_pool2d(conv0, 2, 2, "VALID", "NCHW", name="pool0") | |||||
conv1 = conv2d_layer(name="conv1", input=pool0, filters=cfg[1], kernel_size=5, | |||||
padding="VALID", strides=1, bn=bn) | |||||
pool1 = flow.nn.max_pool2d(conv1, 2, 2, "VALID", "NCHW", name="pool1") | |||||
pool1 = flow.reshape(pool1, [pool1.shape[0], -1]) | |||||
# pool1 = flow.reshape(images, [images.shape[0], -1]) | |||||
dense0 = flow.layers.dense( | |||||
inputs=pool1, | |||||
units=cfg[2], | |||||
activation=flow.nn.relu, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense0") | |||||
dense1 = flow.layers.dense( | |||||
inputs=dense0, | |||||
units=cfg[3], | |||||
activation=flow.nn.relu, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense1") | |||||
dense2 = flow.layers.dense( | |||||
inputs=dense1, | |||||
units=cfg[4], | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense2") | |||||
# flow.watch(fc8) | |||||
def getTypeAndShape(inputs,units): | |||||
in_shape = inputs.shape | |||||
in_num_axes = len(in_shape) | |||||
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs) | |||||
shape=(units, inputs.shape[1]) | |||||
dtype=inputs.dtype | |||||
return shape,dtype | |||||
if model_weight == True: | |||||
modelWeight.addConv(index=0, dtype=conv0.dtype, | |||||
shape1=(cfg[0], images.shape[1], 5, 5), shape2=(cfg[0],), | |||||
optimizer=optimizer) | |||||
modelWeight.addConv(index=1, dtype=conv1.dtype, | |||||
shape1=(cfg[1], conv0.shape[1], 5, 5), shape2=(cfg[1],), | |||||
optimizer=optimizer) | |||||
shape_list = [] | |||||
dtype_list = [] | |||||
shape_weight, dtype = getTypeAndShape(pool1, cfg[2]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense0, cfg[3]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense1, cfg[4]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list, | |||||
optimizer=optimizer, dense_num=3) | |||||
return dense2 |
@@ -0,0 +1,218 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import oneflow as flow | |||||
from util.model_weights import modelWeight | |||||
BLOCK_COUNTS = [3, 4, 6, 3] | |||||
NAME_NUMBER = 0 | |||||
#一个conv层 | |||||
def _conv2d(name, | |||||
input, | |||||
filters, | |||||
kernel_size, | |||||
strides=1, | |||||
padding="SAME", | |||||
data_format="NCHW", | |||||
dilations=1, | |||||
use_bias=True, | |||||
trainable=True, | |||||
weight_initializer=flow.variance_scaling_initializer(data_format="NCHW"), | |||||
bias_initializer=flow.zeros_initializer()): | |||||
weight = flow.get_variable(name + "_weight", | |||||
shape=(filters, input.shape[1], kernel_size, kernel_size), | |||||
dtype=input.dtype, | |||||
initializer=weight_initializer, | |||||
trainable=trainable) | |||||
output = flow.nn.conv2d(input, weight, strides, padding, data_format, dilations, name=name) | |||||
if use_bias: | |||||
bias = flow.get_variable(name + "_bias", | |||||
shape=(filters,), | |||||
dtype=input.dtype, | |||||
initializer=bias_initializer,) | |||||
output = flow.nn.bias_add(output, bias, data_format) | |||||
return output | |||||
#一个bn层 | |||||
def _batch_norm(inputs, name=None, trainable=True): | |||||
return flow.layers.batch_normalization( | |||||
inputs=inputs, | |||||
axis=1, | |||||
momentum=0.997, | |||||
epsilon=1.001e-5, | |||||
center=True, | |||||
scale=True, | |||||
trainable=trainable, | |||||
name=name, | |||||
) | |||||
#conv, bn, relu层 | |||||
def conv2d_affine(input, name, filters, kernel_size, strides, bn, activation=None): | |||||
# input data_format must be NCHW, cannot check now | |||||
padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID" | |||||
output = _conv2d(name, input, filters, kernel_size, strides, padding) | |||||
# print(name) | |||||
if bn: | |||||
output = _batch_norm(output, name + "_bn") | |||||
if activation == "Relu": | |||||
output = flow.nn.relu(output) | |||||
return output | |||||
#三个conv2d_affine(conv, bn, relu层) | |||||
def bottleneck_transformation(input, filter1, filter2, filter3, | |||||
strides, bn, model_weight, optimizer): | |||||
global NAME_NUMBER | |||||
a = conv2d_affine(input, "conv"+str(NAME_NUMBER), filter1, 1, 1, bn, activation="Relu",) | |||||
#添加conv的model weight | |||||
if model_weight == True: | |||||
modelWeight.addConv(index=NAME_NUMBER, | |||||
dtype=input.dtype, | |||||
shape1=(filter1, input.shape[1], 1, 1), | |||||
shape2=(filter1,), | |||||
optimizer=optimizer) | |||||
NAME_NUMBER += 1 | |||||
b = conv2d_affine(a, "conv"+str(NAME_NUMBER), filter2, 3, strides, bn, activation="Relu",) | |||||
#添加conv的model weight | |||||
if model_weight == True: | |||||
modelWeight.addConv(index=NAME_NUMBER, | |||||
dtype=a.dtype, | |||||
shape1=(filter2, a.shape[1], 3, 3), | |||||
shape2=(filter2,), | |||||
optimizer=optimizer) | |||||
NAME_NUMBER += 1 | |||||
c = conv2d_affine(b, "conv"+str(NAME_NUMBER), filter3, 1, 1, bn) | |||||
#添加conv的model weight | |||||
if model_weight == True: | |||||
modelWeight.addConv(index=NAME_NUMBER, | |||||
dtype=b.dtype, | |||||
shape1=(filter3, b.shape[1], 1, 1), | |||||
shape2=(filter3,), | |||||
optimizer=optimizer) | |||||
NAME_NUMBER += 1 | |||||
# print(a.shape, b.shape, c.shape, strides) | |||||
return c | |||||
def residual_block(input, index, i, filter1, filter2, filter3, | |||||
strides_init, bn, model_weight, optimizer): | |||||
# if strides_init != 1 or block_name == "res2_0": | |||||
# #一个conv2d_affine(conv, bn, relu层) | |||||
# shortcut = conv2d_affine(input, block_name + "_branch1", 1, 1, filter3, 1, strides_init) | |||||
# else: | |||||
# shortcut = input | |||||
#对输入做变换,使得和三层oncv的输出shape相同,可以相加 | |||||
shortcut = conv2d_affine(input, "conv_shortcut"+str(index)+"_"+str(i), filter3, 3, | |||||
strides_init, bn) | |||||
#shortcut层添加model weight | |||||
if model_weight == True: | |||||
modelWeight.addConv(index="_shortcut"+str(index)+"_"+str(i), | |||||
dtype=input.dtype, | |||||
shape1=(filter3, input.shape[1], 3, 3), | |||||
shape2=(filter3,), | |||||
optimizer=optimizer) | |||||
#三个conv2d_affine(conv, bn, relu层) | |||||
bottleneck = bottleneck_transformation(input, filter1, filter2, filter3, | |||||
strides_init, bn, model_weight, optimizer) | |||||
# print(bottleneck.shape, shortcut.shape, strides_init, i) | |||||
return flow.nn.relu(bottleneck + shortcut) | |||||
def residual_stage(input, index, counts, cfg, bn, model_weight, optimizer, stride_init=2): | |||||
output = input | |||||
for i in range(counts): | |||||
# block_name = "%s_%d" % (stage_name, i) | |||||
output = residual_block(output, index, i, cfg[i*3+0], cfg[i*3+1], cfg[i*3+2], | |||||
stride_init if i == 0 else 1, bn, model_weight, optimizer) | |||||
return output | |||||
#resnet50主体结构 | |||||
def resnet_conv_x_body(input, cfg, bn, model_weight, optimizer, on_stage_end=lambda x: x): | |||||
output = input | |||||
for index, (counts, cfg_i) in enumerate( | |||||
zip(BLOCK_COUNTS, cfg) | |||||
): | |||||
#stage_name为res2/res3/res4/res5 | |||||
# stage_name = "res%d" % (i + 2) | |||||
output = residual_stage(output, index, counts, cfg_i, bn, model_weight, | |||||
optimizer, 1 if index == 0 else 2) | |||||
on_stage_end(output) | |||||
return output | |||||
#最初的卷积层 | |||||
def resnet_stem(input, bn, model_weight, optimizer): | |||||
conv_stem = _conv2d("conv_stem", input, 64, 7, 2) | |||||
if bn: | |||||
conv_stem = _batch_norm(conv_stem, "conv_stem_bn") | |||||
conv_stem = flow.nn.relu(conv_stem) | |||||
pool1 = flow.nn.max_pool2d( | |||||
conv_stem, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1", | |||||
) | |||||
#最初的卷积层添加model weight | |||||
if model_weight == True: | |||||
modelWeight.addConv(index="_stem", dtype=input.dtype, | |||||
shape1=(64, input.shape[1], 7, 7), | |||||
shape2=(64,), | |||||
optimizer=optimizer) | |||||
return pool1 | |||||
def resnet50(images, cfg, optimizer, trainable=True, need_transpose=False, | |||||
model_weight=True, bn=True): | |||||
if need_transpose: | |||||
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) | |||||
global NAME_NUMBER | |||||
NAME_NUMBER = 0 | |||||
stem = resnet_stem(images, bn, model_weight, optimizer) | |||||
body = resnet_conv_x_body(stem, cfg, bn, model_weight, optimizer, lambda x: x) | |||||
pool5 = flow.nn.avg_pool2d( | |||||
body, ksize=7, strides=1, padding="VALID", data_format="NCHW", name="pool5", | |||||
) | |||||
pool5 = flow.reshape(pool5, [pool5.shape[0], -1]) | |||||
dense0 = flow.layers.dense( | |||||
inputs=pool5, | |||||
units=cfg[4], | |||||
use_bias=True, | |||||
kernel_initializer=flow.xavier_uniform_initializer(), | |||||
bias_initializer=flow.zeros_initializer(), | |||||
trainable=trainable, | |||||
name="dense0",) | |||||
def getTypeAndShape(inputs,units): | |||||
in_shape = inputs.shape | |||||
in_num_axes = len(in_shape) | |||||
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs) | |||||
shape=(units, inputs.shape[1]) | |||||
dtype=inputs.dtype | |||||
return shape,dtype | |||||
#添加dense层的Model weight | |||||
if model_weight == True: | |||||
shape_list = [] | |||||
dtype_list = [] | |||||
shape_weight, dtype = getTypeAndShape(pool5, cfg[4]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list, | |||||
optimizer=optimizer, dense_num=1) | |||||
return dense0 |
@@ -0,0 +1,200 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import oneflow as flow | |||||
from util.model_weights import modelWeight | |||||
def _batch_norm(inputs, name=None, trainable=True): | |||||
return flow.layers.batch_normalization( | |||||
inputs=inputs, | |||||
axis=1, | |||||
momentum=0.997, | |||||
epsilon=1.001e-5, | |||||
center=True, | |||||
scale=True, | |||||
# gamma_initializer=0, | |||||
gamma_regularizer=flow.regularizers.l1(1e-4), | |||||
trainable=trainable, | |||||
name=name, | |||||
) | |||||
def conv2d_layer( | |||||
name, | |||||
input, | |||||
filters, | |||||
kernel_size=3, | |||||
strides=1, | |||||
padding="SAME", | |||||
data_format="NCHW", | |||||
dilation_rate=1, | |||||
activation="Relu", | |||||
use_bias=True, | |||||
weight_initializer=flow.variance_scaling_initializer(2, 'fan_out', 'random_normal', data_format="NCHW"), | |||||
bias_initializer=flow.zeros_initializer(), | |||||
bn=True, | |||||
): | |||||
weight_shape = (filters, input.shape[1], kernel_size, kernel_size) | |||||
weight = flow.get_variable( | |||||
name + "_weight", | |||||
shape=weight_shape, | |||||
dtype=input.dtype, | |||||
initializer=weight_initializer, | |||||
) | |||||
output = flow.nn.conv2d( | |||||
input, weight, strides, padding, data_format, dilation_rate, name=name | |||||
) | |||||
if use_bias: | |||||
bias = flow.get_variable( | |||||
name + "_bias", | |||||
shape=(filters,), | |||||
dtype=input.dtype, | |||||
initializer=bias_initializer, | |||||
) | |||||
output = flow.nn.bias_add(output, bias, data_format) | |||||
if activation is not None: | |||||
if activation == "Relu": | |||||
if bn: | |||||
output = _batch_norm(output, name + "_bn") | |||||
# flow.watch(output) | |||||
output = flow.nn.relu(output) | |||||
else: | |||||
output = flow.nn.relu(output) | |||||
else: | |||||
raise NotImplementedError | |||||
return output | |||||
def _conv_block(in_blob, index, filters, conv_times, optimizer, model_weight, bn=True): | |||||
conv_block = [] | |||||
conv_block.insert(0, in_blob) | |||||
for i in range(conv_times): | |||||
conv_i = conv2d_layer( | |||||
name="conv{}".format(index), | |||||
input=conv_block[i], | |||||
filters=filters[index], | |||||
kernel_size=3, | |||||
strides=1, | |||||
bn=bn, | |||||
) | |||||
if model_weight == True: | |||||
modelWeight.addConv(index=index, | |||||
dtype=conv_block[i].dtype, | |||||
shape1=(filters[index], conv_block[i].shape[1], 3, 3), | |||||
shape2=(filters[index],), | |||||
optimizer=optimizer) | |||||
# shape_weight=(filters[index], conv_block[i].shape[1], 3, 3) | |||||
# modelWeight.add("conv{}".format(index)+'-weight',conv_block[i].dtype,shape_weight) | |||||
# modelWeight.add("conv{}".format(index)+'-bias',conv_block[i].dtype,(filters,)) | |||||
# modelWeight.add("conv{}".format(index)+'_bn-gamma',conv_block[i].dtype,(filters,)) | |||||
# modelWeight.add("conv{}".format(index)+'_bn-beta',conv_block[i].dtype,(filters,)) | |||||
# modelWeight.add("conv{}".format(index)+'_bn-moving_variance',conv_block[i].dtype,(filters,)) | |||||
# modelWeight.add("conv{}".format(index)+'_bn-moving_mean',conv_block[i].dtype,(filters,)) | |||||
conv_block.append(conv_i) | |||||
index += 1 | |||||
return conv_block | |||||
def vgg(images, cfg, optimizer, trainable=True, need_transpose=False, | |||||
training=True, wd=1.0/32768, model_weight=True, bn=True): | |||||
if need_transpose: | |||||
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2]) | |||||
conv1 = _conv_block(images, 0, cfg, 2, optimizer, model_weight, bn=bn) | |||||
pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1") | |||||
conv2 = _conv_block(pool1, 2, cfg, 2, optimizer, model_weight, bn=bn) | |||||
pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", "NCHW", name="pool2") | |||||
conv3 = _conv_block(pool2, 4, cfg, 3, optimizer, model_weight, bn=bn) | |||||
pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", "NCHW", name="pool3") | |||||
conv4 = _conv_block(pool3, 7, cfg, 3, optimizer, model_weight, bn=bn) | |||||
pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", "NCHW", name="pool4") | |||||
conv5 = _conv_block(pool4, 10, cfg, 3, optimizer, model_weight, bn=bn) | |||||
pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", "NCHW", name="pool5") | |||||
pool5 = flow.reshape(pool5, [pool5.shape[0], -1]) | |||||
dense0 = flow.layers.dense( | |||||
inputs=pool5, | |||||
units=cfg[13], | |||||
activation=flow.nn.relu, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense0", | |||||
) | |||||
dense1 = flow.layers.dense( | |||||
inputs=dense0, | |||||
units=cfg[14], | |||||
activation=flow.nn.relu, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense1", | |||||
) | |||||
dense2 = flow.layers.dense( | |||||
inputs=dense1, | |||||
units=cfg[15], | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense2", | |||||
) | |||||
# flow.watch(fc8) | |||||
def getTypeAndShape(inputs,units): | |||||
in_shape = inputs.shape | |||||
in_num_axes = len(in_shape) | |||||
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs) | |||||
shape=(units, inputs.shape[1]) | |||||
dtype=inputs.dtype | |||||
return shape,dtype | |||||
if model_weight == True: | |||||
shape_list = [] | |||||
dtype_list = [] | |||||
shape_weight, dtype = getTypeAndShape(pool5, cfg[13]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense0, cfg[14]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense1, cfg[15]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list, | |||||
optimizer=optimizer, dense_num=3) | |||||
# shape_weight,dtype=getTypeAndShape(pool5,4096) | |||||
# modelWeight.add('fc1'+'-weight',dtype,shape_weight) | |||||
# modelWeight.add('fc1'+'-bias',dtype,(4096,)) | |||||
# shape_weight,dtype=getTypeAndShape(fc6,4096) | |||||
# modelWeight.add('fc2'+'-weight',dtype,shape_weight) | |||||
# modelWeight.add('fc2'+'-bias',dtype,(4096,)) | |||||
# shape_weight,dtype=getTypeAndShape(fc7,1000) | |||||
# modelWeight.add('fc_final'+'-weight',dtype,shape_weight) | |||||
# modelWeight.add('fc_final'+'-bias',dtype,(1000,)) | |||||
return dense2 |
@@ -0,0 +1,118 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow as flow | |||||
from util.model_weights import modelWeight | |||||
# 这是一个具有 2 层隐藏层的 DNN 神经网络,第 1 层使用 relu 激活函数,第 2 层不使用激活函数 | |||||
def dnn_2(input_tensor, cfg, optimizer, model_weight=True, trainable=True): | |||||
input_tensor = flow.reshape(input_tensor, [input_tensor.shape[0], -1]) | |||||
dense0 = flow.layers.dense( | |||||
inputs=input_tensor, | |||||
units=cfg[0], | |||||
activation=flow.nn.relu, use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense0") | |||||
dense1 = flow.layers.dense( | |||||
inputs=dense0, | |||||
units=cfg[1], | |||||
activation=None, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense1") | |||||
def getTypeAndShape(inputs,units): | |||||
in_shape = inputs.shape | |||||
in_num_axes = len(in_shape) | |||||
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs) | |||||
shape=(units, inputs.shape[1]) | |||||
dtype=inputs.dtype | |||||
return shape,dtype | |||||
if model_weight == True: | |||||
shape_list = [] | |||||
dtype_list = [] | |||||
shape_weight, dtype = getTypeAndShape(input_tensor, cfg[0]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense0, cfg[1]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list, | |||||
optimizer=optimizer, dense_num=2) | |||||
return dense1 | |||||
def dnn_4(input_tensor, cfg, optimizer, model_weight=True, trainable=True): | |||||
input_tensor = flow.reshape(input_tensor, [input_tensor.shape[0], -1]) | |||||
dense0 = flow.layers.dense( | |||||
inputs=input_tensor, | |||||
units=cfg[0], | |||||
activation=flow.nn.relu, use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense0") | |||||
dense1 = flow.layers.dense( | |||||
inputs=dense0, | |||||
units=cfg[1], | |||||
activation=flow.nn.relu, use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense1") | |||||
dense2 = flow.layers.dense( | |||||
inputs=dense1, | |||||
units=cfg[2], | |||||
activation=flow.nn.relu, use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense2") | |||||
dense3 = flow.layers.dense( | |||||
inputs=dense2, | |||||
units=cfg[3], | |||||
activation=None, | |||||
use_bias=True, | |||||
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1), | |||||
trainable=trainable, | |||||
name="dense3") | |||||
def getTypeAndShape(inputs,units): | |||||
in_shape = inputs.shape | |||||
in_num_axes = len(in_shape) | |||||
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs) | |||||
shape=(units, inputs.shape[1]) | |||||
dtype=inputs.dtype | |||||
return shape,dtype | |||||
if model_weight == True: | |||||
shape_list = [] | |||||
dtype_list = [] | |||||
shape_weight, dtype = getTypeAndShape(input_tensor, cfg[0]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense0, cfg[1]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense1, cfg[2]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
shape_weight, dtype = getTypeAndShape(dense2, cfg[3]) | |||||
shape_list.append(shape_weight) | |||||
dtype_list.append(dtype) | |||||
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list, | |||||
optimizer=optimizer, dense_num=4) | |||||
return dense3 |
@@ -0,0 +1 @@ | |||||
# Oneflow格式数据集 |
@@ -0,0 +1,202 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow.core.record.record_pb2 as ofrecord | |||||
import six | |||||
import struct | |||||
import numpy as np | |||||
import json | |||||
import os | |||||
import argparse | |||||
parser = argparse.ArgumentParser() | |||||
parser.add_argument("--dataName", default="randomData1", | |||||
type=str, help="my data name") | |||||
args = parser.parse_args() | |||||
#%% | |||||
def int32_feature(value): | |||||
if not isinstance(value, (list, tuple)): | |||||
value = [value] | |||||
return ofrecord.Feature(int32_list=ofrecord.Int32List(value=value)) | |||||
def int64_feature(value): | |||||
if not isinstance(value, (list, tuple)): | |||||
value = [value] | |||||
return ofrecord.Feature(int64_list=ofrecord.Int64List(value=value)) | |||||
def float_feature(value): | |||||
if not isinstance(value, (list, tuple)): | |||||
value = [value] | |||||
return ofrecord.Feature(float_list=ofrecord.FloatList(value=value)) | |||||
def double_feature(value): | |||||
if not isinstance(value, (list, tuple)): | |||||
value = [value] | |||||
return ofrecord.Feature(double_list=ofrecord.DoubleList(value=value)) | |||||
def bytes_feature(value): | |||||
if not isinstance(value, (list, tuple)): | |||||
value = [value] | |||||
if not six.PY2: | |||||
if isinstance(value[0], str): | |||||
value = [x.encode() for x in value] | |||||
return ofrecord.Feature(bytes_list=ofrecord.BytesList(value=value)) | |||||
#%% 随机生成3*32*32大小的训练集1000个,测试集200个,数值范围0-1 | |||||
def createRandomData_1(): | |||||
data_train = np.random.random((1000, 3*32*32)) | |||||
label_train = np.random.randint(0, 10, (1000)) | |||||
np.around(data_train, 4) | |||||
dict_train = {} | |||||
dict_train["data"] = data_train.tolist() | |||||
dict_train["label"] = label_train.tolist() | |||||
dict_train["shape"] = [3, 32, 32] | |||||
with open("./myData/randomData1/train.json", "w") as f_train: | |||||
json.dump(dict_train, f_train, indent=4) | |||||
data_test = np.random.random((200, 3*32*32)) | |||||
label_test = np.random.randint(0, 10, (200)) | |||||
np.around(data_test, 4) | |||||
dict_test = {} | |||||
dict_test["data"] = data_test.tolist() | |||||
dict_test["label"] = label_test.tolist() | |||||
dict_test["shape"] = [3, 32, 32] | |||||
with open("./myData/randomData1/test.json", "w") as f_test: | |||||
json.dump(dict_test, f_test, indent=4) | |||||
#%% 随机生成3*32*32大小的训练集1000个,测试集200个,数值范围1-255 | |||||
def createRandomData_255(): | |||||
data_train = np.random.randint(1, 255, (1000, 3*32*32)) | |||||
label_train = np.random.randint(0, 10, (1000)) | |||||
np.around(data_train, 4) | |||||
dict_train = {} | |||||
dict_train["data"] = data_train.tolist() | |||||
dict_train["label"] = label_train.tolist() | |||||
dict_train["shape"] = [3, 32, 32] | |||||
with open("./myData/randomData255_small/train.json", "w") as f_train: | |||||
json.dump(dict_train, f_train, indent=4) | |||||
data_test = np.random.randint(1, 255, (200, 3*32*32)) | |||||
label_test = np.random.randint(0, 10, (200)) | |||||
np.around(data_test, 4) | |||||
dict_test = {} | |||||
dict_test["data"] = data_test.tolist() | |||||
dict_test["label"] = label_test.tolist() | |||||
dict_test["shape"] = [3, 32, 32] | |||||
with open("./myData/randomData255_small/test.json", "w") as f_test: | |||||
json.dump(dict_test, f_test, indent=4) | |||||
#%% cal mean, std | |||||
def mean_std(data, shape): | |||||
data_reshape = data.reshape(-1, shape[0], shape[1], shape[2]) | |||||
mean_list,std_list = [],[] | |||||
for i in range(shape[0]): | |||||
mean = np.mean(data_reshape[:,i,:,:]) | |||||
std = np.std(data_reshape[:,i,:,:]) | |||||
if mean <= 1: | |||||
mean_list.append(np.around(mean*255, 2)) | |||||
std_list.append(np.around(std*255, 2)) | |||||
else: | |||||
mean_list.append(np.around(mean, 2)) | |||||
std_list.append(np.around(std, 2)) | |||||
return mean_list, std_list | |||||
#%% data转ofData | |||||
def data2of_part(datas, labels, save_path): | |||||
f = open(save_path, "wb") | |||||
for loop in range(0, len(labels)): | |||||
image = datas[loop].tolist() | |||||
label = [labels[loop]] | |||||
topack = { | |||||
'images': float_feature(image), | |||||
'labels': int32_feature(label), | |||||
} | |||||
ofrecord_features = ofrecord.OFRecord(feature=topack) | |||||
serilizedBytes = ofrecord_features.SerializeToString() | |||||
length = ofrecord_features.ByteSize() | |||||
f.write(struct.pack("q", length)) | |||||
f.write(serilizedBytes) | |||||
print("Write ofData to", save_path) | |||||
f.close() | |||||
#%% load mydata and write ofData | |||||
def data2of(dataName): | |||||
# load/save path | |||||
load_path_train = "./myData/" + dataName + "/train.json" | |||||
load_path_test = "./myData/" + dataName + "/test.json" | |||||
save_path_train = "./ofData/" + dataName + "/train/" | |||||
save_path_test = "./ofData/" + dataName + "/test/" | |||||
if not os.path.exists(save_path_train): | |||||
os.makedirs(save_path_train) | |||||
print("create folder", save_path_train) | |||||
if not os.path.exists(save_path_test): | |||||
os.makedirs(save_path_test) | |||||
print("create folder", save_path_test) | |||||
# load data | |||||
with open(load_path_train) as f_train: | |||||
train_dict = json.load(f_train) | |||||
with open(load_path_test) as f_test: | |||||
test_dict = json.load(f_test) | |||||
data_train = np.array(train_dict["data"]) | |||||
label_train = np.array(train_dict["label"]) | |||||
data_test = np.array(test_dict["data"]) | |||||
label_test = np.array(test_dict["label"]) | |||||
data = np.append(data_train, data_test, axis=0) | |||||
label = np.append(label_train, label_test) | |||||
# data 2 ofData | |||||
data2of_part(data_train, label_train, save_path_train+"part-00000") | |||||
data2of_part(data_test, label_test, save_path_test+"part-00000") | |||||
# write meta information | |||||
shape = train_dict["shape"] | |||||
mean_list, std_list = mean_std(data, shape) | |||||
dict_meta = {} | |||||
dict_meta["num_classes"] = len(set(label)) | |||||
dict_meta["image_shape"] = shape | |||||
dict_meta["rgb_mean"] = mean_list | |||||
dict_meta["rgb_std"] = std_list | |||||
dict_meta["num_examples"] = data_train.shape[0] | |||||
dict_meta["num_val_examples"] = data_test.shape[0] | |||||
with open("./ofData/" + dataName + "/meta.json", "w") as f_meta: | |||||
json.dump(dict_meta, f_meta, indent=4) | |||||
print("Write meta infomation to", "./ofData/" + dataName + "/meta.json") | |||||
def main(): | |||||
# load_path = "./myData/data_batch_1" | |||||
# d = unpickle_cifar(load_path) | |||||
# data = d[b'data'] | |||||
# print(type(data)) | |||||
# labels = d[b'labels'] | |||||
# print(data.shape) | |||||
# createRandomData_1() | |||||
# createRandomData_255() | |||||
dataName = args.dataName | |||||
data2of(dataName) | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,4 @@ | |||||
# 模型输出文件夹 | |||||
- 模型输出文件夹,模型文件存储在snapshots文件夹下 | |||||
- 按照模型分别存放,各自模型下按照数据集存放,各自数据集下分为基模型model_base、剪枝模型model_prune、微调模型model_refine。 |
@@ -0,0 +1,225 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import argparse | |||||
import numpy as np | |||||
import os | |||||
from util.model_weights import modelWeight | |||||
import util.prune_algorithm as pa | |||||
parser = argparse.ArgumentParser() | |||||
dtype_dict={2:np.float32, | |||||
3:np.float64, | |||||
4:np.int8, | |||||
5:np.int32, | |||||
6:np.int64, | |||||
9:np.float16} | |||||
parser.add_argument("--bn", default=False, | |||||
type=str, help="Whether to use use bn layer") | |||||
parser.add_argument("--prune_method", default='bn', | |||||
type=str, help="method of prune(channel_prune_bn, channel_prune_conv)") | |||||
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last', | |||||
type = str, required = False, help = "Path of base oneflow model") | |||||
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str, | |||||
required = False, help = "Path to the output OneFlow model.") | |||||
parser.add_argument("--percent", default = 0.7, type = float, required = False, | |||||
help = "scale sparse rate (default: 0.7)") | |||||
parser.add_argument("--optimizer", type=str, default="momentum", required=False, | |||||
help="sgd, adam, momentum") | |||||
args = parser.parse_args() | |||||
def _SaveWeightBlob2File(blob, folder, var): | |||||
if not os.path.exists(folder): | |||||
os.makedirs(folder) | |||||
filename = os.path.join(folder, var) | |||||
f = open(filename, 'wb') | |||||
f.write(blob.tobytes()) | |||||
f.close() | |||||
def _LoadWeightBlob2Numpy(shape, folder, dtype): | |||||
if not os.path.exists(folder): | |||||
print('fail to find', folder) | |||||
filename = os.path.join(folder, 'out') | |||||
f = open(filename, 'r') | |||||
n = np.fromfile(f, dtype=dtype) | |||||
n = n.reshape(shape) | |||||
f.close() | |||||
return n | |||||
def name2array(name, weights_dict): | |||||
folder=os.path.join(args.model_load_dir, name) | |||||
profile_dict = weights_dict[name] | |||||
shape=profile_dict["shape"] | |||||
dtype=profile_dict["dtype"] | |||||
dtype=dtype_dict[dtype] | |||||
array = _LoadWeightBlob2Numpy(shape,folder,dtype) | |||||
return array, dtype, shape | |||||
#制作待剪枝的namelist | |||||
def makeNameList(pruneName, nameList, name): | |||||
if pruneName == '_bn-gamma': | |||||
nameList.append(name+"_weight") | |||||
elif pruneName == "_weight": | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-gamma") | |||||
nameList.append(name+pruneName) | |||||
nameList.append(name+"_bias") | |||||
#是否添加对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta") | |||||
nameList.append(name+"_bn-moving_variance") | |||||
nameList.append(name+"_bn-moving_mean") | |||||
#adam时多加的参数 | |||||
if args.optimizer == 'adam': | |||||
nameList.append(name+"_weight-v") | |||||
nameList.append(name+"_weight-m") | |||||
nameList.append(name+"_bias-v") | |||||
nameList.append(name+"_bias-m") | |||||
#是否添加对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta-v") | |||||
nameList.append(name+"_bn-beta-m") | |||||
nameList.append(name+"_bn-gamma-v") | |||||
nameList.append(name+"_bn-gamma-m") | |||||
#momentum时多加的参数 | |||||
elif args.optimizer == 'momentum': | |||||
nameList.append(name+"_weight-momentum") | |||||
nameList.append(name+"_bias-momentum") | |||||
#是否添加对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta-momentum") | |||||
nameList.append(name+"_bn-gamma-momentum") | |||||
else: | |||||
if args.optimizer != 'sgd': | |||||
print('Error: optimizer!') | |||||
return nameList | |||||
def prune(): | |||||
# 获取对应剪枝方法的thre阈值 | |||||
if args.prune_method == 'bn': | |||||
thre = pa.get_pruneThre_bn() | |||||
elif args.prune_method == 'conv_avg': | |||||
thre = pa.get_pruneThre_conv_avg() | |||||
elif args.prune_method == 'conv_all': | |||||
thre = pa.get_pruneThre_conv_all() | |||||
elif args.prune_method == 'conv_max': | |||||
thre = pa.get_pruneThre_conv_max() | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
modelWeight.weights_dict = {} | |||||
fcRemoveIndexs = [] | |||||
fcDivideNum = 0 | |||||
removeIndexs = [] | |||||
lastRemoveIndexs = [] | |||||
beforePrune = 0 | |||||
afterPrune = 0 | |||||
pruneName = '' | |||||
if "bn" in args.prune_method: | |||||
pruneName = "_bn-gamma" | |||||
elif "conv" in args.prune_method or args.prune_method=="random": | |||||
pruneName = "_weight" | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.startswith("conv") and name.endswith(pruneName): | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
lastRemoveIndexs = removeIndexs | |||||
#获取对应剪枝方法removeIndexs | |||||
if args.prune_method == 'bn': | |||||
removeIndexs = pa.get_removeIndex_bn(a, thre) | |||||
elif args.prune_method == "conv_avg": | |||||
removeIndexs = pa.get_removeIndex_conv_avg(a, shape, thre) | |||||
elif args.prune_method == "conv_all": | |||||
removeIndexs = pa.get_removeIndex_conv_all(a, shape, thre) | |||||
elif args.prune_method == "conv_max": | |||||
removeIndexs = pa.get_removeIndex_conv_max(a, shape, thre) | |||||
elif args.prune_method == "random": | |||||
removeIndexs = pa.get_removeIndex_random(shape) | |||||
elif args.prune_method == "conv_similarity": | |||||
removeIndexs = pa.get_removeIndex_conv_similarity(a, shape) | |||||
elif args.prune_method == "bn_similarity": | |||||
removeIndexs = pa.get_removeIndex_bn_similarity(a, shape) | |||||
elif args.prune_method == "conv_threshold": | |||||
removeIndexs = pa.get_removeIndex_conv_threshold(a, shape, threSet=0.06) | |||||
if len(removeIndexs) == len(a): | |||||
removeIndexs = np.delete(removeIndexs, 0) | |||||
if name == "conv4"+pruneName: | |||||
fcRemoveIndexs = removeIndexs | |||||
fcDivideNum = 256 | |||||
#待剪枝层的名字列表 | |||||
name = name.split("_")[0].split("-")[0] | |||||
nameList = [] | |||||
nameList = makeNameList(pruneName, nameList, name) | |||||
#真正剪枝 | |||||
for name in nameList: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
if name.endswith("weight") or name.endswith("weight-v") or \ | |||||
name.endswith("weight-m") or name.endswith("weight-momentum"): | |||||
b = np.delete(a, removeIndexs, 0) | |||||
b = np.delete(b, lastRemoveIndexs, 1) | |||||
if name.endswith("weight"): | |||||
beforePrune += a.shape[0] | |||||
afterPrune += b.shape[0] | |||||
else: | |||||
b = np.delete(a, removeIndexs) | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
#第一个dense0层剪枝 | |||||
elif name.startswith("dense"): | |||||
if name in ['dense0-weight', 'dense0-weight-v', | |||||
'dense0-weight-m', 'dense0-weight-momentum']: | |||||
fcRemoveIndexsNew = [] | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
num = int(a.shape[1]/fcDivideNum) | |||||
for index in fcRemoveIndexs: | |||||
fcRemoveIndexsNew += [index+fcDivideNum*i for i in range(num)] | |||||
b = np.delete(a, fcRemoveIndexsNew, 1) | |||||
else: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
b = a | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune) | |||||
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%") | |||||
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path") | |||||
modelWeight.save(weights_profile_path) | |||||
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model"))) | |||||
def main(): | |||||
prune() | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,158 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import argparse | |||||
import numpy as np | |||||
import os | |||||
from util.model_weights import modelWeight | |||||
import util.prune_algorithm as pa | |||||
parser = argparse.ArgumentParser() | |||||
dtype_dict={2:np.float32, | |||||
3:np.float64, | |||||
4:np.int8, | |||||
5:np.int32, | |||||
6:np.int64, | |||||
9:np.float16} | |||||
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last', | |||||
type = str, required = False, help = "Path of base oneflow model") | |||||
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str, | |||||
required = False, help = "Path to the output OneFlow model.") | |||||
parser.add_argument("--percent", default = 0.7, type = float, required = False, | |||||
help = "scale sparse rate (default: 0.7)") | |||||
parser.add_argument("--optimizer", type=str, default="momentum", required=False, | |||||
help="sgd, adam, momentum") | |||||
args = parser.parse_args() | |||||
def _SaveWeightBlob2File(blob, folder, var): | |||||
if not os.path.exists(folder): | |||||
os.makedirs(folder) | |||||
filename = os.path.join(folder, var) | |||||
f = open(filename, 'wb') | |||||
f.write(blob.tobytes()) | |||||
f.close() | |||||
def _LoadWeightBlob2Numpy(shape, folder, dtype): | |||||
if not os.path.exists(folder): | |||||
print('fail to find', folder) | |||||
filename = os.path.join(folder, 'out') | |||||
f = open(filename, 'r') | |||||
n = np.fromfile(f, dtype=dtype) | |||||
n = n.reshape(shape) | |||||
f.close() | |||||
return n | |||||
def name2array(name, weights_dict): | |||||
folder=os.path.join(args.model_load_dir, name) | |||||
profile_dict = weights_dict[name] | |||||
shape=profile_dict["shape"] | |||||
dtype=profile_dict["dtype"] | |||||
dtype=dtype_dict[dtype] | |||||
array = _LoadWeightBlob2Numpy(shape,folder,dtype) | |||||
return array, dtype, shape | |||||
#制作待剪枝的namelist | |||||
def makeNameList(nameList, name): | |||||
nameList.append(name+"-weight") | |||||
nameList.append(name+"-bias") | |||||
#adam时多加的参数 | |||||
if args.optimizer == 'adam': | |||||
nameList.append(name+"-weight-v") | |||||
nameList.append(name+"-weight-m") | |||||
nameList.append(name+"-bias-v") | |||||
nameList.append(name+"-bias-m") | |||||
#momentum时多加的参数 | |||||
elif args.optimizer == 'momentum': | |||||
nameList.append(name+"-weight-momentum") | |||||
nameList.append(name+"-bias-momentum") | |||||
else: | |||||
if args.optimizer != 'sgd': | |||||
print('Error: optimizer!') | |||||
return nameList | |||||
def prune(): | |||||
#获的剪枝的阈值 | |||||
thre = pa.get_pruneThre_fc() | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
modelWeight.weights_dict = {} | |||||
removeIndexs = [] | |||||
lastRemoveIndexs = [] | |||||
beforePrune = 0 | |||||
afterPrune = 0 | |||||
dictLen = len(weights_dict) | |||||
numDiv = 0 | |||||
if args.optimizer == 'adam': | |||||
numDiv = 6 | |||||
elif args.optimizer == 'momentum': | |||||
numDiv = 4 | |||||
else: | |||||
numDiv = 2 | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.startswith("dense") and name.endswith("-weight"): | |||||
if name.startswith("dense"+str(int(dictLen/numDiv)-1)) and name.endswith("-weight"): | |||||
lastRemoveIndexs = removeIndexs | |||||
removeIndexs = [] | |||||
else: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
lastRemoveIndexs = removeIndexs | |||||
#获取对应剪枝方法removeIndexs | |||||
removeIndexs = pa.get_removeIndex_fc(a, shape, thre) | |||||
if len(removeIndexs) == len(a): | |||||
removeIndexs = np.delete(removeIndexs, 0) | |||||
#待剪枝层的名字列表 | |||||
name = name.split("_")[0].split("-")[0] | |||||
nameList = [] | |||||
nameList = makeNameList(nameList, name) | |||||
#真正剪枝 | |||||
i = 0 | |||||
for name in nameList: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
if "weight" in name: | |||||
b = np.delete(a, removeIndexs, 0) | |||||
b = np.delete(b, lastRemoveIndexs, 1) | |||||
else: | |||||
b = np.delete(a, removeIndexs) | |||||
if i == 0: | |||||
beforePrune += a.shape[0] | |||||
afterPrune += b.shape[0] | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
i += 1 | |||||
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune) | |||||
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%") | |||||
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path") | |||||
modelWeight.save(weights_profile_path) | |||||
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model"))) | |||||
def main(): | |||||
prune() | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,226 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import argparse | |||||
import numpy as np | |||||
import os | |||||
from util.model_weights import modelWeight | |||||
import util.prune_algorithm as pa | |||||
parser = argparse.ArgumentParser() | |||||
dtype_dict={2:np.float32, | |||||
3:np.float64, | |||||
4:np.int8, | |||||
5:np.int32, | |||||
6:np.int64, | |||||
9:np.float16} | |||||
parser.add_argument("--bn", default=False, | |||||
type=str, help="Whether to use use bn layer") | |||||
parser.add_argument("--prune_method", default='bn', | |||||
type=str, help="method of prune(channel_prune_bn, channel_prune_conv)") | |||||
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last', | |||||
type = str, required = False, help = "Path of base oneflow model") | |||||
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str, | |||||
required = False, help = "Path to the output OneFlow model.") | |||||
parser.add_argument("--percent", default = 0.7, type = float, required = False, | |||||
help = "scale sparse rate (default: 0.7)") | |||||
parser.add_argument("--optimizer", type=str, default="momentum", required=False, | |||||
help="sgd, adam, momentum") | |||||
args = parser.parse_args() | |||||
def _SaveWeightBlob2File(blob, folder, var): | |||||
if not os.path.exists(folder): | |||||
os.makedirs(folder) | |||||
filename = os.path.join(folder, var) | |||||
f = open(filename, 'wb') | |||||
f.write(blob.tobytes()) | |||||
f.close() | |||||
def _LoadWeightBlob2Numpy(shape, folder, dtype): | |||||
if not os.path.exists(folder): | |||||
print('fail to find', folder) | |||||
filename = os.path.join(folder, 'out') | |||||
f = open(filename, 'r') | |||||
n = np.fromfile(f, dtype=dtype) | |||||
n = n.reshape(shape) | |||||
f.close() | |||||
return n | |||||
def name2array(name, weights_dict): | |||||
folder=os.path.join(args.model_load_dir, name) | |||||
profile_dict = weights_dict[name] | |||||
shape=profile_dict["shape"] | |||||
dtype=profile_dict["dtype"] | |||||
dtype=dtype_dict[dtype] | |||||
array = _LoadWeightBlob2Numpy(shape,folder,dtype) | |||||
return array, dtype, shape | |||||
#制作待剪枝的namelist | |||||
def makeNameList(pruneName, nameList, name): | |||||
if pruneName == '_bn-gamma': | |||||
nameList.append(name+"_weight") | |||||
elif pruneName == "_weight": | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-gamma") | |||||
nameList.append(name+pruneName) | |||||
nameList.append(name+"_bias") | |||||
#是否添加对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta") | |||||
nameList.append(name+"_bn-moving_variance") | |||||
nameList.append(name+"_bn-moving_mean") | |||||
#adam时多加的参数 | |||||
if args.optimizer == 'adam': | |||||
nameList.append(name+"_weight-v") | |||||
nameList.append(name+"_weight-m") | |||||
nameList.append(name+"_bias-v") | |||||
nameList.append(name+"_bias-m") | |||||
#是否添加对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta-v") | |||||
nameList.append(name+"_bn-beta-m") | |||||
nameList.append(name+"_bn-gamma-v") | |||||
nameList.append(name+"_bn-gamma-m") | |||||
#momentum时多加的参数 | |||||
elif args.optimizer == 'momentum': | |||||
nameList.append(name+"_weight-momentum") | |||||
nameList.append(name+"_bias-momentum") | |||||
#是否添加对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta-momentum") | |||||
nameList.append(name+"_bn-gamma-momentum") | |||||
else: | |||||
if args.optimizer != 'sgd': | |||||
print('Error: optimizer!') | |||||
return nameList | |||||
def prune(): | |||||
# 获取对应剪枝方法的thre阈值 | |||||
if args.prune_method == 'bn': | |||||
thre = pa.get_pruneThre_bn() | |||||
elif args.prune_method == 'conv_avg': | |||||
thre = pa.get_pruneThre_conv_avg() | |||||
elif args.prune_method == 'conv_all': | |||||
thre = pa.get_pruneThre_conv_all() | |||||
elif args.prune_method == 'conv_max': | |||||
thre = pa.get_pruneThre_conv_max() | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
modelWeight.weights_dict = {} | |||||
fcRemoveIndexs = [] | |||||
fcDivideNum = 0 | |||||
removeIndexs = [] | |||||
lastRemoveIndexs = [] | |||||
beforePrune = 0 | |||||
afterPrune = 0 | |||||
pruneName = '' | |||||
if "bn" in args.prune_method: | |||||
pruneName = "_bn-gamma" | |||||
elif "conv" in args.prune_method or args.prune_method=="random": | |||||
pruneName = "_weight" | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.startswith("conv") and name.endswith(pruneName): | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
lastRemoveIndexs = removeIndexs | |||||
#获取对应剪枝方法removeIndexs | |||||
if args.prune_method == 'bn': | |||||
removeIndexs = pa.get_removeIndex_bn(a, thre) | |||||
elif args.prune_method == "conv_avg": | |||||
removeIndexs = pa.get_removeIndex_conv_avg(a, shape, thre) | |||||
elif args.prune_method == "conv_all": | |||||
removeIndexs = pa.get_removeIndex_conv_all(a, shape, thre) | |||||
elif args.prune_method == "conv_max": | |||||
removeIndexs = pa.get_removeIndex_conv_max(a, shape, thre) | |||||
elif args.prune_method == "random": | |||||
removeIndexs = pa.get_removeIndex_random(shape) | |||||
elif args.prune_method == "conv_similarity": | |||||
removeIndexs = pa.get_removeIndex_conv_similarity(a, shape) | |||||
elif args.prune_method == "bn_similarity": | |||||
removeIndexs = pa.get_removeIndex_bn_similarity(a, shape) | |||||
elif args.prune_method == "conv_threshold": | |||||
removeIndexs = pa.get_removeIndex_conv_threshold(a, shape, threSet=0.06) | |||||
if len(removeIndexs) == len(a): | |||||
removeIndexs = np.delete(removeIndexs, 0) | |||||
if name == "conv1"+pruneName: | |||||
fcRemoveIndexs = removeIndexs | |||||
fcDivideNum = 16 | |||||
#待剪枝层的名字列表 | |||||
name = name.split("_")[0].split("-")[0] | |||||
nameList = [] | |||||
nameList = makeNameList(pruneName, nameList, name) | |||||
#真正剪枝 | |||||
for name in nameList: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
if name.endswith("weight") or name.endswith("weight-v") or \ | |||||
name.endswith("weight-m") or name.endswith("weight-momentum"): | |||||
b = np.delete(a, removeIndexs, 0) | |||||
b = np.delete(b, lastRemoveIndexs, 1) | |||||
if name.endswith("weight"): | |||||
beforePrune += a.shape[0] | |||||
afterPrune += b.shape[0] | |||||
else: | |||||
b = np.delete(a, removeIndexs) | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
#第一个dense0层剪枝 | |||||
elif name.startswith("dense"): | |||||
if name in ['dense0-weight', 'dense0-weight-v', | |||||
'dense0-weight-m', 'dense0-weight-momentum']: | |||||
fcRemoveIndexsNew = [] | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
num = int(a.shape[1]/fcDivideNum) | |||||
for index in fcRemoveIndexs: | |||||
fcRemoveIndexsNew += [index+fcDivideNum*i for i in range(num)] | |||||
b = np.delete(a, fcRemoveIndexsNew, 1) | |||||
else: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
b = a | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune) | |||||
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%") | |||||
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path") | |||||
modelWeight.save(weights_profile_path) | |||||
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model"))) | |||||
def main(): | |||||
prune() | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,271 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import argparse | |||||
import numpy as np | |||||
import os | |||||
from util.model_weights import modelWeight | |||||
import util.prune_algorithm as pa | |||||
parser = argparse.ArgumentParser() | |||||
dtype_dict={2:np.float32, | |||||
3:np.float64, | |||||
4:np.int8, | |||||
5:np.int32, | |||||
6:np.int64, | |||||
9:np.float16} | |||||
parser.add_argument("--bn", default=False, | |||||
type=str, help="Whether to use use bn layer") | |||||
parser.add_argument("--prune_method", default='bn', | |||||
type=str, help="method of prune(channel_prune_bn, channel_prune_conv)") | |||||
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last', | |||||
type = str, required = False, help = "Path of base oneflow model") | |||||
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str, | |||||
required = False, help = "Path to the output OneFlow model.") | |||||
parser.add_argument("--percent", default = 0.7, type = float, required = False, | |||||
help = "scale sparse rate (default: 0.7)") | |||||
parser.add_argument("--optimizer", type=str, default="momentum", required=False, | |||||
help="sgd, adam, momentum") | |||||
args = parser.parse_args() | |||||
def _SaveWeightBlob2File(blob, folder, var): | |||||
if not os.path.exists(folder): | |||||
os.makedirs(folder) | |||||
filename = os.path.join(folder, var) | |||||
f = open(filename, 'wb') | |||||
f.write(blob.tobytes()) | |||||
f.close() | |||||
def _LoadWeightBlob2Numpy(shape, folder, dtype): | |||||
if not os.path.exists(folder): | |||||
print('fail to find', folder) | |||||
filename = os.path.join(folder, 'out') | |||||
f = open(filename, 'r') | |||||
n = np.fromfile(f, dtype=dtype) | |||||
n = n.reshape(shape) | |||||
f.close() | |||||
return n | |||||
def name2array(name, weights_dict): | |||||
folder=os.path.join(args.model_load_dir, name) | |||||
profile_dict = weights_dict[name] | |||||
shape=profile_dict["shape"] | |||||
dtype=profile_dict["dtype"] | |||||
dtype=dtype_dict[dtype] | |||||
array = _LoadWeightBlob2Numpy(shape,folder,dtype) | |||||
return array, dtype, shape | |||||
#制作待剪枝的namelist | |||||
def makeNameList(pruneName, nameList, name): | |||||
if pruneName == '_bn-gamma': | |||||
nameList.append(name+"_weight") | |||||
elif pruneName == "_weight": | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-gamma") | |||||
nameList.append(name+pruneName) | |||||
nameList.append(name+"_bias") | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta") | |||||
nameList.append(name+"_bn-moving_variance") | |||||
nameList.append(name+"_bn-moving_mean") | |||||
#adam时多加的参数 | |||||
if args.optimizer == 'adam': | |||||
nameList.append(name+"_weight-v") | |||||
nameList.append(name+"_weight-m") | |||||
nameList.append(name+"_bias-v") | |||||
nameList.append(name+"_bias-m") | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta-v") | |||||
nameList.append(name+"_bn-beta-m") | |||||
nameList.append(name+"_bn-gamma-v") | |||||
nameList.append(name+"_bn-gamma-m") | |||||
#momentum时多加的参数 | |||||
elif args.optimizer == 'momentum': | |||||
nameList.append(name+"_weight-momentum") | |||||
nameList.append(name+"_bias-momentum") | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta-momentum") | |||||
nameList.append(name+"_bn-gamma-momentum") | |||||
else: | |||||
if args.optimizer != 'sgd': | |||||
print('Error: optimizer!') | |||||
return nameList | |||||
def prune(): | |||||
# 获取对应剪枝方法的thre阈值 | |||||
if args.prune_method == 'bn': | |||||
thre = pa.get_pruneThre_bn() | |||||
elif args.prune_method == 'conv_avg': | |||||
thre = pa.get_pruneThre_conv_avg() | |||||
elif args.prune_method == 'conv_all': | |||||
thre = pa.get_pruneThre_conv_all() | |||||
elif args.prune_method == 'conv_max': | |||||
thre = pa.get_pruneThre_conv_max() | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
modelWeight.weights_dict = {} | |||||
fcRemoveIndexs = [] | |||||
fcDivideNum = 0 | |||||
removeIndexs = [] | |||||
lastRemoveIndexs = [] | |||||
lastRemoveIndexs_shortcut = [] | |||||
beforePrune = 0 | |||||
afterPrune = 0 | |||||
pruneName = '' | |||||
if "bn" in args.prune_method: | |||||
pruneName = "_bn-gamma" | |||||
elif "conv" in args.prune_method or args.prune_method=="random": | |||||
pruneName = "_weight" | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.startswith("conv") and name.endswith(pruneName) and \ | |||||
"stem" not in name and "shortcut" not in name: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
lastRemoveIndexs = removeIndexs | |||||
#获取对应剪枝方法removeIndexs | |||||
if args.prune_method == 'bn': | |||||
removeIndexs = pa.get_removeIndex_bn(a, thre) | |||||
elif args.prune_method == "conv_avg": | |||||
removeIndexs = pa.get_removeIndex_conv_avg(a, shape, thre) | |||||
elif args.prune_method == "conv_all": | |||||
removeIndexs = pa.get_removeIndex_conv_all(a, shape, thre) | |||||
elif args.prune_method == "conv_max": | |||||
removeIndexs = pa.get_removeIndex_conv_max(a, shape, thre) | |||||
elif args.prune_method == "random": | |||||
removeIndexs = pa.get_removeIndex_random(shape) | |||||
elif args.prune_method == "conv_similarity": | |||||
removeIndexs = pa.get_removeIndex_conv_similarity(a, shape) | |||||
elif args.prune_method == "bn_similarity": | |||||
removeIndexs = pa.get_removeIndex_bn_similarity(a, shape) | |||||
elif args.prune_method == "conv_threshold": | |||||
removeIndexs = pa.get_removeIndex_conv_threshold(a, shape, threSet=0.06) | |||||
if len(removeIndexs) == len(a): | |||||
removeIndexs = np.delete(removeIndexs, 0) | |||||
if name == "conv47"+pruneName: | |||||
fcRemoveIndexs = removeIndexs | |||||
fcDivideNum = 2048 | |||||
#待剪枝层的名字列表 | |||||
name = name.split("_")[0].split("-")[0] | |||||
nameList = [] | |||||
nameList = makeNameList(pruneName, nameList, name) | |||||
#除了shortcut层的真正剪枝 | |||||
for name in nameList: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
if name.endswith("weight") or name.endswith("weight-v") or \ | |||||
name.endswith("weight-m") or name.endswith("weight-momentum"): | |||||
b = np.delete(a, removeIndexs, 0) | |||||
b = np.delete(b, lastRemoveIndexs, 1) | |||||
if name.endswith("weight"): | |||||
beforePrune += a.shape[0] | |||||
afterPrune += b.shape[0] | |||||
else: | |||||
b = np.delete(a, removeIndexs) | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
#resnet模型剪枝shortcut | |||||
#addName是shortcut层的数字后缀 | |||||
addName = "" | |||||
#获取conv层name中的编号数字 | |||||
n = int(name.split("_")[0].split("-")[0].replace("conv", "")) | |||||
if (n+1)%3 == 0: | |||||
n = int((n+1)/3) | |||||
if n <= 3: | |||||
addName = "0_" + str(n-1) | |||||
elif n <= 7: | |||||
addName = "1_" + str(n-4) | |||||
elif n <= 13: | |||||
addName = "2_" + str(n-8) | |||||
elif n <= 16: | |||||
addName = "3_" + str(n-14) | |||||
name = "conv_shortcut" + addName | |||||
#shortcut的conv层待剪枝层的名字列表 | |||||
#nameList_shortcut是裁剪所有的名字列表 | |||||
nameList_shortcut = [] | |||||
nameList_shortcut = makeNameList(pruneName, nameList_shortcut, name) | |||||
#resnet模型的shortcut真正剪枝 | |||||
for name in nameList_shortcut: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
if name.endswith("weight") or name.endswith("weight-v") or \ | |||||
name.endswith("weight-m") or name.endswith("weight-momentum"): | |||||
b = np.delete(a, removeIndexs, 0) | |||||
b = np.delete(b, lastRemoveIndexs_shortcut, 1) | |||||
else: | |||||
b = np.delete(a, removeIndexs) | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
lastRemoveIndexs_shortcut = removeIndexs | |||||
#复制stem层 | |||||
elif "stem" in name: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
b = a | |||||
print(name+" copy") | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
#第一个dense0层剪枝 | |||||
elif name.startswith("dense"): | |||||
if name in ['dense0-weight', 'dense0-weight-v', | |||||
'dense0-weight-m', 'dense0-weight-momentum']: | |||||
fcRemoveIndexsNew = [] | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
num = int(a.shape[1]/fcDivideNum) | |||||
for index in fcRemoveIndexs: | |||||
fcRemoveIndexsNew += [index+fcDivideNum*i for i in range(num)] | |||||
b = np.delete(a, fcRemoveIndexsNew, 1) | |||||
else: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
b = a | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune) | |||||
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%") | |||||
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path") | |||||
modelWeight.save(weights_profile_path) | |||||
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model"))) | |||||
def main(): | |||||
prune() | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,228 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import argparse | |||||
import numpy as np | |||||
import os | |||||
from util.model_weights import modelWeight | |||||
import util.prune_algorithm as pa | |||||
parser = argparse.ArgumentParser() | |||||
dtype_dict={2:np.float32, | |||||
3:np.float64, | |||||
4:np.int8, | |||||
5:np.int32, | |||||
6:np.int64, | |||||
9:np.float16} | |||||
parser.add_argument("--bn", default=False, | |||||
type=str, help="Whether to use use bn layer") | |||||
parser.add_argument("--prune_method", default='bn', type=str, | |||||
help="method of prune(bn, conv_avg, random...)") | |||||
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last', | |||||
type = str, required = False, help = "Path of base oneflow model") | |||||
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str, | |||||
required = False, help = "Path to the output OneFlow model.") | |||||
parser.add_argument("--percent", default = 0.7, type = float, required = False, | |||||
help = "scale sparse rate (default: 0.7)") | |||||
parser.add_argument("--optimizer", type=str, default="momentum", required=False, | |||||
help="sgd, adam, momentum") | |||||
args = parser.parse_args() | |||||
def _SaveWeightBlob2File(blob, folder, var): | |||||
if not os.path.exists(folder): | |||||
os.makedirs(folder) | |||||
filename = os.path.join(folder, var) | |||||
f = open(filename, 'wb') | |||||
f.write(blob.tobytes()) | |||||
f.close() | |||||
def _LoadWeightBlob2Numpy(shape, folder, dtype): | |||||
if not os.path.exists(folder): | |||||
print('fail to find', folder) | |||||
filename = os.path.join(folder, 'out') | |||||
f = open(filename, 'r') | |||||
n = np.fromfile(f, dtype=dtype) | |||||
n = n.reshape(shape) | |||||
f.close() | |||||
return n | |||||
def name2array(name, weights_dict): | |||||
folder=os.path.join(args.model_load_dir, name) | |||||
profile_dict = weights_dict[name] | |||||
shape=profile_dict["shape"] | |||||
dtype=profile_dict["dtype"] | |||||
dtype=dtype_dict[dtype] | |||||
array = _LoadWeightBlob2Numpy(shape,folder,dtype) | |||||
return array, dtype, shape | |||||
#制作待剪枝的namelist | |||||
def makeNameList(pruneName, nameList, name): | |||||
if pruneName == '_bn-gamma': | |||||
nameList.append(name+"_weight") | |||||
elif pruneName == "_weight": | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-gamma") | |||||
nameList.append(name+pruneName) | |||||
nameList.append(name+"_bias") | |||||
#是否添加对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta") | |||||
nameList.append(name+"_bn-moving_variance") | |||||
nameList.append(name+"_bn-moving_mean") | |||||
#adam时多加的参数 | |||||
if args.optimizer == 'adam': | |||||
nameList.append(name+"_weight-v") | |||||
nameList.append(name+"_weight-m") | |||||
nameList.append(name+"_bias-v") | |||||
nameList.append(name+"_bias-m") | |||||
#是否添加adam时对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta-v") | |||||
nameList.append(name+"_bn-beta-m") | |||||
nameList.append(name+"_bn-gamma-v") | |||||
nameList.append(name+"_bn-gamma-m") | |||||
#momentum时多加的参数 | |||||
elif args.optimizer == 'momentum': | |||||
nameList.append(name+"_weight-momentum") | |||||
nameList.append(name+"_bias-momentum") | |||||
#是否添加momentum时对应bn层参数 | |||||
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
nameList.append(name+"_bn-beta-momentum") | |||||
nameList.append(name+"_bn-gamma-momentum") | |||||
else: | |||||
if args.optimizer != 'sgd': | |||||
print('Error: optimizer!') | |||||
return nameList | |||||
def prune(): | |||||
# 获取对应剪枝方法的thre阈值 | |||||
if args.prune_method == 'bn': | |||||
thre = pa.get_pruneThre_bn() | |||||
elif args.prune_method == 'conv_avg': | |||||
thre = pa.get_pruneThre_conv_avg() | |||||
elif args.prune_method == 'conv_all': | |||||
thre = pa.get_pruneThre_conv_all() | |||||
elif args.prune_method == 'conv_max': | |||||
thre = pa.get_pruneThre_conv_max() | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
modelWeight.weights_dict = {} | |||||
fcRemoveIndexs = [] | |||||
fcDivideNum = 0 | |||||
removeIndexs = [] | |||||
lastRemoveIndexs = [] | |||||
beforePrune = 0 | |||||
afterPrune = 0 | |||||
pruneName = '' | |||||
if "bn" in args.prune_method: | |||||
pruneName = "_bn-gamma" | |||||
elif "conv" in args.prune_method or args.prune_method=="random": | |||||
pruneName = "_weight" | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.startswith("conv") and name.endswith(pruneName): | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
lastRemoveIndexs = removeIndexs | |||||
#获取对应剪枝方法removeIndexs | |||||
if args.prune_method == 'bn': | |||||
removeIndexs = pa.get_removeIndex_bn(a, thre) | |||||
elif args.prune_method == "conv_avg": | |||||
removeIndexs = pa.get_removeIndex_conv_avg(a, shape, thre) | |||||
elif args.prune_method == "conv_all": | |||||
removeIndexs = pa.get_removeIndex_conv_all(a, shape, thre) | |||||
elif args.prune_method == "conv_max": | |||||
removeIndexs = pa.get_removeIndex_conv_max(a, shape, thre) | |||||
elif args.prune_method == "random": | |||||
removeIndexs = pa.get_removeIndex_random(shape) | |||||
elif args.prune_method == "conv_similarity": | |||||
removeIndexs = pa.get_removeIndex_conv_similarity(a, shape) | |||||
elif args.prune_method == "bn_similarity": | |||||
removeIndexs = pa.get_removeIndex_bn_similarity(a, shape) | |||||
elif args.prune_method == "conv_threshold": | |||||
removeIndexs = pa.get_removeIndex_conv_threshold(a, shape, threSet=0.06) | |||||
# print(removeIndexs) | |||||
if len(removeIndexs) == len(a): | |||||
removeIndexs = np.delete(removeIndexs, 0) | |||||
if name == "conv12"+pruneName: | |||||
fcRemoveIndexs = removeIndexs | |||||
fcDivideNum = 512 | |||||
#待剪枝层的名字列表 | |||||
name = name.split("_")[0].split("-")[0] | |||||
nameList = [] | |||||
nameList = makeNameList(pruneName, nameList, name) | |||||
#真正剪枝 | |||||
for name in nameList: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
if name.endswith("weight") or name.endswith("weight-v") or \ | |||||
name.endswith("weight-m") or name.endswith("weight-momentum"): | |||||
b = np.delete(a, removeIndexs, 0) | |||||
b = np.delete(b, lastRemoveIndexs, 1) | |||||
if name.endswith("weight"): | |||||
beforePrune += a.shape[0] | |||||
afterPrune += b.shape[0] | |||||
else: | |||||
b = np.delete(a, removeIndexs) | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
#第一个dense0层剪枝 | |||||
elif name.startswith("dense"): | |||||
if name in ['dense0-weight', 'dense0-weight-v', | |||||
'dense0-weight-m', 'dense0-weight-momentum']: | |||||
fcRemoveIndexsNew = [] | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
num = int(a.shape[1]/fcDivideNum) | |||||
for index in fcRemoveIndexs: | |||||
fcRemoveIndexsNew += [index+fcDivideNum*i for i in range(num)] | |||||
b = np.delete(a, fcRemoveIndexsNew, 1) | |||||
else: | |||||
a, dtype, shape = name2array(name, weights_dict) | |||||
b = a | |||||
print(name+" pruned: shape from", a.shape, "-->", b.shape) | |||||
if args.model_save_dir: | |||||
folder = os.path.join(args.model_save_dir, "model", name) | |||||
_SaveWeightBlob2File(b, folder, 'out') | |||||
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape) | |||||
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune) | |||||
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%") | |||||
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path") | |||||
modelWeight.save(weights_profile_path) | |||||
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model"))) | |||||
def main(): | |||||
prune() | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,97 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import json | |||||
import oneflow as flow | |||||
# mysingle.py | |||||
class ModelWeights: | |||||
weights_dict={} | |||||
dtype_dict={flow.float32:2, | |||||
flow.float64:3, | |||||
flow.int8:4, | |||||
flow.int32:5, | |||||
flow.int64:6, | |||||
flow.float16:9, | |||||
2:2, 3:3, 4:4, 5:5, 6:6, 9:9} | |||||
def add(self, variable_name, dtype, shape): | |||||
assert variable_name not in self.weights_dict | |||||
profile_dict={} | |||||
profile_dict["dtype"]=dtype | |||||
profile_dict["shape"]=shape | |||||
self.weights_dict[variable_name]=profile_dict | |||||
return self.weights_dict | |||||
def addConv(self, index, dtype, shape1, shape2, optimizer): | |||||
dtype = self.dtype_dict[dtype] | |||||
# print(dtype) | |||||
self.add("conv{}".format(index)+'_weight', dtype, shape1) | |||||
self.add("conv{}".format(index)+'_bias', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-gamma', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-beta', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-moving_variance', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-moving_mean', dtype, shape2) | |||||
if optimizer == 'adam': | |||||
self.add("conv{}".format(index)+'_weight-v', dtype, shape1) | |||||
self.add("conv{}".format(index)+'_weight-m', dtype, shape1) | |||||
self.add("conv{}".format(index)+'_bias-v', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bias-m', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-gamma-v', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-gamma-m', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-beta-v', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-beta-m', dtype, shape2) | |||||
elif optimizer == 'momentum': | |||||
self.add("conv{}".format(index)+'_weight-momentum', dtype, shape1) | |||||
self.add("conv{}".format(index)+'_bias-momentum', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-gamma-momentum', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-beta-momentum', dtype, shape2) | |||||
def addDense(self, dtype_old, shape, optimizer, dense_num): | |||||
dtype = [] | |||||
for old in dtype_old: | |||||
dtype.append(self.dtype_dict[old]) | |||||
# print(dtype) | |||||
for i in range(0, dense_num): | |||||
self.add('dense'+str(i)+'-weight', dtype[i], shape[i]) | |||||
self.add('dense'+str(i)+'-bias', dtype[i], (shape[i][0],)) | |||||
if optimizer == 'adam': | |||||
self.add('dense'+str(i)+'-weight-v', dtype[i], shape[i]) | |||||
self.add('dense'+str(i)+'-weight-m', dtype[i], shape[i]) | |||||
self.add('dense'+str(i)+'-bias-v', dtype[i], (shape[i][0],)) | |||||
self.add('dense'+str(i)+'-bias-m', dtype[i], (shape[i][0],)) | |||||
elif optimizer == 'momentum': | |||||
self.add('dense'+str(i)+'-weight-momentum', dtype[i], shape[i]) | |||||
self.add('dense'+str(i)+'-bias-momentum', dtype[i], (shape[i][0],)) | |||||
def save(self,path): | |||||
print('Saving weights_profile_path to {}'.format(path)) | |||||
# print('weights_dict',self.weights_dict) | |||||
with open(path,"w") as f: | |||||
for k,v in self.weights_dict.items(): | |||||
v_json=json.dumps(v) | |||||
f.write(k+'__'+ v_json +'\n') | |||||
return self.weights_dict | |||||
def load(self,path): | |||||
if len(self.weights_dict)!=0: | |||||
return self.weights_dict | |||||
else: | |||||
with open(path,'r') as f: | |||||
for line in f: | |||||
variable_name,profile_dict=line.split('__') | |||||
profile_dict=json.loads(profile_dict) | |||||
self.weights_dict[variable_name]=profile_dict | |||||
return self.weights_dict | |||||
modelWeight = ModelWeights() |
@@ -0,0 +1,315 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import argparse | |||||
import numpy as np | |||||
import os | |||||
from .model_weights import modelWeight | |||||
import random | |||||
parser = argparse.ArgumentParser() | |||||
dtype_dict={2:np.float32, | |||||
3:np.float64, | |||||
4:np.int8, | |||||
5:np.int32, | |||||
6:np.int64, | |||||
9:np.float16} | |||||
parser.add_argument("--bn", default=False, | |||||
type=str, help="Whether to use use bn layer") | |||||
parser.add_argument("--prune_method", default='bn', | |||||
type=str, help="method of prune(bn, conv_avg, random...)") | |||||
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last', | |||||
type = str, required = False, help = "Path of base oneflow model") | |||||
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str, | |||||
required = False, help = "Path to the output OneFlow model.") | |||||
parser.add_argument("--percent", default = 0.7, type = float, required = False, | |||||
help = "scale sparse rate (default: 0.7)") | |||||
parser.add_argument("--optimizer", type=str, default="momentum", required=False, | |||||
help="sgd, adam, momentum") | |||||
args = parser.parse_args() | |||||
def _LoadWeightBlob2Numpy(shape, folder, dtype): | |||||
if not os.path.exists(folder): | |||||
print('fail to find', folder) | |||||
filename = os.path.join(folder, 'out') | |||||
f = open(filename, 'r') | |||||
n = np.fromfile(f, dtype=dtype) | |||||
n = n.reshape(shape) | |||||
f.close() | |||||
return n | |||||
def name2array(name, weights_dict): | |||||
folder=os.path.join(args.model_load_dir, name) | |||||
profile_dict = weights_dict[name] | |||||
shape=profile_dict["shape"] | |||||
dtype=profile_dict["dtype"] | |||||
dtype=dtype_dict[dtype] | |||||
array = _LoadWeightBlob2Numpy(shape,folder,dtype) | |||||
return array, dtype, shape | |||||
# conv_avg剪枝方法:conv层weight的平均值作为缩放因子,获得对应阈值 | |||||
def get_pruneThre_conv_avg(): | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
totalArray = [] | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.endswith("_weight") and "stem" not in name and "shortcut" not in name: | |||||
array, dtype, shape = name2array(name, weights_dict) | |||||
array = array.tolist() | |||||
array_rank = [] | |||||
for i in range(0, shape[0]): | |||||
array_i = array[i] | |||||
array_i_faltten = [abs(m3) for m1 in array_i for m2 in m1 for m3 in m2] | |||||
array_rank.append(sum(array_i_faltten)/(shape[1]*shape[2]*shape[3])) | |||||
totalArray = totalArray + array_rank | |||||
totalArray.sort() | |||||
threIndex = int(len(totalArray) * args.percent) | |||||
thre = totalArray[threIndex] | |||||
print("threshold:", thre) | |||||
return thre | |||||
# conv_all剪枝方法:conv层weight的总和作为缩放因子,获得对应阈值 | |||||
def get_pruneThre_conv_all(): | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
totalArray = [] | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.endswith("_weight") and "stem" not in name and "shortcut" not in name: | |||||
array, dtype, shape = name2array(name, weights_dict) | |||||
array = array.tolist() | |||||
array_rank = [] | |||||
for i in range(0, shape[0]): | |||||
array_i = array[i] | |||||
array_i_faltten = [abs(m3) for m1 in array_i for m2 in m1 for m3 in m2] | |||||
array_rank.append(sum(array_i_faltten)) | |||||
totalArray = totalArray + array_rank | |||||
totalArray.sort() | |||||
threIndex = int(len(totalArray) * args.percent) | |||||
thre = totalArray[threIndex] | |||||
print("threshold:", thre) | |||||
return thre | |||||
# conv_max剪枝方法:conv层weight的最大值作为缩放因子,获得对应阈值 | |||||
def get_pruneThre_conv_max(): | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
totalArray = [] | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.endswith("_weight") and "stem" not in name and "shortcut" not in name: | |||||
array, dtype, shape = name2array(name, weights_dict) | |||||
array = array.tolist() | |||||
array_rank = [] | |||||
for i in range(0, shape[0]): | |||||
array_i = array[i] | |||||
array_i_faltten = [abs(m3) for m1 in array_i for m2 in m1 for m3 in m2] | |||||
array_rank.append(max(array_i_faltten)) | |||||
totalArray = totalArray + array_rank | |||||
totalArray.sort() | |||||
threIndex = int(len(totalArray) * args.percent) | |||||
thre = totalArray[threIndex] | |||||
print("threshold:", thre) | |||||
return thre | |||||
# bn剪枝方法:bn层weight作为缩放因子,获得对应阈值 | |||||
def get_pruneThre_bn(): | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
totalArray = [] | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.endswith("_bn-gamma") and "stem" not in name and "shortcut" not in name: | |||||
array, dtype, shape = name2array(name, weights_dict) | |||||
array = array.tolist() | |||||
totalArray = totalArray + array | |||||
totalArray.sort() | |||||
threIndex = int(len(totalArray) * args.percent) | |||||
thre = totalArray[threIndex] | |||||
print("threshold:", thre) | |||||
return thre | |||||
#获得剪枝dnn层weight的阈值 | |||||
def get_pruneThre_fc(): | |||||
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
weights_dict=modelWeight.load(of_weight_path) | |||||
dictLen = len(weights_dict) | |||||
numDiv = 0 | |||||
if args.optimizer == 'adam': | |||||
numDiv = 6 | |||||
elif args.optimizer == 'momentum': | |||||
numDiv = 4 | |||||
else: | |||||
numDiv = 2 | |||||
totalArray = [] | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.startswith("dense"+str(int(dictLen/numDiv)-1)): | |||||
continue | |||||
if name.endswith("-weight"): | |||||
array, dtype, shape = name2array(name, weights_dict) | |||||
array = array.tolist() | |||||
array_rank = [] | |||||
for i in range(0, shape[0]): | |||||
array_i = array[i] | |||||
array_i_faltten = [abs(m1) for m1 in array_i] | |||||
array_rank.append(sum(array_i_faltten)/shape[1]) | |||||
totalArray = totalArray + array_rank | |||||
# print(totalArray, len(totalArray)) | |||||
totalArray.sort() | |||||
threIndex = int(len(totalArray) * args.percent) | |||||
thre = totalArray[threIndex] | |||||
print("threshold:", thre) | |||||
return thre | |||||
# 获得fc剪枝方法对应的removeIndexs | |||||
def get_removeIndex_fc(a, shape, thre): | |||||
a_rank = [] | |||||
for i in range(0, shape[0]): | |||||
a_i = a[i] | |||||
a_i_faltten = [abs(m1) for m1 in a_i] | |||||
a_rank.append(sum(a_i_faltten)/shape[1]) | |||||
removeIndexs = np.where(np.array(a_rank)<thre)[0] | |||||
return removeIndexs | |||||
# 获得bn剪枝方法对应的removeIndexs | |||||
def get_removeIndex_bn(a, thre): | |||||
removeIndexs = np.where(a<thre)[0] | |||||
return removeIndexs | |||||
# 获得conv_avg剪枝方法对应的removeIndexs | |||||
def get_removeIndex_conv_avg(a, shape, thre): | |||||
a_rank = [] | |||||
for i in range(0, shape[0]): | |||||
a_i = a[i] | |||||
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2] | |||||
#每一个通道的conv值的权重 | |||||
a_rank.append(sum(a_i_faltten)/(shape[1]*shape[2]*shape[3])) | |||||
removeIndexs = np.where(np.array(a_rank)<thre)[0] | |||||
return removeIndexs | |||||
# 获得conv_all剪枝方法对应的removeIndexs | |||||
def get_removeIndex_conv_all(a, shape, thre): | |||||
a_rank = [] | |||||
for i in range(0, shape[0]): | |||||
a_i = a[i] | |||||
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2] | |||||
#每一个通道的conv值的权重 | |||||
a_rank.append(sum(a_i_faltten)) | |||||
removeIndexs = np.where(np.array(a_rank)<thre)[0] | |||||
return removeIndexs | |||||
# 获得conv_max剪枝方法对应的removeIndexs | |||||
def get_removeIndex_conv_max(a, shape, thre): | |||||
a_rank = [] | |||||
for i in range(0, shape[0]): | |||||
a_i = a[i] | |||||
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2] | |||||
#每一个通道的conv值的权重 | |||||
a_rank.append(max(a_i_faltten)) | |||||
removeIndexs = np.where(np.array(a_rank)<thre)[0] | |||||
return removeIndexs | |||||
# 随机选取removeIndexs | |||||
def get_removeIndex_random(shape): | |||||
removeIndexs = sorted(random.sample(range(shape[0]), int(shape[0]*args.percent))) | |||||
return removeIndexs | |||||
# 获得conv_similarity剪枝方法对应的removeIndexs | |||||
def get_removeIndex_conv_similarity(a, shape): | |||||
removeIndexs = [] | |||||
while len(removeIndexs) <= shape[0]*args.percent: | |||||
a_rank = [] | |||||
# 计算每一个元素和其他所有元素的相似度 | |||||
for i in range(0, shape[0]): | |||||
# 已经移除的元素不再考虑 | |||||
if i in removeIndexs: | |||||
continue | |||||
a_i = a[i] | |||||
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2] | |||||
min_similarity = float("inf") | |||||
for j in range(0, shape[0]): | |||||
# 已经移除的元素不再考虑 | |||||
if j in removeIndexs+[i]: | |||||
continue | |||||
a_j = a[j] | |||||
a_j_faltten = [abs(m3) for m1 in a_j for m2 in m1 for m3 in m2] | |||||
similarity = sum([(n1-n2)**2 for n1,n2 in zip(a_i_faltten,a_j_faltten)]) | |||||
if similarity < min_similarity: | |||||
min_similarity = similarity | |||||
a_rank.append(min_similarity) | |||||
# 选取相似度最小的添加到removeIndexs中 | |||||
removeIndexs.append(a_rank.index(min(a_rank))) | |||||
# print(removeIndexs) | |||||
removeIndexs = sorted(removeIndexs) | |||||
return removeIndexs | |||||
# 获得bn_similarity剪枝方法对应的removeIndexs | |||||
def get_removeIndex_bn_similarity(a, shape): | |||||
removeIndexs = [] | |||||
while len(removeIndexs) <= shape[0]*args.percent: | |||||
a_rank = [] | |||||
# 计算每一个元素和其他所有元素的相似度 | |||||
for i in range(0, shape[0]): | |||||
# 已经移除的元素不再考虑 | |||||
if i in removeIndexs: | |||||
continue | |||||
a_i = a[i] | |||||
min_similarity = float("inf") | |||||
for j in range(0, shape[0]): | |||||
# 已经移除的元素不再考虑 | |||||
if j in removeIndexs+[i]: | |||||
continue | |||||
a_j = a[j] | |||||
similarity = (a_i-a_j)**2 | |||||
if similarity < min_similarity: | |||||
min_similarity = similarity | |||||
a_rank.append(min_similarity) | |||||
# 选取相似度最小的添加到removeIndexs中 | |||||
removeIndexs.append(a_rank.index(min(a_rank))) | |||||
# print(removeIndexs) | |||||
removeIndexs = sorted(removeIndexs) | |||||
return removeIndexs | |||||
# 获得conv_threshold剪枝方法对应的removeIndexs | |||||
# 此thre是人为设置的,不是通过thre函数得到的 | |||||
def get_removeIndex_conv_threshold(a, shape, threSet): | |||||
a_rank = [] | |||||
for i in range(0, shape[0]): | |||||
a_i = a[i] | |||||
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2] | |||||
thre_sum = 0 | |||||
for n in a_i_faltten: | |||||
if n < threSet: | |||||
thre_sum += 1 | |||||
a_rank.append(thre_sum) | |||||
threIndex = int(len(a_rank) * args.percent) | |||||
thre = sorted(a_rank)[threIndex] | |||||
removeIndexs = np.where(np.array(a_rank)<thre)[0] | |||||
return removeIndexs | |||||
def main(): | |||||
thre = get_pruneThre_bn() | |||||
print(thre) | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,251 @@ | |||||
# 通道剪枝快速上手 | |||||
## 1. 简介 | |||||
通道剪枝:剪去DNN模型或者CNN模型的一些冗余的参数通道,来获得更小的模型和更快的结果 | |||||
炼知技术平台提供了7个通道剪枝相关算子,以及众多基于Oneflow算子复现的通道剪枝模型和使用示例。 | |||||
| 类型 | 通道剪枝算子 | 算子介绍 | | |||||
| ------- | -------------------- | ------------------------------------------------------------ | | |||||
| DNN剪枝 | 神经元权重剪枝 | 以DNN神经网络的神经元训练参数的平均值作为剪枝权重,根据用户设置的剪枝率,减去权重较小的神经元 | | |||||
| CNN剪枝 | BN层剪枝 | 以CNN神经网络的BN层gamma参数作为剪枝权重,根据用户设置的剪枝率,减去权重较小的卷积通道(卷积核) | | |||||
| CNN剪枝 | 卷积层权重平均剪枝 | 以CNN神经网络的卷积层参数的平均值作为剪枝权重,根据用户设置的剪枝率,减去权重较小的卷积通道(卷积核) | | |||||
| CNN剪枝 | 卷积层权重总和剪枝 | 以CNN神经网络的卷积层参数的总和作为剪枝权重,根据用户设置的剪枝率,减去权重较小的神卷积通道(卷积核) | | |||||
| CNN剪枝 | 卷积层权重最大值剪枝 | 以CNN神经网络的卷积层参数的最大值作为剪枝权重,根据用户设置的剪枝率,减去权重较小的卷积通道(卷积核) | | |||||
| CNN剪枝 | 随机剪枝 | 根据用户设置的剪枝率,随机选取卷积通道(卷积核)进行剪枝 | | |||||
| CNN剪枝 | 卷积层阈值剪枝 | 计算CNN神经网络的卷积层参数中大于阈值的个数,将此个数作为剪枝的权重,根据用户设置的剪枝率,减去权重较小的卷积通道(卷积核) | | |||||
## 2. 使用 | |||||
### 2.1 依赖及安装 | |||||
- CUDA Version 10.1.243 | |||||
- CUDA Driver Version: 418.56 | |||||
- oneflow_cu101 | |||||
- numpy > 1.17.0 | |||||
- 可通过以下命令安装 | |||||
``` | |||||
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user | |||||
# 若找不到对应版本,升级pip | |||||
python3 -m pip install --upgrade --user pip | |||||
# 运行关于numpy的报错,例如module 'numpy.random' has no attribute 'default_rng' | |||||
# 由于numpy版本低于1.17.0,升级numpy | |||||
python3 -m pip install --upgrade --user numpy | |||||
``` | |||||
### 2.2 数据获取 | |||||
- 数据集需转换成oneflow格式,存放在**ofData文件夹**下 | |||||
- 通道剪枝主要针对CV相关的任务,数据集需处理成oneflow格式,此任务提供了默认的数据集:Cifar10、Cifar100、mnist分类数据集。可从以下链接直接下载of格式数据集,并放至ofData数据集中:https://pan.baidu.com/s/1fj0DuQM6342CWx2DrMJGhQ(提取码:r8qx) | |||||
- 若要使用使用自定义数据集,使用方法见**2.3 运行**下的**使用自己数据集** | |||||
### 2.3 运行 | |||||
- **默认运行(训练基模型、剪枝、微调)** | |||||
``` | |||||
# cifar10数据集、alexnet模型 | |||||
python run.py | |||||
``` | |||||
- 运行结果见**output文件夹**,文件夹结构说明见**2.4 文件说明** | |||||
- 运行过程的日志见**log文件夹**,文件夹结构说明见**2.4 文件说明** | |||||
- **改变默认数据集(可选mnist、cifar10、cifar100)** | |||||
``` | |||||
python run.py --data_type=mnist | |||||
``` | |||||
- **改变默认模型(可选dnn_2、dnn_4、lenet、alexnet、vgg、resnet)** | |||||
``` | |||||
python run.py --model=lenet | |||||
``` | |||||
- **改变剪枝率** | |||||
``` | |||||
python run.py --percent=0.5 | |||||
``` | |||||
- **改变剪枝算子** | |||||
``` | |||||
# dnn剪枝不需要此参数,默认权重剪枝 | |||||
# cnn剪枝算子有bn、conv_avg、conv_all、conv_max、random、conv_threshold | |||||
python run.py --prune_method=bn | |||||
``` | |||||
- **改变更多参数** | |||||
- 见下面**2.4 文件说明**中**train_val.py文件** | |||||
- **使用自己数据集(以randomData255为例)** | |||||
- 数据集示例见myData下的randomData255,里面train.json包含了2张3\*32\*32大小的图片,test.json包含了2张3\*32\*32大小的图片 | |||||
- 创建自己的数据集文件夹在**myData**文件夹下,文件夹名为数据集的名字**randomData255** | |||||
- randomData255文件夹中有两个文件:train.json和test.json,介绍如下 | |||||
- **train.json** | |||||
- 存储为一个字典,字段有data、label、shape | |||||
- data为二维数组,如randomData255数据集为2张3\*32\*32大小的图片,则data维度为2\*3027,3027是3\*32\*32的展开,图片的像素值范围为 [0, 255] | |||||
- label为一维数组,代表每张图片的类别,randomData255数据集中长度为2,是一个2维度的向量 | |||||
- shape为一维数组,长度为3,第一个是通道数,第二三个为像素(需相等),如 [3, 32, 32] | |||||
- **test.json** | |||||
- 存储为一个字典,和train.json相似,字段有data、label,没有shape字段 | |||||
- 在ofrecordMake.py文件夹下运行: | |||||
``` | |||||
# randomData255换成自己的数据集名称,制作完成的数据集见ofData文件夹 | |||||
python ofrecordMake.py --dataName=randomData255 | |||||
``` | |||||
- 基模型训练、剪枝、微调,运行: | |||||
``` | |||||
# randomData255换成自己的数据集名称 | |||||
python run.py --data_type=randomData255 | |||||
``` | |||||
- **自定义步骤step** | |||||
- 1代表训练基模型;2代表剪枝、3代表微调,默认step=123 | |||||
- 只运行训练基模型 | |||||
``` | |||||
python run.py --step=1 | |||||
``` | |||||
- 只运行剪枝: | |||||
``` | |||||
# 在output/snapshots中对应位置需要有model_base,位置介绍见下面output文件夹介绍 | |||||
python run.py --step=2 | |||||
``` | |||||
- 只运行微调: | |||||
``` | |||||
# 在./output/snapshots中对应位置需要有model_prune | |||||
python run.py --step=3 | |||||
``` | |||||
- 运行训练基模型、剪枝 | |||||
``` | |||||
python run.py --step=12 | |||||
``` | |||||
### 2.4 文件说明 | |||||
- **py文件说明** | |||||
- **run.py** | |||||
- 自动化调用train.py来进行训练和微调,以及prune剪枝 | |||||
- 大部分参数设置为默认值,可以自动调整部分参数 | |||||
- 部分参数包括:model、data_type、prune_method、percent | |||||
- 示例 | |||||
``` | |||||
python run.py --model alexnet --data_type=cifar10 --prune_method=bn --step=123 | |||||
``` | |||||
- **train_val.py** | |||||
- 训练train和微调refine模型的主函数 | |||||
- 可以自己调整所有参数,参数列表见**2.5 config参数** | |||||
- 示例见run_dnn2_cifar10.sh、run_alexnet_cifar10.sh(bn剪枝算法) | |||||
- **ofrecordMake.py** | |||||
- 制作自定义数据集 | |||||
- **文件夹说明** | |||||
- **log文件夹** | |||||
- 日志文件夹,存储不同模型和数据的日志log文件,记录每个epoch在test数据集上的top1准确率、topk准确率、运行速度。 | |||||
- 如"log_vgg_cifar10_base_model.txt":vgg模型-cifar10数据集-baseline模型训练的log记录。 | |||||
- **model文件夹** | |||||
- **cnn文件夹** | |||||
- lenet_model.py:LeNet模型 | |||||
- alexnet_model.py:AlexNet模型 | |||||
- vgg_model.py:VggNet模型 | |||||
- resnet_model.py:ResNet模型 | |||||
- **dnn文件夹** | |||||
- dnn_model:Dnn模型,包括两层Dnn模型dnn_2、四层Dnn模型dnn_4 | |||||
- **util文件夹** | |||||
- config.py:命令行参数配置文件 | |||||
- job_function_util.py:job function相关config | |||||
- model_weight.py:模型加载、保存等相关函数 | |||||
- ofrecord_util.py:数据集加载函数 | |||||
- optimizer_util.py:model相关config | |||||
- util.py:加载cfg,data,snapshot,summary等函数 | |||||
- **prune文件夹** | |||||
- util文件夹 | |||||
- 存放model_weight.py文件,模型加载、保存等相关函数 | |||||
- 存放prune_algorithm.py文件,剪枝的不同算法 | |||||
- 不同模型下的剪枝算法 | |||||
- **ofData文件夹** | |||||
- 存放of格式的数据 | |||||
- **output文件夹** | |||||
- 模型输出文件夹,模型文件存储在snapshots文件夹下 | |||||
- 按照模型分别存放,各自模型下按照数据集存放,各自数据集下分为基模型model_base、剪枝模型model_prune、微调模型model_refine。 | |||||
### 2.5 config参数 | |||||
- --dtype=float32:训练过程中参数的类型 | |||||
- --gpu_num_per_node=1:每个训练节点的GPU数量 | |||||
- --num_nodes = 1:训练节点个数 | |||||
- --model=vgg:训练中的模型(vgg、lenet、alexnet、dnn_2、dnn_4) | |||||
- --data_type='imageNet':加载的数据集(imageNet / cifar10) | |||||
- --log_type==base_model:写log日志的类型(base_model / prune_model) | |||||
- --default_dir=train:使用默认地址来加载和保存模型(推荐)'train'或者'refine' | |||||
- --model_load_dir='xxxxxx':自己指定模型加载地址(使用default_dir则不需要此项) | |||||
- --model_save_dir='xxxxxx':自己指定模型保存地址(使用default_dir则不需要此项) | |||||
- --batch_size_per_device=32:train中每个设备的batch_size(乘上gpu_num_per_node和num_nodes就是train_batch_size) | |||||
- --val_batch_size_per_device=32:test中每个设备的batch_size(乘上gpu_num_per_node和num_nodes就是test_batch_size) | |||||
- --num_classes=1000:分类数据集的类别个数 | |||||
- --num_epochs=1:epoch的个数 | |||||
- --num_examples=64000:决定train中的iter个数(除以train_batch_size就是iter个数) | |||||
- --num_val_examples=50000:决定test中的iter个数(除以test_batch_size就是iter个数) | |||||
- --rgb_mean=[123.68, 116.779, 103.939]:图片归一预处理时的均值 | |||||
- --rgb_std=[58.393, 57.12, 57.375]:图片归一预处理时的方差 | |||||
- --image_shape=[3, 224, 224]:图片的channel、height、width | |||||
- --log_dir='./output':log信息的保存地址 | |||||
- --result_dir='./output': results json保存地址。results json文件名格式为:args.result_dir, "results_"+args.model+'_'+args.data_type+'_'+args.log_type+"_{}.json".format(self.desc)) | |||||
- --loss_print_every_n_iter=1:每n个iter输出一次loss、accuracy、speed信息 | |||||
- --model_save_every_n_epoch=10:每n个epoch保存一次模型 | |||||
- --image_size=224:图片大小 | |||||
- --train_data_dir='./ofData/imageNet/train':训练数据集的目录 | |||||
- --train_data_part_num=30:训练数据集的part数(part0000-part00xx) | |||||
- --val_data_dir='./ofData/imageNet/test':测试数据集的目录 | |||||
- --val_data_part_num=2:测试数据集的part数(part0000-part00xx) | |||||
- --model_update='momentum':训练的优化器('momentum' / 'adam' / 'sgd') | |||||
- --learning_rate=0.01:学习率 | |||||
- --prune_method=bn:剪枝算法(bn、conv_avg、conv_all、conv_max、random、conv_similarity、bn_similarity、conv_threshold、以及不需要此参数的dnn剪枝) | |||||
- --step=123:剪枝步骤,1代表训练基模型;2代表剪枝、3代表微调,默认step=123 | |||||
@@ -0,0 +1,240 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import os | |||||
import argparse | |||||
import json | |||||
from datetime import datetime | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = argparse.ArgumentParser() | |||||
parser.add_argument("--model", default="alexnet", | |||||
type=str, help="Model") | |||||
parser.add_argument("--data_type", default="cifar10", | |||||
type=str, help="Dataset name") | |||||
parser.add_argument("--bn", type=str2bool, | |||||
default=True, help="Whether to use use bn layer") | |||||
parser.add_argument("--percent", default="0.5", | |||||
type=str, help="scale sparse rate (default: 0.7)") | |||||
parser.add_argument("--prune_method", default='bn', | |||||
type=str, help="method of prune(bn, conv_avg, random...)") | |||||
parser.add_argument("--step", default='123', | |||||
type=str, help="choose steps from train, prune, refine") | |||||
parser.add_argument("--dataset_dir", type=str, default="./ofData/cifar10", help="dataset info load directory") | |||||
# snapshot | |||||
parser.add_argument("--model_save_dir", type=str, default="./models", help="model save directory", ) | |||||
# log, save and loss print | |||||
parser.add_argument("--model_dir", type=str, default="./model", help="model info save directory") | |||||
parser.add_argument("--log_dir", type=str, default="./log", help="log info save directory") | |||||
parser.add_argument("--before_result_dir", type=str, default="./result/before", help="the save directory of results") | |||||
parser.add_argument("--after_result_dir", type=str, default="./result/after", help="the save directory of results") | |||||
args = parser.parse_args() | |||||
def getCommand(): | |||||
model = args.model | |||||
data_type = args.data_type | |||||
dataset_dir = args.dataset_dir | |||||
model_save_dir = args.model_save_dir | |||||
log_dir = args.log_dir | |||||
before_result_dir = args.before_result_dir | |||||
after_result_dir = args.after_result_dir | |||||
num_classes, train_data_part_num, val_data_part_num = "", "", "" | |||||
image_shape, image_size, resize_shorter = "", "", "" | |||||
rgb_mean, rgb_std, num_examples, num_val_examples = "", "", "", "" | |||||
bn = args.bn | |||||
prune = "" | |||||
percent = args.percent | |||||
prune_method = args.prune_method | |||||
if "dnn" in args.model: | |||||
bn = "False" | |||||
prune = "Dnn" | |||||
elif args.model == "lenet": | |||||
prune = "Lenet" | |||||
elif "alexnet" in args.model: | |||||
prune = "Alexnet" | |||||
elif args.model == "vgg": | |||||
prune = "Vggnet" | |||||
elif args.model == "resnet": | |||||
prune = "Resnet" | |||||
if data_type == "cifar10": | |||||
num_classes, train_data_part_num, val_data_part_num = "10", "5", "1" | |||||
image_shape, image_size, resize_shorter = "3,32,32", "32", "32" | |||||
rgb_mean, rgb_std = "124.95,122.65,114.75", "61.252,60.767,65.852" | |||||
num_examples, num_val_examples = "50000", "10000" | |||||
elif data_type == "cifar100": | |||||
num_classes, train_data_part_num, val_data_part_num = "100", "5", "1" | |||||
image_shape, image_size, resize_shorter = "3,32,32", "32", "32" | |||||
rgb_mean, rgb_std = "124.95,122.65,114.75", "61.252,60.767,65.852" | |||||
num_examples, num_val_examples = "50000", "10000" | |||||
elif data_type == "mnist": | |||||
num_classes, train_data_part_num, val_data_part_num = "10", "6", "1" | |||||
image_shape, image_size, resize_shorter = "1,28,28", "28", "32" | |||||
rgb_mean, rgb_std = "33.3285", "78.5655" | |||||
num_examples, num_val_examples = "60000", "10000" | |||||
elif data_type == "svhn": | |||||
num_classes, train_data_part_num, val_data_part_num = "10", "1", "1" | |||||
image_shape, image_size, resize_shorter = "32,32,3", "32", "32" | |||||
rgb_mean, rgb_std = "111.61,113.16,120.57", "50.50,51.26,50.24" | |||||
num_examples, num_val_examples = "73257", "26032" | |||||
elif data_type == "imageNet": | |||||
num_classes, train_data_part_num, val_data_part_num = "1000", "30", "2" | |||||
image_shape, image_size, resize_shorter = "3,224,224", "224", "256" | |||||
rgb_mean, rgb_std = "123.68,116.779,103.939", "58.393,57.12,57.375" | |||||
num_examples, num_val_examples = "64000", "6400" | |||||
else: | |||||
with open(dataset_dir + "/meta.json") as f_meta: | |||||
dict_meta = json.load(f_meta) | |||||
shape = dict_meta["image_shape"] | |||||
mean_list = dict_meta["rgb_mean"] | |||||
std_list = dict_meta["rgb_std"] | |||||
num_classes = str(dict_meta["num_classes"]) | |||||
train_data_part_num, val_data_part_num = "1", "1" | |||||
image_shape = str(shape[0]) + "," + str(shape[1]) + "," + str(shape[2]) | |||||
image_size, resize_shorter = str(shape[1]), str(shape[1]) | |||||
rgb_mean, rgb_std = "", "" | |||||
for mean in mean_list: | |||||
rgb_mean += str(mean) + "," | |||||
rgb_mean = rgb_mean.strip(",") | |||||
for std in std_list: | |||||
rgb_std += str(std) + "," | |||||
rgb_std = rgb_std.strip(",") | |||||
num_examples = dict_meta["num_examples"] | |||||
num_val_examples = dict_meta["num_val_examples"] | |||||
command1 = "python3 ./train_val.py \ | |||||
--model={0} \ | |||||
--data_type={1} \ | |||||
--log_type=base_model \ | |||||
--model_update=adam \ | |||||
--num_classes={2} \ | |||||
--train_data_dir={13}/train \ | |||||
--train_data_part_num={3} \ | |||||
--val_data_dir={13}/test \ | |||||
--val_data_part_num={4} \ | |||||
--num_nodes=1 \ | |||||
--gpu_num_per_node=1 \ | |||||
--loss_print_every_n_iter=1 \ | |||||
--label_smoothing=0 \ | |||||
--warmup_epochs=0 \ | |||||
--lr_decay=None \ | |||||
--image_shape={5} \ | |||||
--image_size={6} \ | |||||
--resize_shorter={7} \ | |||||
--rgb_mean={8} \ | |||||
--rgb_std={9} \ | |||||
--num_examples={10} \ | |||||
--num_val_examples={11} \ | |||||
--batch_size_per_device=32 \ | |||||
--val_batch_size_per_device=32 \ | |||||
--learning_rate=0.001 \ | |||||
--bn={12} \ | |||||
--num_epochs=2 \ | |||||
--model_save_every_n_epoch=10 \ | |||||
--model_save_dir={16}/model_base \ | |||||
--log_dir={14} \ | |||||
--before_result_dir={15}" \ | |||||
.format(model, data_type, num_classes, train_data_part_num, | |||||
val_data_part_num, image_shape, image_size, | |||||
resize_shorter, rgb_mean, rgb_std, | |||||
num_examples, num_val_examples, bn, dataset_dir, log_dir, before_result_dir, model_save_dir) | |||||
command2 = "python3 ./prune/prune{0}.py \ | |||||
--percent={1} \ | |||||
--optimizer=adam \ | |||||
--prune_method={2} \ | |||||
--bn={3} \ | |||||
--model_load_dir={4}/model_base/snapshot_last \ | |||||
--model_save_dir={4}/model_prune" \ | |||||
.format(prune, percent, prune_method, bn, model_save_dir) | |||||
if "dnn" in args.model: | |||||
command2 = "python3 ./prune/prune{0}.py \ | |||||
--percent={1} \ | |||||
--optimizer=adam \ | |||||
--model_load_dir={2}/model_base/snapshot_last \ | |||||
--model_save_dir={2}/model_prune" \ | |||||
.format(prune, percent, model_save_dir) | |||||
command3 = "python3 ./train_val.py \ | |||||
--model={0} \ | |||||
--data_type={1} \ | |||||
--log_type=prune_model \ | |||||
--model_update=adam \ | |||||
--num_classes={2} \ | |||||
--train_data_dir={13}/train \ | |||||
--train_data_part_num={3} \ | |||||
--val_data_dir={13}/test \ | |||||
--val_data_part_num={4} \ | |||||
--num_nodes=1 \ | |||||
--gpu_num_per_node=1 \ | |||||
--loss_print_every_n_iter=1 \ | |||||
--label_smoothing=0 \ | |||||
--warmup_epochs=0 \ | |||||
--lr_decay=None \ | |||||
--image_shape={5} \ | |||||
--image_size={6} \ | |||||
--resize_shorter={7} \ | |||||
--rgb_mean={8} \ | |||||
--rgb_std={9} \ | |||||
--num_examples={10} \ | |||||
--num_val_examples={11} \ | |||||
--batch_size_per_device=32 \ | |||||
--val_batch_size_per_device=32 \ | |||||
--learning_rate=0.001 \ | |||||
--bn={12} \ | |||||
--num_epochs=2 \ | |||||
--model_save_every_n_epoch=10 \ | |||||
--model_save_dir={15}/model_refine \ | |||||
--model_load_dir={15}/model_prune/model \ | |||||
--log_dir={14} \ | |||||
--after_result_dir={16}" \ | |||||
.format(model, data_type, num_classes, train_data_part_num, | |||||
val_data_part_num, image_shape, image_size, | |||||
resize_shorter, rgb_mean, rgb_std, | |||||
num_examples, num_val_examples, bn, dataset_dir, log_dir, model_save_dir, after_result_dir) | |||||
return command1, command2, command3 | |||||
def main(): | |||||
command1, command2, command3 = getCommand() | |||||
step = args.step | |||||
# print(command1) | |||||
if "1" in step: | |||||
os.system(command1) | |||||
if "2" in step: | |||||
os.system(command2) | |||||
if "3" in step: | |||||
os.system(command3) | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,80 @@ | |||||
export ENABLE_USER_OP=True | |||||
export VISIBLE_DEVICES=3 | |||||
#train base model | |||||
python3 of_cnn_train_val.py \ | |||||
--model=alexnet \ | |||||
--data_type=cifar10 \ | |||||
--log_type=base_model \ | |||||
--model_update=adam \ | |||||
--num_classes=10 \ | |||||
--train_data_dir=./ofData/cifar10/train \ | |||||
--train_data_part_num=5 \ | |||||
--val_data_dir=./ofData/cifar10/test \ | |||||
--val_data_part_num=1 \ | |||||
--num_nodes=1 \ | |||||
--gpu_num_per_node=1 \ | |||||
--loss_print_every_n_iter=1 \ | |||||
--label_smoothing=0 \ | |||||
--warmup_epochs=0 \ | |||||
--lr_decay=None \ | |||||
--image_shape=3,32,32 \ | |||||
--image_size=32 \ | |||||
--resize_shorter=32 \ | |||||
--rgb_mean=124.95,122.65,114.75 \ | |||||
--rgb_std=61.252,60.767,65.852 \ | |||||
--num_examples=50000 \ | |||||
--num_val_examples=10000 \ | |||||
--batch_size_per_device=32 \ | |||||
--val_batch_size_per_device=32 \ | |||||
--learning_rate=0.001 \ | |||||
--bn=True \ | |||||
--num_epochs=30 \ | |||||
--model_save_every_n_epoch=10 \ | |||||
--model_save_dir=./output/snapshots/alexnet/cifar10/model_base | |||||
#prune base model | |||||
python3 ./prune/pruneAlexnet.py \ | |||||
--percent=0.7 \ | |||||
--optimizer=adam \ | |||||
--prune_method=bn \ | |||||
--bn=True \ | |||||
--model_load_dir=./output/snapshots/alexnet/cifar10/model_base/snapshot_last \ | |||||
--model_save_dir=./output/snapshots/alexnet/cifar10/model_prune | |||||
#refine pruned model | |||||
python3 of_cnn_train_val.py \ | |||||
--model=alexnet \ | |||||
--data_type=cifar10 \ | |||||
--model_update=adam \ | |||||
--log_type=prune_model \ | |||||
--num_classes=10 \ | |||||
--train_data_dir=./ofData/cifar10/train \ | |||||
--train_data_part_num=5 \ | |||||
--val_data_dir=./ofData/cifar10/test \ | |||||
--val_data_part_num=1 \ | |||||
--num_nodes=1 \ | |||||
--gpu_num_per_node=1 \ | |||||
--loss_print_every_n_iter=1 \ | |||||
--label_smoothing=0 \ | |||||
--warmup_epochs=0 \ | |||||
--lr_decay=None \ | |||||
--image_shape=3,32,32 \ | |||||
--image_size=32 \ | |||||
--resize_shorter=32 \ | |||||
--rgb_mean=124.95,122.65,114.75 \ | |||||
--rgb_std=61.252,60.767,65.852 \ | |||||
--num_examples=50000 \ | |||||
--num_val_examples=10000 \ | |||||
--batch_size_per_device=32 \ | |||||
--val_batch_size_per_device=32 \ | |||||
--learning_rate=0.001 \ | |||||
--bn=True \ | |||||
--num_epochs=100 \ | |||||
--model_save_every_n_epoch=10 \ | |||||
--model_save_dir=./output/snapshots/alexnet/cifar10/model_refine \ | |||||
--model_load_dir=./output/snapshots/alexnet/cifar10/model_prune/model | |||||
@@ -0,0 +1,78 @@ | |||||
export ENABLE_USER_OP=True | |||||
export VISIBLE_DEVICES=3 | |||||
#train base model | |||||
python3 of_cnn_train_val.py \ | |||||
--model=dnn_2 \ | |||||
--data_type=cifar10 \ | |||||
--log_type=base_model \ | |||||
--model_update=adam \ | |||||
--num_classes=10 \ | |||||
--train_data_dir=./ofData/cifar10/train \ | |||||
--train_data_part_num=5 \ | |||||
--val_data_dir=./ofData/cifar10/test \ | |||||
--val_data_part_num=1 \ | |||||
--num_nodes=1 \ | |||||
--gpu_num_per_node=1 \ | |||||
--loss_print_every_n_iter=1 \ | |||||
--label_smoothing=0 \ | |||||
--warmup_epochs=0 \ | |||||
--lr_decay=None \ | |||||
--image_shape=3,32,32 \ | |||||
--image_size=32 \ | |||||
--resize_shorter=32 \ | |||||
--rgb_mean=124.95,122.65,114.75 \ | |||||
--rgb_std=61.252,60.767,65.852 \ | |||||
--num_examples=50000 \ | |||||
--num_val_examples=10000 \ | |||||
--batch_size_per_device=32 \ | |||||
--val_batch_size_per_device=32 \ | |||||
--learning_rate=0.001 \ | |||||
--bn=True \ | |||||
--num_epochs=30 \ | |||||
--model_save_every_n_epoch=10 \ | |||||
--model_save_dir=./output/snapshots/dnn_2/cifar10/model_base | |||||
#prune base model | |||||
python3 ./prune/pruneDnn.py \ | |||||
--percent=0.5 \ | |||||
--optimizer=adam \ | |||||
--model_load_dir=./output/snapshots/dnn_2/cifar10/model_base/snapshot_last \ | |||||
--model_save_dir=./output/snapshots/dnn_2/cifar10/model_prune | |||||
#refine pruned model | |||||
python3 of_cnn_train_val.py \ | |||||
--model=dnn_2 \ | |||||
--data_type=cifar10 \ | |||||
--model_update=adam \ | |||||
--log_type=prune_model \ | |||||
--num_classes=10 \ | |||||
--train_data_dir=./ofData/cifar10/train \ | |||||
--train_data_part_num=5 \ | |||||
--val_data_dir=./ofData/cifar10/test \ | |||||
--val_data_part_num=1 \ | |||||
--num_nodes=1 \ | |||||
--gpu_num_per_node=1 \ | |||||
--loss_print_every_n_iter=1 \ | |||||
--label_smoothing=0 \ | |||||
--warmup_epochs=0 \ | |||||
--lr_decay=None \ | |||||
--image_shape=3,32,32 \ | |||||
--image_size=32 \ | |||||
--resize_shorter=32 \ | |||||
--rgb_mean=124.95,122.65,114.75 \ | |||||
--rgb_std=61.252,60.767,65.852 \ | |||||
--num_examples=50000 \ | |||||
--num_val_examples=10000 \ | |||||
--batch_size_per_device=32 \ | |||||
--val_batch_size_per_device=32 \ | |||||
--learning_rate=0.001 \ | |||||
--bn=True \ | |||||
--num_epochs=100 \ | |||||
--model_save_every_n_epoch=10 \ | |||||
--model_save_dir=./output/snapshots/dnn_2/cifar10/model_refine \ | |||||
--model_load_dir=./output/snapshots/dnn_2/cifar10/model_prune/model | |||||
@@ -0,0 +1,153 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import os | |||||
import math | |||||
import oneflow as flow | |||||
import util.config as configs | |||||
from util.util import Snapshot, Summary, InitNodes, Metric, LoadCfg, LoadData | |||||
from util.job_function_util import get_train_config, get_val_config | |||||
import model.cnn.resnet_model as resnet_model | |||||
import model.cnn.vgg_model as vgg_model | |||||
import model.cnn.alexnet_model as alexnet_model | |||||
import model.cnn.lenet_model as lenet_model | |||||
import model.dnn.dnn_model as dnn_model | |||||
from util.model_weights import modelWeight | |||||
parser = configs.get_parser() | |||||
args = parser.parse_args() | |||||
configs.print_args(args) | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
train_batch_size = total_device_num * args.batch_size_per_device | |||||
val_batch_size = total_device_num * args.val_batch_size_per_device | |||||
(C, H, W) = args.image_shape | |||||
epoch_size = math.ceil(args.num_examples / train_batch_size) | |||||
num_val_steps = int(args.num_val_examples / val_batch_size) | |||||
model_dict = {"resnet": resnet_model.resnet50, | |||||
"vgg": vgg_model.vgg, | |||||
"alexnet": alexnet_model.alexnet, | |||||
"alexnet_simple": alexnet_model.alexnet_simple, | |||||
"lenet": lenet_model.lenet, | |||||
"dnn_2": dnn_model.dnn_2, | |||||
"dnn_4": dnn_model.dnn_4,} | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.config.enable_debug_mode(True) | |||||
if args.use_boxing_v2: | |||||
flow.config.collective_boxing.nccl_fusion_threshold_mb(8) | |||||
flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False) | |||||
def label_smoothing(labels, classes, eta, dtype): | |||||
assert classes > 0 | |||||
assert eta >= 0.0 and eta < 1.0 | |||||
return flow.one_hot(labels, depth=classes, dtype=dtype, | |||||
on_value=1 - eta + eta / classes, off_value=eta/classes) | |||||
@flow.global_function("train", get_train_config(args)) | |||||
def TrainNet(): | |||||
cfg = LoadCfg(args=args, model_load_dir=args.model_load_dir, load_type='train') | |||||
labels, images = LoadData(args, 'train') | |||||
if args.model in ("resnet", "vgg", "alexnet", "alexnet_simple", "lenet"): | |||||
logits = model_dict[args.model](images, cfg, optimizer=args.model_update, | |||||
need_transpose=False if args.train_data_dir else True, | |||||
bn=args.bn) | |||||
else: | |||||
logits = model_dict[args.model](images, cfg, optimizer=args.model_update) | |||||
if args.label_smoothing > 0: | |||||
one_hot_labels = label_smoothing(labels, args.num_classes, args.label_smoothing, logits.dtype) | |||||
loss = flow.nn.softmax_cross_entropy_with_logits(one_hot_labels, logits, name="softmax_loss") | |||||
else: | |||||
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss") | |||||
# lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate]) | |||||
# flow.optimizer.SGD(lr_scheduler, momentum=args.mom).minimize(loss) | |||||
flow.losses.add_loss(loss) | |||||
predictions = flow.nn.softmax(logits) | |||||
outputs = {"loss": loss, "predictions": predictions, "labels": labels} | |||||
# outputs = {"loss": loss, "predictions": predictions, "labels": labels, 'logits':logits} | |||||
return outputs | |||||
@flow.global_function("predict", get_val_config(args)) | |||||
def InferenceNet(): | |||||
cfg = LoadCfg(args=args, model_load_dir=args.model_load_dir, load_type='test') | |||||
labels, images = LoadData(args, 'test') | |||||
if args.model in ("resnet", "vgg", "alexnet", "alexnet_simple", "lenet"): | |||||
logits = model_dict[args.model](images, cfg, optimizer=args.model_update, | |||||
need_transpose=False if args.train_data_dir else True, | |||||
model_weight=False, bn=args.bn) | |||||
else: | |||||
logits = model_dict[args.model](images, cfg, optimizer=args.model_update, model_weight=False) | |||||
predictions = flow.nn.softmax(logits) | |||||
outputs = {"predictions": predictions, "labels": labels} | |||||
return outputs | |||||
def main(): | |||||
InitNodes(args) | |||||
flow.env.grpc_use_no_signal() | |||||
flow.env.log_dir(args.log_dir) | |||||
summary = Summary(args.log_dir, args) | |||||
snapshot = Snapshot(args.model_save_dir, args.model_load_dir) | |||||
#open log file | |||||
log_file = open("./log/log_"+args.model+"_"+args.data_type+"_"+args.log_type+".txt", "w") | |||||
if not args.before_result_dir: | |||||
args.before_result_dir = "./log/before" | |||||
if not args.after_result_dir: | |||||
args.after_result_dir = "./log/after" | |||||
for epoch in range(args.num_epochs): | |||||
#config callback func during training | |||||
metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter, | |||||
summary=summary, save_summary_steps=epoch_size, | |||||
batch_size=train_batch_size, loss_key='loss') | |||||
#training...(epoch times = epoch_size) | |||||
for i in range(epoch_size): | |||||
TrainNet().async_get(metric.metric_cb(epoch, i)) | |||||
if args.val_data_dir: | |||||
#config callback func during testing | |||||
metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary, | |||||
save_summary_steps=num_val_steps, batch_size=val_batch_size) | |||||
#tesing | |||||
for i in range(num_val_steps): | |||||
InferenceNet().async_get(metric.metric_cb(epoch, i, args=args, log_file=log_file)) | |||||
if epoch % args.model_save_every_n_epoch == 0: | |||||
snapshot.save('epoch_{}'.format(epoch)) | |||||
flow.sync_default_session() | |||||
#save last_snapeshot and model weight | |||||
snapshot.save('last') | |||||
flow.sync_default_session() | |||||
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path") | |||||
modelWeight.save(weights_profile_path) | |||||
if __name__ == "__main__": | |||||
os.system("rm -rf {0}".format(args.model_save_dir)) | |||||
main() |
@@ -0,0 +1,144 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import argparse | |||||
from datetime import datetime | |||||
from util.optimizer_util import add_optimizer_args | |||||
from util.ofrecord_util import add_ofrecord_args | |||||
def get_parser(parser=None): | |||||
def str_list(x): | |||||
return x.split(',') | |||||
def int_list(x): | |||||
return list(map(int, x.split(','))) | |||||
def float_list(x): | |||||
return list(map(float, x.split(','))) | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
if parser is None: | |||||
parser = argparse.ArgumentParser("flags for cnn benchmark") | |||||
parser.add_argument("--dtype", type=str, | |||||
default='float32', help="float16 float32") | |||||
# resouce | |||||
parser.add_argument("--gpu_num_per_node", type=int, default=1) | |||||
parser.add_argument('--num_nodes', type=int, default=1, | |||||
help='node/machine number for training') | |||||
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'], | |||||
help='nodes ip list for training, devided by ",", length >= num_nodes') | |||||
parser.add_argument("--model", type=str, default="vgg", | |||||
help="vgg, alexnet, lenet") | |||||
parser.add_argument( | |||||
'--use_fp16', | |||||
type=str2bool, | |||||
nargs='?', | |||||
const=True, | |||||
help='Whether to use use fp16' | |||||
) | |||||
parser.add_argument( | |||||
'--use_boxing_v2', | |||||
type=str2bool, | |||||
nargs='?', | |||||
const=True, | |||||
help='Whether to use boxing v2' | |||||
) | |||||
# train and validaion | |||||
# parser.add_argument("--default_dir", type=str, | |||||
# default='', help="use default model dir to save and load (train / refine)") | |||||
parser.add_argument("--bn", type=str2bool, | |||||
default=False, help="Whether to use use bn layer") | |||||
parser.add_argument("--data_type", type=str, | |||||
default='imageNet', help="type of dataser (imageNet / cifar10...)") | |||||
parser.add_argument("--log_type", type=str, | |||||
default='base_model', help="type of log (base_model/prune_model)") | |||||
parser.add_argument('--num_epochs', type=int, | |||||
default=90, help='number of epochs') | |||||
parser.add_argument("--model_load_dir", type=str, | |||||
default=None, help="model load directory if need") | |||||
parser.add_argument("--batch_size_per_device", type=int, default=64) | |||||
parser.add_argument("--val_batch_size_per_device", type=int, default=8) | |||||
# inference | |||||
parser.add_argument("--image_path", type=str, default='tiger.jpg', help="image path") | |||||
# for data process | |||||
parser.add_argument("--num_classes", type=int, default=1000, help="num of pic classes") | |||||
parser.add_argument("--num_examples", type=int, | |||||
default=300000, help="train pic number") | |||||
parser.add_argument("--num_val_examples", type=int, | |||||
default=50000, help="validation pic number") | |||||
parser.add_argument('--rgb_mean', type=float_list, default=[123.68, 116.779, 103.939], | |||||
help='a tuple of size 3 for the mean rgb') | |||||
parser.add_argument('--rgb_std', type=float_list, default=[58.393, 57.12, 57.375], | |||||
help='a tuple of size 3 for the std rgb') | |||||
parser.add_argument("--input_layout", type=str, | |||||
default='NHWC', help="NCHW or NHWC") | |||||
parser.add_argument('--image_shape', type=int_list, default=[3, 224, 224], | |||||
help='the image shape feed into the network') | |||||
parser.add_argument('--label_smoothing', type=float, default=0.1, help='label smoothing factor') | |||||
# snapshot | |||||
parser.add_argument("--model_save_dir", type=str, | |||||
default="./output/snapshots/model_save-{}".format( | |||||
str(datetime.now().strftime("%Y%m%d%H%M%S"))), | |||||
help="model save directory", | |||||
) | |||||
# log, save and loss print | |||||
parser.add_argument("--log_dir", type=str,default="./output", help="log info save directory") | |||||
parser.add_argument("--before_result_dir", type=str,default="", help="the save directory of results") | |||||
parser.add_argument("--after_result_dir", type=str, default="", help="the save directory of results") | |||||
parser.add_argument("--loss_print_every_n_iter", type=int, default=1, | |||||
help="print loss every n iteration") | |||||
parser.add_argument("--model_save_every_n_epoch", type=int, default=10, | |||||
help="save model every n epoch",) | |||||
add_ofrecord_args(parser) | |||||
add_optimizer_args(parser) | |||||
return parser | |||||
def print_args(args): | |||||
print("=".ljust(66, "=")) | |||||
print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( | |||||
args.model, args.gpu_num_per_node, args.num_nodes)) | |||||
print("=".ljust(66, "=")) | |||||
for arg in vars(args): | |||||
print("{} = {}".format(arg, getattr(args, arg))) | |||||
print("-".ljust(66, "-")) | |||||
print("Time stamp: {}".format( | |||||
str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) | |||||
if __name__ == '__main__': | |||||
parser = get_parser() | |||||
args = parser.parse_args() | |||||
print_args(args) |
@@ -0,0 +1,53 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import oneflow as flow | |||||
from util.optimizer_util import gen_model_update_conf | |||||
def _default_config(args): | |||||
config = flow.function_config() | |||||
config.default_logical_view(flow.scope.consistent_view()) | |||||
config.default_data_type(flow.float) | |||||
if args.use_fp16: | |||||
config.enable_auto_mixed_precision(True) | |||||
return config | |||||
def get_train_config(args): | |||||
train_config = _default_config(args) | |||||
train_config.train.primary_lr(args.learning_rate) | |||||
# train_config.disable_all_reduce_sequence(False) | |||||
# train_config.cudnn_conv_enable_pseudo_half(True) | |||||
# train_config.all_reduce_group_min_mbyte(8) | |||||
# train_config.all_reduce_group_num(128) | |||||
# train_config.all_reduce_lazy_ratio(0) | |||||
# train_config.enable_nccl_hierarchical_all_reduce(True) | |||||
# train_config.cudnn_buf_limit_mbyte(2048) | |||||
# train_config.concurrency_width(2) | |||||
if args.use_boxing_v2: | |||||
train_config.use_boxing_v2(True) | |||||
train_config.prune_parallel_cast_ops(True) | |||||
train_config.train.model_update_conf(gen_model_update_conf(args)) | |||||
train_config.enable_inplace(True) | |||||
return train_config | |||||
def get_val_config(args): | |||||
return _default_config(args) |
@@ -0,0 +1,97 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import json | |||||
import oneflow as flow | |||||
# mysingle.py | |||||
class ModelWeights: | |||||
weights_dict={} | |||||
dtype_dict={flow.float32:2, | |||||
flow.float64:3, | |||||
flow.int8:4, | |||||
flow.int32:5, | |||||
flow.int64:6, | |||||
flow.float16:9, | |||||
2:2, 3:3, 4:4, 5:5, 6:6, 9:9} | |||||
def add(self, variable_name, dtype, shape): | |||||
assert variable_name not in self.weights_dict | |||||
profile_dict={} | |||||
profile_dict["dtype"]=dtype | |||||
profile_dict["shape"]=shape | |||||
self.weights_dict[variable_name]=profile_dict | |||||
return self.weights_dict | |||||
def addConv(self, index, dtype, shape1, shape2, optimizer): | |||||
dtype = self.dtype_dict[dtype] | |||||
# print(dtype) | |||||
self.add("conv{}".format(index)+'_weight', dtype, shape1) | |||||
self.add("conv{}".format(index)+'_bias', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-gamma', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-beta', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-moving_variance', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-moving_mean', dtype, shape2) | |||||
if optimizer == 'adam': | |||||
self.add("conv{}".format(index)+'_weight-v', dtype, shape1) | |||||
self.add("conv{}".format(index)+'_weight-m', dtype, shape1) | |||||
self.add("conv{}".format(index)+'_bias-v', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bias-m', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-gamma-v', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-gamma-m', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-beta-v', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-beta-m', dtype, shape2) | |||||
elif optimizer == 'momentum': | |||||
self.add("conv{}".format(index)+'_weight-momentum', dtype, shape1) | |||||
self.add("conv{}".format(index)+'_bias-momentum', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-gamma-momentum', dtype, shape2) | |||||
self.add("conv{}".format(index)+'_bn-beta-momentum', dtype, shape2) | |||||
def addDense(self, dtype_old, shape, optimizer, dense_num): | |||||
dtype = [] | |||||
for old in dtype_old: | |||||
dtype.append(self.dtype_dict[old]) | |||||
# print(dtype) | |||||
for i in range(0, dense_num): | |||||
self.add('dense'+str(i)+'-weight', dtype[i], shape[i]) | |||||
self.add('dense'+str(i)+'-bias', dtype[i], (shape[i][0],)) | |||||
if optimizer == 'adam': | |||||
self.add('dense'+str(i)+'-weight-v', dtype[i], shape[i]) | |||||
self.add('dense'+str(i)+'-weight-m', dtype[i], shape[i]) | |||||
self.add('dense'+str(i)+'-bias-v', dtype[i], (shape[i][0],)) | |||||
self.add('dense'+str(i)+'-bias-m', dtype[i], (shape[i][0],)) | |||||
elif optimizer == 'momentum': | |||||
self.add('dense'+str(i)+'-weight-momentum', dtype[i], shape[i]) | |||||
self.add('dense'+str(i)+'-bias-momentum', dtype[i], (shape[i][0],)) | |||||
def save(self,path): | |||||
print('Saving weights_profile_path to {}'.format(path)) | |||||
# print('weights_dict',self.weights_dict) | |||||
with open(path,"w") as f: | |||||
for k,v in self.weights_dict.items(): | |||||
v_json=json.dumps(v) | |||||
f.write(k+'__'+ v_json +'\n') | |||||
return self.weights_dict | |||||
def load(self,path): | |||||
if len(self.weights_dict)!=0: | |||||
return self.weights_dict | |||||
else: | |||||
with open(path,'r') as f: | |||||
for line in f: | |||||
variable_name,profile_dict=line.split('__') | |||||
profile_dict=json.loads(profile_dict) | |||||
self.weights_dict[variable_name]=profile_dict | |||||
return self.weights_dict | |||||
modelWeight = ModelWeights() |
@@ -0,0 +1,335 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import oneflow as flow | |||||
def add_ofrecord_args(parser): | |||||
parser.add_argument("--image_size", type=int, default=224, | |||||
required=False, help="image size") | |||||
parser.add_argument("--resize_shorter", type=int, default=256, | |||||
required=False, help="resize shorter for validation") | |||||
parser.add_argument("--train_data_dir", type=str, | |||||
default=None, help="train dataset directory") | |||||
parser.add_argument("--train_data_part_num", type=int, | |||||
default=256, help="train data part num") | |||||
parser.add_argument("--val_data_dir", type=str, | |||||
default=None, help="val dataset directory") | |||||
parser.add_argument("--val_data_part_num", type=int, | |||||
default=256, help="val data part num") | |||||
return parser | |||||
#old version, cancelled | |||||
def load_imagenet(args, batch_size, data_dir, data_part_num, codec): | |||||
image_blob_conf = flow.data.BlobConf( | |||||
"encoded", | |||||
shape=(args.image_size, args.image_size, 3), | |||||
dtype=flow.float, | |||||
codec=codec, | |||||
preprocessors=[flow.data.NormByChannelPreprocessor(args.rgb_mean[::-1], | |||||
args.rgb_std[::-1])], | |||||
# preprocessors=[flow.data.NormByChannelPreprocessor(args.rgb_mean, args.rgb_std)], #bgr2rgb | |||||
) | |||||
label_blob_conf = flow.data.BlobConf( | |||||
"class/label", shape=(), dtype=flow.int32, codec=flow.data.RawCodec() | |||||
) | |||||
return flow.data.decode_ofrecord( | |||||
data_dir, | |||||
(label_blob_conf, image_blob_conf), | |||||
batch_size=batch_size, | |||||
data_part_num=data_part_num, | |||||
part_name_suffix_length=5, | |||||
#shuffle = True, | |||||
# buffer_size=32768, | |||||
name="decode") | |||||
#old version, cancelled | |||||
def load_cifar10(data_dir, batch_size, data_part_num, image_size=32): | |||||
image_blob_conf = flow.data.BlobConf( | |||||
"images", | |||||
shape=(image_size, image_size, 3), | |||||
dtype=flow.float, | |||||
codec=flow.data.RawCodec(), | |||||
preprocessors=[flow.data.NormByChannelPreprocessor((125.31, 122.96, 113.86), (61.252, 60.767, 65.852))], | |||||
) | |||||
label_blob_conf = flow.data.BlobConf("labels", shape=(), dtype=flow.int32, codec=flow.data.RawCodec()) | |||||
return flow.data.decode_ofrecord( | |||||
data_dir, | |||||
(label_blob_conf, image_blob_conf), | |||||
batch_size=batch_size, | |||||
data_part_num=data_part_num, | |||||
name="decode", | |||||
) | |||||
def load_synthetic(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
batch_size = total_device_num * args.batch_size_per_device | |||||
label = flow.data.decode_random( | |||||
shape=(), | |||||
dtype=flow.int32, | |||||
batch_size=batch_size, | |||||
initializer=flow.zeros_initializer(flow.int32), | |||||
) | |||||
image = flow.data.decode_random( | |||||
shape=(args.image_size, args.image_size, 3), dtype=flow.float, batch_size=batch_size | |||||
) | |||||
return label, image | |||||
def load_imagenet_for_training(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
train_batch_size = total_device_num * args.batch_size_per_device | |||||
color_space = 'RGB' | |||||
ofrecord = flow.data.ofrecord_reader(args.train_data_dir, | |||||
batch_size=train_batch_size, | |||||
data_part_num=args.train_data_part_num, | |||||
part_name_suffix_length=5, | |||||
random_shuffle=True, | |||||
shuffle_after_epoch=True) | |||||
image = flow.data.OFRecordImageDecoderRandomCrop(ofrecord, "encoded", # seed=seed, | |||||
color_space=color_space) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "class/label", shape=(), dtype=flow.int32) | |||||
rsz = flow.image.Resize(image, resize_x=args.image_size, resize_y=args.image_size, | |||||
color_space=color_space) | |||||
rng = flow.random.CoinFlip(batch_size=train_batch_size) # , seed=seed) | |||||
normal = flow.image.CropMirrorNormalize(rsz, mirror_blob=rng, color_space=color_space, | |||||
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float) | |||||
return label, normal | |||||
def load_imagenet_for_validation(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
val_batch_size = total_device_num * args.val_batch_size_per_device | |||||
color_space = 'RGB' | |||||
ofrecord = flow.data.ofrecord_reader(args.val_data_dir, | |||||
batch_size=val_batch_size, | |||||
data_part_num=args.val_data_part_num, | |||||
part_name_suffix_length=5, | |||||
shuffle_after_epoch=False) | |||||
image = flow.data.OFRecordImageDecoder(ofrecord, "encoded", color_space=color_space) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "class/label", shape=(), dtype=flow.int32) | |||||
rsz = flow.image.Resize(image, resize_shorter=args.resize_shorter, color_space=color_space) | |||||
normal = flow.image.CropMirrorNormalize(rsz, color_space=color_space, | |||||
crop_h=args.image_size, crop_w=args.image_size, crop_pos_y=0.5, crop_pos_x=0.5, | |||||
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float) | |||||
return label, normal | |||||
def load_cifar_for_training(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
train_batch_size = total_device_num * args.batch_size_per_device | |||||
# color_space = 'RGB' | |||||
ofrecord = flow.data.ofrecord_reader(args.train_data_dir, | |||||
batch_size=train_batch_size, | |||||
data_part_num=args.train_data_part_num, | |||||
part_name_suffix_length=5, | |||||
random_shuffle=True, | |||||
shuffle_after_epoch=True) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32) | |||||
image = flow.data.OFRecordRawDecoder(ofrecord, "images", | |||||
shape=(3, args.image_size, args.image_size), | |||||
dtype=flow.float) | |||||
image = flow.transpose(image, perm=[0, 2, 3, 1]) | |||||
image_uint8 = flow.cast(image, flow.uint8) | |||||
rng = flow.random.CoinFlip(batch_size=train_batch_size) | |||||
normal = flow.image.CropMirrorNormalize(image_uint8, mirror_blob=rng, | |||||
mean=args.rgb_mean, std=args.rgb_std) | |||||
return label, normal | |||||
def load_cifar_for_validation(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
val_batch_size = total_device_num * args.val_batch_size_per_device | |||||
# color_space = 'RGB' | |||||
ofrecord = flow.data.ofrecord_reader(args.val_data_dir, | |||||
batch_size=val_batch_size, | |||||
data_part_num=args.val_data_part_num, | |||||
part_name_suffix_length=5, | |||||
shuffle_after_epoch=False) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32) | |||||
image = flow.data.OFRecordRawDecoder(ofrecord, "images", | |||||
shape=(3, args.image_size, args.image_size), | |||||
dtype=flow.float) | |||||
image = flow.transpose(image, perm=[0, 2, 3, 1]) | |||||
image_uint8 = flow.cast(image, flow.uint8) | |||||
normal = flow.image.CropMirrorNormalize(image_uint8, crop_h=args.image_size, crop_w=args.image_size, | |||||
crop_pos_y=0.5, crop_pos_x=0.5, | |||||
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float) | |||||
return label, normal | |||||
def load_mnist_for_training(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
train_batch_size = total_device_num * args.batch_size_per_device | |||||
ofrecord = flow.data.ofrecord_reader(args.train_data_dir, | |||||
batch_size=train_batch_size, | |||||
data_part_num=args.train_data_part_num, | |||||
part_name_suffix_length=5, | |||||
random_shuffle=True, | |||||
shuffle_after_epoch=True) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32) | |||||
image = flow.data.OFRecordRawDecoder(ofrecord, "images", | |||||
shape=(1, args.image_size, args.image_size), | |||||
dtype=flow.float) | |||||
# print(image.shape) | |||||
image = flow.transpose(image, perm=[0, 2, 3, 1]) | |||||
image_uint8 = flow.cast(image, flow.uint8) | |||||
rng = flow.random.CoinFlip(batch_size=train_batch_size) | |||||
normal = flow.image.CropMirrorNormalize(image_uint8, mirror_blob=rng, color_space="GRAY", | |||||
mean=args.rgb_mean, std=args.rgb_std) | |||||
return label, normal | |||||
def load_mnist_for_validation(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
val_batch_size = total_device_num * args.val_batch_size_per_device | |||||
ofrecord = flow.data.ofrecord_reader(args.val_data_dir, | |||||
batch_size=val_batch_size, | |||||
data_part_num=args.val_data_part_num, | |||||
part_name_suffix_length=5, | |||||
shuffle_after_epoch=False) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32) | |||||
image = flow.data.OFRecordRawDecoder(ofrecord, "images", | |||||
shape=(1, args.image_size, args.image_size), | |||||
dtype=flow.float) | |||||
image = flow.transpose(image, perm=[0, 2, 3, 1]) | |||||
image_uint8 = flow.cast(image, flow.uint8) | |||||
normal = flow.image.CropMirrorNormalize(image_uint8, crop_h=args.image_size, crop_w=args.image_size, | |||||
crop_pos_y=0.5, crop_pos_x=0.5, color_space="GRAY", | |||||
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float) | |||||
return label, normal | |||||
def load_svhn_for_training(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
train_batch_size = total_device_num * args.batch_size_per_device | |||||
ofrecord = flow.data.ofrecord_reader(args.train_data_dir, | |||||
batch_size=train_batch_size, | |||||
data_part_num=args.train_data_part_num, | |||||
part_name_suffix_length=5, | |||||
random_shuffle=True, | |||||
shuffle_after_epoch=True) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32) | |||||
image = flow.data.OFRecordRawDecoder(ofrecord, "images", | |||||
shape=(args.image_size, args.image_size, 3), | |||||
dtype=flow.float) | |||||
image_uint8 = flow.cast(image, flow.uint8) | |||||
rng = flow.random.CoinFlip(batch_size=train_batch_size) | |||||
normal = flow.image.CropMirrorNormalize(image_uint8, mirror_blob=rng, | |||||
mean=args.rgb_mean, std=args.rgb_std) | |||||
return label, normal | |||||
def load_svhn_for_validation(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
val_batch_size = total_device_num * args.val_batch_size_per_device | |||||
ofrecord = flow.data.ofrecord_reader(args.val_data_dir, | |||||
batch_size=val_batch_size, | |||||
data_part_num=args.val_data_part_num, | |||||
part_name_suffix_length=5, | |||||
shuffle_after_epoch=False) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32) | |||||
image = flow.data.OFRecordRawDecoder(ofrecord, "images", | |||||
shape=(args.image_size, args.image_size, 3), | |||||
dtype=flow.float) | |||||
image_uint8 = flow.cast(image, flow.uint8) | |||||
normal = flow.image.CropMirrorNormalize(image_uint8, crop_h=args.image_size, crop_w=args.image_size, | |||||
crop_pos_y=0.5, crop_pos_x=0.5, | |||||
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float) | |||||
return label, normal | |||||
def load_mydata_for_training(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
train_batch_size = total_device_num * args.batch_size_per_device | |||||
# color_space = 'RGB' | |||||
ofrecord = flow.data.ofrecord_reader(args.train_data_dir, | |||||
batch_size=train_batch_size, | |||||
data_part_num=args.train_data_part_num, | |||||
part_name_suffix_length=5, | |||||
random_shuffle=True, | |||||
shuffle_after_epoch=True) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32) | |||||
image = flow.data.OFRecordRawDecoder(ofrecord, "images", | |||||
shape=(3, args.image_size, args.image_size), | |||||
dtype=flow.float) | |||||
image = flow.transpose(image, perm=[0, 2, 3, 1]) | |||||
image_uint8 = flow.cast(image, flow.uint8) | |||||
rng = flow.random.CoinFlip(batch_size=train_batch_size) | |||||
normal = flow.image.CropMirrorNormalize(image_uint8, mirror_blob=rng, | |||||
mean=args.rgb_mean, std=args.rgb_std) | |||||
return label, normal | |||||
def load_mydata_for_validation(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
val_batch_size = total_device_num * args.val_batch_size_per_device | |||||
# color_space = 'RGB' | |||||
ofrecord = flow.data.ofrecord_reader(args.val_data_dir, | |||||
batch_size=val_batch_size, | |||||
data_part_num=args.val_data_part_num, | |||||
part_name_suffix_length=5, | |||||
shuffle_after_epoch=False) | |||||
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32) | |||||
image = flow.data.OFRecordRawDecoder(ofrecord, "images", | |||||
shape=(3, args.image_size, args.image_size), | |||||
dtype=flow.float) | |||||
image = flow.transpose(image, perm=[0, 2, 3, 1]) | |||||
image_uint8 = flow.cast(image, flow.uint8) | |||||
normal = flow.image.CropMirrorNormalize(image_uint8, crop_h=args.image_size, crop_w=args.image_size, | |||||
crop_pos_y=0.5, crop_pos_x=0.5, | |||||
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float) | |||||
return label, normal | |||||
if __name__ == "__main__": | |||||
import os | |||||
import config as configs | |||||
from util import Summary, Metric | |||||
from job_function_util import get_val_config | |||||
parser = configs.get_parser() | |||||
args = parser.parse_args() | |||||
configs.print_args(args) | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.config.enable_debug_mode(True) | |||||
@flow.global_function(get_val_config(args)) | |||||
def IOTest(): | |||||
if args.train_data_dir: | |||||
assert os.path.exists(args.train_data_dir) | |||||
print("Loading data from {}".format(args.train_data_dir)) | |||||
(labels, images) = load_imagenet_for_training(args) | |||||
else: | |||||
print("Loading synthetic data.") | |||||
(labels, images) = load_synthetic(args) | |||||
outputs = {"images": images, "labels": labels} | |||||
return outputs | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
train_batch_size = total_device_num * args.batch_size_per_device | |||||
summary = Summary(args.log_dir, args, filename='io_test.csv') | |||||
metric = Metric(desc='io_test', calculate_batches=args.loss_print_every_n_iter, | |||||
summary=summary, save_summary_steps=args.loss_print_every_n_iter, | |||||
batch_size=train_batch_size, prediction_key=None) | |||||
for i in range(1000): | |||||
IOTest().async_get(metric.metric_cb(0, i)) |
@@ -0,0 +1,93 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import math | |||||
def add_optimizer_args(parser): | |||||
group = parser.add_argument_group('optimizer parameters', | |||||
'entire group applies only to optimizer parameters') | |||||
group.add_argument("--model_update", type=str, default="momentum", help="sgd, adam, momentum") | |||||
group.add_argument("--learning_rate", type=float, default=0.256) | |||||
group.add_argument("--wd", type=float, default=1.0/32768, help="weight decay") | |||||
group.add_argument("--mom", type=float, default=0.875, help="momentum") | |||||
group.add_argument('--lr_decay', type=str, default='cosine', help='cosine, step, polynomial, None') | |||||
group.add_argument('--warmup_epochs', type=int, default=5, | |||||
help='the epochs to ramp-up lr to scaled large-batch value') | |||||
return parser | |||||
def gen_model_update_conf(args): | |||||
total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
train_batch_size = total_device_num * args.batch_size_per_device | |||||
epoch_size = math.ceil(args.num_examples / train_batch_size) | |||||
num_train_batches = epoch_size * args.num_epochs | |||||
num_warmup_batches = epoch_size * args.warmup_epochs | |||||
decay_batches = num_train_batches - num_warmup_batches | |||||
model_update_conf = {} | |||||
# basic model update | |||||
if args.model_update == 'sgd': | |||||
model_update_conf["naive_conf"] = {} | |||||
elif args.model_update == 'adam': | |||||
model_update_conf["adam_conf"] = {"beta1": 0.9} | |||||
elif args.model_update == 'momentum': | |||||
assert args.mom < 1.0 | |||||
assert args.mom > 0.0 | |||||
model_update_conf["momentum_conf"] = {"beta": args.mom} | |||||
else: | |||||
assert False | |||||
# learning rate warmup | |||||
if args.warmup_epochs > 0: #linear warmup only | |||||
model_update_conf['warmup_conf'] = {"linear_conf": { | |||||
"warmup_batches": num_warmup_batches, | |||||
"start_multiplier": 0, | |||||
}} | |||||
# learning rate decay | |||||
if args.lr_decay == 'cosine': | |||||
model_update_conf['learning_rate_decay'] = {"cosine_conf": {"decay_batches": decay_batches}} | |||||
elif args.lr_decay == 'step': | |||||
boundaries = [x * epoch_size for x in [30, 60, 80]] | |||||
scales = [1, 0.1, 0.01, 0.001] | |||||
model_update_conf['learning_rate_decay'] = {"piecewise_scaling_conf": { | |||||
"boundaries": boundaries, | |||||
"scales":scales, | |||||
}} | |||||
elif args.lr_decay == 'polynomial': | |||||
model_update_conf['learning_rate_decay'] = {"polynomial_conf": { | |||||
"decay_batches": decay_batches, | |||||
"end_learning_rate": 0.00001, | |||||
}} | |||||
# weight decay | |||||
# if args.wd > 0: | |||||
# assert args.wd < 1.0 | |||||
# model_update_conf['weight_decay_conf'] = { | |||||
# "weight_decay_rate": args.wd, | |||||
# "excludes": {"pattern": ['_bn-']} | |||||
# } | |||||
import pprint | |||||
pprint.pprint(model_update_conf) | |||||
return model_update_conf | |||||
if __name__ == '__main__': | |||||
import config as configs | |||||
parser = configs.get_parser() | |||||
args = parser.parse_args() | |||||
configs.print_args(args) | |||||
gen_model_update_conf(args) |
@@ -0,0 +1,374 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import os | |||||
import time | |||||
import numpy as np | |||||
import pandas as pd | |||||
import oneflow as flow | |||||
import util.ofrecord_util as ofrecord_util | |||||
from util.model_weights import modelWeight | |||||
import json | |||||
def InitNodes(args): | |||||
if args.num_nodes > 1: | |||||
assert args.num_nodes <= len(args.node_ips) | |||||
flow.env.ctrl_port(12138) | |||||
nodes = [] | |||||
for ip in args.node_ips: | |||||
addr_dict = {} | |||||
addr_dict["addr"] = ip | |||||
nodes.append(addr_dict) | |||||
flow.env.machine(nodes) | |||||
# laod cfg (model structure) | |||||
def LoadCfg(args, model_load_dir, load_type): | |||||
if model_load_dir: | |||||
if args.model == "resnet": | |||||
assert os.path.isdir(model_load_dir) | |||||
of_weight_path = model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
cfg_temp = [] | |||||
cfg = [] | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.endswith("weight") and "stem" not in name and "shortcut" not in name: | |||||
shape=profile_dict["shape"] | |||||
cfg_temp.append(shape[0]) | |||||
cfg.append(cfg_temp[0:9]) | |||||
cfg.append(cfg_temp[9:21]) | |||||
cfg.append(cfg_temp[21:39]) | |||||
cfg.append(cfg_temp[39:48]) | |||||
cfg.append(cfg_temp[48]) | |||||
if load_type == 'train': | |||||
modelWeight.weights_dict = {} | |||||
else: | |||||
assert os.path.isdir(model_load_dir) | |||||
of_weight_path = model_load_dir.rsplit("/",1)[0] + "/weights_profile_path" | |||||
cfg = [] | |||||
weights_dict = modelWeight.load(of_weight_path) | |||||
for name, profile_dict in weights_dict.items(): | |||||
if name.endswith("weight"): | |||||
shape=profile_dict["shape"] | |||||
cfg.append(shape[0]) | |||||
# print(load_type, modelWeight.weights_dict) | |||||
if load_type == 'train': | |||||
modelWeight.weights_dict = {} | |||||
else: | |||||
if args.model == 'vgg': | |||||
# cfg = [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512, 4096, 4096, args.num_classes] | |||||
cfg = [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 128, args.num_classes] | |||||
elif args.model == 'alexnet': | |||||
cfg = [96, 256, 384, 384, 256, 4096, 4096, args.num_classes] | |||||
elif args.model == 'alexnet_simple': | |||||
cfg = [24, 96, 192, 192, 96, 1024, 1024, args.num_classes] | |||||
elif args.model == 'lenet': | |||||
cfg = [6, 16, 120, 84, args.num_classes] | |||||
elif args.model == "resnet": | |||||
cfg = [[64, 64, 256, 64, 64, 256, 64, 64, 256], | |||||
[128, 128, 512, 128, 128, 512, 128, 128, 512, 128, 128, 512], | |||||
[256, 256, 1024, 256, 256, 1024, 256, 256, 1024, 256, 256, 1024, 256, 256, 1024, 256, 256, 1024], | |||||
[512, 512, 2048, 512, 512, 2048, 512, 512, 2048], args.num_classes] | |||||
elif args.model == 'dnn_2': | |||||
cfg = [128, args.num_classes] | |||||
elif args.model == 'dnn_4': | |||||
cfg = [4096, 256, 128, args.num_classes] | |||||
else: | |||||
cfg = [] | |||||
if load_type == 'train': | |||||
print('Model structure:', cfg) | |||||
return cfg | |||||
# laod cfg(model structure) | |||||
def LoadData(args, load_type): | |||||
# total_device_num = args.num_nodes * args.gpu_num_per_node | |||||
# train_batch_size = total_device_num * args.batch_size_per_device | |||||
# val_batch_size = total_device_num * args.val_batch_size_per_device | |||||
if load_type == 'train': | |||||
if args.train_data_dir: | |||||
assert os.path.exists(args.train_data_dir) | |||||
print("Loading data from {}".format(args.train_data_dir)) | |||||
if args.data_type == 'imageNet': | |||||
(labels, images) = ofrecord_util.load_imagenet_for_training(args) | |||||
elif args.data_type == 'cifar10' or args.data_type == 'cifar100': | |||||
(labels, images) = ofrecord_util.load_cifar_for_training(args) | |||||
elif args.data_type == 'mnist' or args.data_type == 'mnist_32': | |||||
(labels, images) = ofrecord_util.load_mnist_for_training(args) | |||||
elif args.data_type == 'svhn': | |||||
(labels, images) = ofrecord_util.load_svhn_for_training(args) | |||||
elif args.data_type == 'random': | |||||
(labels, images) = ofrecord_util.load_synthetic(args) | |||||
else: | |||||
(labels, images) = ofrecord_util.load_mydata_for_training(args) | |||||
else: | |||||
print("Loading synthetic data.") | |||||
(labels, images) = ofrecord_util.load_synthetic(args) | |||||
elif load_type == 'test': | |||||
if args.val_data_dir: | |||||
assert os.path.exists(args.val_data_dir) | |||||
print("Loading data from {}".format(args.val_data_dir)) | |||||
if args.data_type == 'imageNet': | |||||
(labels, images) = ofrecord_util.load_imagenet_for_validation(args) | |||||
elif args.data_type == 'cifar10' or args.data_type == 'cifar100': | |||||
(labels, images) = ofrecord_util.load_cifar_for_training(args) | |||||
elif args.data_type == 'mnist' or args.data_type == "mnist_32": | |||||
(labels, images) = ofrecord_util.load_mnist_for_validation(args) | |||||
elif args.data_type == 'svhn': | |||||
(labels, images) = ofrecord_util.load_svhn_for_validation(args) | |||||
elif args.data_type == 'random': | |||||
(labels, images) = ofrecord_util.load_synthetic(args) | |||||
else: | |||||
(labels, images) = ofrecord_util.load_mydata_for_training(args) | |||||
else: | |||||
print("Loading synthetic data.") | |||||
(labels, images) = ofrecord_util.load_synthetic(args) | |||||
else: | |||||
print("Loading synthetic data.") | |||||
(labels, images) = ofrecord_util.load_synthetic(args) | |||||
return labels, images | |||||
#get save path and load path of model | |||||
#def getSaveLoadDir(args): | |||||
# if args.default_dir == 'train': | |||||
# model_save_dir = './output/snapshots/model_base' | |||||
# if args.data_type == 'imageNet': | |||||
# if args.model == 'vgg': | |||||
# model_load_dir = './model_init/vgg/model_init_imageNet/of_init_model' | |||||
# elif args.model == 'alexnet': | |||||
# model_load_dir = './model_init/alexnet/model_init_imageNet/of_init_model' | |||||
# elif args.model == 'lenet': | |||||
# model_load_dir = './model_init/lenet/model_init_imageNet/of_init_model' | |||||
# elif args.data_type == 'cifar10': | |||||
# if args.model == 'vgg': | |||||
# model_load_dir = './model_init/vgg/model_init_cifar10/of_init_model' | |||||
# elif args.model == 'alexnet': | |||||
# model_load_dir = './model_init/alexnet/model_init_cifar10/of_init_model' | |||||
# elif args.model == 'lenet': | |||||
# model_load_dir = './model_init/lenet/model_init_cifar10/of_init_model' | |||||
# elif args.default_dir == 'refine': | |||||
# model_save_dir = './output/snapshots/model_refine' | |||||
# model_load_dir = './output/snapshots/model_prune/model' | |||||
# else: | |||||
# model_save_dir = args.model_save_dir | |||||
# model_load_dir = args.model_load_dir | |||||
# return model_save_dir, model_load_dir | |||||
class Snapshot(object): | |||||
def __init__(self, model_save_dir, model_load_dir): | |||||
self._model_save_dir = model_save_dir | |||||
self._check_point = flow.train.CheckPoint() | |||||
if model_load_dir: | |||||
assert os.path.isdir(model_load_dir) | |||||
print("Restoring model from {}.".format(model_load_dir)) | |||||
self._check_point.load(model_load_dir) | |||||
else: | |||||
self._check_point.init() | |||||
self.save('initial_model') | |||||
print("Init model on demand.") | |||||
def save(self, name): | |||||
snapshot_save_path = os.path.join(self._model_save_dir, "snapshot_{}".format(name)) | |||||
if not os.path.exists(snapshot_save_path): | |||||
os.makedirs(snapshot_save_path) | |||||
print("Saving model to {}.".format(snapshot_save_path)) | |||||
self._check_point.save(snapshot_save_path) | |||||
class Summary(object): | |||||
def __init__(self, log_dir, config, filename='summary.csv'): | |||||
self._filename = filename | |||||
self._log_dir = log_dir | |||||
if not os.path.exists(log_dir): os.makedirs(log_dir) | |||||
self._metrics = pd.DataFrame({"epoch":0, "iter": 0, "legend": "cfg", "note": str(config)}, index=[0]) | |||||
def scalar(self, legend, value, epoch, step=-1): | |||||
# TODO: support rank(which device/gpu) | |||||
df = pd.DataFrame( | |||||
{"epoch": epoch, "iter": step, "legend": legend, "value": value, "rank": 0}, | |||||
index=[0]) | |||||
self._metrics = pd.concat([self._metrics, df], axis=0, sort=False) | |||||
def save(self): | |||||
save_path = os.path.join(self._log_dir, self._filename) | |||||
self._metrics.to_csv(save_path, index=False) | |||||
class StopWatch(object): | |||||
def __init__(self): | |||||
pass | |||||
def start(self): | |||||
self.start_time = time.time() | |||||
self.last_split = self.start_time | |||||
def split(self): | |||||
now = time.time() | |||||
duration = now - self.last_split | |||||
self.last_split = now | |||||
return duration | |||||
def stop(self): | |||||
self.stop_time = time.time() | |||||
def duration(self): | |||||
return self.stop_time - self.start_time | |||||
def match_top_k(predictions, labels, top_k=1): | |||||
max_k_preds = np.argpartition(predictions.numpy(), -top_k)[:, -top_k:] | |||||
match_array = np.logical_or.reduce(max_k_preds==labels.reshape((-1, 1)), axis=1) | |||||
num_matched = match_array.sum() | |||||
return num_matched, match_array.shape[0] | |||||
class Metric(object): | |||||
def __init__(self, summary=None, save_summary_steps=-1, desc='train', calculate_batches=-1, | |||||
batch_size=256, top_k=6, prediction_key='predictions', label_key='labels', | |||||
loss_key=None): | |||||
self.summary = summary | |||||
self.save_summary = isinstance(self.summary, Summary) | |||||
self.save_summary_steps = save_summary_steps | |||||
self.desc = desc | |||||
self.calculate_batches = calculate_batches | |||||
self.top_k = top_k | |||||
self.prediction_key = prediction_key | |||||
self.label_key = label_key | |||||
self.loss_key = loss_key | |||||
self.teacher_model_size = 0 | |||||
self.student_model_size = 0 | |||||
if loss_key: | |||||
self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, accuracy(top1): {:.6f}, accuracy(topk): {:.6f}, samples/s: {:.3f}" | |||||
else: | |||||
self.fmt = "{}: epoch {}, iter {}, accuracy(top1): {:.6f}, accuracy(topk): {:.6f}, samples/s: {:.3f}" | |||||
self.timer = StopWatch() | |||||
self.timer.start() | |||||
self._clear() | |||||
def _clear(self): | |||||
self.top_1_num_matched = 0 | |||||
self.top_k_num_matched = 0 | |||||
self.num_samples = 0.0 | |||||
def metric_cb(self, epoch, step, args=None, log_file=None): | |||||
def callback(outputs): | |||||
if step == 0: self._clear() | |||||
if self.prediction_key: | |||||
num_matched, num_samples = match_top_k(outputs[self.prediction_key], | |||||
outputs[self.label_key]) | |||||
self.top_1_num_matched += num_matched | |||||
num_matched, _ = match_top_k(outputs[self.prediction_key], | |||||
outputs[self.label_key], self.top_k) | |||||
self.top_k_num_matched += num_matched | |||||
else: | |||||
num_samples = outputs[self.label_key].shape[0] | |||||
self.num_samples += num_samples | |||||
if (step + 1) % self.calculate_batches == 0: | |||||
throughput = self.num_samples / self.timer.split() | |||||
if self.prediction_key: | |||||
top_1_accuracy = self.top_1_num_matched / self.num_samples | |||||
top_k_accuracy = self.top_k_num_matched / self.num_samples | |||||
else: | |||||
top_1_accuracy = 0.0 | |||||
top_k_accuracy = 0.0 | |||||
if self.loss_key: | |||||
loss = outputs[self.loss_key].mean() | |||||
print(self.fmt.format(self.desc, epoch, step + 1, loss, top_1_accuracy, | |||||
top_k_accuracy, throughput)) | |||||
# print(outputs[self.prediction_key].numpy(), | |||||
# outputs[self.label_key].numpy(), | |||||
# outputs['logits'].numpy()) | |||||
if self.save_summary: | |||||
self.summary.scalar(self.desc+"_" + self.loss_key, loss, epoch, step) | |||||
else: | |||||
print('*'*106) | |||||
print(self.fmt.format(self.desc, epoch, step + 1, top_1_accuracy, | |||||
top_k_accuracy, throughput)) | |||||
if self.desc=='validation': | |||||
def getdirsize(dir): | |||||
size = 0 | |||||
for root, dirs, files in os.walk(dir): | |||||
for name in files: | |||||
if str(root[-2:]) == '-v' or str(root[-2:]) == '-m': | |||||
pass | |||||
else: | |||||
tmp = os.path.getsize(os.path.join(root, name)) | |||||
size += tmp | |||||
# size += sum([os.path.getsize(os.path.join(root, name)) for name in files]) | |||||
return size | |||||
model_size = 0 | |||||
if args.log_type == 'base_model': | |||||
if os.path.exists(os.path.join(args.model_save_dir,'snapshot_initial_model')): | |||||
self.teacher_model_size = getdirsize(os.path.join(args.model_save_dir,'snapshot_initial_model')) | |||||
elif os.path.exists(os.path.join(args.model_save_dir,'snapshot_last')): | |||||
self.teacher_model_size = getdirsize(os.path.join(args.model_save_dir,'snapshot_last')) | |||||
elif os.path.exists(os.path.join(args.model_save_dir,'snapshot_epoch_0')): | |||||
self.teacher_model_size = getdirsize(os.path.join(args.model_save_dir,'snapshot_epoch_0')) | |||||
else: | |||||
print('Error, not find {}'.format(args.model_save_dir)) | |||||
model_size = self.teacher_model_size # 获取teacher model大小, 即 model_base/snapshot_initial_model 文件夹大小 | |||||
elif args.log_type == 'prune_model': | |||||
if os.path.exists(args.model_load_dir): | |||||
self.student_model_size = getdirsize(args.model_load_dir) | |||||
else: | |||||
print('Error, not find {}'.format(args.model_load_dir)) | |||||
model_size = self.student_model_size # 获取student model大小,即 model_prune/model 文件夹大小 | |||||
save_dict = {"accuracy": "%.2f" % top_1_accuracy, | |||||
"top_k_accuracy": "%.2f" % top_k_accuracy, | |||||
"top_k": "%d" % self.top_k, | |||||
"modelSize": "%d" % (model_size / 1024 / 1024), | |||||
"reasoningTime": "%.2f" % throughput | |||||
} # samples/second | |||||
if args.log_type == 'base_model': | |||||
if not os.path.exists(args.before_result_dir): | |||||
os.makedirs(args.before_result_dir) | |||||
with open(os.path.join(args.before_result_dir, "results_eval.json"), "w") as f: | |||||
json.dump(save_dict, f) | |||||
if args.log_type == 'prune_model': | |||||
if not os.path.exists(args.after_result_dir): | |||||
os.makedirs(args.after_result_dir) | |||||
with open(os.path.join(args.after_result_dir, "results_eval.json"), "w") as f: | |||||
json.dump(save_dict, f) | |||||
if log_file: | |||||
log_file.write("epoch"+str(epoch)+" top_1_accuracy: "+str(top_1_accuracy)+\ | |||||
"; top_k_accuracy: "+str(top_k_accuracy)+"; "+str(throughput)+"samples/s\n") | |||||
print('*'*106) | |||||
self._clear() | |||||
if self.save_summary: | |||||
self.summary.scalar(self.desc + "_throughput", throughput, epoch, step) | |||||
if self.prediction_key: | |||||
self.summary.scalar(self.desc + "_top_1", top_1_accuracy, epoch, step) | |||||
self.summary.scalar(self.desc + "_top_{}".format(self.top_k), | |||||
top_k_accuracy, epoch, step) | |||||
if self.save_summary: | |||||
if (step + 1) % self.save_summary_steps == 0: | |||||
self.summary.save() | |||||
return callback | |||||
@@ -0,0 +1,196 @@ | |||||
# 知识蒸馏快速上手 | |||||
## 1. 简介 | |||||
知识蒸馏:通过一些优化目标从大型、知识丰富的teacher模型学习一个小型的student模型 | |||||
炼知技术平台提供了4个知识蒸馏相关算子,以及众多基于Oneflow算子复现的知识蒸馏模型和使用示例。 | |||||
<table> | |||||
<thead> | |||||
<tr> | |||||
<th>类型</th> | |||||
<th>知识蒸馏模型</th> | |||||
<th><a href="../../docs/API_knowledge_distill.md" target="_blank">主要算子</a></th> | |||||
<th>使用文档</th> | |||||
</tr> | |||||
</thead> | |||||
<tbody> | |||||
<tr> | |||||
<td rowspan="2">软标签蒸馏</td> | |||||
<td>KD</td> | |||||
<td>软标签蒸馏</td> | |||||
<td><a href="./examples/knowledge_distillation/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
<tr> | |||||
<td>Distilled-BiLSTM</td> | |||||
<td>软标签蒸馏,将BERT蒸馏到BiLSTM</td> | |||||
<td><a href="./examples/distilled-bilstm/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
<tr> | |||||
<td rowspan="2">从其他知识蒸馏</td> | |||||
<td>BERT-PKD</td> | |||||
<td>软标签蒸馏+层与层蒸馏</td> | |||||
<td><a href="./examples/bert-pkd/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
<tr> | |||||
<td>TinyBERT</td> | |||||
<td>软标签蒸馏+层与层蒸馏+注意力蒸馏</td> | |||||
<td><a href="./examples/tinybert/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
<tr> | |||||
<td>模块替换</td> | |||||
<td>BERT-Theseus</td> | |||||
<td>依照概率替换原有的BERT模块和Theseus的模块组成新的模型来训练</td> | |||||
<td><a href="theseus/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
</tbody> | |||||
</table> | |||||
## 2. 使用 | |||||
### 2.1 依赖 | |||||
- Python 3.6 | |||||
- oneflow-cu101 0.1.10 | |||||
- numpy 1.19.2 | |||||
完整的环境可以通过以下命令安装: | |||||
```bash | |||||
conda create -n distil python=3.6 | |||||
``` | |||||
``` | |||||
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user | |||||
``` | |||||
### 2.2 数据获取 | |||||
知识蒸馏主要针对NLP相关的任务,炼知平台在GLUE任务的数据集上对不同算法进行了测试。 | |||||
可以通过执行以下脚本下载GLUE任务的所有数据集,将会自动下载并解压到'--data_dir=data'目录下。 | |||||
``` | |||||
bash run_download_glue_data.sh | |||||
``` | |||||
或者 | |||||
```bash | |||||
python ../src/download_glue_data.py --data_dir data/glue_data --tasks all | |||||
``` | |||||
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"] | |||||
以上脚本将会默认下载所有GLUE任务数据集,也可以通过'--tasks=TASKS',指定下载某些数据集 | |||||
也可以在这里下载GLUE任务数据集,并放置到相关目录(data/glue_data)下 | |||||
链接: https://pan.baidu.com/s/1Im0uQM_V_zXkmQuHsFae0A 提取码: u64u | |||||
参考[加载与准备OneFlow数据集](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/how_to_make_ofdataset.md),制作OFRecords数据集。或者执行以下命令,生成OFRecords数据集: | |||||
``` | |||||
bash glue_process.sh | |||||
``` | |||||
**或者直接下载转换后的OFRecords GLUE数据集,并放置到相关目录(data/glue_ofrecord)下:** | |||||
链接: https://pan.baidu.com/s/1CY2BfCGBZEeo1EgY5JQcuA 提取码: v2h4 | |||||
### 2.3 微调教师模型 | |||||
预训练BERT模型下载地址: | |||||
链接: https://pan.baidu.com/s/1jfTUY7ygcZZOJzjfrgUL8Q 提取码: 6b87 | |||||
下载后放置在`./models/uncased_L-12_H-768_A-12_oneflow` | |||||
#### 2.3.1 训练 | |||||
- 执行以下脚本进行微调教师模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- MODEL_SAVE_DIR: 模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等) | |||||
```bash | |||||
bash run_train_teacher.sh | |||||
``` | |||||
- 我们微调过的教师模型可以在这里下载: 链接: https://pan.baidu.com/s/1jiOTSPBmmBoij0UwPO6UKw 提取码: 9xkp | |||||
- 已在SST-2,QQP,MRPC,RTE,CoLA数据集上微调 | |||||
- 并放置到`"model_compress/distil/models/finetuned_teacher/"`。 | |||||
- 在上述数据集的dev集上性能为SST-2: 92.2%, QQP: 91.1%, MRPC: 89.2%, RTE: 69.8%, CoLA: 58.5% | |||||
- 评价指标: | |||||
- Accuracy: SST-2, MRPC, QQP, RTE | |||||
- MCC (Matthews correlation coefficient): CoLA | |||||
#### 2.3.2 测试 | |||||
- 微调后,可以执行以下脚本对教师模型进行测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- TEACHER_MODEL_DIR: 教师模型路径 | |||||
```bash | |||||
bash run_eval_teacher.sh | |||||
``` | |||||
### 2.4 蒸馏到学生模型 | |||||
#### 2.4.1 训练 | |||||
执行以下脚本将教师模型蒸馏到学生模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径 | |||||
- TMP_STUDENT_DIR: 临时学生模型路径(如果需要的话,不需要则设为TMP_STUDENT_DIR="") | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等) | |||||
- 不同知识蒸馏算法: | |||||
- KD | |||||
```bash | |||||
bash run_train_student_kd.sh | |||||
``` | |||||
- Distilled-BiLSTM | |||||
```bash | |||||
bash run_train_student_distilled_lstm.sh | |||||
``` | |||||
- BERT-PKD | |||||
```bash | |||||
bash run_train_student_bert_pkd.sh | |||||
``` | |||||
>注:BERT-PKD可以随机初始化,也可以选择根据教师BERT中间层进行初始化,详细步骤请查阅[这里](./examples/bert-pkd/README.md#41-教师模型中间层保存与转换) | |||||
> 临时学生模型下载链接(SST-2, RTE, MRPC, CoLA, QQP数据集) 链接: https://pan.baidu.com/s/17F8KVsLd_lMODLaVLc7yrQ 提取码: 95ir | |||||
> 下载并解压,将相应的模型放置到`"./models/student_model/bert_pkd_3"`路径下 | |||||
- TinyBERT | |||||
```bash | |||||
bash run_train_student_tinybert.sh | |||||
``` | |||||
> 临时学生模型(通用TinyBERT)下载链接 链接: https://pan.baidu.com/s/1vZDILxXi-uxo2v3zFlWL3A 提取码: kpia | |||||
> BERT类模型最大序列长度设为128; LSTM类模型最大序列长度设为32,词表大小为10000 | |||||
#### 2.4.2 测试 | |||||
执行以下脚本进行测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- STUDENT_DIR: 学生模型保存路径,蒸馏过的学生模型下载链接如下(SST-2数据集) | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
- 不同知识蒸馏算法: | |||||
- KD | |||||
下载链接: https://pan.baidu.com/s/1EgQyQgxAcFAG8Ch3-4VPaw 提取码: 5k9p | |||||
```bash | |||||
bash run_eval_student_kd.sh | |||||
``` | |||||
- Distilled-BiLSTM | |||||
下载链接: https://pan.baidu.com/s/1M4XzB2DnLikglxVFvhnYpw 提取码: hqhj | |||||
```bash | |||||
bash run_eval_student_distilled_lstm.sh | |||||
``` | |||||
- BERT-PKD | |||||
- 从教师模型中间层初始化,下载链接: https://pan.baidu.com/s/1l7vXn-3U05Hzl0RXCJPiLg 提取码: 33dk | |||||
- 随机初始化,下载链接: https://pan.baidu.com/s/1m46j57Tova_yaGLabAqUIw 提取码: pdx4 | |||||
```bash | |||||
bash run_eval_student_bert_pkd.sh | |||||
``` | |||||
- TinyBERT | |||||
下载链接: https://pan.baidu.com/s/1nOAZHd3wLmyVw2vTJB7KfQ 提取码: ma65 | |||||
```bash | |||||
bash run_eval_student_tinybert.sh | |||||
``` | |||||
@@ -0,0 +1,89 @@ | |||||
# BERT-PKD | |||||
["Patient knowledge distillation for bert model compression"](https://arxiv.org/abs/1908.09355)的论文实现。 | |||||
传统的KD会导致学生模型在学习的时候只是学到了教师模型最终预测的概率分布,而完全忽略了中间隐藏层的表示,从而导致学生模型过拟合,泛化能力不足。 | |||||
BERT-PKD除了进行软标签蒸馏外,还对教师模型的中间层进行蒸馏。 | |||||
## 1. 依赖 | |||||
- Python 3.6 | |||||
- oneflow-cu101 0.1.10 | |||||
完整的环境可以通过以下命令安装: | |||||
```bash | |||||
conda create -n tinybert python=3.6 | |||||
``` | |||||
```bash | |||||
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user | |||||
``` | |||||
> 注:以下操作时,根目录为`model_compress/distil` | |||||
## 2. 数据获取 | |||||
如何获取数据请查阅[这里](../../README.md#22-数据获取) | |||||
## 3. 微调教师模型 | |||||
如何微调教师模型请查阅[这里](../../README.md#23-微调教师模型) | |||||
## 4. 蒸馏到学生模型 | |||||
### 4.1 教师模型中间层保存与转换 | |||||
为了初始化一个更好的学生模型,我们可以利用教师模型的中间层参数来初始化学生模型,而不是随机初始化一个学生模型。 | |||||
执行以下命令将教师模型的某些中间层参数提取并保存,用于初始化学生模型: | |||||
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径 | |||||
- TMP_STUDENT_DIR: 临时学生模型路径 | |||||
- LAYER_LIST: 保存的层数,如"2,6,10"是保存教师模型的第2,6,10层,用来初始化学生模型的第1,2,3层参数 | |||||
```bash | |||||
FT_BERT_BASE_DIR="./models/finetuned_teacher/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_best" | |||||
#FT_BERT_BASE_DIR="./models/finetuned_teacher/RTE_epoch-5_lr-3e-5_wd-0.0001/snapshot_best" | |||||
#FT_BERT_BASE_DIR="./models/finetuned_teacher/MRPC_epoch-5_lr-1e-5_wd-0.001/snapshot_best" | |||||
#FT_BERT_BASE_DIR="./models/finetuned_teacher/CoLA_epoch-5_lr-1e-5_wd-0.01/snapshot_best" | |||||
#FT_BERT_BASE_DIR="./models/finetuned_teacher/QQP_epoch-5_lr-2e-5_wd-0.0001/snapshot_best" | |||||
TMP_STUDENT_DIR='./models/student_model/bert_pkd_3/SST-2' | |||||
LAYER_LIST="2,6,10" | |||||
python3 examples/bert-pkd/bert-pkd_generate_student_model.py \ | |||||
--teacher_model=${FT_BERT_BASE_DIR} \ | |||||
--student_model=${TMP_STUDENT_DIR} \ | |||||
--layer_list=${LAYER_LIST} | |||||
``` | |||||
临时学生模型下载链接(SST-2, RTE, MRPC, CoLA, QQP数据集) | |||||
链接: https://pan.baidu.com/s/17F8KVsLd_lMODLaVLc7yrQ 提取码: 95ir | |||||
下载并解压,将相应的模型放置到`"./models/student_model/bert_pkd_3"`路径下 | |||||
### 4.2 训练 | |||||
执行以下脚本将教师模型蒸馏到学生模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径 | |||||
- TMP_STUDENT_DIR: 临时学生模型路径(从教师模型中间层初始化时需要指定) | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等) | |||||
```bash | |||||
bash run_train_student_bert_pkd.sh | |||||
``` | |||||
### 4.3 测试 | |||||
执行以下脚本进行测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
蒸馏过的学生模型下载链接如下(SST-2数据集): | |||||
- 从教师模型中间层初始化,下载链接: https://pan.baidu.com/s/1l7vXn-3U05Hzl0RXCJPiLg 提取码: 33dk | |||||
- 随机初始化,下载链接: https://pan.baidu.com/s/1m46j57Tova_yaGLabAqUIw 提取码: pdx4 | |||||
```bash | |||||
bash run_eval_student_bert_pkd.sh | |||||
``` | |||||
### 4.4 结果 | |||||
在SST-2 DEV数据集上: | |||||
- 模型精度:教师模型acc 92.2% ->学生模型acc 88.4% | |||||
- 模型尺寸:教师模型110M -> 学生模型 45.7M (↓2.4x) | |||||
- 推理耗时:教师模型4.04s -> 1.69s (↓2.4x) | |||||
@@ -0,0 +1,76 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import argparse | |||||
import shutil | |||||
import re | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = argparse.ArgumentParser() | |||||
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.") | |||||
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument("--layer_list", default="2,6,10", type=str, help="the set of intermediate layers to distill knowledge from") | |||||
args = parser.parse_args() | |||||
args.layer_list = args.layer_list.split(',') | |||||
args.layer_list = [int(i) for i in args.layer_list] | |||||
args.layer_num = len(args.layer_list) | |||||
student_filelist = [] | |||||
def subString(template): | |||||
rule = r'bert-encoder-layer_(.*?)-' | |||||
slotList = re.findall(rule, template) | |||||
return slotList | |||||
def CopyFile(filepath, newPath): | |||||
if not os.path.exists(newPath): | |||||
os.makedirs(newPath) | |||||
fileNames = os.listdir(filepath) | |||||
for file in fileNames: | |||||
newDir = os.path.join(filepath,file) | |||||
if os.path.isfile(newDir): | |||||
newFile = os.path.join(newPath, file) | |||||
shutil.copyfile(newDir, newFile) | |||||
else: | |||||
if not os.path.exists(os.path.join(newPath, file)): | |||||
os.makedirs(os.path.join(newPath, file)) | |||||
CopyFile(newDir,os.path.join(newPath, file)) | |||||
if not os.path.exists(args.student_model): | |||||
os.makedirs(args.student_model) | |||||
for a, b, c in os.walk(args.teacher_model): | |||||
for subdir in b: | |||||
if str(subdir[-2:])=='-v' or str(subdir[-2:])=='-m': | |||||
continue | |||||
teacher_layer_num = subString(subdir) | |||||
x = 'student-' + subdir | |||||
if len(teacher_layer_num)==0: | |||||
CopyFile(os.path.join(args.teacher_model,subdir),os.path.join(args.student_model,x)) | |||||
else: | |||||
teacher_layer_num = int(teacher_layer_num[0]) | |||||
if teacher_layer_num in args.layer_list: | |||||
student_layer_num = args.layer_list.index(teacher_layer_num) | |||||
rule = r'bert-encoder-layer_(.*?)-' | |||||
x = re.sub(rule,'bert-encoder-layer_{}-'.format(str(student_layer_num)),x) | |||||
CopyFile(os.path.join(args.teacher_model, subdir), os.path.join(args.student_model, x)) |
@@ -0,0 +1,8 @@ | |||||
# | |||||
FT_BERT_BASE_DIR="./models/finetuned_teacher/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_last_snapshot" | |||||
TMP_STUDENT_DIR='./models/student_model/bert_pkd_3/SST-2' | |||||
LAYER_LIST="2,6,10" | |||||
python3 bert-pkd_generate_student_model.py \ | |||||
--teacher_model=${FT_BERT_BASE_DIR} \ | |||||
--student_model=${TMP_STUDENT_DIR} \ | |||||
--layer_list=${LAYER_LIST} |
@@ -0,0 +1,491 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import math | |||||
import numpy as np | |||||
import sys | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
import oneflow as flow | |||||
from classifier import GlueBERT | |||||
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, remove_optimizer_params, remove_teacher_params | |||||
import config as configs | |||||
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score | |||||
import argparse | |||||
import shutil | |||||
import tempfile | |||||
from knowledge_distill_util import BertForSequenceClassification, BertStudentForSequenceClassification, soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill | |||||
import time | |||||
import json | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = configs.get_parser() | |||||
parser.add_argument("--task_name", type=str, default='CoLA') | |||||
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.") | |||||
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs') | |||||
parser.add_argument("--train_data_dir", type=str, default=None) | |||||
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-') | |||||
parser.add_argument("--train_example_num", type=int, default=88614, | |||||
help="example number in dataset") | |||||
parser.add_argument("--batch_size_per_device", type=int, default=32) | |||||
parser.add_argument("--train_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--eval_data_dir", type=str, default=None) | |||||
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-') | |||||
parser.add_argument("--eval_example_num", type=int, default=10833, | |||||
help="example number in dataset") | |||||
parser.add_argument("--eval_batch_size_per_device", type=int, default=64) | |||||
parser.add_argument("--eval_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results") | |||||
parser.add_argument("--student_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--student_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--student_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--student_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--student_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--student_hidden_size", type=int, default=768) | |||||
parser.add_argument("--teacher_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--teacher_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--teacher_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--teacher_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--teacher_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--teacher_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--teacher_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--teacher_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--teacher_hidden_size", type=int, default=768) | |||||
parser.add_argument("--kd_alpha", type=float, default=0.5, help='the usual Distillation loss {0.2,0.5,0.7}') | |||||
parser.add_argument("--kd_beta", type=float, default=10, help='the proposed loss {10,100,500,1000}') | |||||
parser.add_argument('--from_scratch', type=str2bool, nargs='?', const=False, help='train the student model from scratch or initialize from teacher layers') | |||||
parser.add_argument('--temperature', type=float, default=1.) | |||||
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?') | |||||
parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False, help='if serve for online, then after training, will delete the teacher params and optimizer parmas from model_save_dir') | |||||
args = parser.parse_args() | |||||
task_name = args.task_name.lower() | |||||
if args.aug_train: | |||||
args.train_data_dir = args.train_data_dir.replace('train','train_aug') | |||||
batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device | |||||
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device | |||||
epoch_size = math.ceil(args.train_example_num / batch_size) | |||||
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size) | |||||
args.iter_num = epoch_size * args.num_epochs | |||||
configs.print_args(args) | |||||
glue_output_modes = { | |||||
"cola": "classification", | |||||
"mnli": "classification", | |||||
"mnli-mm": "classification", | |||||
"mrpc": "classification", | |||||
"sst-2": "classification", | |||||
"sts-b": "regression", | |||||
"qqp": "classification", | |||||
"qnli": "classification", | |||||
"rte": "classification", | |||||
"wnli": "classification", | |||||
} | |||||
acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] | |||||
corr_tasks = ["sts-b"] | |||||
mcc_tasks = ["cola"] | |||||
output_mode = glue_output_modes[args.task_name.lower()] | |||||
def BertDecoder( | |||||
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True | |||||
): | |||||
with flow.scope.placement("cpu", "0:0"): | |||||
ofrecord = flow.data.ofrecord_reader(data_dir, | |||||
batch_size=batch_size, | |||||
data_part_num=data_part_num, | |||||
part_name_prefix=part_name_prefix, | |||||
random_shuffle=shuffle, | |||||
shuffle_after_epoch=shuffle) | |||||
blob_confs = {} | |||||
def _blob_conf(name, shape, dtype=flow.int32): | |||||
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) | |||||
_blob_conf("input_ids", [seq_length]) | |||||
_blob_conf("input_mask", [seq_length]) | |||||
_blob_conf("segment_ids", [seq_length]) | |||||
_blob_conf("label_ids", [1]) | |||||
_blob_conf("is_real_example", [1]) | |||||
return blob_confs | |||||
def get_tensor_data( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
shuffle=True | |||||
): | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
return decoders | |||||
def BuildBert( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
shuffle=True | |||||
): | |||||
hidden_size = args.hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.hidden_size_per_head = hidden_size / args.num_attention_heads | |||||
intermediate_size = hidden_size * 4 | |||||
# intermediate_size = 1200 | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
loss, logits = GlueBERT( | |||||
decoders['input_ids'], | |||||
decoders['input_mask'], | |||||
decoders['segment_ids'], | |||||
decoders['label_ids'], | |||||
args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=args.num_hidden_layers, | |||||
num_attention_heads=args.num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.attention_probs_dropout_prob, | |||||
max_position_embeddings=args.max_position_embeddings, | |||||
type_vocab_size=args.type_vocab_size, | |||||
initializer_range=0.02, | |||||
) | |||||
return loss, logits, decoders['label_ids'] | |||||
def student_model(input_ids, input_mask, segment_ids,is_train=True): | |||||
# hidden_size = 64 * args.student_num_attention_heads # , H = 64, size per head | |||||
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads | |||||
intermediate_size = hidden_size * 4 | |||||
# intermediate_size = 1200 | |||||
logits, reps, atts = BertStudentForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.student_vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=args.student_num_hidden_layers, | |||||
num_attention_heads=args.student_num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.student_hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.student_attention_probs_dropout_prob, | |||||
max_position_embeddings=args.student_max_position_embeddings, | |||||
type_vocab_size=args.student_type_vocab_size, | |||||
initializer_range=0.02, | |||||
is_student=True, | |||||
fit_size=args.teacher_hidden_size, | |||||
is_train=is_train | |||||
) | |||||
return logits, reps, atts | |||||
def teacher_model(input_ids,input_mask,segment_ids,is_train): | |||||
# hidden_size = 64 * args.teacher_num_attention_heads # , H = 64, size per head | |||||
teacher_hidden_size = args.teacher_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.teacher_hidden_size_per_head = teacher_hidden_size / args.teacher_num_attention_heads | |||||
intermediate_size = teacher_hidden_size * 4 | |||||
logits, reps, atts = BertForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=teacher_hidden_size, | |||||
num_hidden_layers=args.teacher_num_hidden_layers, | |||||
num_attention_heads=args.teacher_num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.teacher_hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.teacher_attention_probs_dropout_prob, | |||||
max_position_embeddings=args.teacher_max_position_embeddings, | |||||
type_vocab_size=args.teacher_type_vocab_size, | |||||
initializer_range=0.02, | |||||
is_student=False, | |||||
is_train=is_train | |||||
) | |||||
return logits, reps, atts | |||||
@flow.global_function(type='train', function_config=GetFunctionConfig(args)) | |||||
def DistilJob(): | |||||
train_dataset = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
) | |||||
student_logits, student_reps, _ = student_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=True) | |||||
teacher_logits, teacher_reps, _ = teacher_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=False) | |||||
pt_loss = layer_distill(args, student_reps,teacher_reps) | |||||
if output_mode == "classification": | |||||
ds_loss = pred_distill(args, student_logits, teacher_logits) | |||||
elif output_mode == "regression": | |||||
""" | |||||
todo | |||||
loss_mse = MSELoss() | |||||
cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1)) | |||||
""" | |||||
pass | |||||
loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=student_logits, labels=train_dataset['label_ids'] | |||||
) | |||||
loss_pkd = loss_ce * (1-args.kd_alpha) + args.kd_alpha * ds_loss + args.kd_beta * pt_loss | |||||
flow.losses.add_loss(loss_pkd) | |||||
opt = CreateOptimizer(args) | |||||
opt.minimize(loss_pkd) | |||||
return {'loss': loss_pkd} | |||||
# | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalTrainJob(): | |||||
train_dataset = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
shuffle=False | |||||
) | |||||
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=False) | |||||
return student_logits, train_dataset['label_ids'] | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalValJob(): | |||||
dev_dataset = get_tensor_data( | |||||
eval_batch_size, | |||||
args.eval_data_part_num, | |||||
args.eval_data_dir, | |||||
args.eval_data_prefix, | |||||
shuffle=False | |||||
) | |||||
student_logits, student_reps, student_atts = student_model(dev_dataset['input_ids'], dev_dataset['input_mask'], dev_dataset['segment_ids'],is_train=False) | |||||
return student_logits, dev_dataset['label_ids'] | |||||
def run_eval_job(eval_job_func, num_steps, desc='train'): | |||||
labels = [] | |||||
predictions = [] | |||||
start_time = time.time() | |||||
for index in range(num_steps): | |||||
logits, label = eval_job_func().get() | |||||
predictions.extend(list(logits.numpy().argmax(axis=1))) | |||||
labels.extend(list(label)) | |||||
end_time = time.time() | |||||
cost_time = end_time - start_time | |||||
print('cost time: {} s'.format(cost_time)) | |||||
model_size = getdirsize(args.model_save_dir) | |||||
print('model_size: %d Mbytes' % (model_size / 1024 / 1024)) # Mbytes | |||||
accuracy = accuracy_score(labels, predictions) | |||||
mcc = matthews_corrcoef(labels, predictions) | |||||
precision = precision_score(labels, predictions) | |||||
recall = recall_score(labels, predictions) | |||||
f_1 = f1_score(labels, predictions) | |||||
save_dict = {"accuracy":"%.2f" % accuracy, | |||||
"MCC":"%.2f" % mcc, | |||||
"precision": "%.2f" % precision, | |||||
"recall": "%.2f" % recall, | |||||
"f_1": "%.2f" % f_1, | |||||
"modelSize":"%d" % (model_size/1024/1024), | |||||
"reasoningTime":"%.2f" % (args.eval_example_num / cost_time)} # sample/second | |||||
if args.result_dir == "": | |||||
args.result_dir = args.model_save_dir | |||||
if not os.path.exists(args.result_dir): | |||||
os.makedirs(args.result_dir) | |||||
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f: | |||||
json.dump(save_dict, f) | |||||
def metric_fn(predictions, labels): | |||||
return { | |||||
"accuracy": accuracy, | |||||
"matthews_corrcoef": mcc, | |||||
"precision": precision, | |||||
"recall": recall, | |||||
"f1": f_1, | |||||
} | |||||
metric_dict = metric_fn(predictions, labels) | |||||
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items())) | |||||
return metric_dict | |||||
def CopyFile(filepath, newPath): | |||||
fileNames = os.listdir(filepath) | |||||
for file in fileNames: | |||||
newDir = os.path.join(filepath,file) | |||||
if os.path.isfile(newDir): | |||||
newFile = os.path.join(newPath, file) | |||||
shutil.copyfile(newDir, newFile) | |||||
else: | |||||
if not os.path.exists(os.path.join(newPath, file)): | |||||
os.makedirs(os.path.join(newPath, file)) | |||||
CopyFile(newDir,os.path.join(newPath, file)) | |||||
def main(): | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.env.log_dir(args.log_dir) | |||||
InitNodes(args) | |||||
check_point = flow.train.CheckPoint() | |||||
summary = Summary(args.log_dir, args) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
if args.do_train: | |||||
print('Loading model...') | |||||
if args.from_scratch: | |||||
print('Train the student model from scratch...') | |||||
check_point.load(args.teacher_model) | |||||
else: | |||||
print('Combining two models into one dir') | |||||
if not os.path.exists('./tmp'): | |||||
os.makedirs('./tmp') | |||||
args.total_model = tempfile.mkdtemp(dir='./tmp') | |||||
CopyFile(args.student_model, args.total_model) | |||||
CopyFile(args.teacher_model, args.total_model) | |||||
print('Initialize the student model from the teacher model...') | |||||
check_point.load(args.total_model) | |||||
print('Start training...') | |||||
global_step = 0 | |||||
best_dev_acc = 0.0 | |||||
for epoch in range(args.num_epochs): | |||||
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, | |||||
batch_size=batch_size, keys=['loss']) | |||||
for step in range(epoch_size): | |||||
DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) | |||||
global_step += 1 | |||||
# if (global_step + 1) % args.model_save_every_n_iter == 0: | |||||
# if not os.path.exists(args.model_save_dir): | |||||
# os.makedirs(args.model_save_dir) | |||||
# snapshot_save_path = os.path.join( | |||||
# args.model_save_dir, "snapshot_%d" % (global_step + 1) | |||||
# ) | |||||
# print("Saving model to {}.".format(snapshot_save_path)) | |||||
# check_point.save(snapshot_save_path) | |||||
print('EvalTrainJob...') | |||||
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') | |||||
print('EvalValJob...') | |||||
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
save_model = False | |||||
if task_name in acc_tasks and result['accuracy'] > best_dev_acc: | |||||
best_dev_acc = result['accuracy'] | |||||
save_model = True | |||||
# if task_name in corr_tasks and result['corr'] > best_dev_acc: | |||||
# best_dev_acc = result['corr'] | |||||
# save_model = True | |||||
if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: | |||||
best_dev_acc = result['matthews_corrcoef'] | |||||
save_model = True | |||||
print('Best result:', result) | |||||
if save_model: | |||||
if os.path.exists(args.model_save_dir): | |||||
import shutil | |||||
shutil.rmtree(args.model_save_dir) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
snapshot_save_path = os.path.join(args.model_save_dir) | |||||
print("Saving best model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
flow.sync_default_session() | |||||
if args.save_last_snapshot: | |||||
snapshot_save_path = args.model_save_dir | |||||
if os.path.exists(args.model_save_dir): | |||||
import shutil | |||||
shutil.rmtree(args.model_save_dir) | |||||
print("Saving model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
flow.sync_default_session() | |||||
if not args.from_scratch: | |||||
if global_step >= 100: | |||||
# remove tmp total models | |||||
print('Removing the tmp models...') | |||||
import shutil | |||||
shutil.rmtree(args.total_model) | |||||
if args.serve_for_online: | |||||
print('Deleting the teacher params and the optimizer parmas from model_save_dir...') | |||||
remove_teacher_params(args.model_save_dir) | |||||
print('Removing the tmp models...') | |||||
# shutil.rmtree(args.total_model) | |||||
if args.do_eval: | |||||
print('Loading model...') | |||||
print(args.model_save_dir) | |||||
if not args.do_train: | |||||
check_point.load(args.model_save_dir) | |||||
print('Evaluation...') | |||||
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
# if args.save_last_snapshot: | |||||
# snapshot.save("last_snapshot") | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,61 @@ | |||||
# Distilled-BiLSTM | |||||
["Distilling task-specific knowledge from bert into simple neural networks"](https://arxiv.org/abs/1903.12136)论文的实现 | |||||
Distilled BiLSTM的教师模型采用微调过的BERT,学生模型采用简单神经网络LSTM。 | |||||
蒸馏的目标是KD loss,即仅使用软标签进行蒸馏,将BERT中的知识蒸馏到LSTM中。 | |||||
## 1. 依赖 | |||||
- Python 3.6 | |||||
- oneflow-cu101 0.1.10 | |||||
完整的环境可以通过以下命令安装: | |||||
```bash | |||||
conda create -n tinybert python=3.6 | |||||
``` | |||||
```bash | |||||
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user | |||||
``` | |||||
> 注:以下操作时,根目录为`model_compress/distil` | |||||
## 2. 数据获取 | |||||
如何获取数据请查阅[这里](../../README.md#22-数据获取) | |||||
## 3. 微调教师模型 | |||||
如何微调教师模型请查阅[这里](../../README.md#23-微调教师模型) | |||||
## 4. 蒸馏到学生模型 | |||||
### 4.1 训练 | |||||
执行以下脚本将教师模型蒸馏到学生模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径 | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等) | |||||
> 最大序列长度为32,词表大小为10000 | |||||
```bash | |||||
bash run_train_student_distilled_lstm.sh | |||||
``` | |||||
### 4.2 测试 | |||||
蒸馏过的学生模型下载链接如下(SST-2数据集): | |||||
下载链接: https://pan.baidu.com/s/1M4XzB2DnLikglxVFvhnYpw 提取码: hqhj | |||||
执行以下脚本进行测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
```bash | |||||
bash run_eval_student_distilled_lstm.sh | |||||
``` | |||||
### 4.3 结果 | |||||
在SST-2 DEV数据集上: | |||||
- 模型精度:教师模型acc 92.2% ->学生模型acc 82.9% | |||||
- 模型尺寸:教师模型110M -> 学生模型 15.3M (↓7.5x) | |||||
- 推理耗时:教师模型4.04s -> 0.83s (↓4.8x) |
@@ -0,0 +1,118 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
nvidia-smi | |||||
dataset=SST-2 | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=./data/glue_ofrecord | |||||
# choose dateset `CoLA`, `MRPC` 'SST-2' | |||||
if [ $DATA_ROOT = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
learning_rate=2e-5 | |||||
wd=0.0001 | |||||
epoch=70 | |||||
elif [ $DATA_ROOT = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
learning_rate=5e-6 | |||||
epoch=20 | |||||
wd=0.001 | |||||
elif [ $DATA_ROOT = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
learning_rate=3e-5 | |||||
epoch=4 | |||||
wd=0.0001 | |||||
elif [ $DATA_ROOT = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $DATA_ROOT = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
wd=0.0001 | |||||
elif [ $DATA_ROOT = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
wd=0.0001 | |||||
elif [ $DATA_ROOT = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
wd=0.0001 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
STUDENT_DIR="./models/student_model/${dataset}/lstm_32_epoch-${epoch}_lr-${learning_rate}_wd-${wd}" | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
train_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
eval_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/eval | |||||
#EPOCH=10 | |||||
#learning_rate=2e-5 # 3e-5 | |||||
GPU=0 | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 task_lstm.py \ | |||||
--do_train='True' \ | |||||
--do_eval='True' \ | |||||
--model=Glue_${TASK_NAME} \ | |||||
--task_name=${TASK_NAME} \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${epoch} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_data_dir_lstm=${train_data_dir_lstm} \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_data_dir_lstm=$eval_data_dir_lstm \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 1 \ | |||||
--log_dir=./log \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--seq_length=32 \ | |||||
--student_num_hidden_layers=4 \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=10002 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=26 \ | |||||
--student_hidden_size=300 \ | |||||
--learning_rate=$learning_rate \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--weight_decay_rate=$wd |
@@ -0,0 +1,338 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import math | |||||
import numpy as np | |||||
import sys | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
import oneflow as flow | |||||
import oneflow.typing as tp | |||||
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig | |||||
import config as configs | |||||
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score | |||||
import argparse | |||||
import shutil | |||||
import tempfile | |||||
from knowledge_distill_util import LSTMStudentForSequenceClassification, BertForSequenceClassification, BertStudentForSequenceClassification, soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill | |||||
import time | |||||
import json | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = configs.get_parser() | |||||
parser.add_argument("--task_name", type=str, default='CoLA') | |||||
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs') | |||||
parser.add_argument("--train_data_dir", type=str, default=None) | |||||
parser.add_argument("--train_data_dir_lstm", type=str, default=None) | |||||
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-') | |||||
parser.add_argument("--train_example_num", type=int, default=88614, | |||||
help="example number in dataset") | |||||
parser.add_argument("--batch_size_per_device", type=int, default=32) | |||||
parser.add_argument("--train_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--eval_data_dir", type=str, default=None) | |||||
parser.add_argument("--eval_data_dir_lstm", type=str, default=None) | |||||
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-') | |||||
parser.add_argument("--eval_example_num", type=int, default=10833, | |||||
help="example number in dataset") | |||||
parser.add_argument("--eval_batch_size_per_device", type=int, default=64) | |||||
parser.add_argument("--eval_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
# | |||||
parser.add_argument("--student_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--student_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--student_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--student_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--student_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--student_hidden_size", type=int, default=768) | |||||
parser.add_argument("--kd_alpha", type=float, default=0.1) | |||||
parser.add_argument('--temperature', type=float, default=1.) | |||||
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?') | |||||
args = parser.parse_args() | |||||
task_name = args.task_name.lower() | |||||
if args.aug_train: | |||||
args.train_data_dir = args.train_data_dir.replace('train','train_aug') | |||||
batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device | |||||
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device | |||||
epoch_size = math.ceil(args.train_example_num / batch_size) | |||||
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size) | |||||
args.iter_num = epoch_size * args.num_epochs | |||||
configs.print_args(args) | |||||
glue_output_modes = { | |||||
"cola": "classification", | |||||
"mnli": "classification", | |||||
"mnli-mm": "classification", | |||||
"mrpc": "classification", | |||||
"sst-2": "classification", | |||||
"sts-b": "regression", | |||||
"qqp": "classification", | |||||
"qnli": "classification", | |||||
"rte": "classification", | |||||
"wnli": "classification", | |||||
} | |||||
acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] | |||||
corr_tasks = ["sts-b"] | |||||
mcc_tasks = ["cola"] | |||||
output_mode = glue_output_modes[args.task_name.lower()] | |||||
def BertDecoder( | |||||
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True | |||||
): | |||||
with flow.scope.placement("cpu", "0:0"): | |||||
ofrecord = flow.data.ofrecord_reader(data_dir, | |||||
batch_size=batch_size, | |||||
data_part_num=data_part_num, | |||||
part_name_prefix=part_name_prefix, | |||||
random_shuffle=shuffle, | |||||
shuffle_after_epoch=shuffle) | |||||
blob_confs = {} | |||||
def _blob_conf(name, shape, dtype=flow.int32): | |||||
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) | |||||
_blob_conf("input_ids", [seq_length]) | |||||
_blob_conf("input_mask", [seq_length]) | |||||
_blob_conf("segment_ids", [seq_length]) | |||||
_blob_conf("label_ids", [1]) | |||||
_blob_conf("is_real_example", [1]) | |||||
return blob_confs | |||||
def get_tensor_data( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
shuffle=True | |||||
): | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
return decoders | |||||
def student_model(input_ids, input_mask, segment_ids,is_train=True): | |||||
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
# args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads | |||||
# print('input_ids:',input_ids.shape) | |||||
logits = LSTMStudentForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.student_vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
intermediate_size=400, | |||||
num_hidden_layers=args.student_num_hidden_layers, | |||||
is_student=True, | |||||
is_train=is_train | |||||
) | |||||
return logits | |||||
def watch_handler(y: tp.Numpy): | |||||
print("out:",y) | |||||
def watch_diff_handler(blob: tp.Numpy): | |||||
print("watch_diff_handler:", blob, blob.shape, blob.dtype) | |||||
@flow.global_function(type='train', function_config=GetFunctionConfig(args)) | |||||
def DistilJob(): | |||||
train_dataset_lstm = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir_lstm, | |||||
args.train_data_prefix, | |||||
True | |||||
) | |||||
student_logits = student_model(train_dataset_lstm['input_ids'], train_dataset_lstm['input_mask'], train_dataset_lstm['segment_ids'],is_train=True) | |||||
# flow.watch(student_logits, watch_handler) | |||||
loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=student_logits, labels=train_dataset_lstm['label_ids'] | |||||
) | |||||
loss = loss_ce | |||||
flow.losses.add_loss(loss) | |||||
opt = CreateOptimizer(args) | |||||
opt.minimize(loss) | |||||
return {'loss': loss} | |||||
# | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalTrainJob(): | |||||
train_dataset_lstm = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir_lstm, | |||||
args.train_data_prefix, | |||||
shuffle=False | |||||
) | |||||
student_logits = student_model(train_dataset_lstm['input_ids'], train_dataset_lstm['input_mask'], train_dataset_lstm['segment_ids'],is_train=False) | |||||
return student_logits, train_dataset_lstm['label_ids'] | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalValJob(): | |||||
dev_dataset = get_tensor_data( | |||||
eval_batch_size, | |||||
args.eval_data_part_num, | |||||
args.eval_data_dir_lstm, | |||||
args.eval_data_prefix, | |||||
shuffle=False | |||||
) | |||||
student_logits= student_model(dev_dataset['input_ids'], dev_dataset['input_mask'], dev_dataset['segment_ids'],is_train=False) | |||||
return student_logits, dev_dataset['label_ids'] | |||||
# | |||||
def run_eval_job(eval_job_func, num_steps, desc='train'): | |||||
labels = [] | |||||
predictions = [] | |||||
start_time = time.time() | |||||
for index in range(num_steps): | |||||
logits, label = eval_job_func().get() | |||||
predictions.extend(list(logits.numpy().argmax(axis=1))) | |||||
labels.extend(list(label)) | |||||
end_time = time.time() | |||||
print('cost time: {} s'.format(end_time-start_time)) | |||||
def metric_fn(predictions, labels): | |||||
return { | |||||
"accuracy": accuracy_score(labels, predictions), | |||||
"matthews_corrcoef": matthews_corrcoef(labels, predictions), | |||||
"precision": precision_score(labels, predictions), | |||||
"recall": recall_score(labels, predictions), | |||||
"f1": f1_score(labels, predictions), | |||||
} | |||||
metric_dict = metric_fn(predictions, labels) | |||||
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items())) | |||||
return metric_dict | |||||
def getdirsize(dir): | |||||
size = 0 | |||||
for root, dirs, files in os.walk(dir): | |||||
size += sum([os.path.getsize(os.path.join(root, name)) for name in files]) | |||||
return size | |||||
def main(): | |||||
flow.config.enable_debug_mode(True) | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.env.log_dir(args.log_dir) | |||||
InitNodes(args) | |||||
check_point = flow.train.CheckPoint() | |||||
check_point.init() | |||||
summary = Summary(args.log_dir, args) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
if args.do_train: | |||||
print('Start training...') | |||||
global_step = 0 | |||||
best_dev_acc = 0.0 | |||||
print('epoch_size:',epoch_size) | |||||
print('args.iter_num:',args.iter_num) | |||||
for epoch in range(args.num_epochs): | |||||
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, | |||||
batch_size=batch_size, keys=['loss']) | |||||
for step in range(epoch_size): | |||||
loss = DistilJob().get() | |||||
if step % 10 == 0: | |||||
print('step/epoch_size:{}/{} epoch:{}'.format(step,epoch_size,epoch)) | |||||
print('loss:',loss['loss'].mean()) | |||||
# global_step+=1 | |||||
# DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) | |||||
print('EvalTrainJob...') | |||||
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') | |||||
print('EvalValJob...') | |||||
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
save_model = False | |||||
if task_name in acc_tasks and result['accuracy'] > best_dev_acc: | |||||
best_dev_acc = result['accuracy'] | |||||
save_model = True | |||||
# if task_name in corr_tasks and result['corr'] > best_dev_acc: | |||||
# best_dev_acc = result['corr'] | |||||
# save_model = True | |||||
if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: | |||||
best_dev_acc = result['matthews_corrcoef'] | |||||
save_model = True | |||||
print('Best result:', result) | |||||
if save_model: | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
snapshot_save_path = os.path.join(args.model_save_dir) | |||||
print("Saving best model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
if args.save_last_snapshot: | |||||
snapshot_save_path = args.model_save_dir | |||||
print("Saving model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
if args.do_eval: | |||||
print('Loading model...') | |||||
print(args.model_save_dir) | |||||
if not args.do_train: | |||||
check_point.load(args.model_save_dir) | |||||
print('Evaluation...') | |||||
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
# if args.save_last_snapshot: | |||||
# snapshot.save("last_snapshot") | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,439 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import math | |||||
import numpy as np | |||||
import sys | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
import oneflow as flow | |||||
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, remove_optimizer_params, remove_teacher_params | |||||
import config as configs | |||||
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score | |||||
import argparse | |||||
import shutil | |||||
import tempfile | |||||
from knowledge_distill_util import LSTMStudentForSequenceClassification, BertForSequenceClassification, BertStudentForSequenceClassification, soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill | |||||
import time | |||||
import json | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = configs.get_parser() | |||||
parser.add_argument("--task_name", type=str, default='CoLA') | |||||
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.") | |||||
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs') | |||||
parser.add_argument("--train_data_dir", type=str, default=None) | |||||
parser.add_argument("--train_data_dir_lstm", type=str, default=None) | |||||
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-') | |||||
parser.add_argument("--train_example_num", type=int, default=88614, | |||||
help="example number in dataset") | |||||
parser.add_argument("--batch_size_per_device", type=int, default=32) | |||||
parser.add_argument("--train_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--eval_data_dir", type=str, default=None) | |||||
parser.add_argument("--eval_data_dir_lstm", type=str, default=None) | |||||
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-') | |||||
parser.add_argument("--eval_example_num", type=int, default=10833, | |||||
help="example number in dataset") | |||||
parser.add_argument("--eval_batch_size_per_device", type=int, default=64) | |||||
parser.add_argument("--eval_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results") | |||||
# | |||||
parser.add_argument("--student_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--student_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--student_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--student_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--student_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--student_hidden_size", type=int, default=768) | |||||
parser.add_argument("--student_seq_length", type=int, default=32, help="the max seq length for studet") | |||||
parser.add_argument("--teacher_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--teacher_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--teacher_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--teacher_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--teacher_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--teacher_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--teacher_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--teacher_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--teacher_hidden_size", type=int, default=768) | |||||
parser.add_argument("--kd_alpha", type=float, default=0.1) | |||||
parser.add_argument('--temperature', type=float, default=1.) | |||||
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?') | |||||
parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False, help='if serve for online, then after training, will delete the teacher params and optimizer parmas from model_save_dir') | |||||
args = parser.parse_args() | |||||
task_name = args.task_name.lower() | |||||
if args.aug_train: | |||||
args.train_data_dir = args.train_data_dir.replace('train','train_aug') | |||||
batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device | |||||
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device | |||||
epoch_size = math.ceil(args.train_example_num / batch_size) | |||||
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size) | |||||
args.iter_num = epoch_size * args.num_epochs | |||||
configs.print_args(args) | |||||
glue_output_modes = { | |||||
"cola": "classification", | |||||
"mnli": "classification", | |||||
"mnli-mm": "classification", | |||||
"mrpc": "classification", | |||||
"sst-2": "classification", | |||||
"sts-b": "regression", | |||||
"qqp": "classification", | |||||
"qnli": "classification", | |||||
"rte": "classification", | |||||
"wnli": "classification", | |||||
} | |||||
acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] | |||||
corr_tasks = ["sts-b"] | |||||
mcc_tasks = ["cola"] | |||||
output_mode = glue_output_modes[args.task_name.lower()] | |||||
def BertDecoder( | |||||
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True | |||||
): | |||||
with flow.scope.placement("cpu", "0:0"): | |||||
ofrecord = flow.data.ofrecord_reader(data_dir, | |||||
batch_size=batch_size, | |||||
data_part_num=data_part_num, | |||||
part_name_prefix=part_name_prefix, | |||||
random_shuffle=shuffle, | |||||
shuffle_after_epoch=shuffle) | |||||
blob_confs = {} | |||||
def _blob_conf(name, shape, dtype=flow.int32): | |||||
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) | |||||
_blob_conf("input_ids", [seq_length]) | |||||
_blob_conf("input_mask", [seq_length]) | |||||
_blob_conf("segment_ids", [seq_length]) | |||||
_blob_conf("label_ids", [1]) | |||||
_blob_conf("is_real_example", [1]) | |||||
return blob_confs | |||||
def get_tensor_data( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
seq_length, | |||||
shuffle=True | |||||
): | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
return decoders | |||||
def student_model(input_ids, input_mask, segment_ids,is_train=True): | |||||
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
# args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads | |||||
# print('input_ids:',input_ids.shape) | |||||
logits = LSTMStudentForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.student_vocab_size, | |||||
seq_length=args.student_seq_length, | |||||
hidden_size=hidden_size, | |||||
intermediate_size=400, | |||||
num_hidden_layers=args.student_num_hidden_layers, | |||||
is_student=True, | |||||
is_train=is_train | |||||
) | |||||
return logits | |||||
def teacher_model(input_ids,input_mask,segment_ids,is_train): | |||||
teacher_hidden_size = args.teacher_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.teacher_hidden_size_per_head = teacher_hidden_size / args.teacher_num_attention_heads | |||||
intermediate_size = teacher_hidden_size * 4 | |||||
logits, reps, atts = BertForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=teacher_hidden_size, | |||||
num_hidden_layers=args.teacher_num_hidden_layers, | |||||
num_attention_heads=args.teacher_num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.teacher_hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.teacher_attention_probs_dropout_prob, | |||||
max_position_embeddings=args.teacher_max_position_embeddings, | |||||
type_vocab_size=args.teacher_type_vocab_size, | |||||
initializer_range=0.02, | |||||
is_student=False, | |||||
is_train=is_train | |||||
) | |||||
return logits, reps, atts | |||||
@flow.global_function(type='train', function_config=GetFunctionConfig(args)) | |||||
def DistilJob(): | |||||
train_dataset = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
args.seq_length, | |||||
False | |||||
) | |||||
train_dataset_lstm = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir_lstm, | |||||
args.train_data_prefix, | |||||
args.student_seq_length, | |||||
False | |||||
) | |||||
student_logits = student_model(train_dataset_lstm['input_ids'], train_dataset_lstm['input_mask'], train_dataset_lstm['segment_ids'],is_train=True) | |||||
teacher_logits, teacher_reps, teacher_atts = teacher_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=False) | |||||
if output_mode == "classification": | |||||
cls_loss = pred_distill(args, student_logits, teacher_logits) | |||||
elif output_mode == "regression": | |||||
""" | |||||
todo | |||||
loss_mse = MSELoss() | |||||
cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1)) | |||||
""" | |||||
pass | |||||
loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=student_logits, labels=train_dataset_lstm['label_ids'] | |||||
) | |||||
loss = loss_ce * (1-args.kd_alpha) + cls_loss * args.kd_alpha | |||||
flow.losses.add_loss(loss) | |||||
opt = CreateOptimizer(args) | |||||
opt.minimize(loss) | |||||
return {'loss': loss} | |||||
# | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalTrainJob(): | |||||
train_dataset = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir_lstm, | |||||
args.train_data_prefix, | |||||
args.student_seq_length, | |||||
shuffle=False | |||||
) | |||||
student_logits = student_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=False) | |||||
return student_logits, train_dataset['label_ids'] | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalValJob(): | |||||
dev_dataset = get_tensor_data( | |||||
eval_batch_size, | |||||
args.eval_data_part_num, | |||||
args.eval_data_dir_lstm, | |||||
args.eval_data_prefix, | |||||
args.student_seq_length, | |||||
shuffle=False | |||||
) | |||||
student_logits= student_model(dev_dataset['input_ids'], dev_dataset['input_mask'], dev_dataset['segment_ids'],is_train=False) | |||||
return student_logits, dev_dataset['label_ids'] | |||||
# | |||||
def run_eval_job(eval_job_func, num_steps, desc='train'): | |||||
labels = [] | |||||
predictions = [] | |||||
start_time = time.time() | |||||
for index in range(num_steps): | |||||
logits, label = eval_job_func().get() | |||||
predictions.extend(list(logits.numpy().argmax(axis=1))) | |||||
labels.extend(list(label)) | |||||
end_time = time.time() | |||||
cost_time = end_time - start_time | |||||
print('cost time: {} s'.format(cost_time)) | |||||
model_size = getdirsize(args.model_save_dir) | |||||
print('model_size: %d Mbytes' % (model_size / 1024 / 1024)) # Mbytes | |||||
accuracy = accuracy_score(labels, predictions) | |||||
mcc = matthews_corrcoef(labels, predictions) | |||||
precision = precision_score(labels, predictions) | |||||
recall = recall_score(labels, predictions) | |||||
f_1 = f1_score(labels, predictions) | |||||
save_dict = {"accuracy":"%.2f" % accuracy, | |||||
"MCC":"%.2f" % mcc, | |||||
"precision": "%.2f" % precision, | |||||
"recall": "%.2f" % recall, | |||||
"f_1": "%.2f" % f_1, | |||||
"modelSize":"%d" % (model_size/1024/1024), | |||||
"reasoningTime":"%.2f" % (args.eval_example_num / cost_time)} # sample/second | |||||
if args.result_dir == "": | |||||
args.result_dir = args.model_save_dir | |||||
if not os.path.exists(args.result_dir): | |||||
os.makedirs(args.result_dir) | |||||
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f: | |||||
json.dump(save_dict, f) | |||||
def metric_fn(predictions, labels): | |||||
return { | |||||
"accuracy": accuracy, | |||||
"matthews_corrcoef": mcc, | |||||
"precision": precision, | |||||
"recall": recall, | |||||
"f1": f_1, | |||||
} | |||||
metric_dict = metric_fn(predictions, labels) | |||||
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items())) | |||||
return metric_dict | |||||
def main(): | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.env.log_dir(args.log_dir) | |||||
InitNodes(args) | |||||
check_point = flow.train.CheckPoint() | |||||
check_point.init() | |||||
summary = Summary(args.log_dir, args) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
import shutil | |||||
if args.do_train: | |||||
print('Loading model...') | |||||
check_point.load(args.teacher_model) | |||||
print('Start training...') | |||||
global_step = 0 | |||||
best_dev_acc = 0.0 | |||||
print('epoch_size:',epoch_size) | |||||
print('args.iter_num:',args.iter_num) | |||||
for epoch in range(args.num_epochs): | |||||
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, | |||||
batch_size=batch_size, keys=['loss']) | |||||
for step in range(epoch_size): | |||||
loss = DistilJob().get() | |||||
if step % 10 == 0: | |||||
print('step/epoch_size:{}/{} epoch:{}'.format(step,epoch_size,epoch)) | |||||
print('loss:',loss['loss'].mean()) | |||||
# DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) | |||||
# DistilJob().get(metric.metric_cb(step)) | |||||
# global_step += 1 | |||||
# if (global_step + 1) % args.model_save_every_n_iter == 0: | |||||
# if (global_step + 1) % 1 == 0: | |||||
# print('global_step:',global_step) | |||||
# if not os.path.exists(args.model_save_dir): | |||||
# os.makedirs(args.model_save_dir) | |||||
# snapshot_save_path = os.path.join( | |||||
# args.model_save_dir, "snapshot_%d" % (global_step + 1) | |||||
# ) | |||||
# print("Saving model to {}.".format(snapshot_save_path)) | |||||
# check_point.save(snapshot_save_path) | |||||
# | |||||
print('EvalTrainJob...') | |||||
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') | |||||
print('EvalValJob...') | |||||
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
save_model = False | |||||
if task_name in acc_tasks and result['accuracy'] > best_dev_acc: | |||||
best_dev_acc = result['accuracy'] | |||||
save_model = True | |||||
# if task_name in corr_tasks and result['corr'] > best_dev_acc: | |||||
# best_dev_acc = result['corr'] | |||||
# save_model = True | |||||
if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: | |||||
best_dev_acc = result['matthews_corrcoef'] | |||||
save_model = True | |||||
print('Best result:', result) | |||||
if save_model: | |||||
if os.path.exists(args.model_save_dir): | |||||
import shutil | |||||
shutil.rmtree(args.model_save_dir) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
snapshot_save_path = os.path.join(args.model_save_dir) | |||||
print("Saving best model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
flow.sync_default_session() | |||||
if args.save_last_snapshot: | |||||
snapshot_save_path = args.model_save_dir | |||||
if os.path.exists(args.model_save_dir): | |||||
import shutil | |||||
shutil.rmtree(args.model_save_dir) | |||||
print("Saving model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
flow.sync_default_session() | |||||
if args.serve_for_online: | |||||
print('Deleting the teacher params and the optimizer parmas from model_save_dir...') | |||||
remove_teacher_params(args.model_save_dir) | |||||
if args.do_eval: | |||||
print('Loading model...') | |||||
print(args.model_save_dir) | |||||
if not args.do_train: | |||||
check_point.load(args.model_save_dir) | |||||
print('Evaluation...') | |||||
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
# if args.save_last_snapshot: | |||||
# snapshot.save("last_snapshot") | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,62 @@ | |||||
# Knowledge Distillation | |||||
["Distilling the knowledge in a neural network"](https://arxiv.org/abs/1503.02531)论文的实现 | |||||
KD的思路是使用教师模型的softmax层输出logits作为“soft target”,使得student模型可以学习teacher模型的输出,达到student模型模仿teacher模型在预测层的表现的目的。 | |||||
L_KD = αL_CE+(1-α)L_DS | |||||
- L_CE 为学生模型的输出logits和label的交叉熵。 | |||||
- L_DS 为学生模型输出logits和教师模型输出logits的距离,比如可以用软softmax或者KL散度等计算。 | |||||
- α用来调节两个loss的权重。 | |||||
## 1. 依赖 | |||||
- Python 3.6 | |||||
- oneflow-cu101 0.1.10 | |||||
完整的环境可以通过以下命令安装: | |||||
```bash | |||||
conda create -n tinybert python=3.6 | |||||
``` | |||||
```bash | |||||
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user | |||||
``` | |||||
> 注:以下操作时,根目录为`model_compress/distil` | |||||
## 2. 数据获取 | |||||
如何获取数据请查阅[这里](../../README.md#22-数据获取) | |||||
## 3. 微调教师模型 | |||||
如何微调教师模型请查阅[这里](../../README.md#23-微调教师模型) | |||||
## 4. 蒸馏到学生模型 | |||||
### 4.1 训练 | |||||
执行以下脚本将教师模型蒸馏到学生模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径 | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等) | |||||
```bash | |||||
bash run_train_student_kd.sh | |||||
``` | |||||
### 4.2 测试 | |||||
执行以下脚本进行测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
蒸馏过的学生模型下载链接如下(SST-2数据集): | |||||
下载链接: https://pan.baidu.com/s/1EgQyQgxAcFAG8Ch3-4VPaw 提取码: 5k9p | |||||
```bash | |||||
bash run_eval_student_kd.sh | |||||
``` | |||||
### 4.3 结果 | |||||
在SST-2 DEV数据集上: | |||||
- 模型精度:教师模型acc 92.2% ->学生模型acc 80.5% | |||||
- 模型尺寸:教师模型110M -> 学生模型 14.5M (↓7.5x) | |||||
- 推理耗时:教师模型4.04s -> 0.81s (↓5.0x) |
@@ -0,0 +1,498 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import math | |||||
import numpy as np | |||||
import sys | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
import oneflow as flow | |||||
import oneflow.typing as tp | |||||
from typing import Tuple, Any | |||||
from classifier import GlueBERT | |||||
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, \ | |||||
remove_optimizer_params, remove_teacher_params | |||||
import config as configs | |||||
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score | |||||
import argparse | |||||
import shutil | |||||
import tempfile | |||||
from knowledge_distill_util import BertForSequenceClassification, BertStudentForSequenceClassification, \ | |||||
soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill | |||||
import time | |||||
import json | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = configs.get_parser() | |||||
parser.add_argument("--task_name", type=str, default='CoLA') | |||||
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.") | |||||
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs') | |||||
parser.add_argument("--train_data_dir", type=str, default=None) | |||||
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-') | |||||
parser.add_argument("--train_example_num", type=int, default=88614, | |||||
help="example number in dataset") | |||||
parser.add_argument("--batch_size_per_device", type=int, default=32) | |||||
parser.add_argument("--train_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--eval_data_dir", type=str, default=None) | |||||
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-') | |||||
parser.add_argument("--eval_example_num", type=int, default=10833, | |||||
help="example number in dataset") | |||||
parser.add_argument("--eval_batch_size_per_device", type=int, default=64) | |||||
parser.add_argument("--eval_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results") | |||||
# | |||||
parser.add_argument("--student_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--student_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--student_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--student_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--student_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--student_hidden_size", type=int, default=768) | |||||
parser.add_argument("--teacher_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--teacher_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--teacher_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--teacher_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--teacher_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--teacher_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--teacher_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--teacher_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--teacher_hidden_size", type=int, default=768) | |||||
parser.add_argument("--kd_alpha", type=float, default=0.9) | |||||
parser.add_argument('--temperature', type=float, default=1.) | |||||
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?') | |||||
parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False, | |||||
help='if serve for online, then after training, will delete the teacher params and optimizer parmas from model_save_dir') | |||||
args = parser.parse_args() | |||||
task_name = args.task_name.lower() | |||||
if args.aug_train: | |||||
args.train_data_dir = args.train_data_dir.replace('train', 'train_aug') | |||||
batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device | |||||
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device | |||||
epoch_size = math.ceil(args.train_example_num / batch_size) | |||||
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size) | |||||
args.iter_num = epoch_size * args.num_epochs | |||||
configs.print_args(args) | |||||
glue_output_modes = { | |||||
"cola": "classification", | |||||
"mnli": "classification", | |||||
"mnli-mm": "classification", | |||||
"mrpc": "classification", | |||||
"sst-2": "classification", | |||||
"sts-b": "regression", | |||||
"qqp": "classification", | |||||
"qnli": "classification", | |||||
"rte": "classification", | |||||
"wnli": "classification", | |||||
} | |||||
acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] | |||||
corr_tasks = ["sts-b"] | |||||
mcc_tasks = ["cola"] | |||||
output_mode = glue_output_modes[args.task_name.lower()] | |||||
def BertDecoder( | |||||
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True | |||||
): | |||||
with flow.scope.placement("cpu", "0:0"): | |||||
ofrecord = flow.data.ofrecord_reader(data_dir, | |||||
batch_size=batch_size, | |||||
data_part_num=data_part_num, | |||||
part_name_prefix=part_name_prefix, | |||||
random_shuffle=shuffle, | |||||
shuffle_after_epoch=shuffle) | |||||
blob_confs = {} | |||||
def _blob_conf(name, shape, dtype=flow.int32): | |||||
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) | |||||
_blob_conf("input_ids", [seq_length]) | |||||
_blob_conf("input_mask", [seq_length]) | |||||
_blob_conf("segment_ids", [seq_length]) | |||||
_blob_conf("label_ids", [1]) | |||||
_blob_conf("is_real_example", [1]) | |||||
return blob_confs | |||||
def get_tensor_data( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
shuffle=True | |||||
): | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
return decoders | |||||
def BuildBert( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
shuffle=True | |||||
): | |||||
hidden_size = args.hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.hidden_size_per_head = hidden_size / args.num_attention_heads | |||||
# intermediate_size = hidden_size * 4 | |||||
intermediate_size = 1200 | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
# is_real_example = decoders['is_real_example'] | |||||
loss, logits = GlueBERT( | |||||
decoders['input_ids'], | |||||
decoders['input_mask'], | |||||
decoders['segment_ids'], | |||||
decoders['label_ids'], | |||||
args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=args.num_hidden_layers, | |||||
num_attention_heads=args.num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.attention_probs_dropout_prob, | |||||
max_position_embeddings=args.max_position_embeddings, | |||||
type_vocab_size=args.type_vocab_size, | |||||
initializer_range=0.02, | |||||
) | |||||
return loss, logits, decoders['label_ids'] | |||||
def student_model(input_ids, input_mask, segment_ids, is_train=True): | |||||
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads | |||||
# intermediate_size = hidden_size * 4 | |||||
intermediate_size = 1200 | |||||
logits, reps, atts = BertStudentForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.student_vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=args.student_num_hidden_layers, | |||||
num_attention_heads=args.student_num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.student_hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.student_attention_probs_dropout_prob, | |||||
max_position_embeddings=args.student_max_position_embeddings, | |||||
type_vocab_size=args.student_type_vocab_size, | |||||
initializer_range=0.02, | |||||
is_student=True, | |||||
fit_size=args.teacher_hidden_size, | |||||
is_train=is_train | |||||
) | |||||
return logits, reps, atts | |||||
def teacher_model(input_ids, input_mask, segment_ids, is_train): | |||||
teacher_hidden_size = args.teacher_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.teacher_hidden_size_per_head = teacher_hidden_size / args.teacher_num_attention_heads | |||||
intermediate_size = teacher_hidden_size * 4 | |||||
logits, reps, atts = BertForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=teacher_hidden_size, | |||||
num_hidden_layers=args.teacher_num_hidden_layers, | |||||
num_attention_heads=args.teacher_num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.teacher_hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.teacher_attention_probs_dropout_prob, | |||||
max_position_embeddings=args.teacher_max_position_embeddings, | |||||
type_vocab_size=args.teacher_type_vocab_size, | |||||
initializer_range=0.02, | |||||
is_student=False, | |||||
is_train=is_train | |||||
) | |||||
return logits, reps, atts | |||||
def watch_handler(y: tp.Numpy): | |||||
print("out:", y) | |||||
@flow.global_function(type='train', function_config=GetFunctionConfig(args)) | |||||
def DistilJob(): | |||||
train_dataset = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
) | |||||
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'], | |||||
train_dataset['segment_ids'], is_train=True) | |||||
teacher_logits, teacher_reps, teacher_atts = teacher_model(train_dataset['input_ids'], train_dataset['input_mask'], | |||||
train_dataset['segment_ids'], is_train=False) | |||||
if output_mode == "classification": | |||||
cls_loss = pred_distill(args, student_logits, teacher_logits) | |||||
elif output_mode == "regression": | |||||
""" | |||||
todo | |||||
loss_mse = MSELoss() | |||||
cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1)) | |||||
""" | |||||
pass | |||||
loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=student_logits, labels=train_dataset['label_ids'] | |||||
) | |||||
# flow.watch(student_logits, watch_handler) | |||||
# flow.watch(train_dataset['label_ids'], watch_handler) | |||||
loss = loss_ce * args.kd_alpha + cls_loss * (1 - args.kd_alpha) | |||||
flow.losses.add_loss(loss) | |||||
opt = CreateOptimizer(args) | |||||
opt.minimize(loss) | |||||
return {'loss': loss} | |||||
# | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalTrainJob(): | |||||
train_dataset = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
shuffle=False | |||||
) | |||||
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'], | |||||
train_dataset['segment_ids'], is_train=False) | |||||
return student_logits, train_dataset['label_ids'] | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalValJob(): | |||||
# 8551 or 1042 | |||||
dev_dataset = get_tensor_data( | |||||
eval_batch_size, | |||||
args.eval_data_part_num, | |||||
args.eval_data_dir, | |||||
args.eval_data_prefix, | |||||
shuffle=False | |||||
) | |||||
student_logits, student_reps, student_atts = student_model(dev_dataset['input_ids'], dev_dataset['input_mask'], | |||||
dev_dataset['segment_ids'], is_train=False) | |||||
return student_logits, dev_dataset['label_ids'] | |||||
def run_eval_job(eval_job_func, num_steps, desc='train'): | |||||
labels = [] | |||||
predictions = [] | |||||
start_time = time.time() | |||||
for index in range(num_steps): | |||||
logits, label = eval_job_func().get() | |||||
predictions.extend(list(logits.numpy().argmax(axis=1))) | |||||
labels.extend(list(label)) | |||||
end_time = time.time() | |||||
cost_time = end_time - start_time | |||||
print('cost time: {} s'.format(cost_time)) | |||||
model_size = getdirsize(args.model_save_dir) | |||||
print('model_size: %d Mbytes' % (model_size / 1024 / 1024)) # Mbytes | |||||
accuracy = accuracy_score(labels, predictions) | |||||
mcc = matthews_corrcoef(labels, predictions) | |||||
precision = precision_score(labels, predictions) | |||||
recall = recall_score(labels, predictions) | |||||
f_1 = f1_score(labels, predictions) | |||||
save_dict = {"accuracy": "%.2f" % accuracy, | |||||
"MCC": "%.2f" % mcc, | |||||
"precision": "%.2f" % precision, | |||||
"recall": "%.2f" % recall, | |||||
"f_1": "%.2f" % f_1, | |||||
"modelSize": "%d" % (model_size / 1024 / 1024), | |||||
"reasoningTime": "%.2f" % (args.eval_example_num / cost_time)} # sample/second | |||||
if args.result_dir == "": | |||||
args.result_dir = args.model_save_dir | |||||
if not os.path.exists(args.result_dir): | |||||
os.makedirs(args.result_dir) | |||||
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f: | |||||
json.dump(save_dict, f) | |||||
def metric_fn(predictions, labels): | |||||
return { | |||||
"accuracy": accuracy, | |||||
"matthews_corrcoef": mcc, | |||||
"precision": precision, | |||||
"recall": recall, | |||||
"f1": f_1, | |||||
} | |||||
metric_dict = metric_fn(predictions, labels) | |||||
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items())) | |||||
return metric_dict | |||||
def CopyFile(filepath, newPath): | |||||
fileNames = os.listdir(filepath) | |||||
for file in fileNames: | |||||
newDir = os.path.join(filepath, file) | |||||
if os.path.isfile(newDir): | |||||
# print(newDir) | |||||
newFile = os.path.join(newPath, file) | |||||
shutil.copyfile(newDir, newFile) | |||||
else: | |||||
if not os.path.exists(os.path.join(newPath, file)): | |||||
os.makedirs(os.path.join(newPath, file)) | |||||
CopyFile(newDir, os.path.join(newPath, file)) | |||||
def main(): | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.env.log_dir(args.log_dir) | |||||
InitNodes(args) | |||||
check_point = flow.train.CheckPoint() | |||||
summary = Summary(args.log_dir, args) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
if args.do_train: | |||||
print('Loading model...') | |||||
check_point.load(args.teacher_model) | |||||
print('Start training...') | |||||
global_step = 0 | |||||
best_dev_acc = 0.0 | |||||
for epoch in range(args.num_epochs): | |||||
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, | |||||
batch_size=batch_size, keys=['loss']) | |||||
for step in range(epoch_size): | |||||
DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) | |||||
global_step += 1 | |||||
# if (global_step + 1) % args.model_save_every_n_iter == 0: | |||||
# if not os.path.exists(args.model_save_dir): | |||||
# os.makedirs(args.model_save_dir) | |||||
# snapshot_save_path = os.path.join( | |||||
# args.model_save_dir, "snapshot_%d" % (global_step + 1) | |||||
# ) | |||||
# print("Saving model to {}.".format(snapshot_save_path)) | |||||
# check_point.save(snapshot_save_path) | |||||
print('EvalTrainJob...') | |||||
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') | |||||
print('EvalValJob...') | |||||
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
save_model = False | |||||
if task_name in acc_tasks and result['accuracy'] > best_dev_acc: | |||||
best_dev_acc = result['accuracy'] | |||||
save_model = True | |||||
# if task_name in corr_tasks and result['corr'] > best_dev_acc: | |||||
# best_dev_acc = result['corr'] | |||||
# save_model = True | |||||
if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: | |||||
best_dev_acc = result['matthews_corrcoef'] | |||||
save_model = True | |||||
print('Best result:', result) | |||||
if save_model: | |||||
if os.path.exists(args.model_save_dir): | |||||
import shutil | |||||
shutil.rmtree(args.model_save_dir) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
snapshot_save_path = os.path.join(args.model_save_dir) | |||||
print("Saving best model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
flow.sync_default_session() | |||||
if args.save_last_snapshot: | |||||
snapshot_save_path = args.model_save_dir | |||||
if os.path.exists(args.model_save_dir): | |||||
import shutil | |||||
shutil.rmtree(args.model_save_dir) | |||||
print("Saving model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
flow.sync_default_session() | |||||
if args.serve_for_online: | |||||
print('Deleting the teacher params and the optimizer parmas from model_save_dir...') | |||||
remove_teacher_params(args.model_save_dir) | |||||
if args.do_eval: | |||||
print('Loading model...') | |||||
print(args.model_save_dir) | |||||
if not args.do_train: | |||||
check_point.load(args.model_save_dir) | |||||
print('Evaluation...') | |||||
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
# if args.save_last_snapshot: | |||||
# snapshot.save("last_snapshot") | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,54 @@ | |||||
# BERT教师模型 | |||||
使用BERT在GLUE文本分类任务数据集上进行微调,作为知识蒸馏的教师模型。 | |||||
## 1. 依赖 | |||||
- Python 3.6 | |||||
- oneflow-cu101 0.1.10 | |||||
完整的环境可以通过以下命令安装: | |||||
```bash | |||||
conda create -n tinybert python=3.6 | |||||
``` | |||||
```bash | |||||
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user | |||||
``` | |||||
> 注:以下操作时,根目录为`model_compress/distil` | |||||
## 2. 数据获取 | |||||
如何获取数据请查阅[这里](../../README.md#22-数据获取) | |||||
## 3. 微调教师模型 | |||||
预训练BERT模型下载地址: | |||||
链接: https://pan.baidu.com/s/1jfTUY7ygcZZOJzjfrgUL8Q 提取码: 6b87 | |||||
下载后放置在`model_compress/models/uncased_L-12_H-768_A-12_oneflow` | |||||
#### 3.1 训练 | |||||
- 执行以下脚本进行微调教师模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- MODEL_SAVE_DIR: 模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如优化器参数等) | |||||
```bash | |||||
bash run_train_teacher.sh | |||||
``` | |||||
- 我们微调过的教师模型可以在这里下载: 链接: https://pan.baidu.com/s/1jiOTSPBmmBoij0UwPO6UKw 提取码: 9xkp | |||||
- 已在SST-2,QQP,MRPC,RTE,CoLA数据集上微调 | |||||
- 并放置到`"model_compress/distil/models/finetuned_teacher/"`。 | |||||
- 在上述数据集的dev集上性能为SST-2: 92.2%, QQP: 91.1%, MRPC: 89.2%, RTE: 69.8%, CoLA: 58.5% | |||||
- 评价指标: | |||||
- Accuracy: SST-2, MRPC, QQP, RTE | |||||
- MCC (Matthews correlation coefficient): CoLA | |||||
#### 3.2 测试 | |||||
- 微调后,可以执行以下脚本对教师模型进行测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- TEACHER_MODEL_DIR: 教师模型路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
```bash | |||||
bash run_eval_teacher.sh | |||||
``` |
@@ -0,0 +1,311 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import math | |||||
import numpy as np | |||||
import oneflow as flow | |||||
import sys | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
from classifier import GlueBERT | |||||
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, remove_optimizer_params, remove_teacher_params | |||||
import config as configs | |||||
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score | |||||
import argparse | |||||
import time | |||||
import json | |||||
import shutil | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = configs.get_parser() | |||||
parser.add_argument("--task_name", type=str, default='CoLA') | |||||
parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs') | |||||
parser.add_argument("--train_data_dir", type=str, default=None) | |||||
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-') | |||||
parser.add_argument("--train_example_num", type=int, default=88614, | |||||
help="example number in dataset") | |||||
parser.add_argument("--batch_size_per_device", type=int, default=32) | |||||
parser.add_argument("--train_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--eval_data_dir", type=str, default=None) | |||||
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-') | |||||
parser.add_argument("--eval_example_num", type=int, default=10833, | |||||
help="example number in dataset") | |||||
parser.add_argument("--eval_batch_size_per_device", type=int, default=64) | |||||
parser.add_argument("--eval_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results") | |||||
parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False, help='if True, then after training, will delete the teacher params and optimizer parmas from model_save_dir') | |||||
args = parser.parse_args() | |||||
task_name = args.task_name.lower() | |||||
batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device | |||||
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device | |||||
epoch_size = math.ceil(args.train_example_num / batch_size) | |||||
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size) | |||||
args.iter_num = epoch_size * args.num_epochs | |||||
configs.print_args(args) | |||||
glue_output_modes = { | |||||
"cola": "classification", | |||||
"mnli": "classification", | |||||
"mnli-mm": "classification", | |||||
"mrpc": "classification", | |||||
"sst-2": "classification", | |||||
"sts-b": "regression", | |||||
"qqp": "classification", | |||||
"qnli": "classification", | |||||
"rte": "classification", | |||||
"wnli": "classification", | |||||
} | |||||
acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] | |||||
corr_tasks = ["sts-b"] | |||||
mcc_tasks = ["cola"] | |||||
def BertDecoder( | |||||
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True | |||||
): | |||||
with flow.scope.placement("cpu", "0:0"): | |||||
ofrecord = flow.data.ofrecord_reader(data_dir, | |||||
batch_size=batch_size, | |||||
data_part_num=data_part_num, | |||||
part_name_prefix=part_name_prefix, | |||||
random_shuffle=shuffle, | |||||
shuffle_after_epoch=shuffle) | |||||
blob_confs = {} | |||||
def _blob_conf(name, shape, dtype=flow.int32): | |||||
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) | |||||
_blob_conf("input_ids", [seq_length]) | |||||
_blob_conf("input_mask", [seq_length]) | |||||
_blob_conf("segment_ids", [seq_length]) | |||||
_blob_conf("label_ids", [1]) | |||||
_blob_conf("is_real_example", [1]) | |||||
return blob_confs | |||||
def BuildBert( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
shuffle=True | |||||
): | |||||
hidden_size = 64 * args.num_attention_heads # , H = 64, size per head | |||||
intermediate_size = hidden_size * 4 | |||||
# intermediate_size=1200 | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
#is_real_example = decoders['is_real_example'] | |||||
loss, logits = GlueBERT( | |||||
decoders['input_ids'], | |||||
decoders['input_mask'], | |||||
decoders['segment_ids'], | |||||
decoders['label_ids'], | |||||
args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=args.num_hidden_layers, | |||||
num_attention_heads=args.num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.attention_probs_dropout_prob, | |||||
max_position_embeddings=args.max_position_embeddings, | |||||
type_vocab_size=args.type_vocab_size, | |||||
initializer_range=0.02, | |||||
) | |||||
return loss, logits, decoders['label_ids'] | |||||
@flow.global_function(type='train', function_config=GetFunctionConfig(args)) | |||||
def BertGlueFinetuneJob(): | |||||
loss, logits, _ = BuildBert( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
) | |||||
flow.losses.add_loss(loss) | |||||
opt = CreateOptimizer(args) | |||||
opt.minimize(loss) | |||||
return {'loss': loss} | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def BertGlueEvalTrainJob(): | |||||
_, logits, label_ids = BuildBert( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
shuffle=False | |||||
) | |||||
return logits, label_ids | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def BertGlueEvalValJob(): | |||||
#8551 or 1042 | |||||
_, logits, label_ids = BuildBert( | |||||
eval_batch_size, | |||||
args.eval_data_part_num, | |||||
args.eval_data_dir, | |||||
args.eval_data_prefix, | |||||
shuffle=False | |||||
) | |||||
return logits, label_ids | |||||
def run_eval_job(eval_job_func, num_steps, desc='train'): | |||||
labels = [] | |||||
predictions = [] | |||||
start_time = time.time() | |||||
for index in range(num_steps): | |||||
logits, label = eval_job_func().get() | |||||
predictions.extend(list(logits.numpy().argmax(axis=1))) | |||||
labels.extend(list(label)) | |||||
end_time = time.time() | |||||
cost_time = end_time-start_time | |||||
print('cost time: {} s'.format(cost_time)) | |||||
model_size = getdirsize(args.model_save_dir) | |||||
print('model_size: %d Mbytes' % (model_size/1024/1024)) # Mbytes | |||||
accuracy = accuracy_score(labels, predictions) | |||||
mcc = matthews_corrcoef(labels, predictions) | |||||
precision = precision_score(labels, predictions) | |||||
recall = recall_score(labels, predictions) | |||||
f_1 = f1_score(labels, predictions) | |||||
save_dict = {"accuracy":"%.2f" % accuracy, | |||||
"MCC":"%.2f" % mcc, | |||||
"precision": "%.2f" % precision, | |||||
"recall": "%.2f" % recall, | |||||
"f_1": "%.2f" % f_1, | |||||
"modelSize":"%d" % (model_size/1024/1024), | |||||
"reasoningTime":"%.2f" % (args.eval_example_num / cost_time)} # sample/second | |||||
if args.result_dir == "": | |||||
args.result_dir = args.model_save_dir | |||||
if not os.path.exists(args.result_dir): | |||||
os.makedirs(args.result_dir) | |||||
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f: | |||||
json.dump(save_dict, f) | |||||
def metric_fn(predictions, labels): | |||||
return { | |||||
"accuracy": accuracy, | |||||
"matthews_corrcoef": mcc, | |||||
"precision": precision, | |||||
"recall": recall, | |||||
"f1": f_1, | |||||
} | |||||
metric_dict = metric_fn(predictions, labels) | |||||
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items())) | |||||
#pd.DataFrame({'predictions': predictions, 'labels': labels}).to_csv('predictions_{0}.csv'.format(step), index=False) | |||||
return metric_dict | |||||
def main(): | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.env.log_dir(args.log_dir) | |||||
InitNodes(args) | |||||
if args.do_train: | |||||
snapshot = Snapshot(args.model_save_dir, args.model_load_dir) | |||||
summary = Summary(args.log_dir, args) | |||||
best_dev_acc = 0.0 | |||||
best_result = {} | |||||
for epoch in range(args.num_epochs): | |||||
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, | |||||
batch_size=batch_size, keys=['loss']) | |||||
for step in range(epoch_size): | |||||
BertGlueFinetuneJob().async_get(metric.metric_cb(step, epoch=epoch)) | |||||
#if 1: #step % args.loss_print_every_n_iter == 0: | |||||
run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train') | |||||
result = run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
save_model = False | |||||
if task_name in acc_tasks and result['accuracy'] > best_dev_acc: | |||||
best_dev_acc = result['accuracy'] | |||||
best_result = result | |||||
save_model = True | |||||
print('Best result:', result) | |||||
# if task_name in corr_tasks and result['corr'] > best_dev_acc: | |||||
# best_dev_acc = result['corr'] | |||||
# best_result = result | |||||
# save_model = True | |||||
#print('Best result:', result) | |||||
if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: | |||||
best_dev_acc = result['matthews_corrcoef'] | |||||
best_result = result | |||||
save_model = True | |||||
print('Best result:', result) | |||||
if save_model: | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
# snapshot_save_path = os.path.join(args.model_save_dir) | |||||
# print("Saving best model to {}".format(snapshot_save_path)) | |||||
snapshot.save('best') | |||||
flow.sync_default_session() | |||||
print('Best result:',best_result ) | |||||
print("Saving best model to "+os.path.join(args.model_save_dir,'snapshot_best')) | |||||
if args.serve_for_online: | |||||
print('Deleting the optimizer parmas from model_save_dir...') | |||||
remove_optimizer_params(os.path.join(args.model_save_dir,'snapshot_best')) | |||||
# if args.save_last_snapshot: | |||||
# snapshot.save("last_snapshot") | |||||
if args.do_eval: | |||||
print('Loading model...') | |||||
print(args.model_save_dir) | |||||
if not args.do_train: | |||||
check_point = flow.train.CheckPoint() | |||||
check_point.load(args.model_save_dir) | |||||
print('Evaluation...') | |||||
run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,203 @@ | |||||
# TinyBERT | |||||
["TinyBERT: Distilling BERT for Natural Language Understanding"](https://arxiv.org/abs/1909.10351)论文的实现 | |||||
## 1. 依赖 | |||||
- Python 3.6 | |||||
- oneflow-cu101 0.1.10 | |||||
完整的环境可以通过以下命令安装: | |||||
```bash | |||||
conda create -n tinybert python=3.6 | |||||
``` | |||||
```bash | |||||
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user | |||||
``` | |||||
> 注:以下操作时,根目录为`model_compress/distil` | |||||
## 2. 通用蒸馏 (General Distillation,可选) | |||||
通用蒸馏阶段使用预训练得到的 BERT-base 为教师模型,在大规模文本语料上进行知识蒸馏得到通用的TinyBERT。 | |||||
这个操作可以让TinyBERT学习到通用的语义表示,提高了模型的泛化能力,也为随后针对特定任务的蒸馏提供了一个很好的初始化。 | |||||
通用蒸馏包含两步: | |||||
(1)语料预处理 (2)通用蒸馏 | |||||
### 2.1 语料预处理 | |||||
准备大规模语料,比如[WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)。可以用过如下命令下载: | |||||
``` | |||||
cd data | |||||
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip | |||||
unzip wikitext-103-raw-v1.zip | |||||
rm wikitext-103-raw-v1.zip | |||||
``` | |||||
执行以下命令,进行训练数据预处理 | |||||
- CORPUS_RAW:大规模语料,比如说Wikipedia | |||||
- BERT_BASE_DIR:教师模型类型 | |||||
- OUTPUT_DIR: 处理过的语料保存路径 | |||||
直接执行 | |||||
```bash | |||||
bash run_pregenerate_training_data.sh | |||||
``` | |||||
或者 执行 | |||||
```bash | |||||
CORPUS_RAW='./data/wikitext-103-raw/wiki.train.raw' | |||||
BERT_BASE_DIR='bert-base-uncased' | |||||
OUTPUT_DIR='./data/pretrain_data_json' | |||||
python pregenerate_training_data.py \ | |||||
--train_corpus $CORPUS_RAW \ | |||||
--bert_model $BERT_BASE_DIR \ | |||||
--do_lower_case \ | |||||
--epochs_to_generate 3 \ | |||||
--output_dir $OUTPUT_DIR | |||||
``` | |||||
### 2.2 通用蒸馏 | |||||
将Pytorch的通用TinyBERT模型转为OneFlow的模型格式: | |||||
Pytorch版通用tinybert -> tensorflow版通用tinybert -> OneFlow版通用tinybert | |||||
#### Step1: | |||||
- 从[TinyBERT页面](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT)下载已经训练好的通用TinyBERT模型: | |||||
- 利用我们提供的convert_bert_pytorch_checkpoint_to_original_tf.py脚本,将其转为tensorflow模型格式。转换过程如下: | |||||
``` | |||||
python convert_bert_pytorch_checkpoint_to_original_tf.py --model_name='./models/2nd_General_TinyBERT_4L_312D' --pytorch_model_path='./models/2nd_General_TinyBERT_4L_312D/pytorch_model.bin' --tf_cache_dir='./models/2nd_General_TinyBERT_4L_312D_tf' | |||||
``` | |||||
- 再利用我们提供的convert_tf_ckpt_to_of.py脚本,将其转为oneflow模型格式。转换过程如下: | |||||
``` | |||||
cd ./models/2nd_General_TinyBERT_4L_312D_tf/ | |||||
cat > checkpoint <<ONEFLOW | |||||
model_checkpoint_path: "bert_model.ckpt" | |||||
all_model_checkpoint_paths: "bert_model.ckpt" | |||||
ONEFLOW | |||||
``` | |||||
该命令将在解压目录下创建一个checkpoint文件,并写入以下内容: | |||||
model_checkpoint_path: "bert_model.ckpt" | |||||
all_model_checkpoint_paths: "bert_model.ckpt" | |||||
此时,已经准备好待转化的tensorflow模型目录,整个模型目录的结构如下: | |||||
```2nd_General_TinyBERT_4L_312D_tf | |||||
├── bert_config.json | |||||
├── bert_model.ckpt.data-00000-of-00001 | |||||
├── bert_model.ckpt.index | |||||
├── checkpoint | |||||
└── vocab.txt | |||||
``` | |||||
#### Step2: | |||||
我们接着使用convert_tf_ckpt_to_of.py将tensorflow模型转为OneFlow模型: | |||||
``` | |||||
python convert_tf_ckpt_to_of.py \ | |||||
--tf_checkpoint_path ./models/2nd_General_TinyBERT_4L_312D_tf \ | |||||
--of_dump_path ./models/2nd_General_TinyBERT_4L_312D_oneflow | |||||
``` | |||||
以上命令,将转化好的OneFlow格式的模型保存在`./2nd_General_TinyBERT_4L_312D_oneflow`目录下,供后续微调训练使用。 | |||||
**你也可以直接下载我们提供的两种规模的通用TinyBERT: General_TinyBERT(4layer-312dim)和General_TinyBERT(6layer-768dim)** | |||||
下载地址如下: | |||||
链接: https://pan.baidu.com/s/1vZDILxXi-uxo2v3zFlWL3A 提取码: kpia | |||||
将他们下载下来,放置在`'./models'`路径下,如`'./models/2nd_General_TinyBERT_4L_312D_oneflow'`和`'./models/2nd_General_TinyBERT_6L_768D_oneflow'` | |||||
## 3. 数据增强 (可选) | |||||
数据增强是TinyBERT中重要的一步个步骤,通过数据增强步骤,TinyBERT可以学习更多的任务相关的例子,可以进一步提高学生模型的泛化能力。可以帮助TinyBERT获得和BERT-base相匹配的性能,甚至在部分任务上超过BERT-base的表现。 | |||||
### 3.1 GLUE数据集下载 | |||||
可以通过执行以下脚本下载GLUE任务的所有数据集,将会自动下载并解压到'--data_dir=data'目录下。 | |||||
```bash | |||||
python ../../src/download_glue_data.py --data_dir data/glue_data --tasks all | |||||
``` | |||||
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"] | |||||
以上脚本将会默认下载所有BLUE任务数据集,也可以通过'--tasks=TASKS',指定下载某些数据集 | |||||
参考[加载与准备OneFlow数据集](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/how_to_make_ofdataset.md),制作OFRecords数据集。或者执行,生成OFRecords数据集: | |||||
``` | |||||
bash glue_process.sh | |||||
``` | |||||
**或者直接下载转换后的OFRecords GLUE数据集:** | |||||
链接: https://pan.baidu.com/s/1TuDJpJ8z9zJvvhqjjXiGDg 提取码: phyf | |||||
### 3.2 下载GloVe嵌入 | |||||
TinyBERT所采用的数据增强方法,结合了预训练BERT和GloVe嵌入来做词级别的替换。 | |||||
可以同以下脚本下载GloVe嵌入,放置到'model_compress/distil/glove'目录下 | |||||
``` | |||||
cd glove | |||||
wget http://nlp.stanford.edu/data/glove.840B.300d.zip | |||||
unzip glove.840B.300d.zip | |||||
rm glove.840B.300d.zip | |||||
``` | |||||
### 3.3 进行数据增强 | |||||
通过执行以下脚本进行数据增强 | |||||
``` bash | |||||
bash run_data_augmentation.sh | |||||
``` | |||||
增强后的数据集 train_aug.tsv 会自动保存到相应的GLUE任务数据集下。 | |||||
## 4. 任务特定蒸馏 (Task-specific Distillation) | |||||
在任务特定蒸馏中,将重新对得到的通用TinyBERT进行微调。通过在特定任务上进行微调,来进一步改进TinyBERT。任务特定化蒸馏包括三个步骤: | |||||
(1)微调教师BERT,随后(2)微调学生TinyBERT,包含层与层蒸馏、注意力蒸馏和软标签蒸馏。 | |||||
### 4.1 微调教师模型BERT | |||||
预训练BERT模型下载地址: | |||||
- 链接: https://pan.baidu.com/s/1jfTUY7ygcZZOJzjfrgUL8Q 提取码: 6b87 | |||||
- 下载后放置在`./models/uncased_L-12_H-768_A-12_oneflow` | |||||
如何微调教师模型请查阅[这里](../../README.md#23-微调教师模型) | |||||
- 我们微调过的教师模型可以在这里下载: 链接: https://pan.baidu.com/s/1jiOTSPBmmBoij0UwPO6UKw 提取码: 9xkp | |||||
- 已在SST-2,QQP,MRPC,RTE,CoLA数据集上微调 | |||||
- 并放置到`"model_compress/distil/models/finetuned_teacher/"`。 | |||||
- 在上述数据集的dev集上性能为SST-2: 92.2%, QQP: 91.1%, MRPC: 89.2%, RTE: 69.8%, CoLA: 58.5% | |||||
- 评价指标: | |||||
- Accuracy: SST-2, MRPC, QQP, RTE | |||||
- MCC (Matthews correlation coefficient): CoLA | |||||
### 4.2 微调学生模型TinyBERT | |||||
执行以下脚本将教师模型蒸馏到学生模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径 | |||||
- TMP_STUDENT_DIR: 临时学生模型路径 | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等) | |||||
直接执行 | |||||
```bash | |||||
bash run_train_student_tinybert.sh | |||||
``` | |||||
最终得到学生TinyBERT,可以从这里下载: | |||||
- 下载链接: https://pan.baidu.com/s/1nOAZHd3wLmyVw2vTJB7KfQ 提取码: ma65 | |||||
- 并放置到`./models/student_model/SST-2/tinybert_epoch-4_lr-2e-5_wd-0.0001` | |||||
### 4.3 性能测试 | |||||
通过执行以下脚本,在GLUE任务上进行性能测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- STUDENT_DIR: 学生模型保存路径,蒸馏过的学生模型下载链接如下(SST-2数据集) | |||||
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json) | |||||
```bash | |||||
bash run_eval_student_tinybert.sh | |||||
``` | |||||
### 4.4 结果: | |||||
在SST-2 DEV数据集上: | |||||
- 模型精度:教师模型acc 92.2% ->学生模型acc 91.3% | |||||
- 模型尺寸:教师模型110M -> 学生模型 14.5M (↓7.5x) | |||||
- 推理耗时:教师模型4.04s -> 0.65s (↓6.2×) |
@@ -0,0 +1,393 @@ | |||||
# coding: utf-8 | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import random | |||||
import sys | |||||
import os | |||||
import unicodedata | |||||
import re | |||||
import logging | |||||
import csv | |||||
import argparse | |||||
import numpy as np | |||||
import sys | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src"))) | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../src"))) | |||||
sys.path.append(rootPath) | |||||
import tokenization | |||||
from maskedBert import maskedBert | |||||
import oneflow as flow | |||||
import config as configs | |||||
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig | |||||
import math | |||||
import oneflow.typing as tp | |||||
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', | |||||
datefmt='%m/%d/%Y %H:%M:%S', | |||||
level=logging.INFO) | |||||
logger = logging.getLogger(__name__) | |||||
StopWordsList = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', | |||||
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', | |||||
'they', 'them', 'their', 'theirs', 'themselves', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', | |||||
'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', | |||||
'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', | |||||
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', | |||||
'there', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', | |||||
'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', | |||||
'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', | |||||
"haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", | |||||
'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "'s", "'re"] | |||||
parser = configs.get_parser() | |||||
parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str, | |||||
help="Downloaded pretrained model (bert-base-uncased) is under this folder") | |||||
parser.add_argument("--glove_embs", default="./glove/glove.840B.300d.txt", type=str, | |||||
help="Glove word embeddings file") | |||||
parser.add_argument("--glue_dir", default="./data", type=str, | |||||
help="GLUE data dir") | |||||
parser.add_argument("--task_name", default="MRPC", type=str, | |||||
help="Task(eg. CoLA, SST-2) that we want to do data augmentation for its train set") | |||||
parser.add_argument("--N", default=30, type=int, | |||||
help="How many times is the corpus expanded?") | |||||
parser.add_argument("--M", default=15, type=int, | |||||
help="Choose from M most-likely words in the corresponding position") | |||||
parser.add_argument("--p", default=0.4, type=float, | |||||
help="Threshold probability p to replace current word") | |||||
parser.add_argument( | |||||
'--vocab_file', | |||||
help='The vocabulary file that the BERT model was trained on.', | |||||
default=None, | |||||
type=str, | |||||
required=True | |||||
) | |||||
parser.add_argument("--eval_data_dir", type=str, default=None) | |||||
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-') | |||||
parser.add_argument("--eval_batch_size_per_device", type=int, default=64) | |||||
parser.add_argument("--eval_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
args = parser.parse_args() | |||||
# batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device | |||||
# eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device | |||||
eval_batch_size = 1 | |||||
# epoch_size = math.ceil(args.train_example_num / batch_size) | |||||
# num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size) | |||||
# args.iter_num = epoch_size * args.num_epochs | |||||
configs.print_args(args) | |||||
def strip_accents(text): | |||||
""" | |||||
Strip accents from input String. | |||||
:param text: The input string. | |||||
:type text: String. | |||||
:returns: The processed String. | |||||
:rtype: String. | |||||
""" | |||||
try: | |||||
text = unicode(text, 'utf-8') | |||||
except (TypeError, NameError): | |||||
# unicode is a default on python 3 | |||||
pass | |||||
text = unicodedata.normalize('NFD', text) | |||||
text = text.encode('ascii', 'ignore') | |||||
text = text.decode("utf-8") | |||||
return str(text) | |||||
# valid string only includes al | |||||
def _is_valid(string): | |||||
return True if not re.search('[^a-z]', string) else False | |||||
def _read_tsv(input_file, quotechar=None): | |||||
"""Reads a tab separated value file.""" | |||||
with open(input_file, "r", encoding="utf-8") as f: | |||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar) | |||||
lines = [] | |||||
for line in reader: | |||||
if sys.version_info[0] == 2: | |||||
line = list(unicode(cell, 'utf-8') for cell in line) | |||||
lines.append(line) | |||||
return lines | |||||
def prepare_embedding_retrieval(glove_file, vocab_size=100000): | |||||
cnt = 0 | |||||
words = [] | |||||
embeddings = {} | |||||
# only read first 100,000 words for fast retrieval | |||||
with open(glove_file, 'r', encoding='utf-8') as fin: | |||||
for line in fin: | |||||
items = line.strip().split(' ') | |||||
words.append(items[0]) | |||||
embeddings[items[0] = [float(x) for x in items[1:] | |||||
cnt += 1 | |||||
if cnt == vocab_size: | |||||
break | |||||
vocab = {w: idx for idx, w in enumerate(words)} | |||||
ids_to_tokens = {idx: w for idx, w in enumerate(words)} | |||||
vector_dim = len(embeddings[ids_to_tokens[0]) | |||||
emb_matrix = np.zeros((vocab_size, vector_dim)) | |||||
for word, v in embeddings.items(): | |||||
if word == '<unk>': | |||||
continue | |||||
emb_matrix[vocab[word], :] = v | |||||
# normalize each word vector | |||||
d = (np.sum(emb_matrix ** 2, 1) ** 0.5) | |||||
emb_norm = (emb_matrix.T / d).T | |||||
return emb_norm, vocab, ids_to_tokens | |||||
BATCH_SIZE = 1 | |||||
@flow.global_function(type="predict", function_config=GetFunctionConfig(args)) | |||||
def eval_job( | |||||
input_ids: tp.Numpy.Placeholder((BATCH_SIZE, args.seq_length), dtype=flow.int32), | |||||
input_mask: tp.Numpy.Placeholder((BATCH_SIZE, args.seq_length), dtype=flow.int32), | |||||
segment_ids: tp.Numpy.Placeholder((BATCH_SIZE, args.seq_length), dtype=flow.int32), | |||||
mask_id: tp.Numpy.Placeholder((BATCH_SIZE, 1), dtype=flow.int32), | |||||
) -> tp.Numpy: | |||||
# with flow.scope.placement("gpu", "0:0"): | |||||
hidden_size = 64 * args.num_attention_heads # , H = 64, size per head | |||||
intermediate_size = hidden_size * 4 | |||||
outputs = maskedBert( | |||||
input_ids, | |||||
input_mask, | |||||
segment_ids, | |||||
mask_id, | |||||
args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=args.num_hidden_layers, | |||||
num_attention_heads=args.num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.attention_probs_dropout_prob, | |||||
max_position_embeddings=args.max_position_embeddings, | |||||
type_vocab_size=args.type_vocab_size, | |||||
initializer_range=0.02, | |||||
) | |||||
return outputs | |||||
class DataAugmentor(object): | |||||
def __init__(self, tokenizer, emb_norm, vocab, ids_to_tokens, M, N, p): | |||||
# self.model = model | |||||
self.tokenizer = tokenizer | |||||
self.emb_norm = emb_norm | |||||
self.vocab = vocab | |||||
self.ids_to_tokens = ids_to_tokens | |||||
self.M = M | |||||
self.N = N | |||||
self.p = p | |||||
def _word_distance(self, word): | |||||
if word not in self.vocab.keys(): | |||||
return [] | |||||
word_idx = self.vocab[word] | |||||
word_emb = self.emb_norm[word_idx] | |||||
dist = np.dot(self.emb_norm, word_emb.T) | |||||
dist[word_idx] = -np.Inf | |||||
candidate_ids = np.argsort(-dist)[:self.M] | |||||
return [self.ids_to_tokens[idx] for idx in candidate_ids][:self.M] | |||||
def _masked_language_model(self, sent, word_pieces, mask_id): | |||||
tokenized_text = self.tokenizer.tokenize(sent) | |||||
tokenized_text = ['[CLS]'] + tokenized_text | |||||
tokenized_len = len(tokenized_text) | |||||
tokenized_text = word_pieces + ['[SEP]'] + tokenized_text[1:] + ['[SEP]'] | |||||
if len(tokenized_text) > 512: | |||||
tokenized_text = tokenized_text[:512] | |||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_text) | |||||
segments_ids = [0] * (tokenized_len + 1) + [1] * (len(tokenized_text) - tokenized_len - 1) | |||||
input_mask = [1] * len(token_ids) | |||||
# Zero-pad up to the sequence length. | |||||
while len(token_ids) < args.seq_length: | |||||
token_ids.append(0) | |||||
input_mask.append(0) | |||||
segments_ids.append(0) | |||||
token_ids = np.array(token_ids).reshape(1,args.seq_length).astype(np.int32) | |||||
input_mask = np.array(input_mask).reshape(1,args.seq_length).astype(np.int32) | |||||
segments_ids = np.array(segments_ids).reshape(1,args.seq_length).astype(np.int32) | |||||
mask_id = np.array(mask_id).reshape(1,1).astype(np.int32) | |||||
# print('token_ids:',token_ids) | |||||
# print('mask_id:',mask_id) | |||||
outputs = eval_job(token_ids,input_mask,segments_ids,mask_id) | |||||
# print(outputs) | |||||
predictions = outputs | |||||
predictions = np.array(predictions) | |||||
# print('predictions:',predictions) | |||||
word_candidates = np.argsort(-predictions)[0][:self.M].tolist() | |||||
word_candidates = self.tokenizer.convert_ids_to_tokens(word_candidates) | |||||
return list(filter(lambda x: x.find("##"), word_candidates)) | |||||
def _word_augment(self, sentence, mask_token_idx, mask_token): | |||||
word_pieces = self.tokenizer.tokenize(sentence) | |||||
word_pieces = ['[CLS]'] + word_pieces | |||||
tokenized_len = len(word_pieces) | |||||
token_idx = -1 | |||||
for i in range(1, tokenized_len): | |||||
if "##" not in word_pieces[i]: | |||||
token_idx = token_idx + 1 | |||||
if token_idx < mask_token_idx: | |||||
word_piece_ids = [] | |||||
elif token_idx == mask_token_idx: | |||||
word_piece_ids = [i] | |||||
else: | |||||
break | |||||
else: | |||||
word_piece_ids.append(i) | |||||
if len(word_piece_ids) == 1: | |||||
word_pieces[word_piece_ids[0] = '[MASK]' | |||||
candidate_words = self._masked_language_model( | |||||
sentence, word_pieces, word_piece_ids[0]) | |||||
elif len(word_piece_ids) > 1: | |||||
candidate_words = self._word_distance(mask_token) | |||||
else: | |||||
logger.info("invalid input sentence!") | |||||
if len(candidate_words)==0: | |||||
candidate_words.append(mask_token) | |||||
return candidate_words | |||||
def augment(self, sent): | |||||
candidate_sents = [sent] | |||||
tokens = self.tokenizer.basic_tokenizer.tokenize(sent) | |||||
candidate_words = {} | |||||
for (idx, word) in enumerate(tokens): | |||||
if _is_valid(word) and word not in StopWordsList: | |||||
candidate_words[idx] = self._word_augment(sent, idx, word) | |||||
logger.info(candidate_words) | |||||
cnt = 0 | |||||
while cnt < self.N: | |||||
new_sent = list(tokens) | |||||
for idx in candidate_words.keys(): | |||||
candidate_word = random.choice(candidate_words[idx]) | |||||
x = random.random() | |||||
if x < self.p: | |||||
new_sent[idx] = candidate_word | |||||
if " ".join(new_sent) not in candidate_sents: | |||||
candidate_sents.append(' '.join(new_sent)) | |||||
cnt += 1 | |||||
return candidate_sents | |||||
class AugmentProcessor(object): | |||||
def __init__(self, augmentor, glue_dir, task_name): | |||||
self.augmentor = augmentor | |||||
self.glue_dir = glue_dir | |||||
self.task_name = task_name | |||||
self.augment_ids = {'MRPC': [3, 4], 'MNLI': [8, 9], 'CoLA': [3], 'SST-2': [0], | |||||
'STS-B': [7, 8], 'QQP': [3, 4], 'QNLI': [1, 2], 'RTE': [1, 2]} | |||||
self.filter_flags = { 'MRPC': True, 'MNLI': True, 'CoLA': False, 'SST-2': True, | |||||
'STS-B': True, 'QQP': True, 'QNLI': True, 'RTE': True} | |||||
assert self.task_name in self.augment_ids | |||||
def read_augment_write(self): | |||||
task_dir = os.path.join(self.glue_dir, self.task_name) | |||||
train_samples = _read_tsv(os.path.join(task_dir, "train.tsv")) | |||||
output_filename = os.path.join(task_dir, "train_aug.tsv") | |||||
augment_ids_ = self.augment_ids[self.task_name] | |||||
filter_flag = self.filter_flags[self.task_name] | |||||
with open(output_filename, 'w', newline='', encoding="utf-8") as f: | |||||
writer = csv.writer(f, delimiter="\t") | |||||
for (i, line) in enumerate(train_samples): | |||||
if i == 0 and filter_flag: | |||||
writer.writerow(line) | |||||
continue | |||||
for augment_id in augment_ids_: | |||||
sent = line[augment_id] | |||||
augmented_sents = self.augmentor.augment(sent) | |||||
for augment_sent in augmented_sents: | |||||
line[augment_id] = augment_sent | |||||
writer.writerow(line) | |||||
if (i+1) % 1000 == 0: | |||||
logger.info("Having been processing {} examples".format(str(i+1))) | |||||
def main(): | |||||
# logger.info(args) | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.env.log_dir(args.log_dir) | |||||
default_params = { | |||||
"CoLA": {"N": 30}, | |||||
"MNLI": {"N": 10}, | |||||
"MRPC": {"N": 30}, | |||||
"SST-2": {"N": 20}, | |||||
"STS-b": {"N": 30}, | |||||
"QQP": {"N": 10}, | |||||
"QNLI": {"N": 20}, | |||||
"RTE": {"N": 30} | |||||
} | |||||
if args.task_name in default_params: | |||||
args.N = default_params[args.task_name]["N"] | |||||
# Prepare data augmentor | |||||
tokenizer = tokenization.FullTokenizer( | |||||
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) | |||||
InitNodes(args) | |||||
snapshot = Snapshot(args.model_save_dir, args.model_load_dir) | |||||
emb_norm, vocab, ids_to_tokens = prepare_embedding_retrieval(args.glove_embs) | |||||
data_augmentor = DataAugmentor(tokenizer, emb_norm, vocab, ids_to_tokens, args.M, args.N, args.p) | |||||
# Do data augmentation | |||||
processor = AugmentProcessor(data_augmentor, args.glue_dir, args.task_name) | |||||
processor.read_augment_write() | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,224 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow as flow | |||||
import sys | |||||
import os | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
import bert as bert_util | |||||
import oneflow.core.operator.op_conf_pb2 as op_conf_util | |||||
def maskedBert( | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
masked_lm_positions_blob, | |||||
# masked_lm_positions_blob, | |||||
# masked_lm_ids_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
# max_predictions_per_seq=20, | |||||
initializer_range=0.02, | |||||
): | |||||
backbone = bert_util.BertBackbone( | |||||
input_ids_blob=input_ids_blob, | |||||
input_mask_blob=input_mask_blob, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
vocab_size=vocab_size, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act=hidden_act, | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
max_position_embeddings=max_position_embeddings, | |||||
type_vocab_size=type_vocab_size, | |||||
initializer_range=initializer_range, | |||||
) | |||||
predictions = _AddMaskedLanguageModel( | |||||
input_blob=backbone.sequence_output(), | |||||
output_weights_blob=backbone.embedding_table(), | |||||
positions_blob=masked_lm_positions_blob, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
vocab_size=vocab_size, | |||||
hidden_act=bert_util.GetActivation(hidden_act), | |||||
initializer_range=initializer_range, | |||||
) | |||||
pooled_output = PooledOutput( | |||||
backbone.sequence_output(), hidden_size, initializer_range | |||||
) | |||||
return predictions | |||||
def PooledOutput(sequence_output, hidden_size, initializer_range): | |||||
with flow.scope.namespace("bert-pooler"): | |||||
first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1]) | |||||
first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size]) | |||||
pooled_output = bert_util._FullyConnected( | |||||
first_token_tensor, | |||||
input_size=hidden_size, | |||||
units=hidden_size, | |||||
weight_initializer=bert_util.CreateInitializer(initializer_range), | |||||
name="dense", | |||||
) | |||||
pooled_output = flow.math.tanh(pooled_output) | |||||
return pooled_output | |||||
def _AddMaskedLanguageModelLoss( | |||||
input_blob, | |||||
output_weights_blob, | |||||
positions_blob, | |||||
label_id_blob, | |||||
label_weight_blob, | |||||
seq_length, | |||||
hidden_size, | |||||
vocab_size, | |||||
max_predictions_per_seq, | |||||
hidden_act, | |||||
initializer_range, | |||||
): | |||||
with flow.scope.namespace("other"): | |||||
sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1]) | |||||
ones = sum_label_weight_blob * 0.0 + 1.0 | |||||
sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob) | |||||
batch_size = flow.math.reduce_sum(ones) | |||||
sum_label_weight_blob = sum_label_weight_blob / batch_size | |||||
with flow.scope.namespace("cls-predictions"): | |||||
input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size) | |||||
with flow.scope.namespace("transform"): | |||||
if callable(hidden_act): | |||||
act_fn = op_conf_util.kNone | |||||
else: | |||||
act_fn = hidden_act | |||||
input_blob = bert_util._FullyConnected( | |||||
input_blob, | |||||
input_size=hidden_size, | |||||
units=hidden_size, | |||||
activation=act_fn, | |||||
weight_initializer=bert_util.CreateInitializer(initializer_range), | |||||
name="dense", | |||||
) | |||||
if callable(hidden_act): | |||||
input_blob = hidden_act(input_blob) | |||||
input_blob = bert_util._LayerNorm(input_blob, hidden_size) | |||||
output_bias = flow.get_variable( | |||||
name="output_bias", | |||||
shape=[vocab_size], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(1.0), | |||||
) | |||||
logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias) | |||||
label_id_blob = flow.reshape(label_id_blob, [-1]) | |||||
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=logit_blob, labels=label_id_blob | |||||
) | |||||
pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq]) | |||||
numerator = pre_example_loss * label_weight_blob | |||||
with flow.scope.namespace("loss"): | |||||
numerator = flow.math.reduce_sum(numerator, axis=[-1]) | |||||
denominator = sum_label_weight_blob + 1e-5 | |||||
loss = numerator / denominator | |||||
return loss, pre_example_loss, logit_blob | |||||
def _AddMaskedLanguageModel( | |||||
input_blob, | |||||
output_weights_blob, | |||||
positions_blob, | |||||
seq_length, | |||||
hidden_size, | |||||
vocab_size, | |||||
hidden_act, | |||||
initializer_range, | |||||
): | |||||
with flow.scope.namespace("cls-predictions"): | |||||
# 获取mask词的encode | |||||
input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size) | |||||
# 在输出之前添加一个非线性变换,只在预训练阶段起作用 | |||||
with flow.scope.namespace("transform"): | |||||
if callable(hidden_act): | |||||
act_fn = op_conf_util.kNone | |||||
else: | |||||
act_fn = hidden_act | |||||
# print('hhhhh') | |||||
input_blob = bert_util._FullyConnected( | |||||
input_blob, | |||||
input_size=hidden_size, | |||||
units=hidden_size, | |||||
activation=act_fn, | |||||
weight_initializer=bert_util.CreateInitializer(initializer_range), | |||||
name="dense", | |||||
) | |||||
if callable(hidden_act): | |||||
input_blob = hidden_act(input_blob) | |||||
input_blob = bert_util._LayerNorm(input_blob, hidden_size) | |||||
# output_weights是和传入的word embedding一样的 | |||||
# 这里再添加一个bias | |||||
output_bias = flow.get_variable( | |||||
name="output_bias", | |||||
shape=[vocab_size], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(1.0), | |||||
) | |||||
logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias) | |||||
return logit_blob | |||||
def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size): | |||||
output = flow.gather( | |||||
params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2 | |||||
) | |||||
output = flow.reshape(output, [-1, hidden_size]) | |||||
return output | |||||
def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range): | |||||
with flow.scope.namespace("cls-seq_relationship"): | |||||
output_weight_blob = flow.get_variable( | |||||
name="output_weights", | |||||
shape=[2, hidden_size], | |||||
dtype=input_blob.dtype, | |||||
initializer=bert_util.CreateInitializer(initializer_range), | |||||
) | |||||
output_bias_blob = flow.get_variable( | |||||
name="output_bias", | |||||
shape=[2], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(0.0), | |||||
) | |||||
logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) | |||||
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=logit_blob, labels=label_blob | |||||
) | |||||
loss = pre_example_loss | |||||
return loss, pre_example_loss, logit_blob |
@@ -0,0 +1,407 @@ | |||||
# coding=utf-8 | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import json | |||||
import collections | |||||
import logging | |||||
import os | |||||
import shelve | |||||
from argparse import ArgumentParser | |||||
from pathlib import Path | |||||
from tqdm import tqdm, trange | |||||
from tempfile import TemporaryDirectory | |||||
from multiprocessing import Pool | |||||
import numpy as np | |||||
from random import random, randrange, randint, shuffle, choice | |||||
import sys | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
from glue_ofrecord import tokenization | |||||
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', | |||||
datefmt='%m/%d/%Y %H:%M:%S', | |||||
level=logging.INFO) | |||||
logger = logging.getLogger(__name__) | |||||
class DocumentDatabase: | |||||
def __init__(self, reduce_memory=False): | |||||
if reduce_memory: | |||||
self.temp_dir = TemporaryDirectory() | |||||
self.working_dir = Path(self.temp_dir.name) | |||||
self.document_shelf_filepath = self.working_dir / 'shelf.db' | |||||
self.document_shelf = shelve.open('/cache/shelf.db', | |||||
flag='n', protocol=-1) | |||||
self.documents = None | |||||
else: | |||||
self.documents = [] | |||||
self.document_shelf = None | |||||
self.document_shelf_filepath = None | |||||
self.temp_dir = None | |||||
self.doc_lengths = [] | |||||
self.doc_cumsum = None | |||||
self.cumsum_max = None | |||||
self.reduce_memory = reduce_memory | |||||
def add_document(self, document): | |||||
if not document: | |||||
return | |||||
if self.reduce_memory: | |||||
current_idx = len(self.doc_lengths) | |||||
self.document_shelf[str(current_idx)] = document | |||||
else: | |||||
self.documents.append(document) | |||||
self.doc_lengths.append(len(document)) | |||||
def _precalculate_doc_weights(self): | |||||
self.doc_cumsum = np.cumsum(self.doc_lengths) | |||||
self.cumsum_max = self.doc_cumsum[-1] | |||||
def sample_doc(self, current_idx, sentence_weighted=True): | |||||
# Uses the current iteration counter to ensure we don't sample the same doc twice | |||||
if sentence_weighted: | |||||
# With sentence weighting, we sample docs proportionally to their sentence length | |||||
if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths): | |||||
self._precalculate_doc_weights() | |||||
rand_start = self.doc_cumsum[current_idx] | |||||
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx] | |||||
sentence_index = randrange(rand_start, rand_end) % self.cumsum_max | |||||
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right') | |||||
else: | |||||
# If we don't use sentence weighting, then every doc has an equal chance to be chosen | |||||
sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths) | |||||
assert sampled_doc_index != current_idx | |||||
if self.reduce_memory: | |||||
return self.document_shelf[str(sampled_doc_index)] | |||||
else: | |||||
return self.documents[sampled_doc_index] | |||||
def __len__(self): | |||||
return len(self.doc_lengths) | |||||
def __getitem__(self, item): | |||||
if self.reduce_memory: | |||||
return self.document_shelf[str(item)] | |||||
else: | |||||
return self.documents[item] | |||||
def __enter__(self): | |||||
return self | |||||
def __exit__(self, exc_type, exc_val, traceback): | |||||
if self.document_shelf is not None: | |||||
self.document_shelf.close() | |||||
if self.temp_dir is not None: | |||||
self.temp_dir.cleanup() | |||||
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens): | |||||
"""Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo.""" | |||||
while True: | |||||
total_length = len(tokens_a) + len(tokens_b) | |||||
if total_length <= max_num_tokens: | |||||
break | |||||
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b | |||||
assert len(trunc_tokens) >= 1 | |||||
# We want to sometimes truncate from the front and sometimes from the | |||||
# back to add more randomness and avoid biases. | |||||
if random() < 0.5: | |||||
del trunc_tokens[0] | |||||
else: | |||||
trunc_tokens.pop() | |||||
MaskedLmInstance = collections.namedtuple("MaskedLmInstance", | |||||
["index", "label"]) | |||||
def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list): | |||||
"""Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but | |||||
with several refactors to clean it up and remove a lot of unnecessary variables.""" | |||||
cand_indices = [] | |||||
for (i, token) in enumerate(tokens): | |||||
if token == "[CLS]" or token == "[SEP]": | |||||
continue | |||||
# Whole Word Masking means that if we mask all of the wordpieces | |||||
# corresponding to an original word. When a word has been split into | |||||
# WordPieces, the first token does not have any marker and any subsequence | |||||
# tokens are prefixed with ##. So whenever we see the ## token, we | |||||
# append it to the previous set of word indexes. | |||||
# | |||||
# Note that Whole Word Masking does *not* change the training code | |||||
# at all -- we still predict each WordPiece independently, softmaxed | |||||
# over the entire vocabulary. | |||||
if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")): | |||||
cand_indices[-1].append(i) | |||||
else: | |||||
cand_indices.append([i]) | |||||
num_to_mask = min(max_predictions_per_seq, | |||||
max(1, int(round(len(tokens) * masked_lm_prob)))) | |||||
shuffle(cand_indices) | |||||
masked_lms = [] | |||||
covered_indexes = set() | |||||
for index_set in cand_indices: | |||||
if len(masked_lms) >= num_to_mask: | |||||
break | |||||
# If adding a whole-word mask would exceed the maximum number of | |||||
# predictions, then just skip this candidate. | |||||
if len(masked_lms) + len(index_set) > num_to_mask: | |||||
continue | |||||
is_any_index_covered = False | |||||
for index in index_set: | |||||
if index in covered_indexes: | |||||
is_any_index_covered = True | |||||
break | |||||
if is_any_index_covered: | |||||
continue | |||||
for index in index_set: | |||||
covered_indexes.add(index) | |||||
# 80% of the time, replace with [MASK] | |||||
if random() < 0.8: | |||||
masked_token = "[MASK]" | |||||
else: | |||||
# 10% of the time, keep original | |||||
if random() < 0.5: | |||||
masked_token = tokens[index] | |||||
# 10% of the time, replace with random word | |||||
else: | |||||
masked_token = choice(vocab_list) | |||||
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) | |||||
tokens[index] = masked_token | |||||
assert len(masked_lms) <= num_to_mask | |||||
masked_lms = sorted(masked_lms, key=lambda x: x.index) | |||||
mask_indices = [p.index for p in masked_lms] | |||||
masked_token_labels = [p.label for p in masked_lms] | |||||
return tokens, mask_indices, masked_token_labels | |||||
def create_instances_from_document( | |||||
doc_database, doc_idx, max_seq_length, short_seq_prob, | |||||
masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list, bi_text=True): | |||||
"""This code is mostly a duplicate of the equivalent function from Google BERT's repo. | |||||
However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function. | |||||
Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence | |||||
(rather than each document) has an equal chance of being sampled as a false example for the NextSentence task.""" | |||||
document = doc_database[doc_idx] | |||||
# Account for [CLS], [SEP], [SEP] | |||||
max_num_tokens = max_seq_length - 3 | |||||
# We *usually* want to fill up the entire sequence since we are padding | |||||
# to `max_seq_length` anyways, so short sequences are generally wasted | |||||
# computation. However, we *sometimes* | |||||
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter | |||||
# sequences to minimize the mismatch between pre-training and fine-tuning. | |||||
# The `target_seq_length` is just a rough target however, whereas | |||||
# `max_seq_length` is a hard limit. | |||||
target_seq_length = max_num_tokens | |||||
if random() < short_seq_prob: | |||||
target_seq_length = randint(2, max_num_tokens) | |||||
# We DON'T just concatenate all of the tokens from a document into a long | |||||
# sequence and choose an arbitrary split point because this would make the | |||||
# next sentence prediction task too easy. Instead, we split the input into | |||||
# segments "A" and "B" based on the actual "sentences" provided by the user | |||||
# input. | |||||
instances = [] | |||||
current_chunk = [] | |||||
current_length = 0 | |||||
i = 0 | |||||
while i < len(document): | |||||
segment = document[i] | |||||
current_chunk.append(segment) | |||||
current_length += len(segment) | |||||
if i == len(document) - 1 or current_length >= target_seq_length: | |||||
if current_chunk: | |||||
# `a_end` is how many segments from `current_chunk` go into the `A` | |||||
# (first) sentence. | |||||
a_end = 1 | |||||
if len(current_chunk) >= 2: | |||||
a_end = randrange(1, len(current_chunk)) | |||||
tokens_a = [] | |||||
for j in range(a_end): | |||||
tokens_a.extend(current_chunk[j]) | |||||
tokens_b = [] | |||||
# Random next | |||||
if bi_text and (len(current_chunk) == 1 or random() < 0.5): | |||||
is_random_next = True | |||||
target_b_length = target_seq_length - len(tokens_a) | |||||
# Sample a random document, with longer docs being sampled more frequently | |||||
random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True) | |||||
random_start = randrange(0, len(random_document)) | |||||
for j in range(random_start, len(random_document)): | |||||
tokens_b.extend(random_document[j]) | |||||
if len(tokens_b) >= target_b_length: | |||||
break | |||||
# We didn't actually use these segments so we "put them back" so | |||||
# they don't go to waste. | |||||
num_unused_segments = len(current_chunk) - a_end | |||||
i -= num_unused_segments | |||||
# Actual next | |||||
else: | |||||
is_random_next = False | |||||
for j in range(a_end, len(current_chunk)): | |||||
tokens_b.extend(current_chunk[j]) | |||||
if not tokens_a or len(tokens_a) == 0: | |||||
tokens_a = ["."] | |||||
if not tokens_b or len(tokens_b) == 0: | |||||
tokens_b = ["."] | |||||
assert len(tokens_a) >= 1 | |||||
assert len(tokens_b) >= 1 | |||||
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens) | |||||
tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"] | |||||
# The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP] | |||||
# They are 1 for the B tokens and the final [SEP] | |||||
segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)] | |||||
tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions( | |||||
tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list) | |||||
instance = { | |||||
"tokens": tokens, | |||||
"segment_ids": segment_ids, | |||||
"is_random_next": is_random_next, | |||||
"masked_lm_positions": masked_lm_positions, | |||||
"masked_lm_labels": masked_lm_labels} | |||||
instances.append(instance) | |||||
current_chunk = [] | |||||
current_length = 0 | |||||
i += 1 | |||||
return instances | |||||
def create_training_file(docs, vocab_list, args, epoch_num, bi_text=True): | |||||
epoch_filename = args.output_dir / "epoch_{}.json".format(epoch_num) | |||||
num_instances = 0 | |||||
with epoch_filename.open('w') as epoch_file: | |||||
for doc_idx in trange(len(docs), desc="Document"): | |||||
doc_instances = create_instances_from_document( | |||||
docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, | |||||
masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, | |||||
whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list, bi_text=bi_text) | |||||
doc_instances = [json.dumps(instance) for instance in doc_instances] | |||||
for instance in doc_instances: | |||||
epoch_file.write(instance + '\n') | |||||
num_instances += 1 | |||||
metrics_filename = args.output_dir / "epoch_{}_metrics.json".format(epoch_num) | |||||
with metrics_filename.open('w') as metrics_file: | |||||
metrics = { | |||||
"num_training_examples": num_instances, | |||||
"max_seq_len": args.max_seq_len | |||||
} | |||||
metrics_file.write(json.dumps(metrics)) | |||||
return epoch_filename, metrics_filename | |||||
def main(): | |||||
parser = ArgumentParser() | |||||
parser.add_argument('--train_corpus', type=Path, required=True) | |||||
parser.add_argument("--output_dir", type=Path, required=True) | |||||
# parser.add_argument("--bert_model", type=str, required=True) | |||||
parser.add_argument( | |||||
'--vocab_file', | |||||
help='The vocabulary file that the BERT model was trained on.', | |||||
default=None, | |||||
type=str, | |||||
required=True | |||||
) | |||||
parser.add_argument("--do_lower_case", action="store_true") | |||||
parser.add_argument("--do_whole_word_mask", action="store_true", | |||||
help="Whether to use whole word masking rather than per-WordPiece masking.") | |||||
parser.add_argument("--reduce_memory", action="store_true", | |||||
help="Reduce memory usage for large datasets by keeping data on disc rather than in memory") | |||||
parser.add_argument("--num_workers", type=int, default=1, | |||||
help="The number of workers to use to write the files") | |||||
parser.add_argument("--epochs_to_generate", type=int, default=3, | |||||
help="Number of epochs of data to pregenerate") | |||||
parser.add_argument("--max_seq_len", type=int, default=128) | |||||
parser.add_argument("--short_seq_prob", type=float, default=0.1, | |||||
help="Probability of making a short sentence as a training example") | |||||
parser.add_argument("--masked_lm_prob", type=float, default=0.0, | |||||
help="Probability of masking each token for the LM task") # no [mask] symbol in corpus | |||||
parser.add_argument("--max_predictions_per_seq", type=int, default=20, | |||||
help="Maximum number of tokens to mask in each sequence") | |||||
parser.add_argument('--oneseq', action='store_true') | |||||
args = parser.parse_args() | |||||
if args.num_workers > 1 and args.reduce_memory: | |||||
raise ValueError("Cannot use multiple workers while reducing memory") | |||||
# tokenizer = BertTokenizer.from_pretrained(args.bert_model) #, do_lower_case=args.do_lower_case | |||||
tokenizer = tokenization.FullTokenizer( | |||||
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) | |||||
vocab_list = list(tokenizer.vocab.keys()) | |||||
doc_num = 0 | |||||
with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: | |||||
with args.train_corpus.open(encoding="utf-8") as f: | |||||
doc = [] | |||||
for line in tqdm(f, desc="Loading Dataset", unit=" lines"): | |||||
line = line.strip() | |||||
if line == "": | |||||
docs.add_document(doc) | |||||
doc = [] | |||||
doc_num += 1 | |||||
if doc_num % 100 == 0: | |||||
logger.info('loaded {} docs!'.format(doc_num)) | |||||
else: | |||||
tokens = tokenizer.tokenize(line) | |||||
doc.append(tokens) | |||||
if doc: | |||||
docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added | |||||
if len(docs) <= 1: | |||||
exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to " | |||||
"ensure that random NextSentences are not sampled from the same document. Please add blank lines to " | |||||
"indicate breaks between documents in your input file. If your dataset does not contain multiple " | |||||
"documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " | |||||
"sections or paragraphs.") | |||||
args.output_dir.mkdir(exist_ok=True) | |||||
if args.num_workers > 1: | |||||
writer_workers = Pool(min(args.num_workers, args.epochs_to_generate)) | |||||
arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)] | |||||
writer_workers.starmap(create_training_file, arguments) | |||||
else: | |||||
for epoch in trange(args.epochs_to_generate, desc="Epoch"): | |||||
bi_text = True if not args.oneseq else False | |||||
epoch_file, metric_file = create_training_file(docs, vocab_list, args, epoch, bi_text=bi_text) | |||||
if __name__ == '__main__': | |||||
main() |
@@ -0,0 +1,32 @@ | |||||
# Data augmentation aims to expand the task-specific training set. | |||||
nvidia-smi | |||||
PRETRAINED_MODEL='../../models/uncased_L-12_H-768_A-12_oneflow' # the BERT-base teacher model | |||||
VOCAB_FILE='../../src/glue_ofrecord/vocab.txt' | |||||
GLOVE_EMB='../../glove/glove.840B.300d.txt' | |||||
GLUE_DIR='../../data/glue_data' | |||||
TASK_NAME=SST-2 | |||||
GPU=0 | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 data_augmentation.py \ | |||||
--model_load_dir=${PRETRAINED_MODEL} \ | |||||
--model_save_dir=./snapshots \ | |||||
--vocab_file $VOCAB_FILE \ | |||||
--do_lower_case \ | |||||
--glove_embs $GLOVE_EMB \ | |||||
--glue_dir $GLUE_DIR \ | |||||
--task_name $TASK_NAME \ | |||||
--log_dir=./log \ | |||||
--save_last_snapshot=True \ | |||||
--gpu_num_per_node=1 \ | |||||
--seq_length=512 \ | |||||
--num_hidden_layers=12 \ | |||||
--num_attention_heads=12 \ | |||||
--max_position_embeddings=512 \ | |||||
--type_vocab_size=2 \ | |||||
--vocab_size=30522 \ | |||||
--attention_probs_dropout_prob=0.1 \ | |||||
--hidden_dropout_prob=0.1 \ | |||||
--hidden_size_per_head=64 |
@@ -0,0 +1,16 @@ | |||||
nvidia-smi | |||||
# tqdm | |||||
# wikitext-103-raw, wikitext-2-raw | |||||
CORPUS_RAW='./data/wikitext-103-raw/wiki.train.raw' | |||||
VOCAB_FILE='./glue_ofrecord/vocab.txt' | |||||
OUTPUT_DIR='./data/pretrain_data_json' | |||||
GPU=0 | |||||
CUDA_VISIBLE_DEVICES=$GPU python pregenerate_training_data.py \ | |||||
--train_corpus $CORPUS_RAW \ | |||||
--vocab_file $VOCAB_FILE \ | |||||
--do_lower_case \ | |||||
--epochs_to_generate 3 \ | |||||
--output_dir $OUTPUT_DIR | |||||
@@ -0,0 +1,516 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import math | |||||
import numpy as np | |||||
import sys | |||||
curPath = os.path.abspath(os.path.dirname(__file__)) | |||||
rootPath = os.path.split(curPath)[0] | |||||
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src"))) | |||||
import oneflow as flow | |||||
import oneflow.typing as tp | |||||
from classifier import GlueBERT | |||||
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, \ | |||||
remove_optimizer_params, remove_teacher_params | |||||
import config as configs | |||||
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score | |||||
import argparse | |||||
import shutil | |||||
import tempfile | |||||
import time | |||||
import json | |||||
from knowledge_distill_util import BertForSequenceClassification, BertStudentForSequenceClassification, \ | |||||
soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = configs.get_parser() | |||||
parser.add_argument("--task_name", type=str, default='CoLA') | |||||
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.") | |||||
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs') | |||||
parser.add_argument("--train_data_dir", type=str, default=None) | |||||
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-') | |||||
parser.add_argument("--train_example_num", type=int, default=88614, | |||||
help="example number in dataset") | |||||
parser.add_argument("--batch_size_per_device", type=int, default=32) | |||||
parser.add_argument("--train_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--eval_data_dir", type=str, default=None) | |||||
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-') | |||||
parser.add_argument("--eval_example_num", type=int, default=10833, | |||||
help="example number in dataset") | |||||
parser.add_argument("--eval_batch_size_per_device", type=int, default=64) | |||||
parser.add_argument("--eval_data_part_num", type=int, default=1, | |||||
help="data part number in dataset") | |||||
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results") | |||||
# | |||||
parser.add_argument("--student_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--student_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--student_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--student_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--student_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--student_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--student_hidden_size", type=int, default=768) | |||||
parser.add_argument("--teacher_num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--teacher_num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--teacher_max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--teacher_type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--teacher_vocab_size", type=int, default=30522) | |||||
parser.add_argument("--teacher_attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--teacher_hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--teacher_hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--teacher_hidden_size", type=int, default=768) | |||||
parser.add_argument('--intermediate_distill', type=str2bool, nargs='?', const=True, | |||||
help='distill attention, intermediate and embedding information') | |||||
parser.add_argument('--pred_distill', type=str2bool, nargs='?', const=True, help='distill prediction layer') | |||||
parser.add_argument('--temperature', type=float, default=1.) | |||||
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?') | |||||
parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False, | |||||
help='if serve for online, then after training, will delete the teacher params and optimizer parmas from model_save_dir') | |||||
args = parser.parse_args() | |||||
task_name = args.task_name.lower() | |||||
if args.aug_train: | |||||
args.train_data_dir = args.train_data_dir.replace('train', 'train_aug') | |||||
batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device | |||||
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device | |||||
epoch_size = math.ceil(args.train_example_num / batch_size) | |||||
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size) | |||||
args.iter_num = epoch_size * args.num_epochs | |||||
configs.print_args(args) | |||||
glue_output_modes = { | |||||
"cola": "classification", | |||||
"mnli": "classification", | |||||
"mnli-mm": "classification", | |||||
"mrpc": "classification", | |||||
"sst-2": "classification", | |||||
"sts-b": "regression", | |||||
"qqp": "classification", | |||||
"qnli": "classification", | |||||
"rte": "classification", | |||||
"wnli": "classification", | |||||
} | |||||
acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] | |||||
corr_tasks = ["sts-b"] | |||||
mcc_tasks = ["cola"] | |||||
output_mode = glue_output_modes[args.task_name.lower()] | |||||
def BertDecoder( | |||||
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True | |||||
): | |||||
with flow.scope.placement("cpu", "0:0"): | |||||
ofrecord = flow.data.ofrecord_reader(data_dir, | |||||
batch_size=batch_size, | |||||
data_part_num=data_part_num, | |||||
part_name_prefix=part_name_prefix, | |||||
random_shuffle=shuffle, | |||||
shuffle_after_epoch=shuffle) | |||||
blob_confs = {} | |||||
def _blob_conf(name, shape, dtype=flow.int32): | |||||
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype) | |||||
_blob_conf("input_ids", [seq_length]) | |||||
_blob_conf("input_mask", [seq_length]) | |||||
_blob_conf("segment_ids", [seq_length]) | |||||
_blob_conf("label_ids", [1]) | |||||
_blob_conf("is_real_example", [1]) | |||||
return blob_confs | |||||
def get_tensor_data( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
shuffle=True | |||||
): | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
return decoders | |||||
def BuildBert( | |||||
batch_size, | |||||
data_part_num, | |||||
data_dir, | |||||
part_name_prefix, | |||||
shuffle=True | |||||
): | |||||
hidden_size = args.hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.hidden_size_per_head = hidden_size / args.num_attention_heads | |||||
# intermediate_size = hidden_size * 4 | |||||
intermediate_size = 1200 | |||||
decoders = BertDecoder( | |||||
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle | |||||
) | |||||
# is_real_example = decoders['is_real_example'] | |||||
loss, logits = GlueBERT( | |||||
decoders['input_ids'], | |||||
decoders['input_mask'], | |||||
decoders['segment_ids'], | |||||
decoders['label_ids'], | |||||
args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=args.num_hidden_layers, | |||||
num_attention_heads=args.num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.attention_probs_dropout_prob, | |||||
max_position_embeddings=args.max_position_embeddings, | |||||
type_vocab_size=args.type_vocab_size, | |||||
initializer_range=0.02, | |||||
) | |||||
return loss, logits, decoders['label_ids'] | |||||
def student_model(input_ids, input_mask, segment_ids, is_train=True): | |||||
# with flow.scope.placement("gpu", "0:0"): | |||||
# hidden_size = 64 * args.student_num_attention_heads # , H = 64, size per head | |||||
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads | |||||
# intermediate_size = hidden_size * 4 | |||||
intermediate_size = 1200 | |||||
logits, reps, atts = BertStudentForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.student_vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=args.student_num_hidden_layers, | |||||
num_attention_heads=args.student_num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.student_hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.student_attention_probs_dropout_prob, | |||||
max_position_embeddings=args.student_max_position_embeddings, | |||||
type_vocab_size=args.student_type_vocab_size, | |||||
initializer_range=0.02, | |||||
is_student=True, | |||||
fit_size=args.teacher_hidden_size, | |||||
is_train=is_train | |||||
) | |||||
return logits, reps, atts | |||||
def teacher_model(input_ids, input_mask, segment_ids, is_train): | |||||
# hidden_size = 64 * args.teacher_num_attention_heads # , H = 64, size per head | |||||
teacher_hidden_size = args.teacher_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head | |||||
args.teacher_hidden_size_per_head = teacher_hidden_size / args.teacher_num_attention_heads | |||||
intermediate_size = teacher_hidden_size * 4 | |||||
logits, reps, atts = BertForSequenceClassification( | |||||
input_ids_blob=input_ids, | |||||
input_mask_blob=input_mask, | |||||
token_type_ids_blob=segment_ids, | |||||
label_blob=None, | |||||
vocab_size=args.vocab_size, | |||||
seq_length=args.seq_length, | |||||
hidden_size=teacher_hidden_size, | |||||
num_hidden_layers=args.teacher_num_hidden_layers, | |||||
num_attention_heads=args.teacher_num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=args.teacher_hidden_dropout_prob, | |||||
attention_probs_dropout_prob=args.teacher_attention_probs_dropout_prob, | |||||
max_position_embeddings=args.teacher_max_position_embeddings, | |||||
type_vocab_size=args.teacher_type_vocab_size, | |||||
initializer_range=0.02, | |||||
is_student=False, | |||||
is_train=is_train | |||||
) | |||||
return logits, reps, atts | |||||
@flow.global_function(type='train', function_config=GetFunctionConfig(args)) | |||||
def DistilJob(): | |||||
train_dataset = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
) | |||||
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'], | |||||
train_dataset['segment_ids'], is_train=True) | |||||
teacher_logits, teacher_reps, teacher_atts = teacher_model(train_dataset['input_ids'], train_dataset['input_mask'], | |||||
train_dataset['segment_ids'], is_train=False) | |||||
loss = 0. | |||||
if args.intermediate_distill: | |||||
rep_loss = layer_distill(args, student_reps, teacher_reps) | |||||
att_loss = att_distill(args, student_atts, teacher_atts) | |||||
loss += att_loss + rep_loss | |||||
if args.pred_distill: | |||||
if output_mode == "classification": | |||||
cls_loss = pred_distill(args, student_logits, teacher_logits) | |||||
elif output_mode == "regression": | |||||
""" | |||||
todo | |||||
loss_mse = MSELoss() | |||||
cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1)) | |||||
""" | |||||
pass | |||||
loss += cls_loss | |||||
# loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
# logits=student_logits, labels=train_dataset['label_ids'] | |||||
# ) | |||||
# loss = loss_ce | |||||
flow.losses.add_loss(loss) | |||||
opt = CreateOptimizer(args) | |||||
opt.minimize(loss) | |||||
return {'loss': loss} | |||||
# | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalTrainJob(): | |||||
train_dataset = get_tensor_data( | |||||
batch_size, | |||||
args.train_data_part_num, | |||||
args.train_data_dir, | |||||
args.train_data_prefix, | |||||
shuffle=False | |||||
) | |||||
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'], | |||||
train_dataset['segment_ids'], is_train=False) | |||||
return student_logits, train_dataset['label_ids'] | |||||
@flow.global_function(type='predict', function_config=GetFunctionConfig(args)) | |||||
def StudentBertGlueEvalValJob(): | |||||
dev_dataset = get_tensor_data( | |||||
eval_batch_size, | |||||
args.eval_data_part_num, | |||||
args.eval_data_dir, | |||||
args.eval_data_prefix, | |||||
shuffle=False | |||||
) | |||||
student_logits, student_reps, student_atts = student_model(dev_dataset['input_ids'], dev_dataset['input_mask'], | |||||
dev_dataset['segment_ids'], is_train=False) | |||||
return student_logits, dev_dataset['label_ids'] | |||||
# | |||||
def run_eval_job(eval_job_func, num_steps, desc='train'): | |||||
labels = [] | |||||
predictions = [] | |||||
start_time = time.time() | |||||
for index in range(num_steps): | |||||
logits, label = eval_job_func().get() | |||||
predictions.extend(list(logits.numpy().argmax(axis=1))) | |||||
labels.extend(list(label)) | |||||
end_time = time.time() | |||||
cost_time = end_time - start_time | |||||
print('cost time: {} s'.format(cost_time)) | |||||
model_size = getdirsize(args.model_save_dir) | |||||
print('model_size: %d Mbytes' % (model_size / 1024 / 1024)) # Mbytes | |||||
accuracy = accuracy_score(labels, predictions) | |||||
mcc = matthews_corrcoef(labels, predictions) | |||||
precision = precision_score(labels, predictions) | |||||
recall = recall_score(labels, predictions) | |||||
f_1 = f1_score(labels, predictions) | |||||
save_dict = {"accuracy": "%.2f" % accuracy, | |||||
"MCC": "%.2f" % mcc, | |||||
"precision": "%.2f" % precision, | |||||
"recall": "%.2f" % recall, | |||||
"f_1": "%.2f" % f_1, | |||||
"modelSize": "%d" % (model_size / 1024 / 1024), | |||||
"reasoningTime": "%.2f" % (args.eval_example_num / cost_time)} # sample/second | |||||
if args.result_dir == "": | |||||
args.result_dir = args.model_save_dir | |||||
if not os.path.exists(args.result_dir): | |||||
os.makedirs(args.result_dir) | |||||
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f: | |||||
json.dump(save_dict, f) | |||||
def metric_fn(predictions, labels): | |||||
return { | |||||
"accuracy": accuracy, | |||||
"matthews_corrcoef": mcc, | |||||
"precision": precision, | |||||
"recall": recall, | |||||
"f1": f_1, | |||||
} | |||||
metric_dict = metric_fn(predictions, labels) | |||||
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items())) | |||||
return metric_dict | |||||
def CopyFile(filepath, newPath): | |||||
fileNames = os.listdir(filepath) | |||||
for file in fileNames: | |||||
newDir = os.path.join(filepath, file) | |||||
if os.path.isfile(newDir): | |||||
# print(newDir) | |||||
newFile = os.path.join(newPath, file) | |||||
shutil.copyfile(newDir, newFile) | |||||
else: | |||||
if not os.path.exists(os.path.join(newPath, file)): | |||||
os.makedirs(os.path.join(newPath, file)) | |||||
CopyFile(newDir, os.path.join(newPath, file)) | |||||
def main(): | |||||
flow.config.gpu_device_num(args.gpu_num_per_node) | |||||
flow.env.log_dir(args.log_dir) | |||||
InitNodes(args) | |||||
check_point = flow.train.CheckPoint() | |||||
summary = Summary(args.log_dir, args) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
if args.do_train: | |||||
print('Combining two models into one dir') | |||||
if not os.path.exists('./tmp'): | |||||
os.makedirs('./tmp') | |||||
args.total_model = tempfile.mkdtemp(dir='./tmp') | |||||
CopyFile(args.student_model, args.total_model) | |||||
CopyFile(args.teacher_model, args.total_model) | |||||
print('Loading model...') | |||||
check_point.load(args.total_model) | |||||
# # check_point.load(args.teacher_model) | |||||
# # check_point.load(args.student_model) | |||||
# | |||||
print('Start training...') | |||||
global_step = 0 | |||||
best_dev_acc = 0.0 | |||||
for epoch in range(args.num_epochs): | |||||
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary, | |||||
batch_size=batch_size, keys=['loss']) | |||||
for step in range(epoch_size): | |||||
DistilJob().async_get(metric.metric_cb(step, epoch=epoch)) | |||||
global_step += 1 | |||||
# if (global_step + 1) % args.model_save_every_n_iter == 0: | |||||
# if not os.path.exists(args.model_save_dir): | |||||
# os.makedirs(args.model_save_dir) | |||||
# snapshot_save_path = os.path.join( | |||||
# args.model_save_dir, "snapshot_%d" % (global_step + 1) | |||||
# ) | |||||
# print("Saving model to {}.".format(snapshot_save_path)) | |||||
# check_point.save(snapshot_save_path) | |||||
# if args.pred_distill: | |||||
print('EvalTrainJob...') | |||||
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train') | |||||
print('EvalValJob...') | |||||
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
if not args.pred_distill: | |||||
save_model = True | |||||
else: | |||||
save_model = False | |||||
if task_name in acc_tasks and result['accuracy'] > best_dev_acc: | |||||
best_dev_acc = result['accuracy'] | |||||
save_model = True | |||||
# if task_name in corr_tasks and result['corr'] > best_dev_acc: | |||||
# best_dev_acc = result['corr'] | |||||
# save_model = True | |||||
if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc: | |||||
best_dev_acc = result['matthews_corrcoef'] | |||||
save_model = True | |||||
print('Best result:', result) | |||||
if save_model: | |||||
if os.path.exists(args.model_save_dir): | |||||
import shutil | |||||
shutil.rmtree(args.model_save_dir) | |||||
if not os.path.exists(args.model_save_dir): | |||||
os.makedirs(args.model_save_dir) | |||||
snapshot_save_path = os.path.join(args.model_save_dir) | |||||
print("Saving best model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
flow.sync_default_session() | |||||
if args.save_last_snapshot: | |||||
snapshot_save_path = args.model_save_dir | |||||
if os.path.exists(args.model_save_dir): | |||||
import shutil | |||||
shutil.rmtree(args.model_save_dir) | |||||
print("Saving model to {}".format(snapshot_save_path)) | |||||
check_point.save(snapshot_save_path) | |||||
flow.sync_default_session() | |||||
if global_step >= 100: | |||||
# remove tmp total models | |||||
print('Removing the tmp models...') | |||||
import shutil | |||||
shutil.rmtree(args.total_model) | |||||
if args.serve_for_online: | |||||
print('Deleting the teacher params and the optimizer parmas from model_save_dir...') | |||||
remove_teacher_params(args.model_save_dir) | |||||
if args.do_eval: | |||||
print('Loading model...') | |||||
print(args.model_save_dir) | |||||
if not args.do_train: | |||||
check_point.load(args.model_save_dir) | |||||
print('Evaluation...') | |||||
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval') | |||||
# if args.save_last_snapshot: | |||||
# snapshot.save("last_snapshot") | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,8 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
# --aug_train True | |||||
TASK=SST-2 | |||||
python ./src/glue_ofrecord/glue_process.py --data_dir ./data/glue_data/${TASK} --output_dir ./data/glue_ofrecord_test/${TASK} \ | |||||
--vocab_file ./glue_ofrecord/vocab.txt --do_lower_case True --max_seq_length 128 \ | |||||
--do_train True --do_eval True --do_predict True --task=${TASK} |
@@ -0,0 +1,4 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
python ../src/download_glue_data.py --data_dir ../data/glue_data --tasks all |
@@ -0,0 +1,95 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
dataset=SST-2 | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=./data/glue_ofrecord | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
LAYER_NUM=3 | |||||
KD_ALPHA=0.2 | |||||
KD_BETA=10 | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
# saved student model dir | |||||
STUDENT_DIR="./models/student_model/SST-2/bert-pkd_3_epoch-4_lr-2e-5_wd-0.0001_kd_alpha-0.2_kd_beta-10" | |||||
RESULT_DIR="" | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/bert-pkd/task_student_bert-pkd.py \ | |||||
--do_train='False' \ | |||||
--do_eval='True' \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${epoch} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 10 \ | |||||
--log_dir=./log \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--seq_length=128 \ | |||||
--student_num_hidden_layers=${LAYER_NUM} \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=30522 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=64 \ | |||||
--student_hidden_size=768 \ | |||||
--teacher_num_hidden_layers=12 \ | |||||
--teacher_num_attention_heads=12 \ | |||||
--teacher_max_position_embeddings=512 \ | |||||
--teacher_type_vocab_size=2 \ | |||||
--teacher_vocab_size=30522 \ | |||||
--teacher_attention_probs_dropout_prob=0.1 \ | |||||
--teacher_hidden_dropout_prob=0.1 \ | |||||
--teacher_hidden_size_per_head=64 \ | |||||
--teacher_hidden_size=768 \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--kd_alpha=${KD_ALPHA} \ | |||||
--kd_beta=${KD_BETA} \ | |||||
--from_scratch='False' |
@@ -0,0 +1,94 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
dataset=SST-2 | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=./data/glue_ofrecord | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
KD_ALPHA=0.7 | |||||
STUDENT_DIR="./models/student_model/SST-2/bert-lstm_32-distl_epoch-5_lr-1e-4_wd-0.0001_kd_alpha-0.7" | |||||
RESULT_DIR="" | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
train_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
eval_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/eval | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/distilled-bilstm/task_student_kd_lstm.py \ | |||||
--do_train='False' \ | |||||
--do_eval='True' \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_data_dir_lstm=${train_data_dir_lstm} \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_data_dir_lstm=$eval_data_dir_lstm \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 1 \ | |||||
--log_dir=./log \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--seq_length=128 \ | |||||
--student_seq_length=32 \ | |||||
--student_num_hidden_layers=4 \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=10002 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=26 \ | |||||
--student_hidden_size=300 \ | |||||
--teacher_num_hidden_layers=12 \ | |||||
--teacher_num_attention_heads=12 \ | |||||
--teacher_max_position_embeddings=512 \ | |||||
--teacher_type_vocab_size=2 \ | |||||
--teacher_vocab_size=30522 \ | |||||
--teacher_attention_probs_dropout_prob=0.1 \ | |||||
--teacher_hidden_dropout_prob=0.1 \ | |||||
--teacher_hidden_size_per_head=64 \ | |||||
--teacher_hidden_size=768 \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--kd_alpha=${KD_ALPHA} |
@@ -0,0 +1,88 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
dataset=SST-2 | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=./data/glue_ofrecord | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
KD_ALPHA=0.8 | |||||
STUDENT_DIR="./models/student_model/SST-2/bert-kd-distl_epoch-4_lr-2e-5_wd-0.0001_kd_alpha-0.8" | |||||
RESULT_DIR="" | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/knowledge_distillation/task_student_kd.py \ | |||||
--do_train='False' \ | |||||
--do_eval='True' \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 10 \ | |||||
--log_dir=./log \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--seq_length=128 \ | |||||
--student_num_hidden_layers=4 \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=30522 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=26 \ | |||||
--student_hidden_size=312 \ | |||||
--teacher_num_hidden_layers=12 \ | |||||
--teacher_num_attention_heads=12 \ | |||||
--teacher_max_position_embeddings=512 \ | |||||
--teacher_type_vocab_size=2 \ | |||||
--teacher_vocab_size=30522 \ | |||||
--teacher_attention_probs_dropout_prob=0.1 \ | |||||
--teacher_hidden_dropout_prob=0.1 \ | |||||
--teacher_hidden_size_per_head=64 \ | |||||
--teacher_hidden_size=768 \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--kd_alpha=${KD_ALPHA} |
@@ -0,0 +1,89 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
dataset=SST-2 | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=./data/glue_ofrecord | |||||
# which GPU to use | |||||
GPU=0 | |||||
# saved student model dir | |||||
STUDENT_DIR="./models/student_model/SST-2/tinybert_epoch-4_lr-2e-5_wd-0.0001" | |||||
RESULT_DIR="" | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/tinybert/task_student_tinybert.py \ | |||||
--do_train='False' \ | |||||
--do_eval='True' \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 10 \ | |||||
--log_dir=./log \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--seq_length=128 \ | |||||
--student_num_hidden_layers=4 \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=30522 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=26 \ | |||||
--student_hidden_size=312 \ | |||||
--teacher_num_hidden_layers=12 \ | |||||
--teacher_num_attention_heads=12 \ | |||||
--teacher_max_position_embeddings=512 \ | |||||
--teacher_type_vocab_size=2 \ | |||||
--teacher_vocab_size=30522 \ | |||||
--teacher_attention_probs_dropout_prob=0.1 \ | |||||
--teacher_hidden_dropout_prob=0.1 \ | |||||
--teacher_hidden_size_per_head=64 \ | |||||
--teacher_hidden_size=768 \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--intermediate_distill='True' \ | |||||
--pred_distill='True' |
@@ -0,0 +1,86 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=./data/glue_ofrecord | |||||
# choose dateset `CoLA`, `MRPC` 'SST-2' | |||||
dataset=SST-2 | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
elif [ $dataset = "QNLI" ]; then | |||||
train_example_num=104743 | |||||
eval_example_num=5463 | |||||
test_example_num=0 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI',''," | |||||
exit | |||||
fi | |||||
TEACHER_MODEL_DIR="./models/finetuned_teacher/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_best" | |||||
#TEACHER_MODEL_DIR="./models/finetuned_teacher/RTE_epoch-5_lr-3e-5_wd-0.0001/snapshot_best" | |||||
#TEACHER_MODEL_DIR="./models/finetuned_teacher/MRPC_epoch-5_lr-1e-5_wd-0.001/snapshot_best" | |||||
#TEACHER_MODEL_DIR="./models/finetuned_teacher/CoLA_epoch-5_lr-1e-5_wd-0.01/snapshot_best" | |||||
#TEACHER_MODEL_DIR="./models/finetuned_teacher/QQP_epoch-5_lr-2e-5_wd-0.0001/snapshot_best" | |||||
RESULT_DIR="./models/finetuned_teacher/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_best" | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 examples/teacher_bert/task_teacher.py \ | |||||
--do_train='False' \ | |||||
--do_eval='True' \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 20 \ | |||||
--log_dir=./log \ | |||||
--model_save_dir=${TEACHER_MODEL_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--save_last_snapshot=False \ | |||||
--seq_length=128 \ | |||||
--num_hidden_layers=12 \ | |||||
--num_attention_heads=12 \ | |||||
--max_position_embeddings=512 \ | |||||
--type_vocab_size=2 \ | |||||
--vocab_size=30522 \ | |||||
--attention_probs_dropout_prob=0.1 \ | |||||
--hidden_dropout_prob=0.1 \ | |||||
--hidden_size_per_head=64 |
@@ -0,0 +1,99 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
# pretrained model dir | |||||
# PRETRAINED_MODEL=/remote-home/my/Projects/bert_theseus/BERT/uncased_L-12_H-768_A-12_oneflow | |||||
# PRETRAINED_MODEL=/remote-home/my/Projects/bert_theseus/BERT-theseus/log/MRPC_uncased_L-12_H-768_A-12_oneflow_v1/snapshot_last_snapshot | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=/usr/local/glue_ofrecord | |||||
GPU_ID=0 | |||||
# choose dateset `CoLA` or `MRPC` | |||||
dataset=SST-2 | |||||
#dataset=MRPC | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
epoch=1 | |||||
wd=0.0001 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
epoch=1 | |||||
wd=0.0001 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=1 | |||||
wd=0.0001 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=1 | |||||
wd=0.0001 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=1 | |||||
wd=0.0001 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=1 | |||||
wd=0.0001 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
model_load_dir=./log/${dataset}_bert_theseus_uncased_L-12_H-768_A-12_oneflow_v1/snapshot_last_snapshot | |||||
# mkdir -p ${model_save_dir} | |||||
replace_prob=1.0 | |||||
CUDA_VISIBLE_DEVICES=$1 python3 ./theseus/run_classifier.py \ | |||||
--do_train=false \ | |||||
--do_eval=True \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=$epoch \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--model_load_dir=${model_load_dir} \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=4 \ | |||||
--loss_print_every_n_iter 20 \ | |||||
--log_dir=./log \ | |||||
--save_last_snapshot=True \ | |||||
--seq_length=128 \ | |||||
--num_hidden_layers=4 \ | |||||
--num_attention_heads=12 \ | |||||
--max_position_embeddings=512 \ | |||||
--type_vocab_size=2 \ | |||||
--vocab_size=30522 \ | |||||
--attention_probs_dropout_prob=0.1 \ | |||||
--hidden_dropout_prob=0.1 \ | |||||
--hidden_size_per_head=64 \ | |||||
--compress_ratio $compress_ratio \ | |||||
--replace_prob $replace_prob \ |
@@ -0,0 +1,130 @@ | |||||
# Copyright (c) The OneFlow Authors. | |||||
# Licensed under the Apache License | |||||
# Script for knowledge distillation with BERT-PKD algorithm. | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=$1 | |||||
# saved student model dir | |||||
STUDENT_DIR="$2/student_model" | |||||
# tran log out | |||||
TRAIN_LOG_DIR=$3 | |||||
# inference json result out | |||||
RESULT_DIR=$4 | |||||
dataset=$5 | |||||
# fine-tuned teacher model dir | |||||
FT_BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best" | |||||
# temp student model dir | |||||
TMP_STUDENT_DIR="./models/bert_pkd_3/${dataset}" | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
learning_rate=5e-5 | |||||
wd=0.0001 | |||||
epoch=100 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.001 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
learning_rate=2e-5 | |||||
epoch=4 | |||||
wd=0.0001 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
learning_rate=3e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/bert-pkd/task_student_bert-pkd.py \ | |||||
--do_train='True' \ | |||||
--do_eval='True' \ | |||||
--serve_for_online='True' \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${epoch} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--teacher_model=${FT_BERT_BASE_DIR} \ | |||||
--student_model=${TMP_STUDENT_DIR} \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 10 \ | |||||
--log_dir=${TRAIN_LOG_DIR} \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--seq_length=128 \ | |||||
--student_num_hidden_layers=3 \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=30522 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=64 \ | |||||
--student_hidden_size=768 \ | |||||
--teacher_num_hidden_layers=12 \ | |||||
--teacher_num_attention_heads=12 \ | |||||
--teacher_max_position_embeddings=512 \ | |||||
--teacher_type_vocab_size=2 \ | |||||
--teacher_vocab_size=30522 \ | |||||
--teacher_attention_probs_dropout_prob=0.1 \ | |||||
--teacher_hidden_dropout_prob=0.1 \ | |||||
--teacher_hidden_size_per_head=64 \ | |||||
--teacher_hidden_size=768 \ | |||||
--learning_rate=$learning_rate \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--weight_decay_rate $wd \ | |||||
--kd_alpha=0.2 \ | |||||
--kd_beta=10 \ | |||||
--from_scratch='False' |
@@ -0,0 +1,130 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
# Script for knowledge distillation with distilled_bilstm algorithm. | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=$1 | |||||
# saved student model dir | |||||
STUDENT_DIR="$2/student_model" | |||||
# tran log out | |||||
TRAIN_LOG_DIR=$3 | |||||
# inference json result out | |||||
RESULT_DIR=$4 | |||||
dataset=$5 | |||||
# fine-tuned teacher model dir | |||||
FT_BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best" | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
train_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
eval_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/eval | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
learning_rate=5e-5 | |||||
wd=0.0001 | |||||
epoch=100 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
learning_rate=5e-6 | |||||
epoch=30 | |||||
wd=0.001 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
learning_rate=1e-4 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
learning_rate=7e-5 | |||||
epoch=10 | |||||
wd=0.0001 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=30 | |||||
wd=0.0001 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/distilled-bilstm/task_student_kd_lstm.py \ | |||||
--do_train='True' \ | |||||
--do_eval='True' \ | |||||
--serve_for_online='True' \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${epoch} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_data_dir_lstm=${train_data_dir_lstm} \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_data_dir_lstm=$eval_data_dir_lstm \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--teacher_model=${FT_BERT_BASE_DIR} \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 1 \ | |||||
--log_dir=./log \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--seq_length=128 \ | |||||
--student_seq_length=32 \ | |||||
--student_num_hidden_layers=4 \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=10002 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=26 \ | |||||
--student_hidden_size=300 \ | |||||
--teacher_num_hidden_layers=12 \ | |||||
--teacher_num_attention_heads=12 \ | |||||
--teacher_max_position_embeddings=512 \ | |||||
--teacher_type_vocab_size=2 \ | |||||
--teacher_vocab_size=30522 \ | |||||
--teacher_attention_probs_dropout_prob=0.1 \ | |||||
--teacher_hidden_dropout_prob=0.1 \ | |||||
--teacher_hidden_size_per_head=64 \ | |||||
--teacher_hidden_size=768 \ | |||||
--learning_rate=$learning_rate \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--weight_decay_rate=$wd \ | |||||
--kd_alpha=0.7 |
@@ -0,0 +1,128 @@ | |||||
# Copyright (c) The OneFlow Authors. | |||||
# Licensed under the Apache License | |||||
# Script for knowledge distillation with KD algorithm. | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=$1 | |||||
# saved student model dir | |||||
STUDENT_DIR="$2/student_model" | |||||
# tran log out | |||||
TRAIN_LOG_DIR=$3 | |||||
# inference json result out | |||||
RESULT_DIR=$4 | |||||
dataset=$5 | |||||
# fine-tuned teacher model dir | |||||
FT_BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best" | |||||
# temp student model dir | |||||
TMP_STUDENT_DIR="./models/bert_pkd_3/${dataset}" | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
learning_rate=5e-5 | |||||
wd=0.0001 | |||||
epoch=70 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.001 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
learning_rate=2e-5 | |||||
epoch=4 | |||||
wd=0.0001 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
learning_rate=5e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/knowledge_distillation/task_student_kd.py \ | |||||
--do_train='True' \ | |||||
--do_eval='True' \ | |||||
--serve_for_online='True' \ | |||||
--model=Glue_${dataset} \ | |||||
--task_name=${dataset} \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${epoch} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--teacher_model=${FT_BERT_BASE_DIR} \ | |||||
--student_model=${TMP_STUDENT_DIR} \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 10 \ | |||||
--log_dir=${TRAIN_LOG_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--seq_length=128 \ | |||||
--student_num_hidden_layers=4 \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=30522 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=26 \ | |||||
--student_hidden_size=312 \ | |||||
--teacher_num_hidden_layers=12 \ | |||||
--teacher_num_attention_heads=12 \ | |||||
--teacher_max_position_embeddings=512 \ | |||||
--teacher_type_vocab_size=2 \ | |||||
--teacher_vocab_size=30522 \ | |||||
--teacher_attention_probs_dropout_prob=0.1 \ | |||||
--teacher_hidden_dropout_prob=0.1 \ | |||||
--teacher_hidden_size_per_head=64 \ | |||||
--teacher_hidden_size=768 \ | |||||
--learning_rate=$learning_rate \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--weight_decay_rate=$wd \ | |||||
--kd_alpha=0.8 |
@@ -0,0 +1,128 @@ | |||||
# Copyright (c) The Tianshu Platform Authors. | |||||
# Licensed under the Apache License | |||||
# Script for knowledge distillation with TinyBERT algorithm. | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=$1 | |||||
# saved student model dir | |||||
STUDENT_DIR="$2/student_model" | |||||
# tran log out | |||||
TRAIN_LOG_DIR=$3 | |||||
# inference json result out | |||||
RESULT_DIR=$4 | |||||
dataset=$5 | |||||
# fine-tuned teacher model dir | |||||
FT_BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best" | |||||
TMP_STUDENT_DIR="./models/2nd_General_TinyBERT_4L_312D_oneflow" | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
learning_rate=7e-5 | |||||
wd=0.0001 | |||||
epoch=100 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
learning_rate=2e-5 | |||||
epoch=30 | |||||
wd=0.001 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
learning_rate=2e-5 | |||||
epoch=4 | |||||
wd=0.0001 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
learning_rate=1e-4 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
epoch=5 | |||||
wd=0.0001 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/tinybert/task_student_tinybert.py \ | |||||
--do_train='True' \ | |||||
--do_eval='True' \ | |||||
--serve_for_online='True' \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${epoch} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--teacher_model=${FT_BERT_BASE_DIR} \ | |||||
--student_model=${TMP_STUDENT_DIR} \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 10 \ | |||||
--log_dir=./log \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--seq_length=128 \ | |||||
--student_num_hidden_layers=4 \ | |||||
--student_num_attention_heads=12 \ | |||||
--student_max_position_embeddings=512 \ | |||||
--student_type_vocab_size=2 \ | |||||
--student_vocab_size=30522 \ | |||||
--student_attention_probs_dropout_prob=0.1 \ | |||||
--student_hidden_dropout_prob=0.1 \ | |||||
--student_hidden_size_per_head=26 \ | |||||
--student_hidden_size=312 \ | |||||
--teacher_num_hidden_layers=12 \ | |||||
--teacher_num_attention_heads=12 \ | |||||
--teacher_max_position_embeddings=512 \ | |||||
--teacher_type_vocab_size=2 \ | |||||
--teacher_vocab_size=30522 \ | |||||
--teacher_attention_probs_dropout_prob=0.1 \ | |||||
--teacher_hidden_dropout_prob=0.1 \ | |||||
--teacher_hidden_size_per_head=64 \ | |||||
--teacher_hidden_size=768 \ | |||||
--learning_rate=$learning_rate \ | |||||
--model_save_every_n_iter=50000 \ | |||||
--weight_decay_rate $wd \ | |||||
--pred_distill='True' \ | |||||
--intermediate_distill='True' |
@@ -0,0 +1,116 @@ | |||||
# Copyright (c) The OneFlow Authors. | |||||
# Licensed under the Apache License | |||||
# Fine-tune Teacher model. | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=$1 | |||||
# pretrained model dir | |||||
PRETRAINED_MODEL=$2 | |||||
TRAIN_LOG_DIR=$3 | |||||
RESULT_DIR=$4 | |||||
# choose dateset | |||||
dataset=$5 | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
MODEL_SAVE_DIR="/usr/local/output/model/before" | |||||
# which GPU to use | |||||
GPU=0 | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
learning_rate=1e-5 | |||||
EPOCH=5 | |||||
wd=0.01 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
learning_rate=3e-5 | |||||
EPOCH=5 | |||||
wd=0.001 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
learning_rate=2e-5 | |||||
EPOCH=3 | |||||
wd=0.0001 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
learning_rate=3e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
elif [ $dataset == "QNLI" ]; then | |||||
train_example_num=104743 | |||||
eval_example_num=5463 | |||||
test_example_num=0 | |||||
learning_rate=2e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI'" | |||||
exit | |||||
fi | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 examples/teacher_bert/task_teacher.py \ | |||||
--do_train='True' \ | |||||
--do_eval='True' \ | |||||
--serve_for_online='True' \ | |||||
--model=Glue_${dataset} \ | |||||
--task_name=${dataset} \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${EPOCH} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--model_load_dir=${PRETRAINED_MODEL} \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=32 \ | |||||
--loss_print_every_n_iter 20 \ | |||||
--log_dir=${TRAIN_LOG_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--model_save_dir=${MODEL_SAVE_DIR} \ | |||||
--save_last_snapshot=True \ | |||||
--seq_length=128 \ | |||||
--num_hidden_layers=12 \ | |||||
--num_attention_heads=12 \ | |||||
--max_position_embeddings=512 \ | |||||
--type_vocab_size=2 \ | |||||
--vocab_size=30522 \ | |||||
--attention_probs_dropout_prob=0.1 \ | |||||
--hidden_dropout_prob=0.1 \ | |||||
--hidden_size_per_head=64 \ | |||||
--learning_rate $learning_rate \ | |||||
--weight_decay_rate $wd |
@@ -0,0 +1,158 @@ | |||||
# pretrained model dir | |||||
# ofrecord dataset dir | |||||
DATA_ROOT=$1 | |||||
# saved student model dir | |||||
STUDENT_DIR="$2/student_v2" | |||||
# tran log out | |||||
TRAIN_LOG_DIR=$3 | |||||
# inference json result out | |||||
RESULT_DIR=$4 | |||||
BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best" | |||||
#BERT_BASE_DIR="/usr/local/Oneflow-Model-Compression/model_compress/distil/models/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_best" | |||||
INIT_STUDENT_DIR="$2/student_init" | |||||
ONE_TRAIN_MODEL="$2/student_v1" | |||||
dataset=$5 | |||||
train_data_dir=$DATA_ROOT/${dataset}/train | |||||
eval_data_dir=$DATA_ROOT/${dataset}/eval | |||||
# which GPU to use | |||||
GPU=0 | |||||
#dataset=MRPC | |||||
if [ $dataset = "CoLA" ]; then | |||||
train_example_num=8551 | |||||
eval_example_num=1043 | |||||
test_example_num=1063 | |||||
learning_rate=1e-5 | |||||
EPOCH=5 | |||||
wd=0.01 | |||||
elif [ $dataset = "MRPC" ]; then | |||||
train_example_num=3668 | |||||
eval_example_num=408 | |||||
test_example_num=1725 | |||||
learning_rate=1e-5 | |||||
EPOCH=5 | |||||
wd=0.001 | |||||
elif [ $dataset = "SST-2" ]; then | |||||
train_example_num=67349 | |||||
eval_example_num=872 | |||||
test_example_num=1821 | |||||
learning_rate=1e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "QQP" ]; then | |||||
train_example_num=363849 | |||||
eval_example_num=40430 | |||||
test_example_num=0 | |||||
learning_rate=1e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "MNLI" ]; then | |||||
train_example_num=392702 | |||||
eval_example_num=9815 | |||||
test_example_num=0 | |||||
learning_rate=1e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "WNLI" ]; then | |||||
train_example_num=635 | |||||
eval_example_num=71 | |||||
test_example_num=0 | |||||
learning_rate=1e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
elif [ $dataset = "RTE" ]; then | |||||
train_example_num=2490 | |||||
eval_example_num=277 | |||||
test_example_num=0 | |||||
learning_rate=1e-5 | |||||
EPOCH=5 | |||||
wd=0.0001 | |||||
else | |||||
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B'," | |||||
exit | |||||
fi | |||||
mkdir -p ${INIT_STUDENT_DIR} | |||||
# LAYER_LIST="0,1,2,3,4,5" | |||||
python3 ./theseus/init_stu.py \ | |||||
--teacher_model=${BERT_BASE_DIR} \ | |||||
--student_model=${INIT_STUDENT_DIR} \ | |||||
--layer_list="0,1,2" | |||||
mkdir -p ${ONE_TRAIN_MODEL} | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./theseus/run_classifier.py \ | |||||
--do_train=True \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${EPOCH} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--model_load_dir=${INIT_STUDENT_DIR} \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=4 \ | |||||
--loss_print_every_n_iter 20 \ | |||||
--log_dir=${TRAIN_LOG_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--model_save_dir=${ONE_TRAIN_MODEL} \ | |||||
--save_last_snapshot=True \ | |||||
--seq_length=128 \ | |||||
--num_hidden_layers=12 \ | |||||
--num_attention_heads=12 \ | |||||
--max_position_embeddings=512 \ | |||||
--type_vocab_size=2 \ | |||||
--vocab_size=30522 \ | |||||
--attention_probs_dropout_prob=0.1 \ | |||||
--hidden_dropout_prob=0.1 \ | |||||
--hidden_size_per_head=64 \ | |||||
--learning_rate $learning_rate \ | |||||
--weight_decay_rate $wd \ | |||||
--compress_ratio=4 \ | |||||
--replace_prob=0.5 \ | |||||
| tee -a ${ONE_TRAIN_MODEL}/train_log.txt | |||||
mkdir -p ${STUDENT_DIR} | |||||
CUDA_VISIBLE_DEVICES=$GPU python3 ./theseus/run_classifier.py \ | |||||
--do_train=True \ | |||||
--model=Glue_$dataset \ | |||||
--task_name=$dataset \ | |||||
--gpu_num_per_node=1 \ | |||||
--num_epochs=${EPOCH} \ | |||||
--train_data_dir=$train_data_dir \ | |||||
--train_example_num=$train_example_num \ | |||||
--eval_data_dir=$eval_data_dir \ | |||||
--eval_example_num=$eval_example_num \ | |||||
--model_load_dir=${ONE_TRAIN_MODEL}/snapshot_last_snapshot \ | |||||
--batch_size_per_device=32 \ | |||||
--eval_batch_size_per_device=4 \ | |||||
--loss_print_every_n_iter 200 \ | |||||
--log_dir=${TRAIN_LOG_DIR} \ | |||||
--result_dir=${RESULT_DIR} \ | |||||
--model_save_dir=${STUDENT_DIR} \ | |||||
--save_last_snapshot=True \ | |||||
--seq_length=128 \ | |||||
--num_hidden_layers=12 \ | |||||
--num_attention_heads=12 \ | |||||
--max_position_embeddings=512 \ | |||||
--type_vocab_size=2 \ | |||||
--vocab_size=30522 \ | |||||
--attention_probs_dropout_prob=0.1 \ | |||||
--hidden_dropout_prob=0.1 \ | |||||
--hidden_size_per_head=64 \ | |||||
--learning_rate=1e-5 \ | |||||
--weight_decay_rate $wd \ | |||||
--compress_ratio=4 \ | |||||
--replace_prob=1.0 \ | |||||
| tee -a ${STUDENT_DIR}/train_log.txt |
@@ -0,0 +1,376 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow as flow | |||||
import oneflow.core.common.data_type_pb2 as data_type_util | |||||
import oneflow.core.operator.op_conf_pb2 as op_conf_util | |||||
import math | |||||
class BertBackbone(object): | |||||
def __init__(self, | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
initializer_range=0.02, | |||||
do_return_all_layers=True, | |||||
do_return_attentions=False, | |||||
is_train=True): | |||||
with flow.scope.namespace("bert"): | |||||
with flow.scope.namespace("embeddings"): | |||||
(self.embedding_output_, self.embedding_table_) = _EmbeddingLookup( | |||||
input_ids_blob=input_ids_blob, | |||||
vocab_size=vocab_size, | |||||
embedding_size=hidden_size, | |||||
initializer_range=initializer_range, | |||||
word_embedding_name="word_embeddings", | |||||
is_train=is_train) | |||||
self.embedding_output_ = _EmbeddingPostprocessor( | |||||
input_blob=self.embedding_output_, | |||||
seq_length=seq_length, | |||||
embedding_size=hidden_size, | |||||
use_token_type=True, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
token_type_vocab_size=type_vocab_size, | |||||
token_type_embedding_name="token_type_embeddings", | |||||
use_position_embeddings=True, | |||||
position_embedding_name="position_embeddings", | |||||
initializer_range=initializer_range, | |||||
max_position_embeddings=max_position_embeddings, | |||||
dropout_prob=hidden_dropout_prob, | |||||
is_train=is_train) | |||||
with flow.scope.namespace("encoder"): | |||||
attention_mask_blob = _CreateAttentionMaskFromInputMask( | |||||
input_mask_blob, from_seq_length=seq_length, to_seq_length=seq_length) | |||||
outputs = _TransformerModel( | |||||
input_blob=self.embedding_output_, | |||||
attention_mask_blob=attention_mask_blob, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
intermediate_act_fn=GetActivation(hidden_act), | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
initializer_range=initializer_range, | |||||
do_return_all_layers=True, | |||||
do_return_attentions=True, | |||||
is_train=is_train) | |||||
# if asdf: | |||||
# self.all_encoder_layers_ = outputs | |||||
# else: | |||||
self.all_encoder_layers_ = outputs[0] | |||||
self.all_attention_probs_ = outputs[1] | |||||
self.sequence_output_ = self.all_encoder_layers_[-1] | |||||
def embedding_output(self): return self.embedding_output_ | |||||
def all_encoder_layers(self): return self.all_encoder_layers_ | |||||
def all_attention_probs(self): return self.all_attention_probs_ | |||||
def sequence_output(self): return self.sequence_output_ | |||||
def embedding_table(self): return self.embedding_table_ | |||||
def CreateInitializer(std): | |||||
return flow.truncated_normal(std) | |||||
def _Gelu(in_blob): | |||||
return flow.math.gelu(in_blob) | |||||
def _TransformerModel(input_blob, | |||||
attention_mask_blob, | |||||
seq_length, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
intermediate_act_fn=_Gelu, | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
initializer_range=0.02, | |||||
do_return_all_layers=False, | |||||
do_return_attentions=False, | |||||
is_train=True): | |||||
assert hidden_size % num_attention_heads == 0 | |||||
attention_head_size = int(hidden_size / num_attention_heads) | |||||
input_width = hidden_size | |||||
prev_output_blob = flow.reshape(input_blob, (-1, input_width)) | |||||
all_layer_output_blobs = [] | |||||
all_attention_prob_blobs = [] | |||||
for layer_idx in range(num_hidden_layers): | |||||
with flow.scope.namespace("layer_%d"%layer_idx): | |||||
layer_input_blob = prev_output_blob | |||||
with flow.scope.namespace("attention"): | |||||
with flow.scope.namespace("self"): | |||||
attention_output_blob, attention_probs_blob = _AttentionLayer( | |||||
from_blob=layer_input_blob, | |||||
to_blob=layer_input_blob, | |||||
attention_mask_blob=attention_mask_blob, | |||||
num_attention_heads=num_attention_heads, | |||||
size_per_head=attention_head_size, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
initializer_range=initializer_range, | |||||
do_return_2d_tensor=True, | |||||
from_seq_length=seq_length, | |||||
to_seq_length=seq_length, | |||||
is_train=is_train) | |||||
all_attention_prob_blobs.append(attention_probs_blob) | |||||
with flow.scope.namespace("output"): | |||||
attention_output_blob = _FullyConnected( | |||||
attention_output_blob, | |||||
input_size=num_attention_heads * attention_head_size, | |||||
units=hidden_size, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense', | |||||
is_train=is_train) | |||||
attention_output_blob = _Dropout(attention_output_blob, hidden_dropout_prob) | |||||
attention_output_blob = attention_output_blob + layer_input_blob | |||||
attention_output_blob = _LayerNorm(attention_output_blob, hidden_size,is_train=is_train) | |||||
with flow.scope.namespace("intermediate"): | |||||
if callable(intermediate_act_fn): | |||||
act_fn = op_conf_util.kNone | |||||
else: | |||||
act_fn = intermediate_act_fn | |||||
intermediate_output_blob = _FullyConnected( | |||||
attention_output_blob, | |||||
input_size=num_attention_heads * attention_head_size, | |||||
units=intermediate_size, | |||||
activation=act_fn, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense', | |||||
is_train=is_train) | |||||
if callable(intermediate_act_fn): | |||||
intermediate_output_blob = intermediate_act_fn(intermediate_output_blob) | |||||
with flow.scope.namespace("output"): | |||||
layer_output_blob = _FullyConnected( | |||||
intermediate_output_blob, | |||||
input_size=intermediate_size, | |||||
units=hidden_size, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense', | |||||
is_train=is_train) | |||||
layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob) | |||||
layer_output_blob = layer_output_blob + attention_output_blob | |||||
layer_output_blob = _LayerNorm(layer_output_blob, hidden_size,is_train=is_train) | |||||
prev_output_blob = layer_output_blob | |||||
all_layer_output_blobs.append(layer_output_blob) | |||||
input_shape = (-1, seq_length, hidden_size) | |||||
if do_return_all_layers: | |||||
final_output_blobs = [] | |||||
for layer_output_blob in all_layer_output_blobs: | |||||
final_output_blob = flow.reshape(layer_output_blob, input_shape) | |||||
final_output_blobs.append(final_output_blob) | |||||
if not do_return_attentions: | |||||
return final_output_blobs | |||||
else: | |||||
return final_output_blobs, all_attention_prob_blobs | |||||
else: | |||||
final_output_blob = flow.reshape(prev_output_blob, input_shape) | |||||
return [final_output_blob] | |||||
def _AttentionLayer(from_blob, | |||||
to_blob, | |||||
attention_mask_blob, | |||||
num_attention_heads=1, | |||||
size_per_head=512, | |||||
query_act=op_conf_util.kNone, | |||||
key_act=op_conf_util.kNone, | |||||
value_act=op_conf_util.kNone, | |||||
attention_probs_dropout_prob=0.0, | |||||
initializer_range=0.02, | |||||
do_return_2d_tensor=False, | |||||
batch_size=None, | |||||
from_seq_length=None, | |||||
to_seq_length=None, | |||||
is_train=True): | |||||
def TransposeForScores(input_blob, num_attention_heads, seq_length, width): | |||||
output_blob = flow.reshape(input_blob, [-1, seq_length, num_attention_heads, width]) | |||||
output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3]) | |||||
return output_blob | |||||
from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head]) | |||||
to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head]) | |||||
query_blob = _FullyConnected( | |||||
from_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=query_act, | |||||
name="query", | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
is_train=is_train) | |||||
key_blob = _FullyConnected( | |||||
to_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=key_act, | |||||
name="key", | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
is_train=is_train) | |||||
value_blob = _FullyConnected( | |||||
to_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=value_act, | |||||
name="value", | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
is_train=is_train) | |||||
query_blob = TransposeForScores(query_blob, num_attention_heads, from_seq_length, size_per_head) | |||||
key_blob = TransposeForScores(key_blob, num_attention_heads, to_seq_length, size_per_head) | |||||
attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True) | |||||
attention_scores_blob = attention_scores_blob * (1.0 / math.sqrt(float(size_per_head))) | |||||
attention_mask_blob = flow.reshape(attention_mask_blob, [-1, 1, from_seq_length, to_seq_length]) | |||||
attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float) | |||||
addr_blob = (attention_mask_blob - 1.0) * 10000.0 | |||||
attention_scores_blob = attention_scores_blob + addr_blob | |||||
attention_probs_blob = flow.nn.softmax(attention_scores_blob) | |||||
attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob) | |||||
# print('attention_probs_blob.shape:',attention_probs_blob.shape) | |||||
value_blob = flow.reshape(value_blob, [-1, to_seq_length, num_attention_heads, size_per_head]) | |||||
value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3]) | |||||
context_blob = flow.matmul(attention_probs_blob, value_blob) | |||||
context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3]) | |||||
if do_return_2d_tensor: | |||||
context_blob = flow.reshape(context_blob, [-1, num_attention_heads * size_per_head]) | |||||
else: | |||||
context_blob = flow.reshape(context_blob, [-1, from_seq_length, num_attention_heads * size_per_head]) | |||||
return context_blob,attention_probs_blob | |||||
def _FullyConnected(input_blob, input_size, units, activation=None, name=None, | |||||
weight_initializer=None,is_train=True): | |||||
weight_blob = flow.get_variable( | |||||
name=name + '-weight', | |||||
shape=[input_size, units], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=weight_initializer) | |||||
bias_blob = flow.get_variable( | |||||
name=name + '-bias', | |||||
shape=[units], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=flow.constant_initializer(0.0)) | |||||
output_blob = flow.matmul(input_blob, weight_blob) | |||||
output_blob = flow.nn.bias_add(output_blob, bias_blob) | |||||
return output_blob | |||||
def _Dropout(input_blob, dropout_prob): | |||||
if dropout_prob == 0.0: | |||||
return input_blob | |||||
return flow.nn.dropout(input_blob, rate=dropout_prob) | |||||
def _LayerNorm(input_blob, hidden_size,is_train=True): | |||||
return flow.layers.layer_norm(input_blob, name='LayerNorm', begin_norm_axis=-1, begin_params_axis=-1,trainable=is_train) | |||||
def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length): | |||||
output = flow.cast(to_mask_blob, dtype=flow.float) | |||||
output = flow.reshape(output, [-1, 1, to_seq_length]) | |||||
zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length]) | |||||
output = zeros + output | |||||
return output | |||||
def _EmbeddingPostprocessor(input_blob, | |||||
seq_length, | |||||
embedding_size, | |||||
use_token_type=False, | |||||
token_type_ids_blob=None, | |||||
token_type_vocab_size=16, | |||||
token_type_embedding_name="token_type_embeddings", | |||||
use_position_embeddings=True, | |||||
position_embedding_name="position_embeddings", | |||||
initializer_range=0.02, | |||||
max_position_embeddings=512, | |||||
dropout_prob=0.1, | |||||
is_train=True): | |||||
output = input_blob | |||||
if use_token_type: | |||||
assert token_type_ids_blob is not None | |||||
token_type_table = flow.get_variable(name=token_type_embedding_name, | |||||
shape=[token_type_vocab_size, embedding_size], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
token_type_embeddings = flow.gather(params=token_type_table, indices=token_type_ids_blob, axis=0) | |||||
output = output + token_type_embeddings | |||||
if use_position_embeddings: | |||||
position_table = flow.get_variable(name=position_embedding_name, | |||||
shape=[1, max_position_embeddings, embedding_size], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
assert seq_length <= max_position_embeddings | |||||
if seq_length != max_position_embeddings: | |||||
position_table = flow.slice(position_table, begin=[None, 0, 0], size=[None, seq_length, -1]) | |||||
output = output + position_table | |||||
output = _LayerNorm(output, embedding_size, is_train=is_train) | |||||
output = _Dropout(output, dropout_prob) | |||||
return output | |||||
def _EmbeddingLookup(input_ids_blob, | |||||
vocab_size, | |||||
embedding_size=128, | |||||
initializer_range=0.02, | |||||
word_embedding_name="word_embeddings", | |||||
is_train=True): | |||||
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size], | |||||
dtype=flow.float, | |||||
trainable=is_train, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0) | |||||
return output, embedding_table | |||||
def GetActivation(name): | |||||
if name == 'linear': | |||||
return None | |||||
elif name == 'relu': | |||||
return flow.math.relu | |||||
elif name == 'tanh': | |||||
return flow.math.tanh | |||||
elif name == 'gelu': | |||||
return flow.math.gelu | |||||
else: | |||||
raise Exception("unsupported activation") | |||||
@@ -0,0 +1,116 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow as flow | |||||
import bert as bert_util | |||||
import oneflow.core.operator.op_conf_pb2 as op_conf_util | |||||
def GlueBERT( | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
label_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
initializer_range=0.02, | |||||
label_num=2, | |||||
replace_prob=None, | |||||
): | |||||
backbone = bert_util.BertBackbone( | |||||
input_ids_blob=input_ids_blob, | |||||
input_mask_blob=input_mask_blob, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
vocab_size=vocab_size, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act=hidden_act, | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
max_position_embeddings=max_position_embeddings, | |||||
type_vocab_size=type_vocab_size, | |||||
initializer_range=initializer_range, | |||||
) | |||||
pooled_output = PooledOutput( | |||||
sequence_output=backbone.sequence_output(), | |||||
hidden_size=hidden_size, | |||||
initializer_range=initializer_range | |||||
) | |||||
loss, _, logit_blob = _AddClassficationLoss( | |||||
input_blob=pooled_output, | |||||
label_blob=label_blob, | |||||
hidden_size=hidden_size, | |||||
label_num=label_num, | |||||
initializer_range=initializer_range, | |||||
scope_name='classification' | |||||
) | |||||
return loss, logit_blob | |||||
def PooledOutput(sequence_output, hidden_size, initializer_range): | |||||
with flow.scope.namespace("bert-pooler"): | |||||
first_token_tensor = flow.slice( | |||||
sequence_output, [None, 0, 0], [None, 1, -1]) | |||||
first_token_tensor = flow.reshape( | |||||
first_token_tensor, [-1, hidden_size]) | |||||
pooled_output = bert_util._FullyConnected( | |||||
first_token_tensor, | |||||
input_size=hidden_size, | |||||
units=hidden_size, | |||||
weight_initializer=bert_util.CreateInitializer(initializer_range), | |||||
name="dense", | |||||
) | |||||
pooled_output = flow.math.tanh(pooled_output) | |||||
return pooled_output | |||||
def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range, | |||||
scope_name='classification'): | |||||
with flow.scope.namespace(scope_name): | |||||
output_weight_blob = flow.get_variable( | |||||
name="output_weights", | |||||
shape=[label_num, hidden_size], | |||||
dtype=input_blob.dtype, | |||||
# initializer=bert_util.CreateInitializer(initializer_range), | |||||
initializer=flow.random_normal_initializer( | |||||
mean=0.0, stddev=initializer_range, seed=None, dtype=None) | |||||
) | |||||
output_bias_blob = flow.get_variable( | |||||
name="output_bias", | |||||
shape=[label_num], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(0.0), | |||||
) | |||||
logit_blob = flow.matmul( | |||||
input_blob, output_weight_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) | |||||
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=logit_blob, labels=label_blob | |||||
) | |||||
loss = pre_example_loss | |||||
return loss, pre_example_loss, logit_blob |
@@ -0,0 +1,107 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import argparse | |||||
from datetime import datetime | |||||
def str_list(x): | |||||
return x.split(',') | |||||
def int_list(x): | |||||
return list(map(int, x.split(','))) | |||||
def float_list(x): | |||||
return list(map(float, x.split(','))) | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
def get_parser(parser=None): | |||||
parser = argparse.ArgumentParser(description="flags for bert") | |||||
parser.add_argument('--do_train', type=str2bool, nargs='?', const=True, help='train or not') | |||||
parser.add_argument('--do_eval', type=str2bool, nargs='?', const=True, help='eval or not') | |||||
# resouce | |||||
parser.add_argument("--model", type=str, default='BERT Pretrain') | |||||
parser.add_argument("--gpu_num_per_node", type=int, default=1) | |||||
parser.add_argument('--num_nodes', type=int, default=1, | |||||
help='node/machine number for training') | |||||
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'], | |||||
help='nodes ip list for training, devided by ",", length >= num_nodes') | |||||
parser.add_argument("--ctrl_port", type=int, default=50051, help='ctrl_port for multinode job') | |||||
# train | |||||
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate") | |||||
parser.add_argument("--weight_decay_rate", type=float, default=0.01, help="weight decay rate") | |||||
parser.add_argument("--warmup_proportion", type=float, default=0.1) | |||||
parser.add_argument('--use_fp16', type=str2bool, nargs='?', default='False', const=True, | |||||
help='use use fp16 or not') | |||||
parser.add_argument('--use_xla', type=str2bool, nargs='?', const=True, | |||||
help='Whether to use use xla') | |||||
# log and resore/save | |||||
parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False, | |||||
help="print loss every n iteration") | |||||
parser.add_argument("--model_save_every_n_iter", type=int, default=10000, required=False, | |||||
help="save model every n iteration",) | |||||
parser.add_argument("--model_save_dir", type=str, | |||||
default="./output/model_save-{}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))), | |||||
required=False, help="model save directory") | |||||
parser.add_argument("--save_last_snapshot", type=bool, default=False, required=False, | |||||
help="save model snapshot for last iteration") | |||||
parser.add_argument("--model_load_dir", type=str, default=None, help="model load directory") | |||||
parser.add_argument("--log_dir", type=str, default="./output", help="log info save directory") | |||||
# bert backbone | |||||
parser.add_argument('--do_lower_case', type=str2bool, nargs='?', const=True, default='True') | |||||
parser.add_argument("--seq_length", type=int, default=512) | |||||
parser.add_argument("--max_predictions_per_seq", type=int, default=80) | |||||
parser.add_argument("--num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--vocab_size", type=int, default=30522) | |||||
parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--hidden_size", type=int, default=768) | |||||
return parser | |||||
def print_args(args): | |||||
print("=".ljust(66, "=")) | |||||
print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( | |||||
args.model, args.gpu_num_per_node, args.num_nodes)) | |||||
print("=".ljust(66, "=")) | |||||
for arg in vars(args): | |||||
print("{} = {}".format(arg, getattr(args, arg))) | |||||
print("-".ljust(66, "-")) | |||||
print("Time stamp: {}".format( | |||||
str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) | |||||
if __name__ == '__main__': | |||||
parser = get_parser() | |||||
args = parser.parse_args() | |||||
print_args(args) |
@@ -0,0 +1,121 @@ | |||||
# coding=utf-8 | |||||
# Copyright 2018 The HuggingFace Inc. team. | |||||
# | |||||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||||
# you may not use this file except in compliance with the License. | |||||
# You may obtain a copy of the License at | |||||
# | |||||
# http://www.apache.org/licenses/LICENSE-2.0 | |||||
# | |||||
# Unless required by applicable law or agreed to in writing, software | |||||
# distributed under the License is distributed on an "AS IS" BASIS, | |||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
# See the License for the specific language governing permissions and | |||||
# limitations under the License. | |||||
"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" | |||||
import argparse | |||||
import os | |||||
import numpy as np | |||||
import tensorflow as tf | |||||
import torch | |||||
from transformers import BertModel,BertConfig | |||||
from modeling import TinyBertForSequenceClassification | |||||
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): | |||||
""" | |||||
:param model:BertModel Pytorch model instance to be converted | |||||
:param ckpt_dir: Tensorflow model directory | |||||
:param model_name: model name | |||||
:return: | |||||
Currently supported HF models: | |||||
Y BertModel | |||||
N BertForMaskedLM | |||||
N BertForPreTraining | |||||
N BertForMultipleChoice | |||||
N BertForNextSentencePrediction | |||||
N BertForSequenceClassification | |||||
N BertForQuestionAnswering | |||||
""" | |||||
tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") | |||||
var_map = ( | |||||
("layer.", "layer_"), | |||||
("word_embeddings.weight", "word_embeddings"), | |||||
("position_embeddings.weight", "position_embeddings"), | |||||
("token_type_embeddings.weight", "token_type_embeddings"), | |||||
(".", "/"), | |||||
("LayerNorm/weight", "LayerNorm/gamma"), | |||||
("LayerNorm/bias", "LayerNorm/beta"), | |||||
("weight", "kernel"), | |||||
("classifier", "classification-output"), | |||||
) | |||||
if not os.path.isdir(ckpt_dir): | |||||
os.makedirs(ckpt_dir) | |||||
state_dict = model.state_dict() | |||||
print('torch state_dict.keys(): ',state_dict.keys()) | |||||
def to_tf_var_name(name: str): | |||||
for patt, repl in iter(var_map): | |||||
name = name.replace(patt, repl) | |||||
# return "bert/{}".format(name) | |||||
return "{}".format(name) | |||||
def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): | |||||
tf_dtype = tf.dtypes.as_dtype(tensor.dtype) | |||||
tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) | |||||
session.run(tf.variables_initializer([tf_var])) | |||||
session.run(tf_var) | |||||
return tf_var | |||||
tf.reset_default_graph() | |||||
with tf.Session() as session: | |||||
for var_name in state_dict: | |||||
tf_name = to_tf_var_name(var_name) | |||||
torch_tensor = state_dict[var_name].numpy() | |||||
if any([x in var_name for x in tensors_to_transpose]): | |||||
torch_tensor = torch_tensor.T | |||||
tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) | |||||
tf.keras.backend.set_value(tf_var, torch_tensor) | |||||
tf_weight = session.run(tf_var) | |||||
print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) | |||||
saver = tf.train.Saver(tf.trainable_variables()) | |||||
saver.save(session, os.path.join(ckpt_dir, 'bert_model' + ".ckpt")) | |||||
def main(raw_args=None): | |||||
parser = argparse.ArgumentParser() | |||||
parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") | |||||
parser.add_argument( | |||||
"--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" | |||||
) | |||||
parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin") | |||||
parser.add_argument("--model_config_path", type=str, required=True, help="/path/to/<pytorch-model-name>") | |||||
parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") | |||||
args = parser.parse_args(raw_args) | |||||
# model = BertModel.from_pretrained( | |||||
# pretrained_model_name_or_path=args.model_name, | |||||
# state_dict=torch.load(args.pytorch_model_path), | |||||
# cache_dir=args.cache_dir, | |||||
# ) | |||||
num_labels=2 | |||||
student_config = BertConfig.from_pretrained(args.model_config_path, num_labels=num_labels) | |||||
model = TinyBertForSequenceClassification.from_pretrained(args.model_config_path, config=student_config) | |||||
# model = BertModel.from_pretrained( | |||||
# state_dict=torch.load(args.pytorch_model_path) | |||||
# ) | |||||
convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) | |||||
if __name__ == "__main__": | |||||
main() |
@@ -0,0 +1,90 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
"""Convert tensorflow checkpoint to oneflow snapshot""" | |||||
import re | |||||
import argparse | |||||
import tensorflow as tf | |||||
import numpy as np | |||||
import os | |||||
parser = argparse.ArgumentParser() | |||||
## Required parameters | |||||
parser.add_argument("--tf_checkpoint_path", | |||||
default = None, | |||||
type = str, | |||||
required = True, | |||||
help = "Path the TensorFlow checkpoint path.") | |||||
parser.add_argument("--of_dump_path", | |||||
default = None, | |||||
type = str, | |||||
required = True, | |||||
help = "Path to the output OneFlow model.") | |||||
#args = parser.parse_args() | |||||
args, unknown = parser.parse_known_args() | |||||
print(args) | |||||
# parse unknown arguments for extra weights | |||||
extra_weights = {} | |||||
for u in unknown: | |||||
w = u.split("=") | |||||
assert len(w) == 2 | |||||
if len(w) == 2: | |||||
extra_weights[w[0]] = float(w[1]) | |||||
def _write_blob(folder, blob): | |||||
os.makedirs(folder, exist_ok=True) | |||||
filename = os.path.join(folder, "out") | |||||
f = open(filename, 'wb') | |||||
f.write(blob.tobytes()) | |||||
f.close() | |||||
print(filename, blob.shape) | |||||
def _SaveWeightBlob2File(blob, folder): | |||||
_write_blob(folder, blob) | |||||
for weight, default_value in extra_weights.items(): | |||||
d = np.full_like(blob, default_value) | |||||
_write_blob(folder + weight, d) | |||||
def convert(): | |||||
path = args.tf_checkpoint_path | |||||
init_vars = tf.train.list_variables(path) | |||||
for name, shape in init_vars: | |||||
array = tf.train.load_variable(path, name) | |||||
sep = name.rfind('/') | |||||
blob_name = name[sep + 1:] | |||||
op_name = name[:sep].replace('/', '-') | |||||
if blob_name == "kernel": | |||||
blob_name = "weight" | |||||
elif blob_name in ['adam_m', 'adam_v']: | |||||
print("find m, v weights") | |||||
folder_name = op_name+"-"+blob_name | |||||
folder = os.path.join(args.of_dump_path, folder_name) | |||||
#print("saved to:", folder) | |||||
_SaveWeightBlob2File(array, folder) | |||||
if __name__ == "__main__": | |||||
convert() | |||||
@@ -0,0 +1,90 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
"""Convert tensorflow checkpoint to oneflow snapshot""" | |||||
import re | |||||
import argparse | |||||
import tensorflow as tf | |||||
import numpy as np | |||||
import os | |||||
parser = argparse.ArgumentParser() | |||||
## Required parameters | |||||
parser.add_argument("--tf_checkpoint_path", | |||||
default = None, | |||||
type = str, | |||||
required = True, | |||||
help = "Path the TensorFlow checkpoint path.") | |||||
parser.add_argument("--of_dump_path", | |||||
default = None, | |||||
type = str, | |||||
required = True, | |||||
help = "Path to the output OneFlow model.") | |||||
#args = parser.parse_args() | |||||
args, unknown = parser.parse_known_args() | |||||
print(args) | |||||
# parse unknown arguments for extra weights | |||||
extra_weights = {} | |||||
for u in unknown: | |||||
w = u.split("=") | |||||
assert len(w) == 2 | |||||
if len(w) == 2: | |||||
extra_weights[w[0]] = float(w[1]) | |||||
def _write_blob(folder, blob): | |||||
os.makedirs(folder, exist_ok=True) | |||||
filename = os.path.join(folder, "out") | |||||
f = open(filename, 'wb') | |||||
f.write(blob.tobytes()) | |||||
f.close() | |||||
print(filename, blob.shape) | |||||
def _SaveWeightBlob2File(blob, folder): | |||||
_write_blob(folder, blob) | |||||
for weight, default_value in extra_weights.items(): | |||||
d = np.full_like(blob, default_value) | |||||
_write_blob(folder + weight, d) | |||||
def convert(): | |||||
path = args.tf_checkpoint_path | |||||
init_vars = tf.train.list_variables(path) | |||||
for name, shape in init_vars: | |||||
array = tf.train.load_variable(path, name) | |||||
sep = name.rfind('/') | |||||
blob_name = name[sep + 1:] | |||||
op_name = name[:sep].replace('/', '-') | |||||
if blob_name == "kernel": | |||||
blob_name = "weight" | |||||
elif blob_name in ['adam_m', 'adam_v']: | |||||
print("find m, v weights") | |||||
folder_name = 'student'+'-'+op_name+"-"+blob_name | |||||
folder = os.path.join(args.of_dump_path, folder_name) | |||||
#print("saved to:", folder) | |||||
_SaveWeightBlob2File(array, folder) | |||||
if __name__ == "__main__": | |||||
convert() | |||||
@@ -0,0 +1,141 @@ | |||||
''' Script for downloading all GLUE data. | |||||
Note: for legal reasons, we are unable to host MRPC. | |||||
You can either use the version hosted by the SentEval team, which is already tokenized, | |||||
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually. | |||||
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example). | |||||
You should then rename and place specific files in a folder (see below for an example). | |||||
mkdir MRPC | |||||
cabextract MSRParaphraseCorpus.msi -d MRPC | |||||
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt | |||||
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt | |||||
rm MRPC/_* | |||||
rm MSRParaphraseCorpus.msi | |||||
1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now. | |||||
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray! | |||||
''' | |||||
import os | |||||
import sys | |||||
import shutil | |||||
import argparse | |||||
import tempfile | |||||
import urllib.request | |||||
import zipfile | |||||
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"] | |||||
TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4', | |||||
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8', | |||||
"MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc', | |||||
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5', | |||||
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5', | |||||
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce', | |||||
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df', | |||||
"QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601', | |||||
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb', | |||||
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf', | |||||
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'} | |||||
MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt' | |||||
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt' | |||||
def download_and_extract(task, data_dir): | |||||
print("Downloading and extracting %s..." % task) | |||||
data_file = "%s.zip" % task | |||||
urllib.request.urlretrieve(TASK2PATH[task], data_file) | |||||
with zipfile.ZipFile(data_file) as zip_ref: | |||||
zip_ref.extractall(data_dir) | |||||
os.remove(data_file) | |||||
print("\tCompleted!") | |||||
def format_mrpc(data_dir, path_to_data): | |||||
print("Processing MRPC...") | |||||
mrpc_dir = os.path.join(data_dir, "MRPC") | |||||
if not os.path.isdir(mrpc_dir): | |||||
os.mkdir(mrpc_dir) | |||||
if path_to_data: | |||||
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt") | |||||
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt") | |||||
else: | |||||
print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN) | |||||
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") | |||||
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") | |||||
urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file) | |||||
urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file) | |||||
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file | |||||
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file | |||||
urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv")) | |||||
dev_ids = [] | |||||
with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh: | |||||
for row in ids_fh: | |||||
dev_ids.append(row.strip().split('\t')) | |||||
with open(mrpc_train_file, encoding="utf8") as data_fh, \ | |||||
open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \ | |||||
open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh: | |||||
header = data_fh.readline() | |||||
train_fh.write(header) | |||||
dev_fh.write(header) | |||||
for row in data_fh: | |||||
label, id1, id2, s1, s2 = row.strip().split('\t') | |||||
if [id1, id2] in dev_ids: | |||||
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) | |||||
else: | |||||
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) | |||||
with open(mrpc_test_file, encoding="utf8") as data_fh, \ | |||||
open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh: | |||||
header = data_fh.readline() | |||||
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") | |||||
for idx, row in enumerate(data_fh): | |||||
label, id1, id2, s1, s2 = row.strip().split('\t') | |||||
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2)) | |||||
print("\tCompleted!") | |||||
def download_diagnostic(data_dir): | |||||
print("Downloading and extracting diagnostic...") | |||||
if not os.path.isdir(os.path.join(data_dir, "diagnostic")): | |||||
os.mkdir(os.path.join(data_dir, "diagnostic")) | |||||
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv") | |||||
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file) | |||||
print("\tCompleted!") | |||||
return | |||||
def get_tasks(task_names): | |||||
task_names = task_names.split(',') | |||||
if "all" in task_names: | |||||
tasks = TASKS | |||||
else: | |||||
tasks = [] | |||||
for task_name in task_names: | |||||
assert task_name in TASKS, "Task %s not found!" % task_name | |||||
tasks.append(task_name) | |||||
return tasks | |||||
def main(arguments): | |||||
parser = argparse.ArgumentParser() | |||||
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') | |||||
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string', | |||||
type=str, default='all') | |||||
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt', | |||||
type=str, default='') | |||||
args = parser.parse_args(arguments) | |||||
if not os.path.isdir(args.data_dir): | |||||
os.mkdir(args.data_dir) | |||||
tasks = get_tasks(args.tasks) | |||||
for task in tasks: | |||||
if task == 'MRPC': | |||||
format_mrpc(args.data_dir, args.path_to_mrpc) | |||||
elif task == 'diagnostic': | |||||
download_diagnostic(args.data_dir) | |||||
else: | |||||
download_and_extract(task, args.data_dir) | |||||
if __name__ == '__main__': | |||||
sys.exit(main(sys.argv[1:])) |
@@ -0,0 +1,793 @@ | |||||
# coding=utf-8 | |||||
# Copyright 2018 The Google AI Language Team Authors. | |||||
# | |||||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||||
# you may not use this file except in compliance with the License. | |||||
# You may obtain a copy of the License at | |||||
# | |||||
# http://www.apache.org/licenses/LICENSE-2.0 | |||||
# | |||||
# Unless required by applicable law or agreed to in writing, software | |||||
# distributed under the License is distributed on an "AS IS" BASIS, | |||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
# See the License for the specific language governing permissions and | |||||
# limitations under the License. | |||||
"""BERT finetuning runner.""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import oneflow.core.record.record_pb2 as of_record | |||||
import collections | |||||
import csv | |||||
import os | |||||
import tokenization | |||||
import logging_setup | |||||
import struct | |||||
from parse_args import parse_args | |||||
# import pandas as pd | |||||
# import tensorflow as tf | |||||
if __name__ == '__main__': | |||||
logger = logging_setup.setup_logger(__name__) | |||||
else: | |||||
logger = logging_setup.setup_multiprocessing_logger() | |||||
class InputExample(object): | |||||
"""A single training/test example for simple sequence classification.""" | |||||
def __init__(self, guid, text_a, text_b=None, label=None): | |||||
"""Constructs a InputExample. | |||||
Args: | |||||
guid: Unique id for the example. | |||||
text_a: string. The untokenized text of the first sequence. For single | |||||
sequence tasks, only this sequence must be specified. | |||||
text_b: (Optional) string. The untokenized text of the second sequence. | |||||
Only must be specified for sequence pair tasks. | |||||
label: (Optional) string. The label of the example. This should be | |||||
specified for train and dev examples, but not for test examples. | |||||
""" | |||||
self.guid = guid | |||||
self.text_a = text_a | |||||
self.text_b = text_b | |||||
self.label = label | |||||
class PaddingInputExample(object): | |||||
"""Fake example so the num input examples is a multiple of the batch size. | |||||
When running eval/predict on the TPU, we need to pad the number of examples | |||||
to be a multiple of the batch size, because the TPU requires a fixed batch | |||||
size. The alternative is to drop the last batch, which is bad because it means | |||||
the entire output data won't be generated. | |||||
We use this class instead of `None` because treating `None` as padding | |||||
battches could cause silent errors. | |||||
""" | |||||
class InputFeatures(object): | |||||
"""A single set of features of data.""" | |||||
def __init__(self, | |||||
input_ids, | |||||
input_mask, | |||||
segment_ids, | |||||
label_id, | |||||
is_real_example=True): | |||||
self.input_ids = input_ids | |||||
self.input_mask = input_mask | |||||
self.segment_ids = segment_ids | |||||
self.label_id = label_id | |||||
self.is_real_example = is_real_example | |||||
class DataProcessor(object): | |||||
"""Base class for data converters for sequence classification data sets.""" | |||||
def get_train_examples(self, data_dir): | |||||
"""Gets a collection of `InputExample`s for the train set.""" | |||||
raise NotImplementedError() | |||||
def get_dev_examples(self, data_dir): | |||||
"""Gets a collection of `InputExample`s for the dev set.""" | |||||
raise NotImplementedError() | |||||
def get_test_examples(self, data_dir): | |||||
"""Gets a collection of `InputExample`s for prediction.""" | |||||
raise NotImplementedError() | |||||
def get_labels(self): | |||||
"""Gets the list of labels for this data set.""" | |||||
raise NotImplementedError() | |||||
@classmethod | |||||
def _read_tsv(cls, input_file, quotechar=None): | |||||
"""Reads a tab separated value file.""" | |||||
with open(input_file, "r", encoding='utf-8') as f: | |||||
reader = csv.reader(f, delimiter="\t", quotechar=quotechar) | |||||
lines = [] | |||||
for line in reader: | |||||
lines.append(line) | |||||
return lines | |||||
class XnliProcessor(DataProcessor): | |||||
"""Processor for the XNLI data set.""" | |||||
def __init__(self): | |||||
self.language = "zh" | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
lines = self._read_tsv( | |||||
os.path.join(data_dir, "multinli", | |||||
"multinli.train.%s.tsv" % self.language)) | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "train-%d" % (i) | |||||
text_a = tokenization.convert_to_unicode(line[0]) | |||||
text_b = tokenization.convert_to_unicode(line[1]) | |||||
label = tokenization.convert_to_unicode(line[2]) | |||||
if label == tokenization.convert_to_unicode("contradictory"): | |||||
label = tokenization.convert_to_unicode("contradiction") | |||||
examples.append( | |||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "dev-%d" % (i) | |||||
language = tokenization.convert_to_unicode(line[0]) | |||||
if language != tokenization.convert_to_unicode(self.language): | |||||
continue | |||||
text_a = tokenization.convert_to_unicode(line[6]) | |||||
text_b = tokenization.convert_to_unicode(line[7]) | |||||
label = tokenization.convert_to_unicode(line[1]) | |||||
examples.append( | |||||
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["contradiction", "entailment", "neutral"] | |||||
class MrpcProcessor(DataProcessor): | |||||
"""Processor for the MRPC data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["sentence1"].numpy().decode("utf-8"), | |||||
tensor_dict["sentence2"].numpy().decode("utf-8"), | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv"))) | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") | |||||
def get_aug_examples(self, data_dir): | |||||
return self._create_examples( | |||||
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") | |||||
def get_test_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["0", "1"] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "%s-%s" % (set_type, i) | |||||
text_a = line[3] | |||||
text_b = line[4] | |||||
label = line[0] | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
class MnliProcessor(DataProcessor): | |||||
"""Processor for the MultiNLI data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["premise"].numpy().decode("utf-8"), | |||||
tensor_dict["hypothesis"].numpy().decode("utf-8"), | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") | |||||
def get_aug_examples(self, data_dir): | |||||
return self._create_examples( | |||||
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["contradiction", "entailment", "neutral"] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "%s-%s" % (set_type, line[0]) | |||||
text_a = line[8] | |||||
text_b = line[9] | |||||
label = line[-1] | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
class MnliMismatchedProcessor(MnliProcessor): | |||||
"""Processor for the MultiNLI Mismatched data set (GLUE version).""" | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") | |||||
class ColaProcessor(DataProcessor): | |||||
"""Processor for the CoLA data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["sentence"].numpy().decode("utf-8"), | |||||
None, | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") | |||||
def get_aug_examples(self, data_dir): | |||||
return self._create_examples( | |||||
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["0", "1"] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
guid = "%s-%s" % (set_type, i) | |||||
text_a = line[3] | |||||
label = line[1] | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) | |||||
return examples | |||||
class Sst2Processor(DataProcessor): | |||||
"""Processor for the SST-2 data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["sentence"].numpy().decode("utf-8"), | |||||
None, | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") | |||||
def get_test_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") | |||||
def get_aug_examples(self, data_dir): | |||||
return self._create_examples( | |||||
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["0", "1"] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "%s-%s" % (set_type, i) | |||||
text_a = line[0] | |||||
label = line[1] | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) | |||||
return examples | |||||
class StsbProcessor(DataProcessor): | |||||
"""Processor for the STS-B data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["sentence1"].numpy().decode("utf-8"), | |||||
tensor_dict["sentence2"].numpy().decode("utf-8"), | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") | |||||
def get_aug_examples(self, data_dir): | |||||
return self._create_examples( | |||||
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return [None] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "%s-%s" % (set_type, line[0]) | |||||
text_a = line[7] | |||||
text_b = line[8] | |||||
label = line[-1] | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
class QqpProcessor(DataProcessor): | |||||
"""Processor for the QQP data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["question1"].numpy().decode("utf-8"), | |||||
tensor_dict["question2"].numpy().decode("utf-8"), | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") | |||||
def get_aug_examples(self, data_dir): | |||||
return self._create_examples( | |||||
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["0", "1"] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "%s-%s" % (set_type, line[0]) | |||||
try: | |||||
text_a = line[3] | |||||
text_b = line[4] | |||||
label = line[5] | |||||
except IndexError: | |||||
continue | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
class QnliProcessor(DataProcessor): | |||||
"""Processor for the QNLI data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["question"].numpy().decode("utf-8"), | |||||
tensor_dict["sentence"].numpy().decode("utf-8"), | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") | |||||
def get_aug_examples(self, data_dir): | |||||
return self._create_examples( | |||||
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["entailment", "not_entailment"] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "%s-%s" % (set_type, line[0]) | |||||
text_a = line[1] | |||||
text_b = line[2] | |||||
label = line[-1] | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
class RteProcessor(DataProcessor): | |||||
"""Processor for the RTE data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["sentence1"].numpy().decode("utf-8"), | |||||
tensor_dict["sentence2"].numpy().decode("utf-8"), | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") | |||||
def get_aug_examples(self, data_dir): | |||||
return self._create_examples( | |||||
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["entailment", "not_entailment"] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "%s-%s" % (set_type, line[0]) | |||||
text_a = line[1] | |||||
text_b = line[2] | |||||
label = line[-1] | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
class WnliProcessor(DataProcessor): | |||||
"""Processor for the WNLI data set (GLUE version).""" | |||||
def get_example_from_tensor_dict(self, tensor_dict): | |||||
"""See base class.""" | |||||
return InputExample( | |||||
tensor_dict["idx"].numpy(), | |||||
tensor_dict["sentence1"].numpy().decode("utf-8"), | |||||
tensor_dict["sentence2"].numpy().decode("utf-8"), | |||||
str(tensor_dict["label"].numpy()), | |||||
) | |||||
def get_train_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") | |||||
def get_dev_examples(self, data_dir): | |||||
"""See base class.""" | |||||
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") | |||||
def get_labels(self): | |||||
"""See base class.""" | |||||
return ["0", "1"] | |||||
def _create_examples(self, lines, set_type): | |||||
"""Creates examples for the training and dev sets.""" | |||||
examples = [] | |||||
for (i, line) in enumerate(lines): | |||||
if i == 0: | |||||
continue | |||||
guid = "%s-%s" % (set_type, line[0]) | |||||
text_a = line[1] | |||||
text_b = line[2] | |||||
label = line[-1] | |||||
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) | |||||
return examples | |||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length): | |||||
"""Truncates a sequence pair in place to the maximum length.""" | |||||
# This is a simple heuristic which will always truncate the longer sequence | |||||
# one token at a time. This makes more sense than truncating an equal percent | |||||
# of tokens from each, since if one sequence is very short then each token | |||||
# that's truncated likely contains more information than a longer sequence. | |||||
while True: | |||||
total_length = len(tokens_a) + len(tokens_b) | |||||
if total_length <= max_length: | |||||
break | |||||
if len(tokens_a) > len(tokens_b): | |||||
tokens_a.pop() | |||||
else: | |||||
tokens_b.pop() | |||||
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): | |||||
"""Converts a single `InputExample` into a single `InputFeatures`.""" | |||||
if isinstance(example, PaddingInputExample): | |||||
return InputFeatures( | |||||
input_ids=[0] * max_seq_length, | |||||
input_mask=[0] * max_seq_length, | |||||
segment_ids=[0] * max_seq_length, | |||||
label_id=0, | |||||
is_real_example=False) | |||||
label_map = {} | |||||
for (i, label) in enumerate(label_list): | |||||
label_map[label] = i | |||||
tokens_a = tokenizer.tokenize(example.text_a) | |||||
tokens_b = None | |||||
if example.text_b: | |||||
tokens_b = tokenizer.tokenize(example.text_b) | |||||
if tokens_b: | |||||
# Modifies `tokens_a` and `tokens_b` in place so that the total | |||||
# length is less than the specified length. | |||||
# Account for [CLS], [SEP], [SEP] with "- 3" | |||||
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) | |||||
else: | |||||
# Account for [CLS] and [SEP] with "- 2" | |||||
if len(tokens_a) > max_seq_length - 2: | |||||
tokens_a = tokens_a[0:(max_seq_length - 2)] | |||||
# The convention in BERT is: | |||||
# (a) For sequence pairs: | |||||
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] | |||||
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 | |||||
# (b) For single sequences: | |||||
# tokens: [CLS] the dog is hairy . [SEP] | |||||
# type_ids: 0 0 0 0 0 0 0 | |||||
# | |||||
# Where "type_ids" are used to indicate whether this is the first | |||||
# sequence or the second sequence. The embedding vectors for `type=0` and | |||||
# `type=1` were learned during pre-training and are added to the wordpiece | |||||
# embedding vector (and position vector). This is not *strictly* necessary | |||||
# since the [SEP] token unambiguously separates the sequences, but it makes | |||||
# it easier for the model to learn the concept of sequences. | |||||
# | |||||
# For classification tasks, the first vector (corresponding to [CLS]) is | |||||
# used as the "sentence vector". Note that this only makes sense because | |||||
# the entire model is fine-tuned. | |||||
tokens = [] | |||||
segment_ids = [] | |||||
tokens.append("[CLS]") | |||||
segment_ids.append(0) | |||||
for token in tokens_a: | |||||
tokens.append(token) | |||||
segment_ids.append(0) | |||||
tokens.append("[SEP]") | |||||
segment_ids.append(0) | |||||
if tokens_b: | |||||
for token in tokens_b: | |||||
tokens.append(token) | |||||
segment_ids.append(1) | |||||
tokens.append("[SEP]") | |||||
segment_ids.append(1) | |||||
input_ids = tokenizer.convert_tokens_to_ids(tokens) | |||||
# The mask has 1 for real tokens and 0 for padding tokens. Only real | |||||
# tokens are attended to. | |||||
input_mask = [1] * len(input_ids) | |||||
# Zero-pad up to the sequence length. | |||||
while len(input_ids) < max_seq_length: | |||||
input_ids.append(0) | |||||
input_mask.append(0) | |||||
segment_ids.append(0) | |||||
assert len(input_ids) == max_seq_length | |||||
assert len(input_mask) == max_seq_length | |||||
assert len(segment_ids) == max_seq_length | |||||
label_id = label_map[example.label] | |||||
if ex_index < 5: | |||||
logger.info("*** Example ***") | |||||
logger.info("guid: %s" % (example.guid)) | |||||
logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) | |||||
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) | |||||
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) | |||||
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) | |||||
logger.info("label: %s (id = %d)" % (example.label, label_id)) | |||||
feature = InputFeatures( | |||||
input_ids=input_ids, | |||||
input_mask=input_mask, | |||||
segment_ids=segment_ids, | |||||
label_id=label_id, | |||||
is_real_example=True) | |||||
return feature | |||||
def file_based_convert_examples_to_features( | |||||
examples, label_list, max_seq_length, tokenizer, output_file): | |||||
"""Convert a set of `InputExample`s to a TFRecord file.""" | |||||
# writer = tf.python_io.TFRecordWriter(output_file) | |||||
writer = open(output_file, 'ab') | |||||
total_written = 0 | |||||
for (ex_index, example) in enumerate(examples): | |||||
if ex_index % 10000 == 0: | |||||
logger.info("Writing example %d of %d" % (ex_index, len(examples))) | |||||
feature = convert_single_example(ex_index, example, label_list, | |||||
max_seq_length, tokenizer) | |||||
def create_int32_feature(values): | |||||
return of_record.Feature(int32_list=of_record.Int32List(value=values)), | |||||
sample = of_record.OFRecord( | |||||
feature={ | |||||
"input_ids": create_int32_feature(feature.input_ids), | |||||
"input_mask": create_int32_feature(feature.input_mask), | |||||
"segment_ids": create_int32_feature(feature.segment_ids), | |||||
"label_ids": create_int32_feature([feature.label_id]), | |||||
"is_real_example": create_int32_feature([int(feature.is_real_example)]) | |||||
} | |||||
) | |||||
writer.write(struct.pack("q", sample.ByteSize())) | |||||
writer.write(sample.SerializeToString()) | |||||
if ex_index % 10000 == (len(examples) - 1) % 10000: | |||||
logger.info('Wrote intances %d/%d to "%s"', ex_index, len(examples), output_file) | |||||
total_written += 1 | |||||
writer.close() | |||||
logger.info('Wrote total %d instances to output files "%s"', total_written, output_file) | |||||
def glue_process(args): | |||||
processors = { | |||||
"xnli": XnliProcessor, | |||||
"cola": ColaProcessor, | |||||
"mnli": MnliProcessor, | |||||
"mnli-mm": MnliMismatchedProcessor, | |||||
"mrpc": MrpcProcessor, | |||||
"sst-2": Sst2Processor, | |||||
"sts-b": StsbProcessor, | |||||
"qqp": QqpProcessor, | |||||
"qnli": QnliProcessor, | |||||
"rte": RteProcessor, | |||||
"wnli": WnliProcessor, | |||||
} | |||||
if not args.do_train and not args.do_eval and not args.do_predict: | |||||
raise ValueError( | |||||
"At least one of `do_train`, `do_eval` or `do_predict' or `aug_train' must be True.") | |||||
os.makedirs(args.output_dir, exist_ok=True) | |||||
task_name = args.task_name.lower() | |||||
if task_name not in processors: | |||||
raise ValueError("Task not found: %s" % (task_name)) | |||||
processor = processors[task_name]() | |||||
label_list = processor.get_labels() | |||||
tokenizer = tokenization.FullTokenizer( | |||||
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) | |||||
total_examples = {} | |||||
if args.do_train: | |||||
train_examples = processor.get_train_examples(args.data_dir) | |||||
total_examples['train'] = len(train_examples) | |||||
os.makedirs(os.path.join(args.output_dir,'train'), exist_ok=True) | |||||
train_file = os.path.join(args.output_dir,'train', "train.of_record-0") | |||||
file_based_convert_examples_to_features( | |||||
train_examples, label_list, args.max_seq_length, tokenizer, train_file) | |||||
if args.do_eval: | |||||
eval_examples = processor.get_dev_examples(args.data_dir) | |||||
total_examples['eval'] = len(eval_examples) | |||||
os.makedirs(os.path.join(args.output_dir,'eval'), exist_ok=True) | |||||
eval_file = os.path.join(args.output_dir,'eval' ,"eval.of_record-0") | |||||
file_based_convert_examples_to_features( | |||||
eval_examples, label_list, args.max_seq_length, tokenizer, eval_file) | |||||
if args.do_predict: | |||||
try: | |||||
predict_examples = processor.get_test_examples(args.data_dir) | |||||
total_examples['test'] = len(predict_examples) | |||||
os.makedirs(os.path.join(args.output_dir,'test'), exist_ok=True) | |||||
predict_file = os.path.join(args.output_dir,'test' ,"predict.of_record-0") | |||||
file_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer, | |||||
predict_file) | |||||
except Exception as e: | |||||
print(e) | |||||
if args.aug_train: | |||||
train_aug_examples = processor.get_aug_examples(args.data_dir) | |||||
os.makedirs(os.path.join(args.output_dir,'train'), exist_ok=True) | |||||
train_aug_file = os.path.join(args.output_dir,'train' ,"train_aug.of_record-0") | |||||
file_based_convert_examples_to_features(train_aug_examples, label_list, args.max_seq_length, tokenizer, | |||||
train_aug_file) | |||||
print('task_name:',task_name) | |||||
print(total_examples) | |||||
def main(): | |||||
args = parse_args() | |||||
glue_process(args) | |||||
if __name__ == '__main__': | |||||
main() |
@@ -0,0 +1,339 @@ | |||||
# coding: utf-8 | |||||
import os | |||||
import numpy as np | |||||
import pickle as pkl | |||||
from tqdm import tqdm | |||||
import time | |||||
from datetime import timedelta | |||||
import csv | |||||
import sys | |||||
import codecs | |||||
import logging_setup | |||||
import struct | |||||
from parse_args import parse_args | |||||
import oneflow.core.record.record_pb2 as of_record | |||||
MAX_VOCAB_SIZE = 10000 # 词表长度限制 | |||||
UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号 | |||||
if __name__ == '__main__': | |||||
logger = logging_setup.setup_logger(__name__) | |||||
else: | |||||
logger = logging_setup.setup_multiprocessing_logger() | |||||
def _truncate_seq_pair(tokens_a, tokens_b, max_length): | |||||
"""Truncates a sequence pair in place to the maximum length.""" | |||||
# This is a simple heuristic which will always truncate the longer sequence | |||||
# one token at a time. This makes more sense than truncating an equal percent | |||||
# of tokens from each, since if one sequence is very short then each token | |||||
# that's truncated likely contains more information than a longer sequence. | |||||
while True: | |||||
total_length = len(tokens_a) + len(tokens_b) | |||||
if total_length <= max_length: | |||||
break | |||||
if len(tokens_a) > len(tokens_b): | |||||
tokens_a.pop() | |||||
else: | |||||
tokens_b.pop() | |||||
def SST2_Processor(path): | |||||
examples = [] | |||||
with open(path, 'r', encoding='UTF-8') as f: | |||||
i=0 | |||||
for line in tqdm(f): | |||||
if i==0: | |||||
i += 1 | |||||
continue | |||||
try: | |||||
lin = line.strip() | |||||
if not lin: | |||||
continue | |||||
text_a, label = lin.split('\t') | |||||
text_b = None | |||||
examples.append([text_a, text_b, label]) | |||||
except Exception as e: | |||||
print(e) | |||||
return examples | |||||
def CoLA_Processor(path): | |||||
examples = [] | |||||
with open(path, 'r', encoding='UTF-8') as f: | |||||
i=0 | |||||
for line in tqdm(f): | |||||
try: | |||||
lin = line.strip().split('\t') | |||||
if not lin: | |||||
continue | |||||
text_a = lin[3] | |||||
text_b = None | |||||
label = lin[1] | |||||
examples.append([text_a, text_b, label]) | |||||
except Exception as e: | |||||
print(e) | |||||
return examples | |||||
def QQP_Processor(path): | |||||
examples = [] | |||||
with open(path, 'r', encoding='UTF-8') as f: | |||||
i=0 | |||||
for line in tqdm(f): | |||||
if i==0: | |||||
i += 1 | |||||
continue | |||||
try: | |||||
lin = line.strip().split('\t') | |||||
if not lin: | |||||
continue | |||||
text_a = lin[3] | |||||
text_b = lin[4] | |||||
label = lin[5] | |||||
examples.append([text_a,text_b,label]) | |||||
except Exception as e: | |||||
print(e) | |||||
return examples | |||||
def RTE_Processor(path): | |||||
examples = [] | |||||
with open(path, 'r', encoding='UTF-8') as f: | |||||
i=0 | |||||
for line in tqdm(f): | |||||
if i==0: | |||||
i += 1 | |||||
continue | |||||
try: | |||||
lin = line.strip().split('\t') | |||||
if not lin: | |||||
continue | |||||
text_a = lin[1] | |||||
text_b = lin[2] | |||||
label = lin[-1] | |||||
examples.append([text_a,text_b,label]) | |||||
except Exception as e: | |||||
print(e) | |||||
return examples | |||||
def MRPC_Processor(path): | |||||
examples = [] | |||||
with open(path, 'r', encoding='UTF-8') as f: | |||||
i=0 | |||||
for line in tqdm(f): | |||||
if i==0: | |||||
i += 1 | |||||
continue | |||||
try: | |||||
lin = line.strip().split('\t') | |||||
if not lin: | |||||
continue | |||||
text_a = lin[3] | |||||
text_b = lin[4] | |||||
label = lin[0] | |||||
examples.append([text_a,text_b,label]) | |||||
except Exception as e: | |||||
print(e) | |||||
return examples | |||||
def convert_single_example(examples,tokenizer, pad_size, vocab): | |||||
contents = [] | |||||
for example in examples: | |||||
text_a = example[0] | |||||
text_b = example[1] | |||||
label = example[2] | |||||
words_line = [] | |||||
tokens_a = tokenizer(text_a) | |||||
if text_b: | |||||
tokens_b = tokenizer(text_b) | |||||
_truncate_seq_pair(tokens_a, tokens_b, pad_size - 1) | |||||
token = tokens_a + [PAD] + tokens_b | |||||
else: | |||||
token = tokens_a | |||||
seq_len = len(token) | |||||
if pad_size: | |||||
if len(token) < pad_size: | |||||
token.extend([PAD] * (pad_size - len(token))) | |||||
else: | |||||
token = token[:pad_size] | |||||
seq_len = pad_size | |||||
# word to id | |||||
for word in token: | |||||
words_line.append(vocab.get(word, vocab.get(UNK))) | |||||
contents.append((words_line, label, seq_len)) | |||||
return contents | |||||
def build_vocab(dataset, file_path, tokenizer, max_size, min_freq): | |||||
vocab_dic = {} | |||||
if dataset == 'SST-2': | |||||
examples = SST2_Processor(file_path) | |||||
elif dataset == 'CoLA': | |||||
examples = CoLA_Processor(file_path) | |||||
elif dataset == 'MRPC': | |||||
examples = MRPC_Processor(file_path) | |||||
elif dataset == 'QQP': | |||||
examples = QQP_Processor(file_path) | |||||
elif dataset == 'RTE': | |||||
examples = RTE_Processor(file_path) | |||||
else: | |||||
print('Error: the dataset does not support') | |||||
print('Building vocab ...') | |||||
for example in tqdm(examples): | |||||
text_a = example[0] | |||||
text_b = example[1] | |||||
if text_b: | |||||
text = text_a + text_b | |||||
else: | |||||
text = text_a | |||||
for word in tokenizer(text): | |||||
vocab_dic[word] = vocab_dic.get(word, 0) + 1 | |||||
vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size] | |||||
vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)} | |||||
vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1}) | |||||
return vocab_dic | |||||
def build_dataset(dataset, config, ues_word): | |||||
if ues_word: | |||||
tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level | |||||
else: | |||||
tokenizer = lambda x: [y for y in x] # char-level | |||||
if os.path.exists(config.vocab_path): | |||||
vocab = pkl.load(open(config.vocab_path, 'rb')) | |||||
else: | |||||
vocab = build_vocab(dataset, config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1) | |||||
pkl.dump(vocab, open(config.vocab_path, 'wb')) | |||||
print(f"Vocab size: {len(vocab)}") | |||||
def load_dataset(dataset, tokenizer, path, pad_size=32): | |||||
if dataset=='SST-2': | |||||
examples = SST2_Processor(path) | |||||
elif dataset=='CoLA': | |||||
examples = CoLA_Processor(path) | |||||
elif dataset=='MRPC': | |||||
examples = MRPC_Processor(path) | |||||
elif dataset=='QQP': | |||||
examples = QQP_Processor(path) | |||||
elif dataset == 'RTE': | |||||
examples = RTE_Processor(path) | |||||
else: | |||||
print('error dataset not support') | |||||
contents = convert_single_example(examples,tokenizer,pad_size,vocab) | |||||
return contents | |||||
train = load_dataset(dataset,tokenizer, config.train_path, config.pad_size) | |||||
dev = load_dataset(dataset,tokenizer, config.dev_path, config.pad_size) | |||||
# test = load_dataset(config.test_path, config.pad_size) | |||||
return vocab, train, dev | |||||
def get_time_dif(start_time): | |||||
"""获取已使用时间""" | |||||
end_time = time.time() | |||||
time_dif = end_time - start_time | |||||
return timedelta(seconds=int(round(time_dif))) | |||||
def file_based_convert_examples_to_features( | |||||
examples, label_list, max_seq_length, output_file): | |||||
"""Convert a set of `InputExample`s to a TFRecord file.""" | |||||
# writer = tf.python_io.TFRecordWriter(output_file) | |||||
writer = open(output_file, 'ab') | |||||
total_written = 0 | |||||
for (ex_index, example) in enumerate(examples): | |||||
if ex_index % 10000 == 0: | |||||
logger.info("Writing example %d of %d" % (ex_index, len(examples))) | |||||
label_map = {} | |||||
for (i, label) in enumerate(label_list): | |||||
label_map[label] = i | |||||
input_mask = [1] * example[2] + [0] * (max_seq_length - example[2]) | |||||
segment_ids = [1] * max_seq_length | |||||
assert len(input_mask)==max_seq_length | |||||
label_id = label_map[example[1]] | |||||
def create_int32_feature(values): | |||||
return of_record.Feature(int32_list=of_record.Int32List(value=values)), | |||||
sample = of_record.OFRecord( | |||||
feature={ | |||||
"input_ids": create_int32_feature(example[0]), | |||||
"input_mask": create_int32_feature(input_mask), | |||||
"segment_ids": create_int32_feature(segment_ids), | |||||
"label_ids": create_int32_feature([label_id]), | |||||
"is_real_example": create_int32_feature([int(True)]) | |||||
} | |||||
) | |||||
writer.write(struct.pack("q", sample.ByteSize())) | |||||
writer.write(sample.SerializeToString()) | |||||
if ex_index % 10000 == (len(examples) - 1) % 10000: | |||||
logger.info('Wrote intances %d/%d to "%s"', ex_index, len(examples), output_file) | |||||
total_written += 1 | |||||
writer.close() | |||||
logger.info('Wrote total %d instances to output files "%s"', total_written, output_file) | |||||
class Config(object): | |||||
vocab_path = '' | |||||
train_path = '' | |||||
dev_path = '' | |||||
pad_size = 32 | |||||
if __name__ == "__main__": | |||||
'''提取预训练词向量''' | |||||
# 下面的目录、文件名按需更改。 | |||||
config =Config | |||||
dataset = "MRPC" | |||||
train_dir = "../../data/glue_data/{}/train.tsv".format(dataset) | |||||
dev_dir = "../../data/glue_data/{}/dev.tsv".format(dataset) | |||||
vocab_dir = "../../data/glue_ofrecord/{}_lstm_32".format(dataset) | |||||
pretrain_dir = "" | |||||
emb_dim = 300 | |||||
if os.path.exists(os.path.join(vocab_dir,'vocab.pkl')): | |||||
word_to_id = pkl.load(open(os.path.join(vocab_dir,'vocab.pkl'), 'rb')) | |||||
else: | |||||
tokenizer = lambda x: x.split(' ') # 以词为单位构建词表(数据集中词之间以空格隔开) | |||||
# tokenizer = lambda x: [y for y in x] # 以字为单位构建词表 | |||||
word_to_id = build_vocab(dataset, train_dir, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1) | |||||
os.makedirs(vocab_dir, exist_ok=True) | |||||
pkl.dump(word_to_id, open(os.path.join(vocab_dir,'vocab.pkl'), 'wb')) | |||||
# print(word_to_id) | |||||
# print(len(word_to_id)) | |||||
output_dir = '../../data/glue_ofrecord/{}_lstm_32'.format(dataset) | |||||
total_examples = {} | |||||
max_seq_length= 32 | |||||
config.vocab_path = os.path.join(vocab_dir,'vocab.pkl') | |||||
config.train_path = train_dir | |||||
config.dev_path = dev_dir | |||||
config.pad_size = max_seq_length | |||||
if dataset == 'RTE': | |||||
label_list = ["entailment", "not_entailment"] | |||||
elif dataset in ['SST-2', 'MRPC', 'QQP', 'CoLA']: | |||||
label_list = ["0", "1"] | |||||
elif dataset == 'MNLI': | |||||
label_list = ["contradiction", "entailment", "neutral"] | |||||
else: | |||||
print('Error: the dataset not supports') | |||||
# print(config.vocab_path) | |||||
_,train_dataset,dev_dataset = build_dataset(dataset=dataset, config=config,ues_word='True') | |||||
# print(dev_dataset[0]) | |||||
os.makedirs(os.path.join(output_dir, 'eval'), exist_ok=True) | |||||
dev_file = os.path.join(output_dir, 'eval', "eval.of_record-0") | |||||
file_based_convert_examples_to_features(dev_dataset,label_list,config.pad_size,dev_file) | |||||
os.makedirs(os.path.join(output_dir, 'train'), exist_ok=True) | |||||
train_file = os.path.join(output_dir, 'train', "train.of_record-0") | |||||
file_based_convert_examples_to_features(train_dataset,label_list,config.pad_size,train_file) |
@@ -0,0 +1,15 @@ | |||||
import os | |||||
def list_files(path, filename=None): | |||||
fullname = path | |||||
if filename: | |||||
fullname = os.path.join(path, filename) | |||||
files = [] | |||||
if os.path.isfile(fullname): | |||||
return [fullname] | |||||
elif os.path.isdir(fullname): | |||||
for sub in os.listdir(fullname): | |||||
files.extend(list_files(fullname, sub)) | |||||
return files |
@@ -0,0 +1,31 @@ | |||||
import sys | |||||
import logging | |||||
import multiprocessing | |||||
def setup_logger(name): | |||||
# Manually clear root loggers to prevent any module that may have called | |||||
# logging.basicConfig() from blocking our logging setup | |||||
# logging.root.handlers = [] | |||||
FORMAT = '%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d: %(message)s' | |||||
# DATEFMT = '%Y-%m-%d,%H:%M:%S.%f' | |||||
logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout) | |||||
logger = logging.getLogger(name) | |||||
return logger | |||||
def setup_multiprocessing_logger(): | |||||
logger = multiprocessing.get_logger() | |||||
logger.setLevel(logging.INFO) | |||||
if not logger.handlers: | |||||
logger._rudimentary_setup = True | |||||
logfile = sys.__stdout__ | |||||
if hasattr(logfile, "write"): | |||||
handler = logging.StreamHandler(logfile) | |||||
else: | |||||
handler = logging.FileHandler(logfile) | |||||
# formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(processName)s] ' | |||||
# '%(filename)s:%(lineno)d: %(message)s') | |||||
formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(processName)s] %(message)s') | |||||
handler.setFormatter(formatter) | |||||
logger.addHandler(handler) | |||||
return logger |
@@ -0,0 +1,71 @@ | |||||
import argparse | |||||
def parse_args(): | |||||
parser = argparse.ArgumentParser() | |||||
parser.add_argument( | |||||
'--data_dir', | |||||
help='Input glue task directories.', | |||||
default=None, | |||||
type=str, | |||||
required=True | |||||
) | |||||
parser.add_argument( | |||||
'--output_dir', | |||||
help='Output the directory of oneflow record files.', | |||||
default=None, | |||||
type=str, | |||||
required=True | |||||
) | |||||
parser.add_argument( | |||||
'--vocab_file', | |||||
help='The vocabulary file that the BERT model was trained on.', | |||||
default=None, | |||||
type=str, | |||||
required=True | |||||
) | |||||
parser.add_argument( | |||||
'--do_lower_case', | |||||
help='Whether to lower case the input text. Should be True for uncased ' | |||||
'models and False for cased models.', | |||||
default=None, | |||||
type=bool | |||||
) | |||||
parser.add_argument( | |||||
'--max_seq_length', | |||||
help='Maximum sequence length.', | |||||
default=128, | |||||
type=int | |||||
) | |||||
parser.add_argument( | |||||
'--do_train', | |||||
help='Whether to process the training data', | |||||
default=None, | |||||
type=bool | |||||
) | |||||
parser.add_argument( | |||||
'--do_eval', | |||||
help='Whether to process the validation data', | |||||
default=None, | |||||
type=bool | |||||
) | |||||
parser.add_argument( | |||||
'--do_predict', | |||||
help='Whether to process the prediction data', | |||||
default=None, | |||||
type=bool | |||||
) | |||||
parser.add_argument( | |||||
'--aug_train', | |||||
help='Whether to process the augmented training data', | |||||
default=None, | |||||
type=bool | |||||
) | |||||
parser.add_argument( | |||||
'--task_name', | |||||
help='The task of glue to be processed', | |||||
default='cola', | |||||
type=str | |||||
) | |||||
return parser.parse_args() |
@@ -0,0 +1,347 @@ | |||||
# coding=utf-8 | |||||
# Copyright 2018 Oneflow Inc. | |||||
# | |||||
# Licensed under the Apache License, Version 2.0 (the "License"); | |||||
# you may not use this file except in compliance with the License. | |||||
# You may obtain a copy of the License at | |||||
# | |||||
# http://www.apache.org/licenses/LICENSE-2.0 | |||||
# | |||||
# Unless required by applicable law or agreed to in writing, software | |||||
# distributed under the License is distributed on an "AS IS" BASIS, | |||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
# See the License for the specific language governing permissions and | |||||
# limitations under the License. | |||||
"""Tokenization classes.""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import collections | |||||
import unicodedata | |||||
import six | |||||
def convert_to_unicode(text): | |||||
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" | |||||
if six.PY3: | |||||
if isinstance(text, str): | |||||
return text | |||||
elif isinstance(text, bytes): | |||||
return text.decode("utf-8", "ignore") | |||||
else: | |||||
raise ValueError("Unsupported string type: %s" % (type(text))) | |||||
elif six.PY2: | |||||
if isinstance(text, str): | |||||
return text.decode("utf-8", "ignore") | |||||
elif isinstance(text, unicode): | |||||
return text | |||||
else: | |||||
raise ValueError("Unsupported string type: %s" % (type(text))) | |||||
else: | |||||
raise ValueError("Not running on Python2 or Python 3?") | |||||
def printable_text(text): | |||||
"""Returns text encoded in a way suitable for print.""" | |||||
# These functions want `str` for both Python2 and Python3, but in one case | |||||
# it's a Unicode string and in the other it's a byte string. | |||||
if six.PY3: | |||||
if isinstance(text, str): | |||||
return text | |||||
elif isinstance(text, bytes): | |||||
return text.decode("utf-8", "ignore") | |||||
else: | |||||
raise ValueError("Unsupported string type: %s" % (type(text))) | |||||
elif six.PY2: | |||||
if isinstance(text, str): | |||||
return text | |||||
elif isinstance(text, unicode): | |||||
return text.encode("utf-8") | |||||
else: | |||||
raise ValueError("Unsupported string type: %s" % (type(text))) | |||||
else: | |||||
raise ValueError("Not running on Python2 or Python 3?") | |||||
def load_vocab(vocab_file): | |||||
"""Loads a vocabulary file into a dictionary.""" | |||||
vocab = collections.OrderedDict() | |||||
index = 0 | |||||
with open(vocab_file, "r", encoding='utf-8') as reader: | |||||
while True: | |||||
token = convert_to_unicode(reader.readline()) | |||||
if not token: | |||||
break | |||||
token = token.strip() | |||||
vocab[token] = index | |||||
index += 1 | |||||
return vocab | |||||
def convert_by_vocab(vocab, items): | |||||
"""Converts a sequence of [tokens|ids] using the vocab.""" | |||||
output = [] | |||||
for item in items: | |||||
output.append(vocab[item]) | |||||
return output | |||||
def convert_tokens_to_ids(vocab, tokens): | |||||
return convert_by_vocab(vocab, tokens) | |||||
def convert_ids_to_tokens(inv_vocab, ids): | |||||
return convert_by_vocab(inv_vocab, ids) | |||||
def whitespace_tokenize(text): | |||||
"""Runs basic whitespace cleaning and splitting on a peice of text.""" | |||||
text = text.strip() | |||||
if not text: | |||||
return [] | |||||
tokens = text.split() | |||||
return tokens | |||||
class FullTokenizer(object): | |||||
"""Runs end-to-end tokenziation.""" | |||||
def __init__(self, vocab_file, do_lower_case=True): | |||||
self.vocab = load_vocab(vocab_file) | |||||
self.inv_vocab = {v: k for k, v in self.vocab.items()} | |||||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) | |||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) | |||||
def tokenize(self, text): | |||||
split_tokens = [] | |||||
for token in self.basic_tokenizer.tokenize(text): | |||||
for sub_token in self.wordpiece_tokenizer.tokenize(token): | |||||
split_tokens.append(sub_token) | |||||
return split_tokens | |||||
def convert_tokens_to_ids(self, tokens): | |||||
return convert_by_vocab(self.vocab, tokens) | |||||
def convert_ids_to_tokens(self, ids): | |||||
return convert_by_vocab(self.inv_vocab, ids) | |||||
class BasicTokenizer(object): | |||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.).""" | |||||
def __init__(self, do_lower_case=True): | |||||
"""Constructs a BasicTokenizer. | |||||
Args: | |||||
do_lower_case: Whether to lower case the input. | |||||
""" | |||||
self.do_lower_case = do_lower_case | |||||
def tokenize(self, text): | |||||
"""Tokenizes a piece of text.""" | |||||
text = convert_to_unicode(text) | |||||
text = self._clean_text(text) | |||||
# This was added on November 1st, 2018 for the multilingual and Chinese | |||||
# models. This is also applied to the English models now, but it doesn't | |||||
# matter since the English models were not trained on any Chinese data | |||||
# and generally don't have any Chinese data in them (there are Chinese | |||||
# characters in the vocabulary because Wikipedia does have some Chinese | |||||
# words in the English Wikipedia.). | |||||
text = self._tokenize_chinese_chars(text) | |||||
orig_tokens = whitespace_tokenize(text) | |||||
split_tokens = [] | |||||
for token in orig_tokens: | |||||
if self.do_lower_case: | |||||
token = token.lower() | |||||
token = self._run_strip_accents(token) | |||||
split_tokens.extend(self._run_split_on_punc(token)) | |||||
output_tokens = whitespace_tokenize(" ".join(split_tokens)) | |||||
return output_tokens | |||||
def _run_strip_accents(self, text): | |||||
"""Strips accents from a piece of text.""" | |||||
text = unicodedata.normalize("NFD", text) | |||||
output = [] | |||||
for char in text: | |||||
cat = unicodedata.category(char) | |||||
if cat == "Mn": | |||||
continue | |||||
output.append(char) | |||||
return "".join(output) | |||||
def _run_split_on_punc(self, text): | |||||
"""Splits punctuation on a piece of text.""" | |||||
chars = list(text) | |||||
i = 0 | |||||
start_new_word = True | |||||
output = [] | |||||
while i < len(chars): | |||||
char = chars[i] | |||||
if _is_punctuation(char): | |||||
output.append([char]) | |||||
start_new_word = True | |||||
else: | |||||
if start_new_word: | |||||
output.append([]) | |||||
start_new_word = False | |||||
output[-1].append(char) | |||||
i += 1 | |||||
return ["".join(x) for x in output] | |||||
def _tokenize_chinese_chars(self, text): | |||||
"""Adds whitespace around any CJK character.""" | |||||
output = [] | |||||
for char in text: | |||||
cp = ord(char) | |||||
if self._is_chinese_char(cp): | |||||
output.append(" ") | |||||
output.append(char) | |||||
output.append(" ") | |||||
else: | |||||
output.append(char) | |||||
return "".join(output) | |||||
def _is_chinese_char(self, cp): | |||||
"""Checks whether CP is the codepoint of a CJK character.""" | |||||
# This defines a "chinese character" as anything in the CJK Unicode block: | |||||
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) | |||||
# | |||||
# Note that the CJK Unicode block is NOT all Japanese and Korean characters, | |||||
# despite its name. The modern Korean Hangul alphabet is a different block, | |||||
# as is Japanese Hiragana and Katakana. Those alphabets are used to write | |||||
# space-separated words, so they are not treated specially and handled | |||||
# like the all of the other languages. | |||||
if ((cp >= 0x4E00 and cp <= 0x9FFF) or # | |||||
(cp >= 0x3400 and cp <= 0x4DBF) or # | |||||
(cp >= 0x20000 and cp <= 0x2A6DF) or # | |||||
(cp >= 0x2A700 and cp <= 0x2B73F) or # | |||||
(cp >= 0x2B740 and cp <= 0x2B81F) or # | |||||
(cp >= 0x2B820 and cp <= 0x2CEAF) or | |||||
(cp >= 0xF900 and cp <= 0xFAFF) or # | |||||
(cp >= 0x2F800 and cp <= 0x2FA1F)): # | |||||
return True | |||||
return False | |||||
def _clean_text(self, text): | |||||
"""Performs invalid character removal and whitespace cleanup on text.""" | |||||
output = [] | |||||
for char in text: | |||||
cp = ord(char) | |||||
if cp == 0 or cp == 0xfffd or _is_control(char): | |||||
continue | |||||
if _is_whitespace(char): | |||||
output.append(" ") | |||||
else: | |||||
output.append(char) | |||||
return "".join(output) | |||||
class WordpieceTokenizer(object): | |||||
"""Runs WordPiece tokenziation.""" | |||||
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): | |||||
self.vocab = vocab | |||||
self.unk_token = unk_token | |||||
self.max_input_chars_per_word = max_input_chars_per_word | |||||
def tokenize(self, text): | |||||
"""Tokenizes a piece of text into its word pieces. | |||||
This uses a greedy longest-match-first algorithm to perform tokenization | |||||
using the given vocabulary. | |||||
For example: | |||||
input = "unaffable" | |||||
output = ["un", "##aff", "##able"] | |||||
Args: | |||||
text: A single token or whitespace separated tokens. This should have | |||||
already been passed through `BasicTokenizer. | |||||
Returns: | |||||
A list of wordpiece tokens. | |||||
""" | |||||
text = convert_to_unicode(text) | |||||
output_tokens = [] | |||||
for token in whitespace_tokenize(text): | |||||
chars = list(token) | |||||
if len(chars) > self.max_input_chars_per_word: | |||||
output_tokens.append(self.unk_token) | |||||
continue | |||||
is_bad = False | |||||
start = 0 | |||||
sub_tokens = [] | |||||
while start < len(chars): | |||||
end = len(chars) | |||||
cur_substr = None | |||||
while start < end: | |||||
substr = "".join(chars[start:end]) | |||||
if start > 0: | |||||
substr = "##" + substr | |||||
if substr in self.vocab: | |||||
cur_substr = substr | |||||
break | |||||
end -= 1 | |||||
if cur_substr is None: | |||||
is_bad = True | |||||
break | |||||
sub_tokens.append(cur_substr) | |||||
start = end | |||||
if is_bad: | |||||
output_tokens.append(self.unk_token) | |||||
else: | |||||
output_tokens.extend(sub_tokens) | |||||
return output_tokens | |||||
def _is_whitespace(char): | |||||
"""Checks whether `chars` is a whitespace character.""" | |||||
# \t, \n, and \r are technically contorl characters but we treat them | |||||
# as whitespace since they are generally considered as such. | |||||
if char == " " or char == "\t" or char == "\n" or char == "\r": | |||||
return True | |||||
cat = unicodedata.category(char) | |||||
if cat == "Zs": | |||||
return True | |||||
return False | |||||
def _is_control(char): | |||||
"""Checks whether `chars` is a control character.""" | |||||
# These are technically control characters but we count them as whitespace | |||||
# characters. | |||||
if char == "\t" or char == "\n" or char == "\r": | |||||
return False | |||||
cat = unicodedata.category(char) | |||||
if cat.startswith("C"): | |||||
return True | |||||
return False | |||||
def _is_punctuation(char): | |||||
"""Checks whether `chars` is a punctuation character.""" | |||||
cp = ord(char) | |||||
# We treat all non-letter/number ASCII as punctuation. | |||||
# Characters such as "^", "$", and "`" are not in the Unicode | |||||
# Punctuation class but we treat them as punctuation anyways, for | |||||
# consistency. | |||||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or | |||||
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): | |||||
return True | |||||
cat = unicodedata.category(char) | |||||
if cat.startswith("P"): | |||||
return True | |||||
return False |
@@ -0,0 +1,382 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import math | |||||
import oneflow as flow | |||||
import oneflow.typing as tp | |||||
from typing import Tuple,Any | |||||
import bert as bert_util | |||||
def BertForSequenceClassification( | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
label_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
initializer_range=0.02, | |||||
label_num=2, | |||||
is_student=False, | |||||
fit_size=768, | |||||
is_train=False | |||||
): | |||||
# with flow.scope.namespace('teacher'): | |||||
backbone = bert_util.BertBackbone( | |||||
input_ids_blob=input_ids_blob, | |||||
input_mask_blob=input_mask_blob, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
vocab_size=vocab_size, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act=hidden_act, | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
max_position_embeddings=max_position_embeddings, | |||||
type_vocab_size=type_vocab_size, | |||||
initializer_range=initializer_range, | |||||
is_train=is_train | |||||
) | |||||
pooled_output = PooledOutput( | |||||
sequence_output=backbone.sequence_output(), | |||||
hidden_size=hidden_size, | |||||
initializer_range=initializer_range, | |||||
is_train=is_train | |||||
) | |||||
logit_blob = _AddClassfication( | |||||
input_blob=pooled_output, | |||||
label_blob=label_blob, | |||||
hidden_size=hidden_size, | |||||
label_num=label_num, | |||||
initializer_range=initializer_range, | |||||
scope_name='classification', | |||||
is_train=is_train | |||||
) | |||||
sequence_output = backbone.all_encoder_layers() | |||||
att_output = backbone.all_attention_probs() | |||||
embed_output = backbone.embedding_output() | |||||
sequence_output.insert(0,embed_output) | |||||
# print(logit_blob.shape) | |||||
# print(len(sequence_output)) | |||||
# print(sequence_output.shape) | |||||
tmp = [] | |||||
if is_student: | |||||
for s_id, sequence_layer in enumerate(sequence_output): | |||||
tmp.append( | |||||
fit_dense( | |||||
input_blob=sequence_layer, | |||||
hidden_size=hidden_size, | |||||
label_num=fit_size, | |||||
initializer_range=initializer_range, | |||||
scope_name='fit_dense', | |||||
is_train=is_train | |||||
)) | |||||
sequence_output = tmp | |||||
return logit_blob, sequence_output, att_output | |||||
def BertStudentForSequenceClassification( | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
label_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
initializer_range=0.02, | |||||
label_num=2, | |||||
is_student=False, | |||||
fit_size=768, | |||||
is_train=True | |||||
): | |||||
with flow.scope.namespace('student'): | |||||
backbone = bert_util.BertBackbone( | |||||
input_ids_blob=input_ids_blob, | |||||
input_mask_blob=input_mask_blob, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
vocab_size=vocab_size, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act=hidden_act, | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
max_position_embeddings=max_position_embeddings, | |||||
type_vocab_size=type_vocab_size, | |||||
initializer_range=initializer_range, | |||||
is_train=is_train | |||||
) | |||||
pooled_output = PooledOutput( | |||||
sequence_output=backbone.sequence_output(), | |||||
hidden_size=hidden_size, | |||||
initializer_range=initializer_range, | |||||
is_train=is_train | |||||
) | |||||
logit_blob = _AddClassfication( | |||||
input_blob=pooled_output, | |||||
label_blob=label_blob, | |||||
hidden_size=hidden_size, | |||||
label_num=label_num, | |||||
initializer_range=initializer_range, | |||||
scope_name='classification', | |||||
is_train=is_train | |||||
) | |||||
sequence_output = backbone.all_encoder_layers() | |||||
att_output = backbone.all_attention_probs() | |||||
embed_output = backbone.embedding_output() | |||||
sequence_output.insert(0, embed_output) | |||||
# print(logit_blob.shape) | |||||
# print(len(sequence_output)) | |||||
# print(sequence_output.shape) | |||||
tmp = [] | |||||
if is_student: | |||||
for s_id, sequence_layer in enumerate(sequence_output): | |||||
tmp.append( | |||||
fit_dense( | |||||
input_blob=sequence_layer, | |||||
hidden_size=hidden_size, | |||||
label_num=fit_size, | |||||
initializer_range=initializer_range, | |||||
scope_name='fit_dense', | |||||
is_train=is_train | |||||
)) | |||||
sequence_output = tmp | |||||
return logit_blob, sequence_output, att_output | |||||
def CreateInitializer(std): | |||||
return flow.truncated_normal(std) | |||||
def _EmbeddingLookup(input_ids_blob, | |||||
vocab_size, | |||||
embedding_size=128, | |||||
initializer_range=0.02, | |||||
word_embedding_name="word_embeddings", | |||||
is_train=True): | |||||
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size], | |||||
dtype=flow.float, | |||||
trainable=is_train, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0) | |||||
return output, embedding_table | |||||
def watch_diff_handler(blob: tp.Numpy): | |||||
print("watch_diff_handler:", blob, blob.shape, blob.dtype) | |||||
def watch_handler(y: tp.Numpy): | |||||
print("out:",y) | |||||
from lstm import lstm,Blstm | |||||
def LSTMStudentForSequenceClassification( | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
label_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=300, | |||||
intermediate_size=400, | |||||
num_hidden_layers=1, | |||||
hidden_dropout_prob=0.5, | |||||
initializer_range=0.25, | |||||
label_num=2, | |||||
is_student=True, | |||||
is_train=True | |||||
): | |||||
with flow.scope.namespace('student'): | |||||
with flow.scope.namespace("embeddings"): | |||||
(embedding_output_, embedding_table_) = _EmbeddingLookup( | |||||
input_ids_blob=input_ids_blob, | |||||
vocab_size=vocab_size+1, | |||||
embedding_size=hidden_size, | |||||
word_embedding_name="word_embeddings", | |||||
is_train=is_train) | |||||
with flow.scope.namespace('lstm'): | |||||
output = lstm(embedding_output_, hidden_size, return_sequence=False, is_train=is_train) | |||||
output = flow.layers.dense(inputs=output,units=intermediate_size,activation=flow.nn.relu,kernel_initializer=CreateInitializer(initializer_range),trainable=is_train,name='FC1') | |||||
output = _Dropout(output, hidden_dropout_prob) | |||||
logit_blob = flow.layers.dense(inputs=output,units=label_num,kernel_initializer=CreateInitializer(initializer_range),trainable=is_train,name='FC2') | |||||
return logit_blob | |||||
def PooledOutput(sequence_output, hidden_size, initializer_range, is_train): | |||||
with flow.scope.namespace("bert-pooler"): | |||||
first_token_tensor = flow.slice( | |||||
sequence_output, [None, 0, 0], [None, 1, -1]) | |||||
first_token_tensor = flow.reshape( | |||||
first_token_tensor, [-1, hidden_size]) | |||||
pooled_output = bert_util._FullyConnected( | |||||
first_token_tensor, | |||||
input_size=hidden_size, | |||||
units=hidden_size, | |||||
weight_initializer=bert_util.CreateInitializer(initializer_range), | |||||
name="dense", | |||||
is_train=is_train | |||||
) | |||||
pooled_output = flow.math.tanh(pooled_output) | |||||
return pooled_output | |||||
def _AddClassfication(input_blob, label_blob, hidden_size, label_num, initializer_range, | |||||
scope_name='classification',is_train=True): | |||||
with flow.scope.namespace(scope_name): | |||||
output_weight_blob = flow.get_variable( | |||||
name="output_weights", | |||||
shape=[label_num, hidden_size], | |||||
dtype=input_blob.dtype, | |||||
# initializer=bert_util.CreateInitializer(initializer_range), | |||||
initializer=flow.random_normal_initializer( | |||||
mean=0.0, stddev=initializer_range, seed=None, dtype=None), | |||||
trainable=is_train | |||||
) | |||||
output_bias_blob = flow.get_variable( | |||||
name="output_bias", | |||||
shape=[label_num], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(0.0), | |||||
trainable=is_train | |||||
) | |||||
logit_blob = flow.matmul( | |||||
input_blob, output_weight_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) | |||||
# pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
# logits=logit_blob, labels=label_blob | |||||
# ) | |||||
# loss = pre_example_loss | |||||
# return loss, pre_example_loss, logit_blob | |||||
return logit_blob | |||||
def _Dropout(input_blob, dropout_prob): | |||||
if dropout_prob == 0.0: | |||||
return input_blob | |||||
return flow.nn.dropout(input_blob, rate=dropout_prob) | |||||
def fit_dense(input_blob, hidden_size, label_num, initializer_range, | |||||
scope_name='fit_dense',is_train=True): | |||||
with flow.scope.namespace(scope_name): | |||||
in_shape = input_blob.shape | |||||
in_num_axes = len(in_shape) | |||||
assert in_num_axes >= 2 | |||||
input_blob = ( | |||||
flow.reshape(input_blob, (-1, in_shape[-1])) if in_num_axes > 2 else input_blob | |||||
) | |||||
output_weight_blob = flow.get_variable( | |||||
name="weight", | |||||
shape=[label_num, hidden_size], | |||||
dtype=input_blob.dtype, | |||||
# initializer=bert_util.CreateInitializer(initializer_range), | |||||
initializer=flow.random_normal_initializer( | |||||
mean=0.0, stddev=initializer_range, seed=None, dtype=None), | |||||
trainable=is_train | |||||
) | |||||
output_bias_blob = flow.get_variable( | |||||
name="bias", | |||||
shape=[label_num], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(0.0), | |||||
trainable=is_train | |||||
) | |||||
logit_blob = flow.matmul( | |||||
input_blob, output_weight_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) | |||||
logit_blob = ( | |||||
flow.reshape(logit_blob, in_shape[:-1] + (label_num,)) if in_num_axes > 2 else logit_blob | |||||
) | |||||
# pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
# logits=logit_blob, labels=label_blob | |||||
# ) | |||||
# loss = pre_example_loss | |||||
# return loss, pre_example_loss, logit_blob | |||||
return logit_blob | |||||
def soft_cross_entropy(predicts, targets): | |||||
student_likelihood = flow.math.log(flow.nn.softmax(predicts, axis=-1)) | |||||
targets_prob = flow.nn.softmax(targets, axis=-1) | |||||
tmp = flow.math.multiply(flow.math.negative(targets_prob), student_likelihood) | |||||
res = flow.math.reduce_mean(tmp) | |||||
return res | |||||
def mseloss(rep1, rep2): | |||||
return flow.math.reduce_mean(flow.math.square(rep1-rep2)) | |||||
def layer_distill(args, student_reps, teacher_reps): | |||||
rep_loss = 0. | |||||
teacher_layer_num = len(teacher_reps) - 1 | |||||
student_layer_num = len(student_reps) - 1 | |||||
assert teacher_layer_num % student_layer_num == 0 | |||||
layers_per_block = int(teacher_layer_num / student_layer_num) | |||||
new_teacher_reps = [teacher_reps[i * layers_per_block] for i in range(student_layer_num + 1)] | |||||
new_student_reps = student_reps | |||||
for student_rep, teacher_rep in zip(new_student_reps, new_teacher_reps): | |||||
tmp_loss = mseloss(student_rep, teacher_rep) | |||||
rep_loss += tmp_loss | |||||
return rep_loss | |||||
def att_distill(args, student_atts, teacher_atts): | |||||
att_loss = 0. | |||||
teacher_layer_num = len(teacher_atts) | |||||
student_layer_num = len(student_atts) | |||||
assert teacher_layer_num % student_layer_num == 0 | |||||
layers_per_block = int(teacher_layer_num / student_layer_num) | |||||
new_teacher_atts = [teacher_atts[i * layers_per_block + layers_per_block - 1] for i in range(student_layer_num)] | |||||
for student_att, teacher_att in zip(student_atts, new_teacher_atts): | |||||
student_att = flow.where(student_att <= flow.constant(-1e2,dtype=flow.float), flow.zeros_like(student_att), student_att) | |||||
teacher_att = flow.where(teacher_att <= flow.constant(-1e2,dtype=flow.float), flow.zeros_like(teacher_att), teacher_att) | |||||
tmp_loss = mseloss(student_att, teacher_att) | |||||
att_loss += tmp_loss | |||||
return att_loss | |||||
def pred_distill(args, student_logits, teacher_logits): | |||||
soft_loss = soft_cross_entropy(student_logits / args.temperature, | |||||
teacher_logits / args.temperature) | |||||
return soft_loss | |||||
@@ -0,0 +1,311 @@ | |||||
""" | |||||
Copyright 2020 Tianshu AI Platform. All Rights Reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import os | |||||
import time | |||||
import argparse | |||||
from datetime import datetime | |||||
import test_global_storage | |||||
import oneflow as flow | |||||
import numpy as np | |||||
np.set_printoptions(suppress=True) | |||||
def _FullyConnected(input_blob,weight_blob,bias_blob): | |||||
output_blob = flow.matmul(input_blob, weight_blob) | |||||
if bias_blob: | |||||
output_blob = flow.nn.bias_add(output_blob, bias_blob) | |||||
return output_blob | |||||
def lstm(input,units,return_sequence=False,initial_state=None,direction='forward',layer_index=0, is_train=True): | |||||
''' | |||||
input: sequence input tensor with shape [batch_size,sequence_length,embedding size] | |||||
units: hidden units numbers | |||||
''' | |||||
batch_size=input.shape[0] | |||||
seq_len=input.shape[1] | |||||
input_size = input.shape[2] | |||||
dtype = flow.float32 | |||||
with flow.scope.namespace('layer'+str(layer_index)): | |||||
with flow.scope.namespace(direction): | |||||
weight_blob_i = flow.get_variable( | |||||
name='input' + '-weight', | |||||
shape=[input_size, units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.glorot_normal_initializer()) | |||||
weight_blob_ih = flow.get_variable( | |||||
name='input' + '-h-weight', | |||||
shape=[units, units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.glorot_normal_initializer()) | |||||
bias_blob_i = flow.get_variable( | |||||
name='input' + '-bias', | |||||
shape=[units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.constant_initializer(0.0)) | |||||
weight_blob_f = flow.get_variable( | |||||
name='forget' + '-weight', | |||||
shape=[input_size, units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.glorot_normal_initializer()) | |||||
weight_blob_fh = flow.get_variable( | |||||
name='forget' + '-h-weight', | |||||
shape=[units, units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.glorot_normal_initializer()) | |||||
bias_blob_f = flow.get_variable( | |||||
name='forget' + '-bias', | |||||
shape=[units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.constant_initializer(0.0)) | |||||
weight_blob_c = flow.get_variable( | |||||
name='cell' + '-weight', | |||||
shape=[input_size, units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.glorot_normal_initializer()) | |||||
weight_blob_ch = flow.get_variable( | |||||
name='cell' + '-h-weight', | |||||
shape=[units, units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.glorot_normal_initializer()) | |||||
bias_blob_c = flow.get_variable( | |||||
name='cell' + '-bias', | |||||
shape=[units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.constant_initializer(0.0)) | |||||
weight_blob_o = flow.get_variable( | |||||
name='output' + '-weight', | |||||
shape=[input_size, units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.glorot_normal_initializer()) | |||||
weight_blob_oh = flow.get_variable( | |||||
name='output' + '-h-weight', | |||||
shape=[units, units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.glorot_normal_initializer()) | |||||
bias_blob_o = flow.get_variable( | |||||
name='output' + '-bias', | |||||
shape=[units], | |||||
dtype=dtype, | |||||
trainable=is_train, | |||||
initializer=flow.constant_initializer(0.0)) | |||||
flow.watch(weight_blob_i, test_global_storage.Setter("weight_blob_i")) | |||||
flow.watch(weight_blob_f, test_global_storage.Setter("weight_blob_f")) | |||||
flow.watch(weight_blob_c, test_global_storage.Setter("weight_blob_c")) | |||||
flow.watch(weight_blob_o, test_global_storage.Setter("weight_blob_o")) | |||||
flow.watch(weight_blob_ih, test_global_storage.Setter("weight_blob_ih")) | |||||
flow.watch(weight_blob_fh, test_global_storage.Setter("weight_blob_fh")) | |||||
flow.watch(weight_blob_ch, test_global_storage.Setter("weight_blob_ch")) | |||||
flow.watch(weight_blob_oh, test_global_storage.Setter("weight_blob_oh")) | |||||
flow.watch(bias_blob_i, test_global_storage.Setter("bias_blob_i")) | |||||
flow.watch(bias_blob_f, test_global_storage.Setter("bias_blob_f")) | |||||
flow.watch(bias_blob_c, test_global_storage.Setter("bias_blob_c")) | |||||
flow.watch(bias_blob_o, test_global_storage.Setter("bias_blob_o")) | |||||
def step_function(input,states): | |||||
hx=states[0] | |||||
cx=states[1] | |||||
x_i = _FullyConnected(input,weight_blob_i,bias_blob_i) # input gate | |||||
mark_int=x_i | |||||
x_f = _FullyConnected(input,weight_blob_f,bias_blob_f) # forget gate | |||||
x_c = _FullyConnected(input,weight_blob_c,bias_blob_c) # cell state | |||||
x_o = _FullyConnected(input,weight_blob_o,bias_blob_o) # output gate | |||||
h_i = _FullyConnected(hx,weight_blob_ih,None) | |||||
h_f = _FullyConnected(hx,weight_blob_fh,None) | |||||
h_c = _FullyConnected(hx,weight_blob_ch,None) | |||||
h_o = _FullyConnected(hx,weight_blob_oh,None) | |||||
x_i = x_i + h_i | |||||
x_f = x_f+h_f | |||||
x_c = x_c+h_c | |||||
x_o = x_o+h_o | |||||
x_i = flow.math.sigmoid(x_i) | |||||
x_f = flow.math.sigmoid(x_f) | |||||
cellgate = flow.math.tanh(x_c) | |||||
x_o = flow.math.sigmoid(x_o) | |||||
cy = x_f * cx + x_i * cellgate | |||||
hy = x_o * flow.math.tanh(cy) | |||||
return hy, (hy,cy) | |||||
if initial_state: | |||||
states=initial_state | |||||
else: | |||||
states=[flow.constant(0, dtype=flow.float32, shape=[batch_size,units]),flow.constant(0, dtype=flow.float32, shape=[batch_size,units])] | |||||
successive_outputs=[] | |||||
successive_states= [] | |||||
for index in range(seq_len): | |||||
# print('time step:',index) | |||||
inp = flow.slice(input, [None, index, 0], [None, 1, input_size]) | |||||
# print(inp.shape) | |||||
inp = flow.reshape(inp, [-1, input_size]) | |||||
# print(inp.shape) | |||||
output, states = step_function(inp, states) | |||||
output = flow.reshape(output,[-1,1,units]) | |||||
successive_outputs.append(output) | |||||
successive_states.append(states) | |||||
last_output = successive_outputs[-1] | |||||
new_states = successive_states[-1] | |||||
outputs = flow.concat(successive_outputs,axis=1) | |||||
if return_sequence: | |||||
return outputs | |||||
else: | |||||
return flow.reshape(last_output,[-1,units]) | |||||
def Blstm(input,units,return_sequence=True,initial_state=None,layer_index=0,is_train=True): | |||||
# return_sequence should be True for BLSTM currently | |||||
# default concat method : add | |||||
forward = lstm(input,units,return_sequence=return_sequence,initial_state=initial_state,direction='forward',layer_index=layer_index,is_train=is_train) | |||||
reverse_input = flow.reverse(input,axis=1) | |||||
backward = lstm(reverse_input,units,return_sequence=return_sequence,initial_state=initial_state,direction='backward',layer_index=layer_index,is_train=is_train) | |||||
backward = flow.reverse(backward,axis=1) | |||||
outputs = forward + backward | |||||
return outputs | |||||
def TestLstm(): | |||||
func_config = flow.FunctionConfig() | |||||
func_config.default_data_type(flow.float32) | |||||
flow.config.gpu_device_num(1) | |||||
@flow.global_function(func_config) | |||||
def InferenceNet(sentence=flow.FixedTensorDef((32, 128, 312), dtype=flow.float32)): | |||||
output = lstm(sentence,512,return_sequence=False) | |||||
return output | |||||
flow.config.enable_debug_mode(True) | |||||
check_point = flow.train.CheckPoint() | |||||
check_point.init() | |||||
sentence_in = np.random.uniform(-10, 10, (32, 128, 312)).astype(np.float32) | |||||
output_of = InferenceNet(sentence_in).get() | |||||
print('output shape',output_of.numpy().shape) | |||||
print('lstm hello world') | |||||
# print('x_o',output_of[0].numpy()) | |||||
# print('o',output_of[3].numpy()) | |||||
#print('output shape:',output.numpy().shape) | |||||
# print('weight:',test_global_storage.Get("weight_blob_i") ) | |||||
# print('weight:',test_global_storage.Get("weight_blob_ih").shape ) | |||||
# print('lstm hello world') | |||||
# from tensorflow.keras import layers | |||||
# from tensorflow import keras | |||||
# | |||||
# inputs = keras.Input(shape=(14, 64)) | |||||
# x = layers.LSTM(15,return_sequences=True,recurrent_activation ='sigmoid',name="lstm_one")(inputs) | |||||
# | |||||
# weight_blob_i = test_global_storage.Get("weight_blob_i") | |||||
# weight_blob_f = test_global_storage.Get("weight_blob_f") | |||||
# weight_blob_c = test_global_storage.Get("weight_blob_c") | |||||
# weight_blob_o = test_global_storage.Get("weight_blob_o") | |||||
# kernel_1 = np.concatenate( ( weight_blob_i,weight_blob_f,weight_blob_c,weight_blob_o) ,axis=1) | |||||
# | |||||
# weight_blob_ih = test_global_storage.Get("weight_blob_ih") | |||||
# weight_blob_fh = test_global_storage.Get("weight_blob_fh") | |||||
# weight_blob_ch = test_global_storage.Get("weight_blob_ch") | |||||
# weight_blob_oh = test_global_storage.Get("weight_blob_oh") | |||||
# kernel_2 = np.concatenate( ( weight_blob_ih,weight_blob_fh,weight_blob_ch,weight_blob_oh) ,axis=1) | |||||
# | |||||
# bias_blob_i = test_global_storage.Get("bias_blob_i") | |||||
# bias_blob_f = test_global_storage.Get("bias_blob_f") | |||||
# bias_blob_c = test_global_storage.Get("bias_blob_c") | |||||
# bias_blob_o = test_global_storage.Get("bias_blob_o") | |||||
# bias_1 = np.concatenate( ( bias_blob_i,bias_blob_f,bias_blob_c,bias_blob_o) ) | |||||
# | |||||
# model = keras.Model(inputs,x) | |||||
# model.get_layer("lstm_one").set_weights([kernel_1,kernel_2,bias_1]) | |||||
# output_tf = model.predict(sentence_in) | |||||
# | |||||
# print(output_of.numpy()[:,-1,:]) | |||||
# print('-'*100) | |||||
# print(output_tf[:,-1,:]) | |||||
# assert(np.allclose(output_of.numpy(),output_tf, rtol=1e-04,atol=1e-04)) | |||||
def TestBlstm(): | |||||
func_config = flow.FunctionConfig() | |||||
func_config.default_data_type(flow.float32) | |||||
flow.config.gpu_device_num(1) | |||||
@flow.global_function(func_config) | |||||
def InferenceNet(sentence=flow.FixedTensorDef((8,15,64), dtype=flow.float32)): | |||||
output = Blstm(sentence,15,return_sequence=True) | |||||
return output | |||||
flow.config.enable_debug_mode(True) | |||||
check_point = flow.train.CheckPoint() | |||||
check_point.init() | |||||
sentence_in = np.random.uniform(-10, 10, (8, 15, 64)).astype(np.float32) | |||||
output=InferenceNet(sentence_in).get() | |||||
print('output shape',output.numpy().shape) | |||||
print('blstm hello world') | |||||
if __name__ == "__main__": | |||||
TestLstm() | |||||
# TestBlstm() |
@@ -0,0 +1,29 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
global_storage = {} | |||||
def Get(name): | |||||
return global_storage.get(name).numpy() | |||||
def Setter(name): | |||||
global global_storage | |||||
def _set(x): | |||||
global_storage[name] = x | |||||
return _set |
@@ -0,0 +1,400 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
"""Tokenization classes.""" | |||||
import collections | |||||
import re | |||||
import unicodedata | |||||
import six | |||||
# import tensorflow as tf | |||||
def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): | |||||
"""Checks whether the casing config is consistent with the checkpoint name.""" | |||||
# The casing has to be passed in by the user and there is no explicit check | |||||
# as to whether it matches the checkpoint. The casing information probably | |||||
# should have been stored in the bert_config.json file, but it's not, so | |||||
# we have to heuristically detect it to validate. | |||||
if not init_checkpoint: | |||||
return | |||||
m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) | |||||
if m is None: | |||||
return | |||||
model_name = m.group(1) | |||||
lower_models = [ | |||||
"uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", | |||||
"multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" | |||||
] | |||||
cased_models = [ | |||||
"cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", | |||||
"multi_cased_L-12_H-768_A-12" | |||||
] | |||||
is_bad_config = False | |||||
if model_name in lower_models and not do_lower_case: | |||||
is_bad_config = True | |||||
actual_flag = "False" | |||||
case_name = "lowercased" | |||||
opposite_flag = "True" | |||||
if model_name in cased_models and do_lower_case: | |||||
is_bad_config = True | |||||
actual_flag = "True" | |||||
case_name = "cased" | |||||
opposite_flag = "False" | |||||
if is_bad_config: | |||||
raise ValueError( | |||||
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " | |||||
"However, `%s` seems to be a %s model, so you " | |||||
"should pass in `--do_lower_case=%s` so that the fine-tuning matches " | |||||
"how the model was pre-training. If this error is wrong, please " | |||||
"just comment out this check." % (actual_flag, init_checkpoint, | |||||
model_name, case_name, opposite_flag)) | |||||
def convert_to_unicode(text): | |||||
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" | |||||
if six.PY3: | |||||
if isinstance(text, str): | |||||
return text | |||||
elif isinstance(text, bytes): | |||||
return text.decode("utf-8", "ignore") | |||||
else: | |||||
raise ValueError("Unsupported string type: %s" % (type(text))) | |||||
elif six.PY2: | |||||
if isinstance(text, str): | |||||
return text.decode("utf-8", "ignore") | |||||
elif isinstance(text, unicode): | |||||
return text | |||||
else: | |||||
raise ValueError("Unsupported string type: %s" % (type(text))) | |||||
else: | |||||
raise ValueError("Not running on Python2 or Python 3?") | |||||
def printable_text(text): | |||||
"""Returns text encoded in a way suitable for print or `tf.logging`.""" | |||||
# These functions want `str` for both Python2 and Python3, but in one case | |||||
# it's a Unicode string and in the other it's a byte string. | |||||
if six.PY3: | |||||
if isinstance(text, str): | |||||
return text | |||||
elif isinstance(text, bytes): | |||||
return text.decode("utf-8", "ignore") | |||||
else: | |||||
raise ValueError("Unsupported string type: %s" % (type(text))) | |||||
elif six.PY2: | |||||
if isinstance(text, str): | |||||
return text | |||||
elif isinstance(text, unicode): | |||||
return text.encode("utf-8") | |||||
else: | |||||
raise ValueError("Unsupported string type: %s" % (type(text))) | |||||
else: | |||||
raise ValueError("Not running on Python2 or Python 3?") | |||||
def load_vocab(vocab_file): | |||||
"""Loads a vocabulary file into a dictionary.""" | |||||
vocab = collections.OrderedDict() | |||||
index = 0 | |||||
# with tf.gfile.GFile(vocab_file, "r") as reader: | |||||
with open(vocab_file, "r", encoding='utf-8') as reader: | |||||
while True: | |||||
token = convert_to_unicode(reader.readline()) | |||||
if not token: | |||||
break | |||||
token = token.strip() | |||||
vocab[token] = index | |||||
index += 1 | |||||
return vocab | |||||
def convert_by_vocab(vocab, items): | |||||
"""Converts a sequence of [tokens|ids] using the vocab.""" | |||||
output = [] | |||||
for item in items: | |||||
output.append(vocab[item]) | |||||
return output | |||||
def convert_tokens_to_ids(vocab, tokens): | |||||
return convert_by_vocab(vocab, tokens) | |||||
def convert_ids_to_tokens(inv_vocab, ids): | |||||
return convert_by_vocab(inv_vocab, ids) | |||||
def whitespace_tokenize(text): | |||||
"""Runs basic whitespace cleaning and splitting on a piece of text.""" | |||||
text = text.strip() | |||||
if not text: | |||||
return [] | |||||
tokens = text.split() | |||||
return tokens | |||||
class FullTokenizer(object): | |||||
"""Runs end-to-end tokenziation.""" | |||||
def __init__(self, vocab_file, do_lower_case=True): | |||||
self.vocab = load_vocab(vocab_file) | |||||
self.inv_vocab = {v: k for k, v in self.vocab.items()} | |||||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) | |||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) | |||||
def tokenize(self, text): | |||||
split_tokens = [] | |||||
for token in self.basic_tokenizer.tokenize(text): | |||||
for sub_token in self.wordpiece_tokenizer.tokenize(token): | |||||
split_tokens.append(sub_token) | |||||
return split_tokens | |||||
def convert_tokens_to_ids(self, tokens): | |||||
return convert_by_vocab(self.vocab, tokens) | |||||
def convert_ids_to_tokens(self, ids): | |||||
return convert_by_vocab(self.inv_vocab, ids) | |||||
class BasicTokenizer(object): | |||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.).""" | |||||
def __init__(self, do_lower_case=True): | |||||
"""Constructs a BasicTokenizer. | |||||
Args: | |||||
do_lower_case: Whether to lower case the input. | |||||
""" | |||||
self.do_lower_case = do_lower_case | |||||
def tokenize(self, text): | |||||
"""Tokenizes a piece of text.""" | |||||
text = convert_to_unicode(text) | |||||
text = self._clean_text(text) | |||||
# This was added on November 1st, 2018 for the multilingual and Chinese | |||||
# models. This is also applied to the English models now, but it doesn't | |||||
# matter since the English models were not trained on any Chinese data | |||||
# and generally don't have any Chinese data in them (there are Chinese | |||||
# characters in the vocabulary because Wikipedia does have some Chinese | |||||
# words in the English Wikipedia.). | |||||
text = self._tokenize_chinese_chars(text) | |||||
orig_tokens = whitespace_tokenize(text) | |||||
split_tokens = [] | |||||
for token in orig_tokens: | |||||
if self.do_lower_case: | |||||
token = token.lower() | |||||
token = self._run_strip_accents(token) | |||||
split_tokens.extend(self._run_split_on_punc(token)) | |||||
output_tokens = whitespace_tokenize(" ".join(split_tokens)) | |||||
return output_tokens | |||||
def _run_strip_accents(self, text): | |||||
"""Strips accents from a piece of text.""" | |||||
text = unicodedata.normalize("NFD", text) | |||||
output = [] | |||||
for char in text: | |||||
cat = unicodedata.category(char) | |||||
if cat == "Mn": | |||||
continue | |||||
output.append(char) | |||||
return "".join(output) | |||||
def _run_split_on_punc(self, text): | |||||
"""Splits punctuation on a piece of text.""" | |||||
chars = list(text) | |||||
i = 0 | |||||
start_new_word = True | |||||
output = [] | |||||
while i < len(chars): | |||||
char = chars[i] | |||||
if _is_punctuation(char): | |||||
output.append([char]) | |||||
start_new_word = True | |||||
else: | |||||
if start_new_word: | |||||
output.append([]) | |||||
start_new_word = False | |||||
output[-1].append(char) | |||||
i += 1 | |||||
return ["".join(x) for x in output] | |||||
def _tokenize_chinese_chars(self, text): | |||||
"""Adds whitespace around any CJK character.""" | |||||
output = [] | |||||
for char in text: | |||||
cp = ord(char) | |||||
if self._is_chinese_char(cp): | |||||
output.append(" ") | |||||
output.append(char) | |||||
output.append(" ") | |||||
else: | |||||
output.append(char) | |||||
return "".join(output) | |||||
def _is_chinese_char(self, cp): | |||||
"""Checks whether CP is the codepoint of a CJK character.""" | |||||
# This defines a "chinese character" as anything in the CJK Unicode block: | |||||
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) | |||||
# | |||||
# Note that the CJK Unicode block is NOT all Japanese and Korean characters, | |||||
# despite its name. The modern Korean Hangul alphabet is a different block, | |||||
# as is Japanese Hiragana and Katakana. Those alphabets are used to write | |||||
# space-separated words, so they are not treated specially and handled | |||||
# like the all of the other languages. | |||||
if ((cp >= 0x4E00 and cp <= 0x9FFF) or # | |||||
(cp >= 0x3400 and cp <= 0x4DBF) or # | |||||
(cp >= 0x20000 and cp <= 0x2A6DF) or # | |||||
(cp >= 0x2A700 and cp <= 0x2B73F) or # | |||||
(cp >= 0x2B740 and cp <= 0x2B81F) or # | |||||
(cp >= 0x2B820 and cp <= 0x2CEAF) or | |||||
(cp >= 0xF900 and cp <= 0xFAFF) or # | |||||
(cp >= 0x2F800 and cp <= 0x2FA1F)): # | |||||
return True | |||||
return False | |||||
def _clean_text(self, text): | |||||
"""Performs invalid character removal and whitespace cleanup on text.""" | |||||
output = [] | |||||
for char in text: | |||||
cp = ord(char) | |||||
if cp == 0 or cp == 0xfffd or _is_control(char): | |||||
continue | |||||
if _is_whitespace(char): | |||||
output.append(" ") | |||||
else: | |||||
output.append(char) | |||||
return "".join(output) | |||||
class WordpieceTokenizer(object): | |||||
"""Runs WordPiece tokenziation.""" | |||||
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): | |||||
self.vocab = vocab | |||||
self.unk_token = unk_token | |||||
self.max_input_chars_per_word = max_input_chars_per_word | |||||
def tokenize(self, text): | |||||
"""Tokenizes a piece of text into its word pieces. | |||||
This uses a greedy longest-match-first algorithm to perform tokenization | |||||
using the given vocabulary. | |||||
For example: | |||||
input = "unaffable" | |||||
output = ["un", "##aff", "##able"] | |||||
Args: | |||||
text: A single token or whitespace separated tokens. This should have | |||||
already been passed through `BasicTokenizer. | |||||
Returns: | |||||
A list of wordpiece tokens. | |||||
""" | |||||
text = convert_to_unicode(text) | |||||
output_tokens = [] | |||||
for token in whitespace_tokenize(text): | |||||
chars = list(token) | |||||
if len(chars) > self.max_input_chars_per_word: | |||||
output_tokens.append(self.unk_token) | |||||
continue | |||||
is_bad = False | |||||
start = 0 | |||||
sub_tokens = [] | |||||
while start < len(chars): | |||||
end = len(chars) | |||||
cur_substr = None | |||||
while start < end: | |||||
substr = "".join(chars[start:end]) | |||||
if start > 0: | |||||
substr = "##" + substr | |||||
if substr in self.vocab: | |||||
cur_substr = substr | |||||
break | |||||
end -= 1 | |||||
if cur_substr is None: | |||||
is_bad = True | |||||
break | |||||
sub_tokens.append(cur_substr) | |||||
start = end | |||||
if is_bad: | |||||
output_tokens.append(self.unk_token) | |||||
else: | |||||
output_tokens.extend(sub_tokens) | |||||
return output_tokens | |||||
def _is_whitespace(char): | |||||
"""Checks whether `chars` is a whitespace character.""" | |||||
# \t, \n, and \r are technically contorl characters but we treat them | |||||
# as whitespace since they are generally considered as such. | |||||
if char == " " or char == "\t" or char == "\n" or char == "\r": | |||||
return True | |||||
cat = unicodedata.category(char) | |||||
if cat == "Zs": | |||||
return True | |||||
return False | |||||
def _is_control(char): | |||||
"""Checks whether `chars` is a control character.""" | |||||
# These are technically control characters but we count them as whitespace | |||||
# characters. | |||||
if char == "\t" or char == "\n" or char == "\r": | |||||
return False | |||||
cat = unicodedata.category(char) | |||||
if cat.startswith("C"): | |||||
return True | |||||
return False | |||||
def _is_punctuation(char): | |||||
"""Checks whether `chars` is a punctuation character.""" | |||||
cp = ord(char) | |||||
# We treat all non-letter/number ASCII as punctuation. | |||||
# Characters such as "^", "$", and "`" are not in the Unicode | |||||
# Punctuation class but we treat them as punctuation anyways, for | |||||
# consistency. | |||||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or | |||||
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): | |||||
return True | |||||
cat = unicodedata.category(char) | |||||
if cat.startswith("P"): | |||||
return True | |||||
return False |
@@ -0,0 +1,220 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import time | |||||
import numpy as np | |||||
from collections import OrderedDict | |||||
import pandas as pd | |||||
from datetime import datetime | |||||
import oneflow as flow | |||||
import shutil | |||||
def InitNodes(args): | |||||
if args.num_nodes > 1: | |||||
assert args.num_nodes <= len(args.node_ips) | |||||
flow.env.ctrl_port(args.ctrl_port) | |||||
nodes = [] | |||||
for ip in args.node_ips[:args.num_nodes]: | |||||
addr_dict = {} | |||||
addr_dict["addr"] = ip | |||||
nodes.append(addr_dict) | |||||
flow.env.machine(nodes) | |||||
class Snapshot(object): | |||||
def __init__(self, model_save_dir, model_load_dir): | |||||
self._model_save_dir = model_save_dir | |||||
self._check_point = flow.train.CheckPoint() | |||||
if model_load_dir: | |||||
assert os.path.isdir(model_load_dir) | |||||
print("Restoring model from {}.".format(model_load_dir)) | |||||
self._check_point.load(model_load_dir) | |||||
else: | |||||
self._check_point.init() | |||||
self.save('initial_model') | |||||
print("Init model on demand.") | |||||
def save(self, name): | |||||
snapshot_save_path = os.path.join(self._model_save_dir, "snapshot_{}".format(name)) | |||||
if not os.path.exists(snapshot_save_path): | |||||
os.makedirs(snapshot_save_path) | |||||
print("Saving model to {}.".format(snapshot_save_path)) | |||||
self._check_point.save(snapshot_save_path) | |||||
class Summary(object): | |||||
def __init__(self, log_dir, config, filename='summary.csv'): | |||||
self._filename = filename | |||||
self._log_dir = log_dir | |||||
if not os.path.exists(log_dir): os.makedirs(log_dir) | |||||
self._metrics = pd.DataFrame({"legend": "cfg", "value": str(config)}, index=[0]) | |||||
def scalar(self, legend, iter, value, **kwargs): | |||||
kwargs['legend'] = legend | |||||
kwargs['iter'] = int(iter) | |||||
kwargs['value'] = value | |||||
df = pd.DataFrame(kwargs, index=[0]) | |||||
self._metrics = pd.concat([self._metrics, df], axis=0, sort=False) | |||||
self.save() | |||||
def save(self): | |||||
save_path = os.path.join(self._log_dir, self._filename) | |||||
self._metrics.to_csv(save_path, index=False) | |||||
class StopWatch(object): | |||||
def __init__(self): | |||||
pass | |||||
def start(self): | |||||
self.start_time = time.time() | |||||
self.last_split = self.start_time | |||||
def split(self): | |||||
now = time.time() | |||||
duration = now - self.last_split | |||||
self.last_split = now | |||||
return duration | |||||
def stop(self): | |||||
self.stop_time = time.time() | |||||
def duration(self): | |||||
return self.stop_time - self.start_time | |||||
class Metric(object): | |||||
def __init__(self, summary=None, desc='train', print_steps=-1, batch_size=256, keys=[]): | |||||
r"""accumulate and calculate metric | |||||
Args: | |||||
summary: A `Summary` object to write in. | |||||
desc: `str` general description of the metric to show | |||||
print_steps: `Int` print metrics every nth steps | |||||
batch_size: `Int` batch size per step | |||||
keys: keys in callback outputs | |||||
Returns: | |||||
""" | |||||
self.summary = summary | |||||
self.save_summary = isinstance(self.summary, Summary) | |||||
self.desc = desc | |||||
self.print_steps = print_steps | |||||
assert batch_size > 0 | |||||
self.batch_size = batch_size | |||||
assert isinstance(keys, (list, tuple)) | |||||
self.keys = keys | |||||
self.metric_dict = OrderedDict() | |||||
self.metric_dict['step'] = 0 | |||||
self.timer = StopWatch() | |||||
self.timer.start() | |||||
self._clear() | |||||
def _clear(self): | |||||
for key in self.keys: | |||||
self.metric_dict[key] = 0.0 | |||||
self.metric_dict['throughput'] = 0.0 | |||||
self.num_samples = 0.0 | |||||
def update_and_save(self, key, value, step, **kwargs): | |||||
self.metric_dict[key] = value | |||||
if self.save_summary: | |||||
self.summary.scalar(self.desc + "_" + key, step, value, **kwargs) | |||||
def metric_cb(self, step=0, **kwargs): | |||||
def callback(outputs): | |||||
if step == 0: self._clear() | |||||
for key in self.keys: | |||||
self.metric_dict[key] += outputs[key].sum() | |||||
self.num_samples += self.batch_size | |||||
if (step + 1) % self.print_steps == 0: | |||||
self.metric_dict['step'] = step | |||||
for k, v in kwargs.items(): | |||||
self.metric_dict[k] = v | |||||
throughput = self.num_samples / self.timer.split() | |||||
self.update_and_save('throughput', throughput, step) | |||||
for key in self.keys: | |||||
value = self.metric_dict[key] / self.num_samples | |||||
self.update_and_save(key, value, step, **kwargs) | |||||
print(', '.join(('{}: {}' if type(v) is int else '{}: {:.3f}').format(k, v) \ | |||||
for k, v in self.metric_dict.items()), time.time()) | |||||
self._clear() | |||||
return callback | |||||
def CreateOptimizer(args): | |||||
warmup_batches = int(args.iter_num * args.warmup_proportion) | |||||
lr_warmup = flow.optimizer.warmup.linear(warmup_batches, 0) | |||||
lr_scheduler = flow.optimizer.PolynomialSchduler(args.learning_rate, args.iter_num, 0.0, | |||||
warmup=lr_warmup) | |||||
return flow.optimizer.AdamW(lr_scheduler, epsilon=1e-6, weight_decay=args.weight_decay_rate, | |||||
weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], | |||||
grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0)) | |||||
def GetFunctionConfig(args): | |||||
config = flow.function_config() | |||||
config.enable_auto_mixed_precision(args.use_fp16) | |||||
if args.use_xla: | |||||
config.use_xla_jit(True) | |||||
return config | |||||
def getdirsize(dir): | |||||
size = 0 | |||||
for root, dirs, files in os.walk(dir): | |||||
for name in files: | |||||
if str(root[-2:]) == '-v' or str(root[-2:]) == '-m': | |||||
pass | |||||
else: | |||||
tmp = os.path.getsize(os.path.join(root, name)) | |||||
size += tmp | |||||
# size += sum([os.path.getsize(os.path.join(root, name)) for name in files]) | |||||
return size | |||||
def remove_optimizer_params(model_dir): | |||||
# delete the optimizer parmas from model_save_dir | |||||
for a, b, c in os.walk(model_dir): | |||||
for subdir in b: | |||||
if str(subdir[-2:]) == '-v' or str(subdir[-2:]) == '-m': | |||||
shutil.rmtree(os.path.join(model_dir, subdir)) | |||||
def remove_teacher_params(model_dir): | |||||
# delete the teacher params from model_save_dir | |||||
# delete the optimizer parmas from model_save_dir | |||||
for a, b, c in os.walk(model_dir): | |||||
for subdir in b: | |||||
if subdir[:7]!='student': | |||||
shutil.rmtree(os.path.join(model_dir,subdir)) | |||||
elif str(subdir[-2:]) == '-v' or str(subdir[-2:]) == '-m': | |||||
shutil.rmtree(os.path.join(model_dir, subdir)) | |||||
glue_tasks_num_labels = { | |||||
"cola": 2, | |||||
"mnli": 3, | |||||
"mrpc": 2, | |||||
"sst-2": 2, | |||||
"sts-b": 1, | |||||
"qqp": 2, | |||||
"qnli": 2, | |||||
"rte": 2, | |||||
"wnli": 2, | |||||
} | |||||
@@ -0,0 +1,204 @@ | |||||
# 知识蒸馏快速上手 | |||||
## 1. 简介 | |||||
知识蒸馏:通过一些优化目标从大型、知识丰富的teacher模型学习一个小型的student模型 | |||||
炼知技术平台提供了4个知识蒸馏相关算子,以及众多基于Oneflow算子复现的知识蒸馏模型和使用示例。 | |||||
<table> | |||||
<thead> | |||||
<tr> | |||||
<th>类型</th> | |||||
<th>知识蒸馏模型</th> | |||||
<th><a href="../../../docs/API_knowledge_distill.md" target="_blank">主要算子</a></th> | |||||
<th>使用文档</th> | |||||
</tr> | |||||
</thead> | |||||
<tbody> | |||||
<tr> | |||||
<td rowspan="2">软标签蒸馏</td> | |||||
<td>KD</td> | |||||
<td>软标签蒸馏</td> | |||||
<td><a href="./examples/knowledge_distillation/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
<tr> | |||||
<td>Distilled-BiLSTM</td> | |||||
<td>软标签蒸馏,将BERT蒸馏到BiLSTM</td> | |||||
<td><a href="./examples/distilled-bilstm/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
<tr> | |||||
<td rowspan="2">从其他知识蒸馏</td> | |||||
<td>BERT-PKD</td> | |||||
<td>软标签蒸馏+层与层蒸馏</td> | |||||
<td><a href="./examples/bert-pkd/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
<tr> | |||||
<td>TinyBERT</td> | |||||
<td>软标签蒸馏+层与层蒸馏+注意力蒸馏</td> | |||||
<td><a href="./examples/tinybert/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
<tr> | |||||
<td>模块替换</td> | |||||
<td>BERT-Theseus</td> | |||||
<td>依照概率替换原有的BERT模块和Theseus的模块组成新的模型来训练</td> | |||||
<td><a href="./examples/xxx/README.md" target="_blank">链接</a></td> | |||||
</tr> | |||||
</tbody> | |||||
</table> | |||||
## 2. 使用 | |||||
### 2.1 依赖 | |||||
- Python 3.6 | |||||
- oneflow-cu101 0.1.10 | |||||
- numpy 1.19.2 | |||||
完整的环境可以通过以下命令安装: | |||||
```bash | |||||
conda create -n distil python=3.6 | |||||
``` | |||||
``` | |||||
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user | |||||
``` | |||||
### 2.2 数据获取 | |||||
知识蒸馏主要针对NLP相关的任务,炼知平台在GLUE任务的数据集上对不同算法进行了测试。 | |||||
可以通过执行以下脚本下载GLUE任务的所有数据集,将会自动下载并解压到'--data_dir=data'目录下。 | |||||
``` | |||||
bash run_download_glue_data.sh | |||||
``` | |||||
或者 | |||||
```bash | |||||
python ../src/download_glue_data.py --data_dir data/glue_data --tasks all | |||||
``` | |||||
TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"] | |||||
以上脚本将会默认下载所有BLUE任务数据集,也可以通过'--tasks=TASKS',指定下载某些数据集 | |||||
参考[加载与准备OneFlow数据集](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/how_to_make_ofdataset.md),制作OFRecords数据集。或者执行以下命令,生成OFRecords数据集: | |||||
``` | |||||
bash glue_process.sh | |||||
``` | |||||
**或者直接下载转换后的OFRecords GLUE数据集,并放置到相关目录(data/glue_ofrecord)下:** | |||||
链接: https://pan.baidu.com/s/1TuDJpJ8z9zJvvhqjjXiGDg 提取码: phyf | |||||
### 2.3 微调教师模型 | |||||
预训练BERT模型下载地址: | |||||
链接: https://pan.baidu.com/s/1jfTUY7ygcZZOJzjfrgUL8Q 提取码: 6b87 | |||||
下载后放置在`./models/uncased_L-12_H-768_A-12_oneflow` | |||||
#### 2.3.1 训练 | |||||
- 执行以下脚本进行微调教师模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- MODEL_SAVE_DIR: 模型保存路径 | |||||
```bash | |||||
bash run_train_teacher.sh | |||||
``` | |||||
#### 2.3.2 测试 | |||||
- 微调后,可以执行以下脚本对教师模型进行测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- TEACHER_MODEL_DIR: 教师模型路径 | |||||
```bash | |||||
bash run_eval_teacher.sh | |||||
``` | |||||
### 2.4 蒸馏到学生模型 | |||||
#### 2.4.1 训练 | |||||
执行以下脚本将教师模型蒸馏到学生模型: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径 | |||||
- TMP_STUDENT_DIR: 临时学生模型路径(如果需要的话) | |||||
- STUDENT_DIR: 学生模型保存路径 | |||||
- 不同知识蒸馏算法: | |||||
- KD | |||||
```bash | |||||
bash run_train_student_kd.sh | |||||
``` | |||||
- Distilled-BiLSTM | |||||
```bash | |||||
bash run_train_student_distilled_lstm.sh | |||||
``` | |||||
- BERT-PKD | |||||
```bash | |||||
bash run_train_student_bert_pkd.sh | |||||
``` | |||||
>注:BERT-PKD可以随机初始化,也可以选择根据教师BERT中间层进行初始化,详细步骤请查阅[这里](./examples/bert-pkd/README.md#41-教师模型中间层保存与转换) | |||||
- TinyBERT | |||||
```bash | |||||
bash run_train_student_tinybert.sh | |||||
``` | |||||
- BERT-of-Theseus | |||||
```bash | |||||
bash run_bert_theseus.sh ${GPU_ID} ${SAVE_TAG} {PHRASE1_LR} ${PHRASE2_LR} ${PHRASE1_REPLACE_RATE} ${COMPRESS_RATIO} | |||||
example: bash run_bert_theseus.sh 0 1 1e-5 1e-5 0.5 4 | |||||
``` | |||||
- GPU_ID: 指定进行训练的 GPU 的 id | |||||
- SAVE_TAG: 指定模型保存文件的特定标识符 | |||||
- PHRASE1_LR: BERT-of-Theseus第一阶段的学习率 | |||||
- PHRASE1_LR: BERT-of-Theseus第二阶段的学习率 | |||||
- PHRASE1_REPLACE_RATE: 第一阶段当中,BERT的模块替换为Theseus模块的概率 | |||||
- COMPRESS_RATIO: 压缩的比率,例如 COMPRESS_RATIO=4,会将12层的BERT-Base压缩为3层 | |||||
- BERT-of-Theses 需要在特定数据集上微调好的模型作为输入 | |||||
- 修改 run_bert_theseus.sh 里 line 25 的 dataset=你需要的数据集,现在默认是 SST-2 | |||||
- 将特定数据集现象下面的 PRETRAINED_MODEL 和 BERT_BASE_DIR 都改成上面你微调好的模型所在的文件夹。 | |||||
- 默认的保存路径为: | |||||
- 第一阶段./log/${dataset}_bert_theseus_uncased_L-12_H-768_A-12_oneflow_v${SAVE_TAG}s1 | |||||
- 第一阶段./log/${dataset}_bert_theseus_uncased_L-12_H-768_A-12_oneflow_v${SAVE_TAG}s2 | |||||
> BERT类模型最大序列长度设为128; LSTM类模型最大序列长度设为32,词表大小为10000 | |||||
#### 2.4.2 测试 | |||||
执行以下脚本进行测试: | |||||
- DATA_ROOT: GLUE数据集总路径 | |||||
- dataset: 任务名 | |||||
- STUDENT_DIR: 学生模型保存路径,蒸馏过的学生模型下载链接如下(SST-2数据集) | |||||
- 不同知识蒸馏算法: | |||||
- KD | |||||
下载链接: https://pan.baidu.com/s/1EgQyQgxAcFAG8Ch3-4VPaw 提取码: 5k9p | |||||
```bash | |||||
bash run_eval_student_kd.sh | |||||
``` | |||||
- Distilled-BiLSTM | |||||
下载链接: https://pan.baidu.com/s/1M4XzB2DnLikglxVFvhnYpw 提取码: hqhj | |||||
```bash | |||||
bash run_eval_student_distilled_lstm.sh | |||||
``` | |||||
- BERT-PKD | |||||
- 从教师模型中间层初始化,下载链接: https://pan.baidu.com/s/1l7vXn-3U05Hzl0RXCJPiLg 提取码: 33dk | |||||
- 随机初始化,下载链接: https://pan.baidu.com/s/1m46j57Tova_yaGLabAqUIw 提取码: pdx4 | |||||
```bash | |||||
bash run_eval_student_bert_pkd.sh | |||||
``` | |||||
- TinyBERT | |||||
下载链接: https://pan.baidu.com/s/1nOAZHd3wLmyVw2vTJB7KfQ 提取码: ma65 | |||||
```bash | |||||
bash run_eval_student_tinybert.sh | |||||
``` | |||||
- BERT-of-Theseus | |||||
```bash | |||||
bash eval_bert_theseus.sh ${GPU_ID} ${VERSION} | |||||
example: bash eval_bert_theseus.sh 0 1s1 | |||||
``` | |||||
@@ -0,0 +1,341 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow as flow | |||||
import oneflow.core.common.data_type_pb2 as data_type_util | |||||
import oneflow.core.operator.op_conf_pb2 as op_conf_util | |||||
import math | |||||
class BertBackbone(object): | |||||
def __init__(self, | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
initializer_range=0.02): | |||||
with flow.scope.namespace("bert"): | |||||
with flow.scope.namespace("embeddings"): | |||||
(self.embedding_output_, self.embedding_table_) = _EmbeddingLookup( | |||||
input_ids_blob=input_ids_blob, | |||||
vocab_size=vocab_size, | |||||
embedding_size=hidden_size, | |||||
initializer_range=initializer_range, | |||||
word_embedding_name="word_embeddings") | |||||
self.embedding_output_ = _EmbeddingPostprocessor( | |||||
input_blob=self.embedding_output_, | |||||
seq_length=seq_length, | |||||
embedding_size=hidden_size, | |||||
use_token_type=True, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
token_type_vocab_size=type_vocab_size, | |||||
token_type_embedding_name="token_type_embeddings", | |||||
use_position_embeddings=True, | |||||
position_embedding_name="position_embeddings", | |||||
initializer_range=initializer_range, | |||||
max_position_embeddings=max_position_embeddings, | |||||
dropout_prob=hidden_dropout_prob) | |||||
with flow.scope.namespace("encoder"): | |||||
attention_mask_blob = _CreateAttentionMaskFromInputMask( | |||||
input_mask_blob, from_seq_length=seq_length, to_seq_length=seq_length) | |||||
self.all_encoder_layers_ = _TransformerModel( | |||||
input_blob=self.embedding_output_, | |||||
attention_mask_blob=attention_mask_blob, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
intermediate_act_fn=GetActivation(hidden_act), | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
initializer_range=initializer_range, | |||||
do_return_all_layers=False) | |||||
self.sequence_output_ = self.all_encoder_layers_[-1] | |||||
def embedding_output(self): return self.embedding_output_ | |||||
def all_encoder_layers(self): return self.all_encoder_layers_ | |||||
def sequence_output(self): return self.sequence_output_ | |||||
def embedding_table(self): return self.embedding_table_ | |||||
def CreateInitializer(std): | |||||
return flow.truncated_normal(std) | |||||
def _Gelu(in_blob): | |||||
return flow.math.gelu(in_blob) | |||||
def _TransformerModel(input_blob, | |||||
attention_mask_blob, | |||||
seq_length, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
intermediate_act_fn=_Gelu, | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
initializer_range=0.02, | |||||
do_return_all_layers=False): | |||||
assert hidden_size % num_attention_heads == 0 | |||||
attention_head_size = int(hidden_size / num_attention_heads) | |||||
input_width = hidden_size | |||||
prev_output_blob = flow.reshape(input_blob, (-1, input_width)) | |||||
all_layer_output_blobs = [] | |||||
for layer_idx in range(num_hidden_layers): | |||||
with flow.scope.namespace("layer_%d"%layer_idx): | |||||
layer_input_blob = prev_output_blob | |||||
with flow.scope.namespace("attention"): | |||||
with flow.scope.namespace("self"): | |||||
attention_output_blob = _AttentionLayer( | |||||
from_blob=layer_input_blob, | |||||
to_blob=layer_input_blob, | |||||
attention_mask_blob=attention_mask_blob, | |||||
num_attention_heads=num_attention_heads, | |||||
size_per_head=attention_head_size, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
initializer_range=initializer_range, | |||||
do_return_2d_tensor=True, | |||||
from_seq_length=seq_length, | |||||
to_seq_length=seq_length) | |||||
with flow.scope.namespace("output"): | |||||
attention_output_blob = _FullyConnected( | |||||
attention_output_blob, | |||||
input_size=num_attention_heads * attention_head_size, | |||||
units=hidden_size, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense') | |||||
attention_output_blob = _Dropout(attention_output_blob, hidden_dropout_prob) | |||||
attention_output_blob = attention_output_blob + layer_input_blob | |||||
attention_output_blob = _LayerNorm(attention_output_blob, hidden_size) | |||||
with flow.scope.namespace("intermediate"): | |||||
if callable(intermediate_act_fn): | |||||
act_fn = op_conf_util.kNone | |||||
else: | |||||
act_fn = intermediate_act_fn | |||||
intermediate_output_blob = _FullyConnected( | |||||
attention_output_blob, | |||||
input_size=num_attention_heads * attention_head_size, | |||||
units=intermediate_size, | |||||
activation=act_fn, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense') | |||||
if callable(intermediate_act_fn): | |||||
intermediate_output_blob = intermediate_act_fn(intermediate_output_blob) | |||||
with flow.scope.namespace("output"): | |||||
layer_output_blob = _FullyConnected( | |||||
intermediate_output_blob, | |||||
input_size=intermediate_size, | |||||
units=hidden_size, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense') | |||||
layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob) | |||||
layer_output_blob = layer_output_blob + attention_output_blob | |||||
layer_output_blob = _LayerNorm(layer_output_blob, hidden_size) | |||||
prev_output_blob = layer_output_blob | |||||
all_layer_output_blobs.append(layer_output_blob) | |||||
input_shape = (-1, seq_length, hidden_size) | |||||
if do_return_all_layers: | |||||
final_output_blobs = [] | |||||
for layer_output_blob in all_layer_output_blobs: | |||||
final_output_blob = flow.reshape(layer_output_blob, input_shape) | |||||
final_output_blobs.append(final_output_blob) | |||||
return final_output_blobs | |||||
else: | |||||
final_output_blob = flow.reshape(prev_output_blob, input_shape) | |||||
return [final_output_blob] | |||||
def _AttentionLayer(from_blob, | |||||
to_blob, | |||||
attention_mask_blob, | |||||
num_attention_heads=1, | |||||
size_per_head=512, | |||||
query_act=op_conf_util.kNone, | |||||
key_act=op_conf_util.kNone, | |||||
value_act=op_conf_util.kNone, | |||||
attention_probs_dropout_prob=0.0, | |||||
initializer_range=0.02, | |||||
do_return_2d_tensor=False, | |||||
batch_size=None, | |||||
from_seq_length=None, | |||||
to_seq_length=None): | |||||
def TransposeForScores(input_blob, num_attention_heads, seq_length, width): | |||||
output_blob = flow.reshape(input_blob, [-1, seq_length, num_attention_heads, width]) | |||||
output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3]) | |||||
return output_blob | |||||
from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head]) | |||||
to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head]) | |||||
query_blob = _FullyConnected( | |||||
from_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=query_act, | |||||
name="query", | |||||
weight_initializer=CreateInitializer(initializer_range)) | |||||
key_blob = _FullyConnected( | |||||
to_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=key_act, | |||||
name="key", | |||||
weight_initializer=CreateInitializer(initializer_range)) | |||||
value_blob = _FullyConnected( | |||||
to_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=value_act, | |||||
name="value", | |||||
weight_initializer=CreateInitializer(initializer_range)) | |||||
query_blob = TransposeForScores(query_blob, num_attention_heads, from_seq_length, size_per_head) | |||||
key_blob = TransposeForScores(key_blob, num_attention_heads, to_seq_length, size_per_head) | |||||
attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True) | |||||
attention_scores_blob = attention_scores_blob * (1.0 / math.sqrt(float(size_per_head))) | |||||
attention_mask_blob = flow.reshape(attention_mask_blob, [-1, 1, from_seq_length, to_seq_length]) | |||||
attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float) | |||||
addr_blob = (attention_mask_blob - 1.0) * 10000.0 | |||||
attention_scores_blob = attention_scores_blob + addr_blob | |||||
attention_probs_blob = flow.nn.softmax(attention_scores_blob) | |||||
attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob) | |||||
value_blob = flow.reshape(value_blob, [-1, to_seq_length, num_attention_heads, size_per_head]) | |||||
value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3]) | |||||
context_blob = flow.matmul(attention_probs_blob, value_blob) | |||||
context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3]) | |||||
if do_return_2d_tensor: | |||||
context_blob = flow.reshape(context_blob, [-1, num_attention_heads * size_per_head]) | |||||
else: | |||||
context_blob = flow.reshape(context_blob, [-1, from_seq_length, num_attention_heads * size_per_head]) | |||||
return context_blob | |||||
def _FullyConnected(input_blob, input_size, units, activation=None, name=None, | |||||
weight_initializer=None, is_train=True): | |||||
weight_blob = flow.get_variable( | |||||
name=name + '-weight', | |||||
shape=[input_size, units], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=weight_initializer) | |||||
bias_blob = flow.get_variable( | |||||
name=name + '-bias', | |||||
shape=[units], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=flow.constant_initializer(0.0)) | |||||
output_blob = flow.matmul(input_blob, weight_blob) | |||||
output_blob = flow.nn.bias_add(output_blob, bias_blob) | |||||
return output_blob | |||||
def _Dropout(input_blob, dropout_prob): | |||||
if dropout_prob == 0.0: | |||||
return input_blob | |||||
return flow.nn.dropout(input_blob, rate=dropout_prob) | |||||
def _LayerNorm(input_blob, hidden_size): | |||||
return flow.layers.layer_norm(input_blob, name='LayerNorm', begin_norm_axis=-1, begin_params_axis=-1) | |||||
def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length): | |||||
output = flow.cast(to_mask_blob, dtype=flow.float) | |||||
output = flow.reshape(output, [-1, 1, to_seq_length]) | |||||
zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length]) | |||||
output = zeros + output | |||||
return output | |||||
def _EmbeddingPostprocessor(input_blob, | |||||
seq_length, | |||||
embedding_size, | |||||
use_token_type=False, | |||||
token_type_ids_blob=None, | |||||
token_type_vocab_size=16, | |||||
token_type_embedding_name="token_type_embeddings", | |||||
use_position_embeddings=True, | |||||
position_embedding_name="position_embeddings", | |||||
initializer_range=0.02, | |||||
max_position_embeddings=512, | |||||
dropout_prob=0.1): | |||||
output = input_blob | |||||
if use_token_type: | |||||
assert token_type_ids_blob is not None | |||||
token_type_table = flow.get_variable(name=token_type_embedding_name, | |||||
shape=[token_type_vocab_size, embedding_size], | |||||
dtype=input_blob.dtype, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
token_type_embeddings = flow.gather(params=token_type_table, indices=token_type_ids_blob, axis=0) | |||||
output = output + token_type_embeddings | |||||
if use_position_embeddings: | |||||
position_table = flow.get_variable(name=position_embedding_name, | |||||
shape=[1, max_position_embeddings, embedding_size], | |||||
dtype=input_blob.dtype, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
assert seq_length <= max_position_embeddings | |||||
if seq_length != max_position_embeddings: | |||||
position_table = flow.slice(position_table, begin=[None, 0, 0], size=[None, seq_length, -1]) | |||||
output = output + position_table | |||||
output = _LayerNorm(output, embedding_size) | |||||
output = _Dropout(output, dropout_prob) | |||||
return output | |||||
def _EmbeddingLookup(input_ids_blob, | |||||
vocab_size, | |||||
embedding_size=128, | |||||
initializer_range=0.02, | |||||
word_embedding_name="word_embeddings"): | |||||
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size], | |||||
dtype=flow.float, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0) | |||||
return output, embedding_table | |||||
def GetActivation(name): | |||||
if name == 'linear': | |||||
return None | |||||
elif name == 'relu': | |||||
return flow.math.relu | |||||
elif name == 'tanh': | |||||
return flow.math.tanh | |||||
elif name == 'gelu': | |||||
return flow.math.gelu | |||||
else: | |||||
raise Exception("unsupported activation") | |||||
@@ -0,0 +1,408 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow as flow | |||||
import oneflow.core.common.data_type_pb2 as data_type_util | |||||
import oneflow.core.operator.op_conf_pb2 as op_conf_util | |||||
import math | |||||
class BertTheseusBackbone(object): | |||||
def __init__(self, | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
initializer_range=0.02, | |||||
replace_prob=None, | |||||
compress_ratio=1): | |||||
with flow.scope.namespace("bert"): | |||||
with flow.scope.namespace("embeddings"): | |||||
(self.embedding_output_, self.embedding_table_) = _EmbeddingLookup( | |||||
input_ids_blob=input_ids_blob, | |||||
vocab_size=vocab_size, | |||||
embedding_size=hidden_size, | |||||
initializer_range=initializer_range, | |||||
word_embedding_name="word_embeddings", | |||||
is_train=False) | |||||
self.embedding_output_ = _EmbeddingPostprocessor( | |||||
input_blob=self.embedding_output_, | |||||
seq_length=seq_length, | |||||
embedding_size=hidden_size, | |||||
use_token_type=True, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
token_type_vocab_size=type_vocab_size, | |||||
token_type_embedding_name="token_type_embeddings", | |||||
use_position_embeddings=True, | |||||
position_embedding_name="position_embeddings", | |||||
initializer_range=initializer_range, | |||||
max_position_embeddings=max_position_embeddings, | |||||
dropout_prob=hidden_dropout_prob, | |||||
is_train=False) | |||||
with flow.scope.namespace("encoder"): | |||||
attention_mask_blob = _CreateAttentionMaskFromInputMask( | |||||
input_mask_blob, from_seq_length=seq_length, to_seq_length=seq_length) | |||||
self.all_encoder_layers_ = _TransformerModel( | |||||
input_blob=self.embedding_output_, | |||||
attention_mask_blob=attention_mask_blob, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
intermediate_act_fn=GetActivation(hidden_act), | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
initializer_range=initializer_range, | |||||
do_return_all_layers=False, | |||||
replace_prob=replace_prob, | |||||
compress_ratio=compress_ratio) | |||||
self.sequence_output_ = self.all_encoder_layers_[-1] | |||||
def embedding_output(self): return self.embedding_output_ | |||||
def all_encoder_layers(self): return self.all_encoder_layers_ | |||||
def sequence_output(self): return self.sequence_output_ | |||||
def embedding_table(self): return self.embedding_table_ | |||||
def CreateInitializer(std): | |||||
return flow.truncated_normal(std) | |||||
def _Gelu(in_blob): | |||||
return flow.math.gelu(in_blob) | |||||
def _TransformerModel(input_blob, | |||||
attention_mask_blob, | |||||
seq_length, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
intermediate_act_fn=_Gelu, | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
initializer_range=0.02, | |||||
do_return_all_layers=False, | |||||
replace_prob=0.0, | |||||
compress_ratio=1): | |||||
# print('| transformer num hidden layers: ', num_hidden_layers) | |||||
assert hidden_size % num_attention_heads == 0 | |||||
attention_head_size = int(hidden_size / num_attention_heads) | |||||
input_width = hidden_size | |||||
prev_output_blob = flow.reshape(input_blob, (-1, input_width)) | |||||
# all_layer_output_blobs = [] | |||||
per_add_teacher_layers = compress_ratio | |||||
per_add_student_layers = 1 | |||||
teacher_layer_idx = student_layer_idx = 0 | |||||
def add_teacher_layer(base_teacher_layer_idx, sub_teacher_output_blob): | |||||
for add_teacher_layer_idx in range(per_add_teacher_layers): | |||||
sub_teacher_output_blob = addOnelayer( | |||||
layer_idx=base_teacher_layer_idx+add_teacher_layer_idx, | |||||
prev_output_blob=sub_teacher_output_blob, | |||||
attention_mask_blob=attention_mask_blob, | |||||
num_attention_heads=num_attention_heads, | |||||
attention_head_size=attention_head_size, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
initializer_range=initializer_range, seq_length=seq_length, hidden_size=hidden_size, | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
intermediate_act_fn=intermediate_act_fn, | |||||
intermediate_size=intermediate_size, namescope_prefix='', is_train=False) | |||||
return sub_teacher_output_blob | |||||
def add_student_layer(base_student_layer_idx, sub_student_output_blob): | |||||
# with flow.scope.namespace("student"): | |||||
sub_student_output_blob = addOnelayer( | |||||
base_student_layer_idx, sub_student_output_blob, attention_mask_blob, | |||||
num_attention_heads, attention_head_size, | |||||
attention_probs_dropout_prob, initializer_range, seq_length, hidden_size, hidden_dropout_prob, | |||||
intermediate_act_fn, intermediate_size, namescope_prefix='student-', is_train=True) | |||||
return sub_student_output_blob | |||||
while teacher_layer_idx < num_hidden_layers: | |||||
with flow.scope.placement("cpu", "0:0"): | |||||
sample = flow.random.coin_flip(name='layer{}_replacing_prob'.format(teacher_layer_idx), probability=replace_prob) | |||||
sample = sample.with_distribute(flow.distribute.broadcast()) | |||||
prev_output_blob = flow.where( | |||||
sample, | |||||
x=add_student_layer(student_layer_idx, prev_output_blob), | |||||
y=add_teacher_layer(teacher_layer_idx, prev_output_blob), | |||||
name='where_layer{}'.format(teacher_layer_idx) | |||||
) | |||||
teacher_layer_idx += per_add_teacher_layers | |||||
student_layer_idx += per_add_student_layers | |||||
# print('| current teacher_layer: ', teacher_layer_idx) | |||||
# print('| current student_layer: ', student_layer_idx) | |||||
# print('| num_hidden_layers: ', num_hidden_layers) | |||||
input_shape = (-1, seq_length, hidden_size) | |||||
final_output_blob = flow.reshape(prev_output_blob, input_shape) | |||||
return [final_output_blob] | |||||
def addOnelayer(layer_idx, prev_output_blob, attention_mask_blob, num_attention_heads, attention_head_size, | |||||
attention_probs_dropout_prob, initializer_range, seq_length, hidden_size, hidden_dropout_prob, | |||||
intermediate_act_fn, intermediate_size, namescope_prefix='', is_train=True): | |||||
# print('| {} | addOnelayer {}'.format(namescope_prefix, layer_idx)) | |||||
with flow.scope.namespace("{}layer_{}".format(namescope_prefix, layer_idx)): | |||||
layer_input_blob = prev_output_blob | |||||
with flow.scope.namespace("attention"): | |||||
with flow.scope.namespace("self"): | |||||
attention_output_blob = _AttentionLayer( | |||||
from_blob=layer_input_blob, | |||||
to_blob=layer_input_blob, | |||||
attention_mask_blob=attention_mask_blob, | |||||
num_attention_heads=num_attention_heads, | |||||
size_per_head=attention_head_size, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
initializer_range=initializer_range, | |||||
do_return_2d_tensor=True, | |||||
from_seq_length=seq_length, | |||||
to_seq_length=seq_length, | |||||
is_train=is_train) | |||||
with flow.scope.namespace("output"): | |||||
attention_output_blob = _FullyConnected( | |||||
attention_output_blob, | |||||
input_size=num_attention_heads * attention_head_size, | |||||
units=hidden_size, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense', | |||||
is_train=is_train) | |||||
attention_output_blob = _Dropout(attention_output_blob, hidden_dropout_prob) | |||||
attention_output_blob = attention_output_blob + layer_input_blob | |||||
attention_output_blob = _LayerNorm(attention_output_blob, hidden_size) | |||||
with flow.scope.namespace("intermediate"): | |||||
if callable(intermediate_act_fn): | |||||
act_fn = op_conf_util.kNone | |||||
else: | |||||
act_fn = intermediate_act_fn | |||||
intermediate_output_blob = _FullyConnected( | |||||
attention_output_blob, | |||||
input_size=num_attention_heads * attention_head_size, | |||||
units=intermediate_size, | |||||
activation=act_fn, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense', | |||||
is_train=is_train) | |||||
if callable(intermediate_act_fn): | |||||
intermediate_output_blob = intermediate_act_fn(intermediate_output_blob) | |||||
with flow.scope.namespace("output"): | |||||
layer_output_blob = _FullyConnected( | |||||
intermediate_output_blob, | |||||
input_size=intermediate_size, | |||||
units=hidden_size, | |||||
weight_initializer=CreateInitializer(initializer_range), | |||||
name='dense', | |||||
is_train=is_train) | |||||
layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob) | |||||
layer_output_blob = layer_output_blob + attention_output_blob | |||||
layer_output_blob = _LayerNorm(layer_output_blob, hidden_size) | |||||
output_blob = layer_output_blob | |||||
return output_blob | |||||
def _AttentionLayer(from_blob, | |||||
to_blob, | |||||
attention_mask_blob, | |||||
num_attention_heads=1, | |||||
size_per_head=512, | |||||
query_act=op_conf_util.kNone, | |||||
key_act=op_conf_util.kNone, | |||||
value_act=op_conf_util.kNone, | |||||
attention_probs_dropout_prob=0.0, | |||||
initializer_range=0.02, | |||||
do_return_2d_tensor=False, | |||||
batch_size=None, | |||||
from_seq_length=None, | |||||
to_seq_length=None, | |||||
is_train=True): | |||||
def TransposeForScores(input_blob, num_attention_heads, seq_length, width): | |||||
output_blob = flow.reshape(input_blob, [-1, seq_length, num_attention_heads, width]) | |||||
output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3]) | |||||
return output_blob | |||||
from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head]) | |||||
to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head]) | |||||
query_blob = _FullyConnected( | |||||
from_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=query_act, | |||||
name="query", | |||||
is_train=is_train, | |||||
weight_initializer=CreateInitializer(initializer_range)) | |||||
key_blob = _FullyConnected( | |||||
to_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=key_act, | |||||
name="key", | |||||
is_train=is_train, | |||||
weight_initializer=CreateInitializer(initializer_range)) | |||||
value_blob = _FullyConnected( | |||||
to_blob_2d, | |||||
input_size=num_attention_heads * size_per_head, | |||||
units=num_attention_heads * size_per_head, | |||||
activation=value_act, | |||||
name="value", | |||||
is_train=is_train, | |||||
weight_initializer=CreateInitializer(initializer_range)) | |||||
query_blob = TransposeForScores(query_blob, num_attention_heads, from_seq_length, size_per_head) | |||||
key_blob = TransposeForScores(key_blob, num_attention_heads, to_seq_length, size_per_head) | |||||
attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True) | |||||
attention_scores_blob = attention_scores_blob * (1.0 / math.sqrt(float(size_per_head))) | |||||
attention_mask_blob = flow.reshape(attention_mask_blob, [-1, 1, from_seq_length, to_seq_length]) | |||||
attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float) | |||||
addr_blob = (attention_mask_blob - 1.0) * 10000.0 | |||||
attention_scores_blob = attention_scores_blob + addr_blob | |||||
attention_probs_blob = flow.nn.softmax(attention_scores_blob) | |||||
attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob) | |||||
value_blob = flow.reshape(value_blob, [-1, to_seq_length, num_attention_heads, size_per_head]) | |||||
value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3]) | |||||
context_blob = flow.matmul(attention_probs_blob, value_blob) | |||||
context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3]) | |||||
if do_return_2d_tensor: | |||||
context_blob = flow.reshape(context_blob, [-1, num_attention_heads * size_per_head]) | |||||
else: | |||||
context_blob = flow.reshape(context_blob, [-1, from_seq_length, num_attention_heads * size_per_head]) | |||||
return context_blob | |||||
def _FullyConnected(input_blob, input_size, units, activation=None, name=None, | |||||
weight_initializer=None, is_train=True): | |||||
weight_blob = flow.get_variable( | |||||
name=name + '-weight', | |||||
shape=[input_size, units], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=weight_initializer) | |||||
bias_blob = flow.get_variable( | |||||
name=name + '-bias', | |||||
shape=[units], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=flow.constant_initializer(0.0)) | |||||
output_blob = flow.matmul(input_blob, weight_blob) | |||||
output_blob = flow.nn.bias_add(output_blob, bias_blob) | |||||
return output_blob | |||||
def _Dropout(input_blob, dropout_prob): | |||||
if dropout_prob == 0.0: | |||||
return input_blob | |||||
return flow.nn.dropout(input_blob, rate=dropout_prob) | |||||
def _LayerNorm(input_blob, hidden_size): | |||||
return flow.layers.layer_norm(input_blob, name='LayerNorm', begin_norm_axis=-1, begin_params_axis=-1) | |||||
def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length): | |||||
output = flow.cast(to_mask_blob, dtype=flow.float) | |||||
output = flow.reshape(output, [-1, 1, to_seq_length]) | |||||
zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length]) | |||||
output = zeros + output | |||||
return output | |||||
def _EmbeddingPostprocessor(input_blob, | |||||
seq_length, | |||||
embedding_size, | |||||
use_token_type=False, | |||||
token_type_ids_blob=None, | |||||
token_type_vocab_size=16, | |||||
token_type_embedding_name="token_type_embeddings", | |||||
use_position_embeddings=True, | |||||
position_embedding_name="position_embeddings", | |||||
initializer_range=0.02, | |||||
max_position_embeddings=512, | |||||
dropout_prob=0.1, | |||||
is_train=True): | |||||
output = input_blob | |||||
if use_token_type: | |||||
assert token_type_ids_blob is not None | |||||
token_type_table = flow.get_variable(name=token_type_embedding_name, | |||||
shape=[token_type_vocab_size, embedding_size], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
token_type_embeddings = flow.gather(params=token_type_table, indices=token_type_ids_blob, axis=0) | |||||
output = output + token_type_embeddings | |||||
if use_position_embeddings: | |||||
position_table = flow.get_variable(name=position_embedding_name, | |||||
shape=[1, max_position_embeddings, embedding_size], | |||||
dtype=input_blob.dtype, | |||||
trainable=is_train, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
assert seq_length <= max_position_embeddings | |||||
if seq_length != max_position_embeddings: | |||||
position_table = flow.slice(position_table, begin=[None, 0, 0], size=[None, seq_length, -1]) | |||||
output = output + position_table | |||||
output = _LayerNorm(output, embedding_size) | |||||
output = _Dropout(output, dropout_prob) | |||||
return output | |||||
def _EmbeddingLookup(input_ids_blob, | |||||
vocab_size, | |||||
embedding_size=128, | |||||
initializer_range=0.02, | |||||
word_embedding_name="word_embeddings", | |||||
is_train=True): | |||||
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size], | |||||
dtype=flow.float, | |||||
trainable=is_train, | |||||
initializer=CreateInitializer(initializer_range)) | |||||
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0) | |||||
return output, embedding_table | |||||
def GetActivation(name): | |||||
if name == 'linear': | |||||
return None | |||||
elif name == 'relu': | |||||
return flow.math.relu | |||||
elif name == 'tanh': | |||||
return flow.math.tanh | |||||
elif name == 'gelu': | |||||
return flow.math.gelu | |||||
else: | |||||
raise Exception("unsupported activation") | |||||
@@ -0,0 +1,126 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow as flow | |||||
import bert as bert_util | |||||
import bert_theseus as bert_theseus_util | |||||
import oneflow.core.operator.op_conf_pb2 as op_conf_util | |||||
def GlueBERT( | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
label_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
initializer_range=0.02, | |||||
label_num=2, | |||||
replace_prob=0.0, | |||||
compress_ratio=1 | |||||
): | |||||
# print('| replace_prob: {} | compress_ratio: {}'.format(replace_prob, compress_ratio)) | |||||
backbone = bert_theseus_util.BertTheseusBackbone( | |||||
input_ids_blob=input_ids_blob, | |||||
input_mask_blob=input_mask_blob, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
vocab_size=vocab_size, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act=hidden_act, | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
max_position_embeddings=max_position_embeddings, | |||||
type_vocab_size=type_vocab_size, | |||||
initializer_range=initializer_range, | |||||
replace_prob=replace_prob, | |||||
compress_ratio=compress_ratio | |||||
) | |||||
pooled_output = PooledOutput( | |||||
sequence_output=backbone.sequence_output(), | |||||
hidden_size=hidden_size, | |||||
initializer_range=initializer_range, | |||||
is_train=False | |||||
) | |||||
loss, _, logit_blob = _AddClassficationLoss( | |||||
input_blob=pooled_output, | |||||
label_blob=label_blob, | |||||
hidden_size=hidden_size, | |||||
label_num=label_num, | |||||
initializer_range=initializer_range, | |||||
scope_name='classification', | |||||
is_train=False | |||||
) | |||||
return loss, logit_blob | |||||
def PooledOutput(sequence_output, hidden_size, initializer_range, is_train=True): | |||||
with flow.scope.namespace("bert-pooler"): | |||||
first_token_tensor = flow.slice( | |||||
sequence_output, [None, 0, 0], [None, 1, -1]) | |||||
first_token_tensor = flow.reshape( | |||||
first_token_tensor, [-1, hidden_size]) | |||||
pooled_output = bert_util._FullyConnected( | |||||
first_token_tensor, | |||||
input_size=hidden_size, | |||||
units=hidden_size, | |||||
weight_initializer=bert_util.CreateInitializer(initializer_range), | |||||
name="dense", | |||||
is_train=is_train | |||||
) | |||||
pooled_output = flow.math.tanh(pooled_output) | |||||
return pooled_output | |||||
def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range, | |||||
scope_name='classification', is_train=True): | |||||
with flow.scope.namespace(scope_name): | |||||
output_weight_blob = flow.get_variable( | |||||
name="output_weights", | |||||
shape=[label_num, hidden_size], | |||||
dtype=input_blob.dtype, | |||||
# initializer=bert_util.CreateInitializer(initializer_range), | |||||
initializer=flow.random_normal_initializer( | |||||
mean=0.0, stddev=initializer_range, seed=None, dtype=None), | |||||
trainable=is_train | |||||
) | |||||
output_bias_blob = flow.get_variable( | |||||
name="output_bias", | |||||
shape=[label_num], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(0.0), | |||||
trainable=is_train | |||||
) | |||||
logit_blob = flow.matmul( | |||||
input_blob, output_weight_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) | |||||
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=logit_blob, labels=label_blob | |||||
) | |||||
loss = pre_example_loss | |||||
return loss, pre_example_loss, logit_blob |
@@ -0,0 +1,110 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import argparse | |||||
from datetime import datetime | |||||
def str_list(x): | |||||
return x.split(',') | |||||
def int_list(x): | |||||
return list(map(int, x.split(','))) | |||||
def float_list(x): | |||||
return list(map(float, x.split(','))) | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
def get_parser(parser=None): | |||||
parser = argparse.ArgumentParser(description="flags for bert") | |||||
parser.add_argument('--do_train', type=str2bool, nargs='?', const=True, help='train or not') | |||||
parser.add_argument('--do_eval', type=str2bool, nargs='?', const=True, help='eval or not') | |||||
# resouce | |||||
parser.add_argument("--model", type=str, default='BERT Pretrain') | |||||
parser.add_argument("--gpu_num_per_node", type=int, default=1) | |||||
parser.add_argument('--num_nodes', type=int, default=1, | |||||
help='node/machine number for training') | |||||
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'], | |||||
help='nodes ip list for training, devided by ",", length >= num_nodes') | |||||
# train | |||||
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate") | |||||
parser.add_argument("--weight_decay_rate", type=float, default=0.01, help="weight decay rate") | |||||
parser.add_argument("--warmup_proportion", type=float, default=0.1) | |||||
parser.add_argument('--use_fp16', type=str2bool, nargs='?', default='False', const=True, | |||||
help='use use fp16 or not') | |||||
# log and resore/save | |||||
parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False, | |||||
help="print loss every n iteration") | |||||
parser.add_argument("--model_save_every_n_iter", type=int, default=10000, required=False, | |||||
help="save model every n iteration", ) | |||||
parser.add_argument("--model_save_dir", type=str, | |||||
default="./output/model_save-{}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))), | |||||
required=False, help="model save directory") | |||||
parser.add_argument("--result_dir", type=str, default="./result", required=False, help="result save directory") | |||||
parser.add_argument("--save_last_snapshot", type=bool, default=False, required=False, | |||||
help="save model snapshot for last iteration") | |||||
parser.add_argument("--model_load_dir", type=str, default=None, help="model load directory") | |||||
parser.add_argument("--log_dir", type=str, default="./output", help="log info save directory") | |||||
# bert backbone | |||||
parser.add_argument('--do_lower_case', type=str2bool, nargs='?', const=True, default='True') | |||||
parser.add_argument("--seq_length", type=int, default=512) | |||||
parser.add_argument("--max_predictions_per_seq", type=int, default=80) | |||||
parser.add_argument("--num_hidden_layers", type=int, default=24) | |||||
parser.add_argument("--num_attention_heads", type=int, default=16) | |||||
parser.add_argument("--max_position_embeddings", type=int, default=512) | |||||
parser.add_argument("--type_vocab_size", type=int, default=2) | |||||
parser.add_argument("--vocab_size", type=int, default=30522) | |||||
parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--hidden_dropout_prob", type=float, default=0.1) | |||||
parser.add_argument("--hidden_size_per_head", type=int, default=64) | |||||
parser.add_argument("--replace_prob", type=float, default=0.0) | |||||
parser.add_argument("--compress_ratio", type=int, default=1) | |||||
return parser | |||||
def print_args(args): | |||||
print("=".ljust(66, "=")) | |||||
print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format( | |||||
args.model, args.gpu_num_per_node, args.num_nodes)) | |||||
print("=".ljust(66, "=")) | |||||
for arg in vars(args): | |||||
print("{} = {}".format(arg, getattr(args, arg))) | |||||
print("-".ljust(66, "-")) | |||||
print("Time stamp: {}".format( | |||||
str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) | |||||
if __name__ == '__main__': | |||||
parser = get_parser() | |||||
args = parser.parse_args() | |||||
print_args(args) |
@@ -0,0 +1,90 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
"""Convert tensorflow checkpoint to oneflow snapshot""" | |||||
import re | |||||
import argparse | |||||
import tensorflow as tf | |||||
import numpy as np | |||||
import os | |||||
parser = argparse.ArgumentParser() | |||||
## Required parameters | |||||
parser.add_argument("--tf_checkpoint_path", | |||||
default = None, | |||||
type = str, | |||||
required = True, | |||||
help = "Path the TensorFlow checkpoint path.") | |||||
parser.add_argument("--of_dump_path", | |||||
default = None, | |||||
type = str, | |||||
required = True, | |||||
help = "Path to the output OneFlow model.") | |||||
#args = parser.parse_args() | |||||
args, unknown = parser.parse_known_args() | |||||
print(args) | |||||
# parse unknown arguments for extra weights | |||||
extra_weights = {} | |||||
for u in unknown: | |||||
w = u.split("=") | |||||
assert len(w) == 2 | |||||
if len(w) == 2: | |||||
extra_weights[w[0]] = float(w[1]) | |||||
def _write_blob(folder, blob): | |||||
os.makedirs(folder, exist_ok=True) | |||||
filename = os.path.join(folder, "out") | |||||
f = open(filename, 'wb') | |||||
f.write(blob.tobytes()) | |||||
f.close() | |||||
print(filename, blob.shape) | |||||
def _SaveWeightBlob2File(blob, folder): | |||||
_write_blob(folder, blob) | |||||
for weight, default_value in extra_weights.items(): | |||||
d = np.full_like(blob, default_value) | |||||
_write_blob(folder + weight, d) | |||||
def convert(): | |||||
path = args.tf_checkpoint_path | |||||
init_vars = tf.train.list_variables(path) | |||||
for name, shape in init_vars: | |||||
array = tf.train.load_variable(path, name) | |||||
sep = name.rfind('/') | |||||
blob_name = name[sep + 1:] | |||||
op_name = name[:sep].replace('/', '-') | |||||
if blob_name == "kernel": | |||||
blob_name = "weight" | |||||
elif blob_name in ['adam_m', 'adam_v']: | |||||
print("find m, v weights") | |||||
folder_name = op_name+"-"+blob_name | |||||
folder = os.path.join(args.of_dump_path, folder_name) | |||||
#print("saved to:", folder) | |||||
_SaveWeightBlob2File(array, folder) | |||||
if __name__ == "__main__": | |||||
convert() | |||||
@@ -0,0 +1,103 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import os | |||||
import argparse | |||||
import shutil | |||||
import re | |||||
def str2bool(v): | |||||
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |||||
return True | |||||
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |||||
return False | |||||
else: | |||||
raise argparse.ArgumentTypeError('Unsupported value encountered.') | |||||
parser = argparse.ArgumentParser() | |||||
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.") | |||||
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.") | |||||
parser.add_argument("--layer_list", default="2,6,10", type=str, | |||||
help="the set of intermediate layers to distill knowledge from") | |||||
args = parser.parse_args() | |||||
# args.layer_list = | |||||
args.layer_list = [int(i) for i in args.layer_list.split(',')] | |||||
args.layer_num = len(args.layer_list) | |||||
student_filelist = [] | |||||
def subString(template): | |||||
rule = r'bert-encoder-layer_(.*?)-' | |||||
slotList = re.findall(rule, template) | |||||
return slotList | |||||
def CopyFile(filepath, newPath): | |||||
if not os.path.exists(newPath): | |||||
os.makedirs(newPath) | |||||
fileNames = os.listdir(filepath) | |||||
for file in fileNames: | |||||
newDir = os.path.join(filepath,file) | |||||
if os.path.isfile(newDir): | |||||
newFile = os.path.join(newPath, file) | |||||
shutil.copyfile(newDir, newFile) | |||||
else: | |||||
if not os.path.exists(os.path.join(newPath, file)): | |||||
os.makedirs(os.path.join(newPath, file)) | |||||
CopyFile(newDir, os.path.join(newPath, file)) | |||||
if not os.path.exists(args.student_model): | |||||
os.makedirs(args.student_model) | |||||
for a, b, c in os.walk(args.teacher_model): | |||||
for subdir in b: | |||||
if str(subdir[-2:]) == '-v' or str(subdir[-2:]) == '-m': | |||||
continue | |||||
teacher_layer_num = subString(subdir) | |||||
# print('| teacher_layer_num: {}'.format(teacher_layer_num)) | |||||
if len(teacher_layer_num) == 0: | |||||
teacher_model_subdir = os.path.join(args.teacher_model, subdir) | |||||
student_model_subdir = os.path.join(args.student_model, subdir) | |||||
# print('| teacher model subdir: {} | student model subdir: {}'.format( | |||||
# teacher_model_subdir, student_model_subdir)) | |||||
CopyFile(teacher_model_subdir, student_model_subdir) | |||||
else: | |||||
teacher_layer_num = int(teacher_layer_num[0]) | |||||
teacher_model_source_subdir = os.path.join(args.teacher_model, subdir) | |||||
teacher_model_target_subdir = os.path.join(args.student_model, subdir) | |||||
CopyFile(teacher_model_source_subdir, teacher_model_target_subdir) | |||||
# print('| teacher_layer_num: {}'.format(teacher_layer_num)) | |||||
# print(subdir, subdir.split('layer', 1)) | |||||
prefix, suffix = subdir.split('layer', 1) | |||||
student_subdir = prefix + 'student-layer' + suffix | |||||
# student_subdir = 'student-' + subdir | |||||
# print('| student_subdir: ', student_subdir) | |||||
if teacher_layer_num in args.layer_list: | |||||
student_layer_num = args.layer_list.index(teacher_layer_num) | |||||
rule = r'bert-encoder-layer_(.*?)-' | |||||
x = re.sub(rule, 'bert-encoder-layer_{}-'.format(str(student_layer_num)), student_subdir) | |||||
# print('| x: ', x) | |||||
teacher_model_subdir = os.path.join(args.teacher_model, subdir) | |||||
student_model_subdir = os.path.join(args.student_model, x) | |||||
# print('| teacher model subdir: {} | student model subdir: {}'.format(teacher_model_subdir, | |||||
# student_model_subdir)) | |||||
CopyFile(teacher_model_subdir, student_model_subdir) |
@@ -0,0 +1,189 @@ | |||||
""" | |||||
Copyright 2020 The OneFlow Authors. All rights reserved. | |||||
Licensed under the Apache License, Version 2.0 (the "License"); | |||||
you may not use this file except in compliance with the License. | |||||
You may obtain a copy of the License at | |||||
http://www.apache.org/licenses/LICENSE-2.0 | |||||
Unless required by applicable law or agreed to in writing, software | |||||
distributed under the License is distributed on an "AS IS" BASIS, | |||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
See the License for the specific language governing permissions and | |||||
limitations under the License. | |||||
""" | |||||
import oneflow as flow | |||||
import bert as bert_util | |||||
import oneflow.core.operator.op_conf_pb2 as op_conf_util | |||||
def PreTrain( | |||||
input_ids_blob, | |||||
input_mask_blob, | |||||
token_type_ids_blob, | |||||
masked_lm_positions_blob, | |||||
masked_lm_ids_blob, | |||||
masked_lm_weights_blob, | |||||
next_sentence_label_blob, | |||||
vocab_size, | |||||
seq_length=512, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=16, | |||||
max_predictions_per_seq=20, | |||||
initializer_range=0.02, | |||||
): | |||||
backbone = bert_util.BertBackbone( | |||||
input_ids_blob=input_ids_blob, | |||||
input_mask_blob=input_mask_blob, | |||||
token_type_ids_blob=token_type_ids_blob, | |||||
vocab_size=vocab_size, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
num_hidden_layers=num_hidden_layers, | |||||
num_attention_heads=num_attention_heads, | |||||
intermediate_size=intermediate_size, | |||||
hidden_act=hidden_act, | |||||
hidden_dropout_prob=hidden_dropout_prob, | |||||
attention_probs_dropout_prob=attention_probs_dropout_prob, | |||||
max_position_embeddings=max_position_embeddings, | |||||
type_vocab_size=type_vocab_size, | |||||
initializer_range=initializer_range, | |||||
) | |||||
(lm_loss, _, _) = _AddMaskedLanguageModelLoss( | |||||
input_blob=backbone.sequence_output(), | |||||
output_weights_blob=backbone.embedding_table(), | |||||
positions_blob=masked_lm_positions_blob, | |||||
label_id_blob=masked_lm_ids_blob, | |||||
label_weight_blob=masked_lm_weights_blob, | |||||
seq_length=seq_length, | |||||
hidden_size=hidden_size, | |||||
vocab_size=vocab_size, | |||||
max_predictions_per_seq=max_predictions_per_seq, | |||||
hidden_act=bert_util.GetActivation(hidden_act), | |||||
initializer_range=initializer_range, | |||||
) | |||||
pooled_output = PooledOutput( | |||||
backbone.sequence_output(), hidden_size, initializer_range | |||||
) | |||||
(ns_loss, _, _) = _AddNextSentenceOutput( | |||||
input_blob=pooled_output, | |||||
label_blob=next_sentence_label_blob, | |||||
hidden_size=hidden_size, | |||||
initializer_range=initializer_range, | |||||
) | |||||
with flow.scope.namespace("cls-loss"): | |||||
total_loss = lm_loss + ns_loss | |||||
return total_loss, lm_loss, ns_loss | |||||
def PooledOutput(sequence_output, hidden_size, initializer_range): | |||||
with flow.scope.namespace("bert-pooler"): | |||||
first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1]) | |||||
first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size]) | |||||
pooled_output = bert_util._FullyConnected( | |||||
first_token_tensor, | |||||
input_size=hidden_size, | |||||
units=hidden_size, | |||||
weight_initializer=bert_util.CreateInitializer(initializer_range), | |||||
name="dense", | |||||
) | |||||
pooled_output = flow.math.tanh(pooled_output) | |||||
return pooled_output | |||||
def _AddMaskedLanguageModelLoss( | |||||
input_blob, | |||||
output_weights_blob, | |||||
positions_blob, | |||||
label_id_blob, | |||||
label_weight_blob, | |||||
seq_length, | |||||
hidden_size, | |||||
vocab_size, | |||||
max_predictions_per_seq, | |||||
hidden_act, | |||||
initializer_range, | |||||
): | |||||
with flow.scope.namespace("other"): | |||||
sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1]) | |||||
ones = sum_label_weight_blob * 0.0 + 1.0 | |||||
sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob) | |||||
batch_size = flow.math.reduce_sum(ones) | |||||
sum_label_weight_blob = sum_label_weight_blob / batch_size | |||||
with flow.scope.namespace("cls-predictions"): | |||||
input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size) | |||||
with flow.scope.namespace("transform"): | |||||
if callable(hidden_act): | |||||
act_fn = op_conf_util.kNone | |||||
else: | |||||
act_fn = hidden_act | |||||
input_blob = bert_util._FullyConnected( | |||||
input_blob, | |||||
input_size=hidden_size, | |||||
units=hidden_size, | |||||
activation=act_fn, | |||||
weight_initializer=bert_util.CreateInitializer(initializer_range), | |||||
name="dense", | |||||
) | |||||
if callable(hidden_act): | |||||
input_blob = hidden_act(input_blob) | |||||
input_blob = bert_util._LayerNorm(input_blob, hidden_size) | |||||
output_bias = flow.get_variable( | |||||
name="output_bias", | |||||
shape=[vocab_size], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(1.0), | |||||
) | |||||
logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias) | |||||
label_id_blob = flow.reshape(label_id_blob, [-1]) | |||||
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=logit_blob, labels=label_id_blob | |||||
) | |||||
pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq]) | |||||
numerator = pre_example_loss * label_weight_blob | |||||
with flow.scope.namespace("loss"): | |||||
numerator = flow.math.reduce_sum(numerator, axis=[-1]) | |||||
denominator = sum_label_weight_blob + 1e-5 | |||||
loss = numerator / denominator | |||||
return loss, pre_example_loss, logit_blob | |||||
def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size): | |||||
output = flow.gather( | |||||
params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2 | |||||
) | |||||
output = flow.reshape(output, [-1, hidden_size]) | |||||
return output | |||||
def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range): | |||||
with flow.scope.namespace("cls-seq_relationship"): | |||||
output_weight_blob = flow.get_variable( | |||||
name="output_weights", | |||||
shape=[2, hidden_size], | |||||
dtype=input_blob.dtype, | |||||
initializer=bert_util.CreateInitializer(initializer_range), | |||||
) | |||||
output_bias_blob = flow.get_variable( | |||||
name="output_bias", | |||||
shape=[2], | |||||
dtype=input_blob.dtype, | |||||
initializer=flow.constant_initializer(0.0), | |||||
) | |||||
logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True) | |||||
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob) | |||||
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits( | |||||
logits=logit_blob, labels=label_blob | |||||
) | |||||
loss = pre_example_loss | |||||
return loss, pre_example_loss, logit_blob |