Browse Source

update tianshu serving and model compress

tags/v0.3.0
之江实验室 3 years ago
parent
commit
080cccd85a
100 changed files with 59338 additions and 0 deletions
  1. +5
    -0
      model_compress/.gitignore
  2. +155
    -0
      model_compress/README.md
  3. +63
    -0
      model_compress/docs/API_knowledge_distill.md
  4. +86
    -0
      model_compress/docs/API_prune.md
  5. +126
    -0
      model_compress/docs/API_quant.md
  6. BIN
      model_compress/docs/imgs/overview.png
  7. +44
    -0
      model_compress/docs/model_zoo.md
  8. +4
    -0
      model_compress/model_compress/ChannelSlimming/log/readme.md
  9. +263
    -0
      model_compress/model_compress/ChannelSlimming/model/cnn/alexnet_model.py
  10. +152
    -0
      model_compress/model_compress/ChannelSlimming/model/cnn/lenet_model.py
  11. +218
    -0
      model_compress/model_compress/ChannelSlimming/model/cnn/resnet_model.py
  12. +200
    -0
      model_compress/model_compress/ChannelSlimming/model/cnn/vgg_model.py
  13. +118
    -0
      model_compress/model_compress/ChannelSlimming/model/dnn/dnn_model.py
  14. +6161
    -0
      model_compress/model_compress/ChannelSlimming/myData/randomData255/test.json
  15. +6161
    -0
      model_compress/model_compress/ChannelSlimming/myData/randomData255/train.json
  16. +1
    -0
      model_compress/model_compress/ChannelSlimming/ofData/readme.md
  17. +202
    -0
      model_compress/model_compress/ChannelSlimming/ofrecordMake.py
  18. +4
    -0
      model_compress/model_compress/ChannelSlimming/output/snapshots/readme.md
  19. +225
    -0
      model_compress/model_compress/ChannelSlimming/prune/pruneAlexnet.py
  20. +158
    -0
      model_compress/model_compress/ChannelSlimming/prune/pruneDnn.py
  21. +226
    -0
      model_compress/model_compress/ChannelSlimming/prune/pruneLenet.py
  22. +271
    -0
      model_compress/model_compress/ChannelSlimming/prune/pruneResnet.py
  23. +228
    -0
      model_compress/model_compress/ChannelSlimming/prune/pruneVggnet.py
  24. +97
    -0
      model_compress/model_compress/ChannelSlimming/prune/util/model_weights.py
  25. +315
    -0
      model_compress/model_compress/ChannelSlimming/prune/util/prune_algorithm.py
  26. +251
    -0
      model_compress/model_compress/ChannelSlimming/readme.md
  27. +240
    -0
      model_compress/model_compress/ChannelSlimming/run.py
  28. +80
    -0
      model_compress/model_compress/ChannelSlimming/run_alexnet_cifar10.sh
  29. +78
    -0
      model_compress/model_compress/ChannelSlimming/run_dnn2_cifar10.sh
  30. +153
    -0
      model_compress/model_compress/ChannelSlimming/train_val.py
  31. +144
    -0
      model_compress/model_compress/ChannelSlimming/util/config.py
  32. +53
    -0
      model_compress/model_compress/ChannelSlimming/util/job_function_util.py
  33. +97
    -0
      model_compress/model_compress/ChannelSlimming/util/model_weights.py
  34. +335
    -0
      model_compress/model_compress/ChannelSlimming/util/ofrecord_util.py
  35. +93
    -0
      model_compress/model_compress/ChannelSlimming/util/optimizer_util.py
  36. +374
    -0
      model_compress/model_compress/ChannelSlimming/util/util.py
  37. +0
    -0
      model_compress/model_compress/__init__.py
  38. +196
    -0
      model_compress/model_compress/distil/README.md
  39. +0
    -0
      model_compress/model_compress/distil/__init__.py
  40. +89
    -0
      model_compress/model_compress/distil/examples/bert-pkd/README.md
  41. +76
    -0
      model_compress/model_compress/distil/examples/bert-pkd/bert-pkd_generate_student_model.py
  42. +8
    -0
      model_compress/model_compress/distil/examples/bert-pkd/run_bert-pkd_generate_student_mdoel.sh
  43. +491
    -0
      model_compress/model_compress/distil/examples/bert-pkd/task_student_bert-pkd.py
  44. +61
    -0
      model_compress/model_compress/distil/examples/distilled-bilstm/README.md
  45. +118
    -0
      model_compress/model_compress/distil/examples/distilled-bilstm/run_train_lstm.sh
  46. +338
    -0
      model_compress/model_compress/distil/examples/distilled-bilstm/task_lstm.py
  47. +439
    -0
      model_compress/model_compress/distil/examples/distilled-bilstm/task_student_kd_lstm.py
  48. +62
    -0
      model_compress/model_compress/distil/examples/knowledge_distillation/README.md
  49. +498
    -0
      model_compress/model_compress/distil/examples/knowledge_distillation/task_student_kd.py
  50. +54
    -0
      model_compress/model_compress/distil/examples/teacher_bert/README.md
  51. +311
    -0
      model_compress/model_compress/distil/examples/teacher_bert/task_teacher.py
  52. +203
    -0
      model_compress/model_compress/distil/examples/tinybert/README.md
  53. +393
    -0
      model_compress/model_compress/distil/examples/tinybert/data_augmentation.py
  54. +224
    -0
      model_compress/model_compress/distil/examples/tinybert/maskedBert.py
  55. +407
    -0
      model_compress/model_compress/distil/examples/tinybert/pregenerate_training_data.py
  56. +32
    -0
      model_compress/model_compress/distil/examples/tinybert/run_data_augmentation.sh
  57. +16
    -0
      model_compress/model_compress/distil/examples/tinybert/run_pregenerate_training_data.sh
  58. +516
    -0
      model_compress/model_compress/distil/examples/tinybert/task_student_tinybert.py
  59. +8
    -0
      model_compress/model_compress/distil/glue_process.sh
  60. +4
    -0
      model_compress/model_compress/distil/run_download_glue_data.sh
  61. +95
    -0
      model_compress/model_compress/distil/run_eval_student_bert_pkd.sh
  62. +94
    -0
      model_compress/model_compress/distil/run_eval_student_distilled_lstm.sh
  63. +88
    -0
      model_compress/model_compress/distil/run_eval_student_kd.sh
  64. +89
    -0
      model_compress/model_compress/distil/run_eval_student_tinybert.sh
  65. +86
    -0
      model_compress/model_compress/distil/run_eval_teacher.sh
  66. +99
    -0
      model_compress/model_compress/distil/run_eval_theseus.sh
  67. +130
    -0
      model_compress/model_compress/distil/run_train_student_bert_pkd.sh
  68. +130
    -0
      model_compress/model_compress/distil/run_train_student_distilled_lstm.sh
  69. +128
    -0
      model_compress/model_compress/distil/run_train_student_kd.sh
  70. +128
    -0
      model_compress/model_compress/distil/run_train_student_tinybert.sh
  71. +116
    -0
      model_compress/model_compress/distil/run_train_teacher.sh
  72. +158
    -0
      model_compress/model_compress/distil/run_train_theseus.sh
  73. +376
    -0
      model_compress/model_compress/distil/src/bert.py
  74. +116
    -0
      model_compress/model_compress/distil/src/classifier.py
  75. +107
    -0
      model_compress/model_compress/distil/src/config.py
  76. +121
    -0
      model_compress/model_compress/distil/src/convert_bert_pytorch_checkpoint_to_original_tf.py
  77. +90
    -0
      model_compress/model_compress/distil/src/convert_tf_ckpt_to_of.py
  78. +90
    -0
      model_compress/model_compress/distil/src/convert_tf_ckpt_to_of_student.py
  79. +141
    -0
      model_compress/model_compress/distil/src/download_glue_data.py
  80. +793
    -0
      model_compress/model_compress/distil/src/glue_ofrecord/glue_process.py
  81. +339
    -0
      model_compress/model_compress/distil/src/glue_ofrecord/glue_process_lstm.py
  82. +15
    -0
      model_compress/model_compress/distil/src/glue_ofrecord/list_files.py
  83. +31
    -0
      model_compress/model_compress/distil/src/glue_ofrecord/logging_setup.py
  84. +71
    -0
      model_compress/model_compress/distil/src/glue_ofrecord/parse_args.py
  85. +347
    -0
      model_compress/model_compress/distil/src/glue_ofrecord/tokenization.py
  86. +30522
    -0
      model_compress/model_compress/distil/src/glue_ofrecord/vocab.txt
  87. +382
    -0
      model_compress/model_compress/distil/src/knowledge_distill_util.py
  88. +311
    -0
      model_compress/model_compress/distil/src/lstm.py
  89. +29
    -0
      model_compress/model_compress/distil/src/test_global_storage.py
  90. +400
    -0
      model_compress/model_compress/distil/src/tokenization.py
  91. +220
    -0
      model_compress/model_compress/distil/src/util.py
  92. +204
    -0
      model_compress/model_compress/distil/theseus/README.md
  93. +0
    -0
      model_compress/model_compress/distil/theseus/__init__.py
  94. +341
    -0
      model_compress/model_compress/distil/theseus/bert.py
  95. +408
    -0
      model_compress/model_compress/distil/theseus/bert_theseus.py
  96. +126
    -0
      model_compress/model_compress/distil/theseus/classifier.py
  97. +110
    -0
      model_compress/model_compress/distil/theseus/config.py
  98. +90
    -0
      model_compress/model_compress/distil/theseus/convert_tf_ckpt_to_of.py
  99. +103
    -0
      model_compress/model_compress/distil/theseus/init_stu.py
  100. +189
    -0
      model_compress/model_compress/distil/theseus/pretrain.py

+ 5
- 0
model_compress/.gitignore View File

@@ -0,0 +1,5 @@
.idea
/model_compress/distil/data/
/model_compress/distil/glove/
/model_compress/distil/models/
/model_compress/distil/outputs/

+ 155
- 0
model_compress/README.md View File

@@ -0,0 +1,155 @@
# Oneflow-Model-Compression

## 概述
炼知技术平台是一个模型压缩平台,包含剪枝、量化、知识蒸馏等一系列模型压缩策略。
提供完整的模型压缩解决方案,可用于各种类型的自然语言和计算机视觉场景,如文本分类、推理,图像分类等。
另外,平台在不断完善各种压缩策略在经典开源任务的Benchmark,以便用户参考。
同时,平台也提供各种压缩策略的功能算子,方便用户使用、复现最新的论文方法,以及利用压缩算子进行二次开发。

<p align="center">
<br>
<img src="./docs/imgs/overview.png" width="600"/>
<br>
<p>

## 功能

<table style="width:100%;" cellpadding="2" cellspacing="0" border="1" bordercolor="#000000">
<tbody>
<tr>
<td style="text-align:center;">
<span style="font-size:18px;">功能模块</span>
</td>
<td style="text-align:center;">
<span style="font-size:18px;">算法</span>
</td>
<td style="text-align:center;">
<span style="font-size:18px;">相关文档</span>
</td>
</tr>
<tr>
<td style="text-align:center;">
量化
</td>
<td>
<ul>
<li>
<span>deep compression</span>:&nbsp;<a href="https://arxiv.org/pdf/1510.00149.pdf" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Han S, Mao H, Dally W J. "Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding"&nbsp;</span><i>arXiv preprint arXiv:1510.00149</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2017).</span></a>
</li>
<li>
<span>NVIDIA TensorRT</span>:&nbsp;<a href="https://github.com/NVIDIA/TensorRT" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">a C++ library for high performance inference on NVIDIA GPUs and deep learning accelerators.&nbsp;</span><i></i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;</span></a>
</li>
</ul>
</td>
<td>
<ul>
<li>
<a href="./docs/API_quant.md" target="_blank">量化API文档</a>
</li>
</ul>
</td>
</tr>
<tr>
<td style="text-align:center;">
<span style="font-size:12px;">剪枝</span><span style="font-size:12px;"></span><br />
</td>
<td>
<ul>
<li>
<span>bn channel slimming</span>:&nbsp;<a href="https://arxiv.org/abs/1708.06519" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Zhuang Liu, Jianguo Li, Zhiqiang Shen. "Learning Efficient Convolutional Networks through Network Slimming"&nbsp;</span><i>arXiv preprint arXiv:1708.06519</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2017).</span></a>
</li>
<li>
<span>conv channel slimming</span>:&nbsp;<a href="https://arxiv.org/abs/1608.08710" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Hao Li, Asim Kadav, Igor Durdanovic. "Pruning Filters for Efficient ConvNets"&nbsp;</span><i>arXiv preprint arXiv:1608.08710</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2016).</span></a>
</li>
<li>
<span>conv channel slimming</span>:&nbsp;<a href="http://cn.arxiv.org/abs/1607.03250" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Hengyuan Hu, Rui Peng, Yu-Wing Tai. "Network Trimming: A Data-Driven Neuron Pruning Approach towards Efficient Deep Architectures"&nbsp;</span><i>arXiv preprint arXiv:1607.03250</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2016).</span></a>
</li>
</ul>
</td>
<td>
<ul>
<li>
<a href="./docs/API_prune.md" target="_blank">剪枝API文档</a>
</li>
</ul>
<ul>
<li>
<a href="./model_compress/ChannelSlimming" target="_blank">剪枝快速上手</a>
</li>
</ul>
</td>
</tr>
<tr>
<td style="text-align:center;">
知识蒸馏
</td>
<td>
<ul>
<li>
<span>Knowledge Distillation</span>:&nbsp;<a href="https://arxiv.org/abs/1503.02531" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Hinton, Geoffrey, Oriol Vinyals, and Jeff Dean. "Distilling the knowledge in a neural network."&nbsp;</span><i>arXiv preprint arXiv:1503.02531</i><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">&nbsp;(2015).</span></a>
</li>
<li>
Distilled-BiLSTM:&nbsp;&nbsp;<a href="https://arxiv.org/abs/1903.12136" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Tang, Raphael, et al. "Distilling task-specific knowledge from bert into simple neural networks." arXiv preprint arXiv:1903.12136 (2019).</span></a>
</li>
<li>
BERT-PKD:&nbsp;&nbsp;<a href="https://arxiv.org/abs/1908.09355" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Sun, Siqi, et al. "Patient knowledge distillation for bert model compression." arXiv preprint arXiv:1908.09355 (2019).</span></a>
</li>
<li>
TinyBERT: <a href="https://arxiv.org/abs/1909.10351" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Jiao, Xiaoqi, et al. "Tinybert: Distilling bert for natural language understanding." arXiv preprint arXiv:1909.10351 (2019).</span></a>
</li>
<li>
MobileBERT: <a href="https://arxiv.org/abs/2004.02984" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Sun, Zhiqing, et al. "Mobilebert: a compact task-agnostic bert for resource-limited devices." arXiv preprint arXiv:2004.02984 (2020).</span></a>
</li>
<li>
BERT-Theseus: <a href="https://arxiv.org/abs/2002.02925" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Xu, Canwen, et al. "Bert-of-theseus: Compressing bert by progressive module replacing." arXiv preprint arXiv:2002.02925 (2020).</span></a>
</li>
<li>
改进版的BERT-Theseus: <a href="https://arxiv.org/abs/2002.02925" target="_blank"><span style="color:#222222;font-family:Arial, sans-serif;font-size:13px;background-color:#FFFFFF;">Xu, Canwen, et al. "Bert-of-theseus: Compressing bert by progressive module replacing." arXiv preprint arXiv:2002.02925 (2020).</span></a>
</li>
</ul>
</td>
<td>
<ul>
<li>
<a href="./docs/API_knowledge_distill.md" target="_blank">知识蒸馏API文档</a>
</li>
<li>
<a href="./model_compress/distil" target="_blank">知识蒸馏快速上手</a>
</li>
<li>
<a href="./model_compress/distil/examples/knowledge_distillation/README.md" target="_blank">Knowledge Distillation算法文档</a>
</li>
<li>
<a href="./model_compress/distil/examples/distilled-bilstm/README.md" target="_blank">Distilled-BiLSTM算法文档</a>
</li>
<li>
<a href="./model_compress/distil/examples/bert-pkd/README.md" target="_blank">BERT-PKD算法文档</a>
</li>
<li>
<a href="./model_compress/distil/examples/tinybert/README.md" target="_blank">TinyBERT算法文档</a>
</li>
<li>
<a href="model_compress/distil/theseus/README.md" target="_blank">BERT-Theseus算法文档</a>
</li>
</ul>
</td>
</tr>
</tbody>
</table>>


<br />

## 使用
- Oneflow介绍: 深度学习框架Oneflow[介绍以及环境安装说明](https://github.com/Oneflow-Inc/oneflow)。
- Oneflow快速开始:通过[简单示例](http://docs.oneflow.org/quick_start/quickstart_in_3_min.html)介绍如何快速3分钟上手使用Oneflow。
- 模型压缩API文档:用户接口文档,包含以下功能
- [量化](./docs/API_quant.md)
- [剪枝](./docs/API_prune.md)
- [知识蒸馏](./docs/API_knowledge_distill.md)
- 高阶教程:包括在CV和NLP等应用场景任务的使用示例、算法使用步骤,高级特性的使用教程。
- 量化功能文档: 介绍量化功能使用示例,主要包含int8量化。
- 剪枝功能文档: 介绍通道剪枝实现和[使用示例](./model_compress/ChannelSlimming/readme.md),只要包括CNN模型、DNN模型的不同剪枝算子。
- [知识蒸馏功能](./model_compress/distil)文档: 介绍知识蒸馏功能相关论文实现和使用示例,主要包含[KD](./model_compress/distil/examples/knowledge_distillation/README.md), [Distilled-BiLSTM](./model_compress/distil/examples/distilled-bilstm/README.md), [BERT-PKD](./model_compress/distil/examples/bert-pkd/README.md), [TinyBERT](./model_compress/distil/examples/tinybert/README.md), [BERT-Theseus](model_compress/distil/theseus/README.md)等算法。
- [TensorRT量化部署](./docs/API_quant.md): 介绍如何使用TensorRT部署量化得到的Oneflow模型。
- [模型库](./docs/model_zoo.md):各个压缩算法在文本分类、推理,图像分类等数据集上的实验结果,包括模型精度、模型尺寸和推理速度。

+ 63
- 0
model_compress/docs/API_knowledge_distill.md View File

@@ -0,0 +1,63 @@
知识蒸馏
=========

"软标签蒸馏"算子: pred_distill
---------

`knowledge_distill_util.pred_distill(args, student_logits, teacher_logits):`

[源代码](../model_compress/distil/src/knowledge_distill_util.py#L381)

`pred_distill`为teacher和student模型添加软标签损失,使得student模型可以学习教师模型的输出,达到student模型模仿teacher模型在预测层的表现的目的。
采用[soft_cross_entropy](../model_compress/distil/src/knowledge_distill_util.py#L336)来计算损失。

**参数:**

- **args**: 一些超参,如teacher_temperature和student_temperature,对student和teacher模型进行soft操作的温度值。
- **student_logits**: student模型预测出的logits。
- **teacher_logits**: teacher模型预测出的logits。


**返回:** 由teacher模型和student模型组合得到的软标签损失。

---
"层与层蒸馏"算子: layer_distill
---------

`knowledge_distill_util.layer_distill(args, student_reps, teacher_reps):`

[源代码](../model_compress/distil/src/knowledge_distill_util.py#L346)

`layer_distill`为teacher和student模型添加层与层损失,使得student模型可以学习教师模型的隐藏层特征,达到用teacher模型的暗知识(Dark Knowledge)指导student模型学习的目的,将teacher模型中的知识更好的蒸馏到student模型中。通过[MSE](../model_compress/distil/src/knowledge_distill_util.py#L343)来计算student模型和teacher模型中间层的距离。

**参数:**

- **args**: 一些超参,暂未用到,仅留出接口。
- **student_reps**: student模型的所有中间层表示。
- **teacher_reps**: teacher模型的所有中间层表示。


**返回:** 由teacher模型和student模型组合得到的层与层蒸馏损失。

>注:该算子仅适用于BERT类的student和teacher模型。

---
"注意力蒸馏"算子: att_distill
---------

`knowledge_distill_util.att_distill(args, student_atts, teacher_atts):`

[源代码](../model_compress/distil/src/knowledge_distill_util.py#L363)

`att_distill`为teacher和student模型添加注意力损失,使得student模型可以学习教师模型的attention score矩阵,学习到其中包含语义知识,例如语法和相互关系等。通过[MSE](../model_compress/distil/src/knowledge_distill_util.py#L343)来计算损失。

**参数:**

- **args**: 一些超参,暂未用到,仅留出接口。
- **student_reps**: student模型的所有的attention score矩阵。
- **teacher_reps**: teacher模型的所有的attention score矩阵。


**返回:** 由teacher模型和student模型组合得到的注意力蒸馏损失。

>注:该算子仅适用于BERT类的student和teacher模型。

+ 86
- 0
model_compress/docs/API_prune.md View File

@@ -0,0 +1,86 @@
1. 通道剪枝算子
=========

## 1.1 "bn"剪枝算子

- `get_pruneThre_bn():`卷积层对应的BN层的gamma参数作为缩放因子,获得剪枝对应阈值
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L120)
- **返回**:剪枝对应的阈值

- `get_removeIndex_bn(a, thre):`根据阈值获得当前卷积层需要剪枝的通道index
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L182)
- **参数**:
- **a**:当前卷积层的参数
- **thre**:`get_pruneThre_bn()`返回的阈值

1.2 "conv_avg"剪枝算子
---------

- `get_pruneThre_conv_avg():`卷积层参数的平均值作为缩放因子,获得剪枝对应阈值
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L54)
- **返回**:剪枝对应的阈值

- `get_removeIndex_conv_avg(a, shape, thre):`根据阈值获得当前卷积层需要剪枝的通道index
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L187)
- **参数**:
- **a**:当前卷积层的参数
- **shape**:当前卷积层的shape信息
- **thre**:`get_pruneThre_conv_avg()`返回的阈值

## 1.3 "conv_max"剪枝算子

- 同"conv_avg"剪枝算子

## 1.4 "conv_all"剪枝算子

- 同"conv_avg"剪枝算子

1.5 "random"剪枝算子
---------

- `get_removeIndex_conv_avg(shape):`随机选择需要剪枝的通道index
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L220)
- **参数**:
- **shape**:当前卷积层的shape信息

1.6 "dnn"剪枝算子
---------

- `get_pruneThre_fc():`全连接层的神经元的参数的平均值作为缩放因子,获得剪枝对应阈值
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#137)
- **返回**:剪枝对应的阈值

- `get_removeIndex_fc(a, shape, thre):`根据阈值获得当前全连接层需要剪枝的神经元index
- [源代码](../model_compress/ChannelSlimming/prune/util/prune_algorithm.py#L171)
- **参数**:
- **a**:当前全连接层的参数
- **shape**:当前全连接层的shape信息
- **thre**:`get_pruneThre_fc()`返回的阈值

2. 模型调用算子
=========

## 2.1 pruneDnn.py

- DNN模型剪枝,可调用1.6剪枝算子
- [文件](../model_compress/ChannelSlimming/prune/pruneDnn.py)

## 2.2 pruneLenet.py

- CNN模型的lenet模型剪枝,可调用1.1-1.5剪枝算子
- [文件](../model_compress/ChannelSlimming/prune/pruneLenet.py)

## 2.3 pruneAlexnet.py

- CNN模型的lenet模型剪枝,可调用1.1-1.5剪枝算子
- [文件](../model_compress/ChannelSlimming/prune/pruneAlexnet.py)

## 2.4 pruneVggnet.py

- CNN模型的lenet模型剪枝,可调用1.1-1.5剪枝算子
- [文件](../model_compress/ChannelSlimming/prune/pruneVggnet.py)

## 2.5 pruneResnet.py

- CNN模型的lenet模型剪枝,可调用1.1-1.5剪枝算子
- [文件](../model_compress/ChannelSlimming/prune/pruneResnet.py)

+ 126
- 0
model_compress/docs/API_quant.md View File

@@ -0,0 +1,126 @@
# OneFlow 量化推理

## OneFlow 中的 XRT

XRT 是一个同时支持多个计算引擎的运行时加速库,目前已经集成了 TensorFlow XLA 和 NVIDIA
TensorRT 两个后端引擎。其中 XLA 全面支持训练和预测,TensorRT 支持预测以及部分算子支持训练。对于同一个计算图,XRT 允许多个计算引擎联合使用,以获得更好的加速效果,其中 TensorRT 具有 Int8 量化功能。

由于 TensorRT 中官方支持的 op 并没有那么全面,其余自定义 op 有可能受接口限制,因此 OneFlow 后续会采用 plug-in 形式添加,支持更多算子。

## 在 OneFlow 中使用 TensorRT

* 前期准备
* 数据集:以测试 ResNet50 为例,需要提前准备 ImageNet 的 OFRecord 格式数据集。
* 下载 TensorRT:编译时需要链接 TensorRT 的头文件和动态库,因此用户需要根据自己系统和已安装的 CUDA 版本选择相应版本的 TensorRT,同时满足 TensorRT 的其他依赖。
* 下载 OneFlow-Benchmark:OneFlow-Benchmark 是 OneFlow 的模型基准仓库,提供了一系列完备实现的网络模型,本次测试选择的是其中的ResNet50。
* 编译:编译时开启 -DWITH_TENSORRT 选项,并指定 TensorRT 源码解压后的所在路径

```
cmake .. -DWITH_TENSORRT=ON -DTENSORRT_ROOT=/home/${user}/TensorRT-6.0.1.8 && make -j 24
```

或者可以在 cmake 前使用环境变量指定

```
export TENSORRT_ROOT=/home/${user}/TensorRT-6.0.1.8
```
编译成功后即可安装支持 TensoRT 的 OneFlow。

* 运行
目前 OneFlow 中的 TensorRT 仅支持单卡推理。编译成功后切换到 dev_trt_infer 分支,在 config.py 中
* 添加 --use\_tensorrt,可使用 TenosrRT 推理。
* 添加 --use\_tensorrt 和 use\_int8,可开启 TenosrRT 的 int8 量化。

## 环境

硬件环境

* CPU:Intel(R) Xeon(R) CPU E5-2650 v4 @ 2.20GHz x 6
* GPU:[GeForce GTX 1080] x 4

软件环境

* 系统:Ubuntu 18.04.4 LTS
* NVIDIA Driver Version:440.44
* CUDA:10.2
* GCC:7.5
* Cmake:3.14.4
* Make:4.1

测试结果

测试模型为 ResNet 50(以下称 rn50),使用在线量化,分别进行单机单卡和单机多卡推理,batch_size 取 64 和可运行的最大 batch_size。
若正常运行,log 打印如下:

```
==================================================================
Running resnet50: num_gpu_per_node = 1, num_nodes = 1.
==================================================================
dtype = float32
gpu_num_per_node = 1
num_nodes = 1
node_ips = ['127.0.0.1']
ctrl_port = 50051
model = resnet50
use_fp16 = None
use_xla = None
channel_last = None
pad_output = None
num_epochs = 1
model_load_dir = resnet_v15_of_best_model_val_top1_77318
batch_size_per_device = 64
val_batch_size_per_device = 256
nccl_fusion_threshold_mb = 0
nccl_fusion_max_ops = 0
fuse_bn_relu = False
fuse_bn_add_relu = False
gpu_image_decoder = False
image_path = test_img/tiger.jpg
num_classes = 1000
num_examples = 1281167
num_val_examples = 50000
rgb_mean = [123.68, 116.779, 103.939]
rgb_std = [58.393, 57.12, 57.375]
image_shape = [3, 224, 224]
label_smoothing = 0.1
model_save_dir = ./output/snapshots/model_save-20201123172206
log_dir = ./output
loss_print_every_n_iter = 1
image_size = 224
resize_shorter = 256
train_data_dir = None
train_data_part_num = 256
val_data_dir = /dataset/ImageNet/ofrecord/validation
val_data_part_num = 256
optimizer = sgd
learning_rate = 0.256
wd = 3.0517578125e-05
momentum = 0.875
lr_decay = cosine
lr_decay_rate = 0.94
lr_decay_epochs = 2
warmup_epochs = 5
decay_rate = 0.9
epsilon = 1.0
gradient_clipping = 0.0
------------------------------------------------------------------
Time stamp: 2020-11-23-17:22:06
Restoring model from resnet_v15_of_best_model_val_top1_77318.
Loading data from /dataset/ImageNet/ofrecord/validation


W1123 17:23:41.120939 31217 trt_executable.cpp:146] Rebuild engine since the maximum batch size 1 is less than the input batch size 256
W1123 17:24:25.756124 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead.
W1123 17:24:31.005220 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead.
W1123 17:24:36.085610 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead.
W1123 17:24:41.073289 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead.
W1123 17:24:45.920917 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead.
W1123 17:24:50.633805 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead.
W1123 17:24:55.354147 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead.
W1123 17:24:59.904863 33076 trt_logger.cpp:35] TensorRT Logging: Explicit batch network detected and batch size specified, use execute without batch size instead.
validation: epoch 0, iter 195, top_1: 0.772155, top_k: 0.934856, samples/s: 181.038 1606123666.3968866
```

### 单机单卡



BIN
model_compress/docs/imgs/overview.png View File

Before After
Width: 2120  |  Height: 1004  |  Size: 243 kB

+ 44
- 0
model_compress/docs/model_zoo.md View File

@@ -0,0 +1,44 @@
# 模型库
# 1. 图像分类
## 1.1 量化

## 1.2 剪枝

数据集:Cifar10

模型:Alexnet、Lenet

设置:剪枝率为0.5、0.7

| 模型 - 剪枝算子 | 测试次数 | Acc | 剪枝率 | 压缩比例 | 推理耗时samples/s |
| :---------------------: | :------: | :----: | :----: | :------: | :---------------: |
| Alexnet - 无剪枝 | 5 | 94.89% | - | 1x | 5409 |
| Alexnet - bn | 5 | 98.81% | 50% | 1.4x | 5968 |
| Alexnet - conv_all | 5 | 93.95% | 50% | 1.3x | 5969 |
| Alexnet - conv_avg | 5 | 98.56% | 50% | 1.3x | 5865 |
| Alexnet - conv_max | 5 | 97.44% | 50% | 1.3x | 5555 |
| Alexnet - random | 5 | 97.32% | 50% | 1.3x | 5580 |
| Alexnet -conv_threshold | 5 | 98.03% | 50% | x1.3x | 5567 |
| Lenet - 无剪枝 | 5 | 75.72% | - | 1x | 5821 |
| Lenet - bn | 5 | 64.89% | 70% | 3x | 1923 |

# 2. 文本分类
## 2.1 知识蒸馏
数据集:SST-2

环境:单卡2080Ti

设置:BERT类模型最大序列长度设为128,LSTM类模型最大序列长度设为32,词表大小为10000

| 模型 | 测试次数 | Acc | 层数 | 隐藏层维度/前馈层维度 | 模型尺寸 | 压缩比例 | 推理耗时 | 推理加速 |
|:--:|:---:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
| BERT_base(Teacher) | 5 | 92.2% | 12 | 768/3072 | 110M | 1x | 4.04s | 1x |
| KD | 5 | 80.5% | 3 | 312/1200 | 14.5M | 7.5x | 0.81s | 5.0x |
| BiLSTM | 5 | 80.4% | 1 | 300/400 | 15.3M | 7.2x | 0.83s | 4.8x |
| Distilled-BiLSTM | 5 | 82.9% | 1 | 300/400 | 15.3M | 7.2x | 0.83s | 4.8x |
| BERT-PKD(from scratch) | 5 | 81.5% | 3 | 768/3072 | 45.7M | 2.4x | 1.69s | 2.4x |
| BERT-PKD | 5 | 88.4% | 3 | 768/3072 | 45.7M | 2.4x | 1.69s | 2.4x |
| TinyBERT | 5 | 91.3% | 4 | 312/1200 | 14.5M | 7.5x | 0.65s | 6.2x |
| BERT-of-Theseus | 5 | 87.2% | 4 | 768/3072 | 53.7M | 2.05x | 2.05s | 2.0x |

注:层数不包含embedding和prediction层。

+ 4
- 0
model_compress/model_compress/ChannelSlimming/log/readme.md View File

@@ -0,0 +1,4 @@
# 训练的日志文件

- 日志文件夹,存储不同模型和数据的日志log文件,记录每个epoch在test数据集上的top1准确率、topk准确率、运行速度。
- 如"log_vgg_cifar10_base_model.txt":vgg模型-cifar10数据集-baseline模型训练的log记录。

+ 263
- 0
model_compress/model_compress/ChannelSlimming/model/cnn/alexnet_model.py View File

@@ -0,0 +1,263 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import oneflow as flow
from util.model_weights import modelWeight

def _batch_norm(inputs, name=None, trainable=True):
return flow.layers.batch_normalization(
inputs=inputs,
axis=1,
momentum=0.997,
epsilon=1.001e-5,
center=True,
scale=True,
# gamma_initializer=0,
gamma_regularizer=flow.regularizers.l1(1e-4),
trainable=trainable,
name=name,
)


def conv2d_layer(
name,
input,
filters,
kernel_size=1,
strides=1,
padding="VALID",
data_format="NCHW",
dilation_rate=1,
activation="Relu",
use_bias=True,
weight_initializer=flow.variance_scaling_initializer(2, 'fan_out', 'random_normal', data_format="NCHW"),
bias_initializer=flow.zeros_initializer(),
bn=True,
):
weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
weight = flow.get_variable(
name + "_weight",
shape=weight_shape,
dtype=input.dtype,
initializer=weight_initializer,
)
output = flow.nn.conv2d(
input, weight, strides, padding, data_format, dilation_rate, name=name
)
if use_bias:
bias = flow.get_variable(
name + "_bias",
shape=(filters,),
dtype=input.dtype,
initializer=bias_initializer,
)
output = flow.nn.bias_add(output, bias, data_format)

if activation is not None:
if activation == "Relu":
if bn:
output = _batch_norm(output, name + "_bn")
# flow.watch(output)
output = flow.nn.relu(output)
else:
output = flow.nn.relu(output)
else:
raise NotImplementedError

return output


def alexnet(images, cfg, optimizer, trainable=True, need_transpose=False,
training=True, wd=1.0/32768, model_weight=True, bn=True):
if need_transpose:
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])
conv0 = conv2d_layer(name="conv0", input=images, filters=cfg[0], kernel_size=11,
padding="VALID", strides=1, bn=bn)
pool0 = flow.nn.max_pool2d(conv0, 3, 2, "VALID", "NCHW", name="pool0")

conv1 = conv2d_layer(name="conv1", input=pool0, filters=cfg[1], kernel_size=5,
padding="SAME", strides=1, bn=bn)
pool1 = flow.nn.max_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
conv2 = conv2d_layer(name="conv2", input=pool1, filters=cfg[2], kernel_size=3,
padding="SAME", strides=1, bn=bn)
conv3 = conv2d_layer(name="conv3", input=conv2, filters=cfg[3], kernel_size=3,
padding="SAME", strides=1, bn=bn)
conv4 = conv2d_layer(name="conv4", input=conv3, filters=cfg[4], kernel_size=3,
padding="SAME", strides=1, bn=bn)
pool2 = flow.nn.max_pool2d(conv4, 3, 2, "VALID", "NCHW", name="pool2")

pool2 = flow.reshape(pool2, [pool2.shape[0], -1])
dense0 = flow.layers.dense(
inputs=pool2,
units=cfg[5],
activation=flow.nn.relu,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense0",
)

dense1 = flow.layers.dense(
inputs=dense0,
units=cfg[6],
activation=flow.nn.relu,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense1",
)
dense2 = flow.layers.dense(
inputs=dense1,
units=cfg[7],
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense2",
)
# flow.watch(fc8)
def getTypeAndShape(inputs,units):
in_shape = inputs.shape
in_num_axes = len(in_shape)
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs)
shape=(units, inputs.shape[1])
dtype=inputs.dtype
return shape,dtype
if model_weight == True:
modelWeight.addConv(index=0, dtype=conv0.dtype,
shape1=(cfg[0], images.shape[1], 11, 11), shape2=(cfg[0],),
optimizer=optimizer)
modelWeight.addConv(index=1, dtype=conv1.dtype,
shape1=(cfg[1], conv0.shape[1], 5, 5), shape2=(cfg[1],),
optimizer=optimizer)
modelWeight.addConv(index=2, dtype=conv2.dtype,
shape1=(cfg[2], conv1.shape[1], 3, 3), shape2=(cfg[2],),
optimizer=optimizer)
modelWeight.addConv(index=3, dtype=conv3.dtype,
shape1=(cfg[3], conv2.shape[1], 3, 3), shape2=(cfg[3],),
optimizer=optimizer)
modelWeight.addConv(index=4, dtype=conv4.dtype,
shape1=(cfg[4], conv3.shape[1], 3, 3), shape2=(cfg[4],),
optimizer=optimizer)
shape_list = []
dtype_list = []
shape_weight, dtype = getTypeAndShape(pool2, cfg[5])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense0, cfg[6])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense1, cfg[7])
shape_list.append(shape_weight)
dtype_list.append(dtype)
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list,
optimizer=optimizer, dense_num=3)

return dense2

def alexnet_simple(images, cfg, optimizer, trainable=True, need_transpose=False,
training=True, wd=1.0/32768, model_weight=True, bn=True):
if need_transpose:
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])
conv0 = conv2d_layer(name="conv0", input=images, filters=cfg[0], kernel_size=3,
padding="VALID", strides=1, bn=bn)
pool0 = flow.nn.max_pool2d(conv0, 3, 2, "VALID", "NCHW", name="pool0")

conv1 = conv2d_layer(name="conv1", input=pool0, filters=cfg[1], kernel_size=3,
padding="SAME", strides=1, bn=bn)
pool1 = flow.nn.max_pool2d(conv1, 3, 2, "VALID", "NCHW", name="pool1")
conv2 = conv2d_layer(name="conv2", input=pool1, filters=cfg[2], kernel_size=3,
padding="SAME", strides=1, bn=bn)
conv3 = conv2d_layer(name="conv3", input=conv2, filters=cfg[3], kernel_size=3,
padding="SAME", strides=1, bn=bn)
conv4 = conv2d_layer(name="conv4", input=conv3, filters=cfg[4], kernel_size=3,
padding="SAME", strides=1, bn=bn)
pool2 = flow.nn.max_pool2d(conv4, 3, 2, "VALID", "NCHW", name="pool2")

pool2 = flow.reshape(pool2, [pool2.shape[0], -1])
dense0 = flow.layers.dense(
inputs=pool2,
units=cfg[5],
activation=flow.nn.relu,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense0",
)

dense1 = flow.layers.dense(
inputs=dense0,
units=cfg[6],
activation=flow.nn.relu,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense1",
)
dense2 = flow.layers.dense(
inputs=dense1,
units=cfg[7],
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense2",
)
# flow.watch(fc8)
def getTypeAndShape(inputs,units):
in_shape = inputs.shape
in_num_axes = len(in_shape)
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs)
shape=(units, inputs.shape[1])
dtype=inputs.dtype
return shape,dtype
if model_weight == True:
modelWeight.addConv(index=0, dtype=conv0.dtype,
shape1=(cfg[0], images.shape[1], 3, 3), shape2=(cfg[0],),
optimizer=optimizer)
modelWeight.addConv(index=1, dtype=conv1.dtype,
shape1=(cfg[1], conv0.shape[1], 3, 3), shape2=(cfg[1],),
optimizer=optimizer)
modelWeight.addConv(index=2, dtype=conv2.dtype,
shape1=(cfg[2], conv1.shape[1], 3, 3), shape2=(cfg[2],),
optimizer=optimizer)
modelWeight.addConv(index=3, dtype=conv3.dtype,
shape1=(cfg[3], conv2.shape[1], 3, 3), shape2=(cfg[3],),
optimizer=optimizer)
modelWeight.addConv(index=4, dtype=conv4.dtype,
shape1=(cfg[4], conv3.shape[1], 3, 3), shape2=(cfg[4],),
optimizer=optimizer)
shape_list = []
dtype_list = []
shape_weight, dtype = getTypeAndShape(pool2, cfg[5])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense0, cfg[6])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense1, cfg[7])
shape_list.append(shape_weight)
dtype_list.append(dtype)
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list,
optimizer=optimizer, dense_num=3)

return dense2

+ 152
- 0
model_compress/model_compress/ChannelSlimming/model/cnn/lenet_model.py View File

@@ -0,0 +1,152 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import oneflow as flow
from util.model_weights import modelWeight

def _batch_norm(inputs, name=None, trainable=True):
return flow.layers.batch_normalization(
inputs=inputs,
axis=1,
momentum=0.997,
epsilon=1.001e-5,
center=True,
scale=True,
# gamma_initializer=0,
gamma_regularizer=flow.regularizers.l1(1e-4),
trainable=trainable,
name=name,
)


def conv2d_layer(
name,
input,
filters,
kernel_size=1,
strides=1,
padding="VALID",
data_format="NCHW",
dilation_rate=1,
activation="Relu",
use_bias=True,
weight_initializer=flow.variance_scaling_initializer(2, 'fan_out', 'random_normal', data_format="NCHW"),
bias_initializer=flow.zeros_initializer(),
bn=True,
):
weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
weight = flow.get_variable(
name + "_weight",
shape=weight_shape,
dtype=input.dtype,
initializer=weight_initializer,
)
output = flow.nn.conv2d(
input, weight, strides, padding, data_format, dilation_rate, name=name
)
if use_bias:
bias = flow.get_variable(
name + "_bias",
shape=(filters,),
dtype=input.dtype,
initializer=bias_initializer,
)
output = flow.nn.bias_add(output, bias, data_format)

if activation is not None:
if activation == "Relu":
if bn:
output = _batch_norm(output, name + "_bn")
output = flow.nn.relu(output)
else:
output = flow.nn.relu(output)
else:
raise NotImplementedError

return output


def lenet(images, cfg, optimizer, trainable=True, need_transpose=False,
training=True, wd=1.0/32768, model_weight=True, bn=True):
if need_transpose:
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])
conv0 = conv2d_layer(name="conv0", input=images, filters=cfg[0], kernel_size=5,
padding="VALID", strides=1, bn=bn)
pool0 = flow.nn.max_pool2d(conv0, 2, 2, "VALID", "NCHW", name="pool0")

conv1 = conv2d_layer(name="conv1", input=pool0, filters=cfg[1], kernel_size=5,
padding="VALID", strides=1, bn=bn)
pool1 = flow.nn.max_pool2d(conv1, 2, 2, "VALID", "NCHW", name="pool1")
pool1 = flow.reshape(pool1, [pool1.shape[0], -1])
# pool1 = flow.reshape(images, [images.shape[0], -1])
dense0 = flow.layers.dense(
inputs=pool1,
units=cfg[2],
activation=flow.nn.relu,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense0")

dense1 = flow.layers.dense(
inputs=dense0,
units=cfg[3],
activation=flow.nn.relu,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense1")
dense2 = flow.layers.dense(
inputs=dense1,
units=cfg[4],
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense2")
# flow.watch(fc8)
def getTypeAndShape(inputs,units):
in_shape = inputs.shape
in_num_axes = len(in_shape)
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs)
shape=(units, inputs.shape[1])
dtype=inputs.dtype
return shape,dtype
if model_weight == True:
modelWeight.addConv(index=0, dtype=conv0.dtype,
shape1=(cfg[0], images.shape[1], 5, 5), shape2=(cfg[0],),
optimizer=optimizer)
modelWeight.addConv(index=1, dtype=conv1.dtype,
shape1=(cfg[1], conv0.shape[1], 5, 5), shape2=(cfg[1],),
optimizer=optimizer)
shape_list = []
dtype_list = []
shape_weight, dtype = getTypeAndShape(pool1, cfg[2])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense0, cfg[3])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense1, cfg[4])
shape_list.append(shape_weight)
dtype_list.append(dtype)
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list,
optimizer=optimizer, dense_num=3)

return dense2

+ 218
- 0
model_compress/model_compress/ChannelSlimming/model/cnn/resnet_model.py View File

@@ -0,0 +1,218 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import oneflow as flow
from util.model_weights import modelWeight

BLOCK_COUNTS = [3, 4, 6, 3]
NAME_NUMBER = 0

#一个conv层
def _conv2d(name,
input,
filters,
kernel_size,
strides=1,
padding="SAME",
data_format="NCHW",
dilations=1,
use_bias=True,
trainable=True,
weight_initializer=flow.variance_scaling_initializer(data_format="NCHW"),
bias_initializer=flow.zeros_initializer()):
weight = flow.get_variable(name + "_weight",
shape=(filters, input.shape[1], kernel_size, kernel_size),
dtype=input.dtype,
initializer=weight_initializer,
trainable=trainable)
output = flow.nn.conv2d(input, weight, strides, padding, data_format, dilations, name=name)
if use_bias:
bias = flow.get_variable(name + "_bias",
shape=(filters,),
dtype=input.dtype,
initializer=bias_initializer,)
output = flow.nn.bias_add(output, bias, data_format)
return output

#一个bn层
def _batch_norm(inputs, name=None, trainable=True):
return flow.layers.batch_normalization(
inputs=inputs,
axis=1,
momentum=0.997,
epsilon=1.001e-5,
center=True,
scale=True,
trainable=trainable,
name=name,
)

#conv, bn, relu层
def conv2d_affine(input, name, filters, kernel_size, strides, bn, activation=None):
# input data_format must be NCHW, cannot check now
padding = "SAME" if strides > 1 or kernel_size > 1 else "VALID"
output = _conv2d(name, input, filters, kernel_size, strides, padding)
# print(name)
if bn:
output = _batch_norm(output, name + "_bn")
if activation == "Relu":
output = flow.nn.relu(output)

return output

#三个conv2d_affine(conv, bn, relu层)
def bottleneck_transformation(input, filter1, filter2, filter3,
strides, bn, model_weight, optimizer):
global NAME_NUMBER
a = conv2d_affine(input, "conv"+str(NAME_NUMBER), filter1, 1, 1, bn, activation="Relu",)
#添加conv的model weight
if model_weight == True:
modelWeight.addConv(index=NAME_NUMBER,
dtype=input.dtype,
shape1=(filter1, input.shape[1], 1, 1),
shape2=(filter1,),
optimizer=optimizer)
NAME_NUMBER += 1
b = conv2d_affine(a, "conv"+str(NAME_NUMBER), filter2, 3, strides, bn, activation="Relu",)
#添加conv的model weight
if model_weight == True:
modelWeight.addConv(index=NAME_NUMBER,
dtype=a.dtype,
shape1=(filter2, a.shape[1], 3, 3),
shape2=(filter2,),
optimizer=optimizer)
NAME_NUMBER += 1
c = conv2d_affine(b, "conv"+str(NAME_NUMBER), filter3, 1, 1, bn)
#添加conv的model weight
if model_weight == True:
modelWeight.addConv(index=NAME_NUMBER,
dtype=b.dtype,
shape1=(filter3, b.shape[1], 1, 1),
shape2=(filter3,),
optimizer=optimizer)
NAME_NUMBER += 1
# print(a.shape, b.shape, c.shape, strides)
return c


def residual_block(input, index, i, filter1, filter2, filter3,
strides_init, bn, model_weight, optimizer):
# if strides_init != 1 or block_name == "res2_0":
# #一个conv2d_affine(conv, bn, relu层)
# shortcut = conv2d_affine(input, block_name + "_branch1", 1, 1, filter3, 1, strides_init)
# else:
# shortcut = input
#对输入做变换,使得和三层oncv的输出shape相同,可以相加
shortcut = conv2d_affine(input, "conv_shortcut"+str(index)+"_"+str(i), filter3, 3,
strides_init, bn)
#shortcut层添加model weight
if model_weight == True:
modelWeight.addConv(index="_shortcut"+str(index)+"_"+str(i),
dtype=input.dtype,
shape1=(filter3, input.shape[1], 3, 3),
shape2=(filter3,),
optimizer=optimizer)
#三个conv2d_affine(conv, bn, relu层)
bottleneck = bottleneck_transformation(input, filter1, filter2, filter3,
strides_init, bn, model_weight, optimizer)
# print(bottleneck.shape, shortcut.shape, strides_init, i)
return flow.nn.relu(bottleneck + shortcut)


def residual_stage(input, index, counts, cfg, bn, model_weight, optimizer, stride_init=2):
output = input
for i in range(counts):
# block_name = "%s_%d" % (stage_name, i)
output = residual_block(output, index, i, cfg[i*3+0], cfg[i*3+1], cfg[i*3+2],
stride_init if i == 0 else 1, bn, model_weight, optimizer)
return output

#resnet50主体结构
def resnet_conv_x_body(input, cfg, bn, model_weight, optimizer, on_stage_end=lambda x: x):
output = input
for index, (counts, cfg_i) in enumerate(
zip(BLOCK_COUNTS, cfg)
):
#stage_name为res2/res3/res4/res5
# stage_name = "res%d" % (i + 2)
output = residual_stage(output, index, counts, cfg_i, bn, model_weight,
optimizer, 1 if index == 0 else 2)
on_stage_end(output)
return output

#最初的卷积层
def resnet_stem(input, bn, model_weight, optimizer):
conv_stem = _conv2d("conv_stem", input, 64, 7, 2)
if bn:
conv_stem = _batch_norm(conv_stem, "conv_stem_bn")
conv_stem = flow.nn.relu(conv_stem)
pool1 = flow.nn.max_pool2d(
conv_stem, ksize=3, strides=2, padding="VALID", data_format="NCHW", name="pool1",
)
#最初的卷积层添加model weight
if model_weight == True:
modelWeight.addConv(index="_stem", dtype=input.dtype,
shape1=(64, input.shape[1], 7, 7),
shape2=(64,),
optimizer=optimizer)
return pool1

def resnet50(images, cfg, optimizer, trainable=True, need_transpose=False,
model_weight=True, bn=True):
if need_transpose:
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])

global NAME_NUMBER
NAME_NUMBER = 0
stem = resnet_stem(images, bn, model_weight, optimizer)
body = resnet_conv_x_body(stem, cfg, bn, model_weight, optimizer, lambda x: x)
pool5 = flow.nn.avg_pool2d(
body, ksize=7, strides=1, padding="VALID", data_format="NCHW", name="pool5",
)
pool5 = flow.reshape(pool5, [pool5.shape[0], -1])
dense0 = flow.layers.dense(
inputs=pool5,
units=cfg[4],
use_bias=True,
kernel_initializer=flow.xavier_uniform_initializer(),
bias_initializer=flow.zeros_initializer(),
trainable=trainable,
name="dense0",)
def getTypeAndShape(inputs,units):
in_shape = inputs.shape
in_num_axes = len(in_shape)
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs)
shape=(units, inputs.shape[1])
dtype=inputs.dtype
return shape,dtype
#添加dense层的Model weight
if model_weight == True:
shape_list = []
dtype_list = []
shape_weight, dtype = getTypeAndShape(pool5, cfg[4])
shape_list.append(shape_weight)
dtype_list.append(dtype)
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list,
optimizer=optimizer, dense_num=1)

return dense0

+ 200
- 0
model_compress/model_compress/ChannelSlimming/model/cnn/vgg_model.py View File

@@ -0,0 +1,200 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import oneflow as flow
from util.model_weights import modelWeight

def _batch_norm(inputs, name=None, trainable=True):
return flow.layers.batch_normalization(
inputs=inputs,
axis=1,
momentum=0.997,
epsilon=1.001e-5,
center=True,
scale=True,
# gamma_initializer=0,
gamma_regularizer=flow.regularizers.l1(1e-4),
trainable=trainable,
name=name,
)


def conv2d_layer(
name,
input,
filters,
kernel_size=3,
strides=1,
padding="SAME",
data_format="NCHW",
dilation_rate=1,
activation="Relu",
use_bias=True,
weight_initializer=flow.variance_scaling_initializer(2, 'fan_out', 'random_normal', data_format="NCHW"),
bias_initializer=flow.zeros_initializer(),
bn=True,
):
weight_shape = (filters, input.shape[1], kernel_size, kernel_size)
weight = flow.get_variable(
name + "_weight",
shape=weight_shape,
dtype=input.dtype,
initializer=weight_initializer,
)
output = flow.nn.conv2d(
input, weight, strides, padding, data_format, dilation_rate, name=name
)
if use_bias:
bias = flow.get_variable(
name + "_bias",
shape=(filters,),
dtype=input.dtype,
initializer=bias_initializer,
)
output = flow.nn.bias_add(output, bias, data_format)

if activation is not None:
if activation == "Relu":
if bn:
output = _batch_norm(output, name + "_bn")
# flow.watch(output)
output = flow.nn.relu(output)
else:
output = flow.nn.relu(output)
else:
raise NotImplementedError

return output


def _conv_block(in_blob, index, filters, conv_times, optimizer, model_weight, bn=True):
conv_block = []
conv_block.insert(0, in_blob)
for i in range(conv_times):
conv_i = conv2d_layer(
name="conv{}".format(index),
input=conv_block[i],
filters=filters[index],
kernel_size=3,
strides=1,
bn=bn,
)
if model_weight == True:
modelWeight.addConv(index=index,
dtype=conv_block[i].dtype,
shape1=(filters[index], conv_block[i].shape[1], 3, 3),
shape2=(filters[index],),
optimizer=optimizer)
# shape_weight=(filters[index], conv_block[i].shape[1], 3, 3)
# modelWeight.add("conv{}".format(index)+'-weight',conv_block[i].dtype,shape_weight)
# modelWeight.add("conv{}".format(index)+'-bias',conv_block[i].dtype,(filters,))
# modelWeight.add("conv{}".format(index)+'_bn-gamma',conv_block[i].dtype,(filters,))
# modelWeight.add("conv{}".format(index)+'_bn-beta',conv_block[i].dtype,(filters,))
# modelWeight.add("conv{}".format(index)+'_bn-moving_variance',conv_block[i].dtype,(filters,))
# modelWeight.add("conv{}".format(index)+'_bn-moving_mean',conv_block[i].dtype,(filters,))
conv_block.append(conv_i)
index += 1

return conv_block


def vgg(images, cfg, optimizer, trainable=True, need_transpose=False,
training=True, wd=1.0/32768, model_weight=True, bn=True):
if need_transpose:
images = flow.transpose(images, name="transpose", perm=[0, 3, 1, 2])
conv1 = _conv_block(images, 0, cfg, 2, optimizer, model_weight, bn=bn)
pool1 = flow.nn.max_pool2d(conv1[-1], 2, 2, "VALID", "NCHW", name="pool1")
conv2 = _conv_block(pool1, 2, cfg, 2, optimizer, model_weight, bn=bn)
pool2 = flow.nn.max_pool2d(conv2[-1], 2, 2, "VALID", "NCHW", name="pool2")

conv3 = _conv_block(pool2, 4, cfg, 3, optimizer, model_weight, bn=bn)
pool3 = flow.nn.max_pool2d(conv3[-1], 2, 2, "VALID", "NCHW", name="pool3")

conv4 = _conv_block(pool3, 7, cfg, 3, optimizer, model_weight, bn=bn)
pool4 = flow.nn.max_pool2d(conv4[-1], 2, 2, "VALID", "NCHW", name="pool4")

conv5 = _conv_block(pool4, 10, cfg, 3, optimizer, model_weight, bn=bn)
pool5 = flow.nn.max_pool2d(conv5[-1], 2, 2, "VALID", "NCHW", name="pool5")

pool5 = flow.reshape(pool5, [pool5.shape[0], -1])
dense0 = flow.layers.dense(
inputs=pool5,
units=cfg[13],
activation=flow.nn.relu,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense0",
)

dense1 = flow.layers.dense(
inputs=dense0,
units=cfg[14],
activation=flow.nn.relu,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense1",
)
dense2 = flow.layers.dense(
inputs=dense1,
units=cfg[15],
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense2",
)
# flow.watch(fc8)
def getTypeAndShape(inputs,units):
in_shape = inputs.shape
in_num_axes = len(in_shape)
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs)
shape=(units, inputs.shape[1])
dtype=inputs.dtype
return shape,dtype
if model_weight == True:
shape_list = []
dtype_list = []
shape_weight, dtype = getTypeAndShape(pool5, cfg[13])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense0, cfg[14])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense1, cfg[15])
shape_list.append(shape_weight)
dtype_list.append(dtype)
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list,
optimizer=optimizer, dense_num=3)


# shape_weight,dtype=getTypeAndShape(pool5,4096)
# modelWeight.add('fc1'+'-weight',dtype,shape_weight)
# modelWeight.add('fc1'+'-bias',dtype,(4096,))

# shape_weight,dtype=getTypeAndShape(fc6,4096)
# modelWeight.add('fc2'+'-weight',dtype,shape_weight)
# modelWeight.add('fc2'+'-bias',dtype,(4096,))

# shape_weight,dtype=getTypeAndShape(fc7,1000)
# modelWeight.add('fc_final'+'-weight',dtype,shape_weight)
# modelWeight.add('fc_final'+'-bias',dtype,(1000,))

return dense2

+ 118
- 0
model_compress/model_compress/ChannelSlimming/model/dnn/dnn_model.py View File

@@ -0,0 +1,118 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
from util.model_weights import modelWeight

# 这是一个具有 2 层隐藏层的 DNN 神经网络,第 1 层使用 relu 激活函数,第 2 层不使用激活函数

def dnn_2(input_tensor, cfg, optimizer, model_weight=True, trainable=True):
input_tensor = flow.reshape(input_tensor, [input_tensor.shape[0], -1])
dense0 = flow.layers.dense(
inputs=input_tensor,
units=cfg[0],
activation=flow.nn.relu, use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense0")

dense1 = flow.layers.dense(
inputs=dense0,
units=cfg[1],
activation=None,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense1")
def getTypeAndShape(inputs,units):
in_shape = inputs.shape
in_num_axes = len(in_shape)
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs)
shape=(units, inputs.shape[1])
dtype=inputs.dtype
return shape,dtype
if model_weight == True:
shape_list = []
dtype_list = []
shape_weight, dtype = getTypeAndShape(input_tensor, cfg[0])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense0, cfg[1])
shape_list.append(shape_weight)
dtype_list.append(dtype)
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list,
optimizer=optimizer, dense_num=2)
return dense1
def dnn_4(input_tensor, cfg, optimizer, model_weight=True, trainable=True):
input_tensor = flow.reshape(input_tensor, [input_tensor.shape[0], -1])
dense0 = flow.layers.dense(
inputs=input_tensor,
units=cfg[0],
activation=flow.nn.relu, use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense0")
dense1 = flow.layers.dense(
inputs=dense0,
units=cfg[1],
activation=flow.nn.relu, use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense1")
dense2 = flow.layers.dense(
inputs=dense1,
units=cfg[2],
activation=flow.nn.relu, use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense2")

dense3 = flow.layers.dense(
inputs=dense2,
units=cfg[3],
activation=None,
use_bias=True,
kernel_initializer=flow.random_normal_initializer(mean=0, stddev=0.1),
trainable=trainable,
name="dense3")
def getTypeAndShape(inputs,units):
in_shape = inputs.shape
in_num_axes = len(in_shape)
inputs = (flow.reshape(inputs, (-1, in_shape[-1])) if in_num_axes > 2 else inputs)
shape=(units, inputs.shape[1])
dtype=inputs.dtype
return shape,dtype
if model_weight == True:
shape_list = []
dtype_list = []
shape_weight, dtype = getTypeAndShape(input_tensor, cfg[0])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense0, cfg[1])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense1, cfg[2])
shape_list.append(shape_weight)
dtype_list.append(dtype)
shape_weight, dtype = getTypeAndShape(dense2, cfg[3])
shape_list.append(shape_weight)
dtype_list.append(dtype)
modelWeight.addDense(dtype_old=dtype_list, shape=shape_list,
optimizer=optimizer, dense_num=4)
return dense3

+ 6161
- 0
model_compress/model_compress/ChannelSlimming/myData/randomData255/test.json
File diff suppressed because it is too large
View File


+ 6161
- 0
model_compress/model_compress/ChannelSlimming/myData/randomData255/train.json
File diff suppressed because it is too large
View File


+ 1
- 0
model_compress/model_compress/ChannelSlimming/ofData/readme.md View File

@@ -0,0 +1 @@
# Oneflow格式数据集

+ 202
- 0
model_compress/model_compress/ChannelSlimming/ofrecordMake.py View File

@@ -0,0 +1,202 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow.core.record.record_pb2 as ofrecord
import six
import struct
import numpy as np
import json
import os
import argparse

parser = argparse.ArgumentParser()

parser.add_argument("--dataName", default="randomData1",
type=str, help="my data name")

args = parser.parse_args()

#%%
def int32_feature(value):
if not isinstance(value, (list, tuple)):
value = [value]
return ofrecord.Feature(int32_list=ofrecord.Int32List(value=value))

def int64_feature(value):
if not isinstance(value, (list, tuple)):
value = [value]
return ofrecord.Feature(int64_list=ofrecord.Int64List(value=value))


def float_feature(value):
if not isinstance(value, (list, tuple)):
value = [value]
return ofrecord.Feature(float_list=ofrecord.FloatList(value=value))


def double_feature(value):
if not isinstance(value, (list, tuple)):
value = [value]
return ofrecord.Feature(double_list=ofrecord.DoubleList(value=value))


def bytes_feature(value):
if not isinstance(value, (list, tuple)):
value = [value]
if not six.PY2:
if isinstance(value[0], str):
value = [x.encode() for x in value]
return ofrecord.Feature(bytes_list=ofrecord.BytesList(value=value))

#%% 随机生成3*32*32大小的训练集1000个,测试集200个,数值范围0-1
def createRandomData_1():
data_train = np.random.random((1000, 3*32*32))
label_train = np.random.randint(0, 10, (1000))
np.around(data_train, 4)
dict_train = {}
dict_train["data"] = data_train.tolist()
dict_train["label"] = label_train.tolist()
dict_train["shape"] = [3, 32, 32]
with open("./myData/randomData1/train.json", "w") as f_train:
json.dump(dict_train, f_train, indent=4)
data_test = np.random.random((200, 3*32*32))
label_test = np.random.randint(0, 10, (200))
np.around(data_test, 4)
dict_test = {}
dict_test["data"] = data_test.tolist()
dict_test["label"] = label_test.tolist()
dict_test["shape"] = [3, 32, 32]
with open("./myData/randomData1/test.json", "w") as f_test:
json.dump(dict_test, f_test, indent=4)
#%% 随机生成3*32*32大小的训练集1000个,测试集200个,数值范围1-255
def createRandomData_255():
data_train = np.random.randint(1, 255, (1000, 3*32*32))
label_train = np.random.randint(0, 10, (1000))
np.around(data_train, 4)
dict_train = {}
dict_train["data"] = data_train.tolist()
dict_train["label"] = label_train.tolist()
dict_train["shape"] = [3, 32, 32]
with open("./myData/randomData255_small/train.json", "w") as f_train:
json.dump(dict_train, f_train, indent=4)
data_test = np.random.randint(1, 255, (200, 3*32*32))
label_test = np.random.randint(0, 10, (200))
np.around(data_test, 4)
dict_test = {}
dict_test["data"] = data_test.tolist()
dict_test["label"] = label_test.tolist()
dict_test["shape"] = [3, 32, 32]
with open("./myData/randomData255_small/test.json", "w") as f_test:
json.dump(dict_test, f_test, indent=4)

#%% cal mean, std
def mean_std(data, shape):
data_reshape = data.reshape(-1, shape[0], shape[1], shape[2])
mean_list,std_list = [],[]
for i in range(shape[0]):
mean = np.mean(data_reshape[:,i,:,:])
std = np.std(data_reshape[:,i,:,:])
if mean <= 1:
mean_list.append(np.around(mean*255, 2))
std_list.append(np.around(std*255, 2))
else:
mean_list.append(np.around(mean, 2))
std_list.append(np.around(std, 2))
return mean_list, std_list

#%% data转ofData
def data2of_part(datas, labels, save_path):
f = open(save_path, "wb")
for loop in range(0, len(labels)):
image = datas[loop].tolist()
label = [labels[loop]]

topack = {
'images': float_feature(image),
'labels': int32_feature(label),
}

ofrecord_features = ofrecord.OFRecord(feature=topack)
serilizedBytes = ofrecord_features.SerializeToString()

length = ofrecord_features.ByteSize()

f.write(struct.pack("q", length))
f.write(serilizedBytes)

print("Write ofData to", save_path)
f.close()

#%% load mydata and write ofData
def data2of(dataName):
# load/save path
load_path_train = "./myData/" + dataName + "/train.json"
load_path_test = "./myData/" + dataName + "/test.json"
save_path_train = "./ofData/" + dataName + "/train/"
save_path_test = "./ofData/" + dataName + "/test/"
if not os.path.exists(save_path_train):
os.makedirs(save_path_train)
print("create folder", save_path_train)
if not os.path.exists(save_path_test):
os.makedirs(save_path_test)
print("create folder", save_path_test)
# load data
with open(load_path_train) as f_train:
train_dict = json.load(f_train)
with open(load_path_test) as f_test:
test_dict = json.load(f_test)
data_train = np.array(train_dict["data"])
label_train = np.array(train_dict["label"])
data_test = np.array(test_dict["data"])
label_test = np.array(test_dict["label"])
data = np.append(data_train, data_test, axis=0)
label = np.append(label_train, label_test)
# data 2 ofData
data2of_part(data_train, label_train, save_path_train+"part-00000")
data2of_part(data_test, label_test, save_path_test+"part-00000")
# write meta information
shape = train_dict["shape"]
mean_list, std_list = mean_std(data, shape)
dict_meta = {}
dict_meta["num_classes"] = len(set(label))
dict_meta["image_shape"] = shape
dict_meta["rgb_mean"] = mean_list
dict_meta["rgb_std"] = std_list
dict_meta["num_examples"] = data_train.shape[0]
dict_meta["num_val_examples"] = data_test.shape[0]
with open("./ofData/" + dataName + "/meta.json", "w") as f_meta:
json.dump(dict_meta, f_meta, indent=4)
print("Write meta infomation to", "./ofData/" + dataName + "/meta.json")


def main():
# load_path = "./myData/data_batch_1"
# d = unpickle_cifar(load_path)
# data = d[b'data']
# print(type(data))
# labels = d[b'labels']
# print(data.shape)
# createRandomData_1()
# createRandomData_255()
dataName = args.dataName
data2of(dataName)

if __name__ == "__main__":
main()

+ 4
- 0
model_compress/model_compress/ChannelSlimming/output/snapshots/readme.md View File

@@ -0,0 +1,4 @@
# 模型输出文件夹

- 模型输出文件夹,模型文件存储在snapshots文件夹下
- 按照模型分别存放,各自模型下按照数据集存放,各自数据集下分为基模型model_base、剪枝模型model_prune、微调模型model_refine。

+ 225
- 0
model_compress/model_compress/ChannelSlimming/prune/pruneAlexnet.py View File

@@ -0,0 +1,225 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np
import os
from util.model_weights import modelWeight
import util.prune_algorithm as pa

parser = argparse.ArgumentParser()
dtype_dict={2:np.float32,
3:np.float64,
4:np.int8,
5:np.int32,
6:np.int64,
9:np.float16}

parser.add_argument("--bn", default=False,
type=str, help="Whether to use use bn layer")
parser.add_argument("--prune_method", default='bn',
type=str, help="method of prune(channel_prune_bn, channel_prune_conv)")
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last',
type = str, required = False, help = "Path of base oneflow model")
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str,
required = False, help = "Path to the output OneFlow model.")
parser.add_argument("--percent", default = 0.7, type = float, required = False,
help = "scale sparse rate (default: 0.7)")
parser.add_argument("--optimizer", type=str, default="momentum", required=False,
help="sgd, adam, momentum")
args = parser.parse_args()

def _SaveWeightBlob2File(blob, folder, var):
if not os.path.exists(folder):
os.makedirs(folder)
filename = os.path.join(folder, var)

f = open(filename, 'wb')
f.write(blob.tobytes())
f.close()

def _LoadWeightBlob2Numpy(shape, folder, dtype):
if not os.path.exists(folder):
print('fail to find', folder)
filename = os.path.join(folder, 'out')
f = open(filename, 'r')
n = np.fromfile(f, dtype=dtype)
n = n.reshape(shape)
f.close()
return n

def name2array(name, weights_dict):
folder=os.path.join(args.model_load_dir, name)
profile_dict = weights_dict[name]
shape=profile_dict["shape"]
dtype=profile_dict["dtype"]
dtype=dtype_dict[dtype]
array = _LoadWeightBlob2Numpy(shape,folder,dtype)
return array, dtype, shape

#制作待剪枝的namelist
def makeNameList(pruneName, nameList, name):
if pruneName == '_bn-gamma':
nameList.append(name+"_weight")
elif pruneName == "_weight":
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-gamma")
nameList.append(name+pruneName)
nameList.append(name+"_bias")
#是否添加对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta")
nameList.append(name+"_bn-moving_variance")
nameList.append(name+"_bn-moving_mean")
#adam时多加的参数
if args.optimizer == 'adam':
nameList.append(name+"_weight-v")
nameList.append(name+"_weight-m")
nameList.append(name+"_bias-v")
nameList.append(name+"_bias-m")
#是否添加对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta-v")
nameList.append(name+"_bn-beta-m")
nameList.append(name+"_bn-gamma-v")
nameList.append(name+"_bn-gamma-m")
#momentum时多加的参数
elif args.optimizer == 'momentum':
nameList.append(name+"_weight-momentum")
nameList.append(name+"_bias-momentum")
#是否添加对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta-momentum")
nameList.append(name+"_bn-gamma-momentum")
else:
if args.optimizer != 'sgd':
print('Error: optimizer!')
return nameList
def prune():
# 获取对应剪枝方法的thre阈值
if args.prune_method == 'bn':
thre = pa.get_pruneThre_bn()
elif args.prune_method == 'conv_avg':
thre = pa.get_pruneThre_conv_avg()
elif args.prune_method == 'conv_all':
thre = pa.get_pruneThre_conv_all()
elif args.prune_method == 'conv_max':
thre = pa.get_pruneThre_conv_max()
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)
modelWeight.weights_dict = {}
fcRemoveIndexs = []
fcDivideNum = 0
removeIndexs = []
lastRemoveIndexs = []
beforePrune = 0
afterPrune = 0
pruneName = ''
if "bn" in args.prune_method:
pruneName = "_bn-gamma"
elif "conv" in args.prune_method or args.prune_method=="random":
pruneName = "_weight"
for name, profile_dict in weights_dict.items():
if name.startswith("conv") and name.endswith(pruneName):
a, dtype, shape = name2array(name, weights_dict)
lastRemoveIndexs = removeIndexs
#获取对应剪枝方法removeIndexs
if args.prune_method == 'bn':
removeIndexs = pa.get_removeIndex_bn(a, thre)
elif args.prune_method == "conv_avg":
removeIndexs = pa.get_removeIndex_conv_avg(a, shape, thre)
elif args.prune_method == "conv_all":
removeIndexs = pa.get_removeIndex_conv_all(a, shape, thre)
elif args.prune_method == "conv_max":
removeIndexs = pa.get_removeIndex_conv_max(a, shape, thre)
elif args.prune_method == "random":
removeIndexs = pa.get_removeIndex_random(shape)
elif args.prune_method == "conv_similarity":
removeIndexs = pa.get_removeIndex_conv_similarity(a, shape)
elif args.prune_method == "bn_similarity":
removeIndexs = pa.get_removeIndex_bn_similarity(a, shape)
elif args.prune_method == "conv_threshold":
removeIndexs = pa.get_removeIndex_conv_threshold(a, shape, threSet=0.06)
if len(removeIndexs) == len(a):
removeIndexs = np.delete(removeIndexs, 0)
if name == "conv4"+pruneName:
fcRemoveIndexs = removeIndexs
fcDivideNum = 256

#待剪枝层的名字列表
name = name.split("_")[0].split("-")[0]
nameList = []
nameList = makeNameList(pruneName, nameList, name)

#真正剪枝
for name in nameList:
a, dtype, shape = name2array(name, weights_dict)
if name.endswith("weight") or name.endswith("weight-v") or \
name.endswith("weight-m") or name.endswith("weight-momentum"):
b = np.delete(a, removeIndexs, 0)
b = np.delete(b, lastRemoveIndexs, 1)
if name.endswith("weight"):
beforePrune += a.shape[0]
afterPrune += b.shape[0]
else:
b = np.delete(a, removeIndexs)
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
#第一个dense0层剪枝
elif name.startswith("dense"):
if name in ['dense0-weight', 'dense0-weight-v',
'dense0-weight-m', 'dense0-weight-momentum']:
fcRemoveIndexsNew = []
a, dtype, shape = name2array(name, weights_dict)
num = int(a.shape[1]/fcDivideNum)
for index in fcRemoveIndexs:
fcRemoveIndexsNew += [index+fcDivideNum*i for i in range(num)]
b = np.delete(a, fcRemoveIndexsNew, 1)
else:
a, dtype, shape = name2array(name, weights_dict)
b = a
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune)
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%")
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path")
modelWeight.save(weights_profile_path)
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model")))
def main():
prune()

if __name__ == "__main__":
main()

+ 158
- 0
model_compress/model_compress/ChannelSlimming/prune/pruneDnn.py View File

@@ -0,0 +1,158 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np
import os
from util.model_weights import modelWeight
import util.prune_algorithm as pa


parser = argparse.ArgumentParser()
dtype_dict={2:np.float32,
3:np.float64,
4:np.int8,
5:np.int32,
6:np.int64,
9:np.float16}

parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last',
type = str, required = False, help = "Path of base oneflow model")
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str,
required = False, help = "Path to the output OneFlow model.")
parser.add_argument("--percent", default = 0.7, type = float, required = False,
help = "scale sparse rate (default: 0.7)")
parser.add_argument("--optimizer", type=str, default="momentum", required=False,
help="sgd, adam, momentum")
args = parser.parse_args()

def _SaveWeightBlob2File(blob, folder, var):
if not os.path.exists(folder):
os.makedirs(folder)
filename = os.path.join(folder, var)

f = open(filename, 'wb')
f.write(blob.tobytes())
f.close()

def _LoadWeightBlob2Numpy(shape, folder, dtype):
if not os.path.exists(folder):
print('fail to find', folder)
filename = os.path.join(folder, 'out')
f = open(filename, 'r')
n = np.fromfile(f, dtype=dtype)
n = n.reshape(shape)
f.close()
return n

def name2array(name, weights_dict):
folder=os.path.join(args.model_load_dir, name)
profile_dict = weights_dict[name]
shape=profile_dict["shape"]
dtype=profile_dict["dtype"]
dtype=dtype_dict[dtype]
array = _LoadWeightBlob2Numpy(shape,folder,dtype)
return array, dtype, shape

#制作待剪枝的namelist
def makeNameList(nameList, name):
nameList.append(name+"-weight")
nameList.append(name+"-bias")
#adam时多加的参数
if args.optimizer == 'adam':
nameList.append(name+"-weight-v")
nameList.append(name+"-weight-m")
nameList.append(name+"-bias-v")
nameList.append(name+"-bias-m")
#momentum时多加的参数
elif args.optimizer == 'momentum':
nameList.append(name+"-weight-momentum")
nameList.append(name+"-bias-momentum")
else:
if args.optimizer != 'sgd':
print('Error: optimizer!')
return nameList
def prune():
#获的剪枝的阈值
thre = pa.get_pruneThre_fc()
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)
modelWeight.weights_dict = {}

removeIndexs = []
lastRemoveIndexs = []
beforePrune = 0
afterPrune = 0
dictLen = len(weights_dict)
numDiv = 0
if args.optimizer == 'adam':
numDiv = 6
elif args.optimizer == 'momentum':
numDiv = 4
else:
numDiv = 2
for name, profile_dict in weights_dict.items():
if name.startswith("dense") and name.endswith("-weight"):
if name.startswith("dense"+str(int(dictLen/numDiv)-1)) and name.endswith("-weight"):
lastRemoveIndexs = removeIndexs
removeIndexs = []
else:
a, dtype, shape = name2array(name, weights_dict)
lastRemoveIndexs = removeIndexs
#获取对应剪枝方法removeIndexs
removeIndexs = pa.get_removeIndex_fc(a, shape, thre)
if len(removeIndexs) == len(a):
removeIndexs = np.delete(removeIndexs, 0)

#待剪枝层的名字列表
name = name.split("_")[0].split("-")[0]
nameList = []
nameList = makeNameList(nameList, name)

#真正剪枝
i = 0
for name in nameList:
a, dtype, shape = name2array(name, weights_dict)
if "weight" in name:
b = np.delete(a, removeIndexs, 0)
b = np.delete(b, lastRemoveIndexs, 1)
else:
b = np.delete(a, removeIndexs)
if i == 0:
beforePrune += a.shape[0]
afterPrune += b.shape[0]
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
i += 1
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune)
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%")
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path")
modelWeight.save(weights_profile_path)
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model")))
def main():
prune()

if __name__ == "__main__":
main()

+ 226
- 0
model_compress/model_compress/ChannelSlimming/prune/pruneLenet.py View File

@@ -0,0 +1,226 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np
import os
from util.model_weights import modelWeight
import util.prune_algorithm as pa


parser = argparse.ArgumentParser()
dtype_dict={2:np.float32,
3:np.float64,
4:np.int8,
5:np.int32,
6:np.int64,
9:np.float16}

parser.add_argument("--bn", default=False,
type=str, help="Whether to use use bn layer")
parser.add_argument("--prune_method", default='bn',
type=str, help="method of prune(channel_prune_bn, channel_prune_conv)")
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last',
type = str, required = False, help = "Path of base oneflow model")
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str,
required = False, help = "Path to the output OneFlow model.")
parser.add_argument("--percent", default = 0.7, type = float, required = False,
help = "scale sparse rate (default: 0.7)")
parser.add_argument("--optimizer", type=str, default="momentum", required=False,
help="sgd, adam, momentum")
args = parser.parse_args()

def _SaveWeightBlob2File(blob, folder, var):
if not os.path.exists(folder):
os.makedirs(folder)
filename = os.path.join(folder, var)

f = open(filename, 'wb')
f.write(blob.tobytes())
f.close()

def _LoadWeightBlob2Numpy(shape, folder, dtype):
if not os.path.exists(folder):
print('fail to find', folder)
filename = os.path.join(folder, 'out')
f = open(filename, 'r')
n = np.fromfile(f, dtype=dtype)
n = n.reshape(shape)
f.close()
return n

def name2array(name, weights_dict):
folder=os.path.join(args.model_load_dir, name)
profile_dict = weights_dict[name]
shape=profile_dict["shape"]
dtype=profile_dict["dtype"]
dtype=dtype_dict[dtype]
array = _LoadWeightBlob2Numpy(shape,folder,dtype)
return array, dtype, shape

#制作待剪枝的namelist
def makeNameList(pruneName, nameList, name):
if pruneName == '_bn-gamma':
nameList.append(name+"_weight")
elif pruneName == "_weight":
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-gamma")
nameList.append(name+pruneName)
nameList.append(name+"_bias")
#是否添加对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta")
nameList.append(name+"_bn-moving_variance")
nameList.append(name+"_bn-moving_mean")
#adam时多加的参数
if args.optimizer == 'adam':
nameList.append(name+"_weight-v")
nameList.append(name+"_weight-m")
nameList.append(name+"_bias-v")
nameList.append(name+"_bias-m")
#是否添加对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta-v")
nameList.append(name+"_bn-beta-m")
nameList.append(name+"_bn-gamma-v")
nameList.append(name+"_bn-gamma-m")
#momentum时多加的参数
elif args.optimizer == 'momentum':
nameList.append(name+"_weight-momentum")
nameList.append(name+"_bias-momentum")
#是否添加对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta-momentum")
nameList.append(name+"_bn-gamma-momentum")
else:
if args.optimizer != 'sgd':
print('Error: optimizer!')
return nameList
def prune():
# 获取对应剪枝方法的thre阈值
if args.prune_method == 'bn':
thre = pa.get_pruneThre_bn()
elif args.prune_method == 'conv_avg':
thre = pa.get_pruneThre_conv_avg()
elif args.prune_method == 'conv_all':
thre = pa.get_pruneThre_conv_all()
elif args.prune_method == 'conv_max':
thre = pa.get_pruneThre_conv_max()
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)
modelWeight.weights_dict = {}
fcRemoveIndexs = []
fcDivideNum = 0
removeIndexs = []
lastRemoveIndexs = []
beforePrune = 0
afterPrune = 0
pruneName = ''
if "bn" in args.prune_method:
pruneName = "_bn-gamma"
elif "conv" in args.prune_method or args.prune_method=="random":
pruneName = "_weight"
for name, profile_dict in weights_dict.items():
if name.startswith("conv") and name.endswith(pruneName):
a, dtype, shape = name2array(name, weights_dict)
lastRemoveIndexs = removeIndexs
#获取对应剪枝方法removeIndexs
if args.prune_method == 'bn':
removeIndexs = pa.get_removeIndex_bn(a, thre)
elif args.prune_method == "conv_avg":
removeIndexs = pa.get_removeIndex_conv_avg(a, shape, thre)
elif args.prune_method == "conv_all":
removeIndexs = pa.get_removeIndex_conv_all(a, shape, thre)
elif args.prune_method == "conv_max":
removeIndexs = pa.get_removeIndex_conv_max(a, shape, thre)
elif args.prune_method == "random":
removeIndexs = pa.get_removeIndex_random(shape)
elif args.prune_method == "conv_similarity":
removeIndexs = pa.get_removeIndex_conv_similarity(a, shape)
elif args.prune_method == "bn_similarity":
removeIndexs = pa.get_removeIndex_bn_similarity(a, shape)
elif args.prune_method == "conv_threshold":
removeIndexs = pa.get_removeIndex_conv_threshold(a, shape, threSet=0.06)
if len(removeIndexs) == len(a):
removeIndexs = np.delete(removeIndexs, 0)
if name == "conv1"+pruneName:
fcRemoveIndexs = removeIndexs
fcDivideNum = 16

#待剪枝层的名字列表
name = name.split("_")[0].split("-")[0]
nameList = []
nameList = makeNameList(pruneName, nameList, name)

#真正剪枝
for name in nameList:
a, dtype, shape = name2array(name, weights_dict)
if name.endswith("weight") or name.endswith("weight-v") or \
name.endswith("weight-m") or name.endswith("weight-momentum"):
b = np.delete(a, removeIndexs, 0)
b = np.delete(b, lastRemoveIndexs, 1)
if name.endswith("weight"):
beforePrune += a.shape[0]
afterPrune += b.shape[0]
else:
b = np.delete(a, removeIndexs)
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
#第一个dense0层剪枝
elif name.startswith("dense"):
if name in ['dense0-weight', 'dense0-weight-v',
'dense0-weight-m', 'dense0-weight-momentum']:
fcRemoveIndexsNew = []
a, dtype, shape = name2array(name, weights_dict)
num = int(a.shape[1]/fcDivideNum)
for index in fcRemoveIndexs:
fcRemoveIndexsNew += [index+fcDivideNum*i for i in range(num)]
b = np.delete(a, fcRemoveIndexsNew, 1)
else:
a, dtype, shape = name2array(name, weights_dict)
b = a
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune)
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%")
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path")
modelWeight.save(weights_profile_path)
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model")))
def main():
prune()

if __name__ == "__main__":
main()

+ 271
- 0
model_compress/model_compress/ChannelSlimming/prune/pruneResnet.py View File

@@ -0,0 +1,271 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np
import os
from util.model_weights import modelWeight
import util.prune_algorithm as pa


parser = argparse.ArgumentParser()
dtype_dict={2:np.float32,
3:np.float64,
4:np.int8,
5:np.int32,
6:np.int64,
9:np.float16}

parser.add_argument("--bn", default=False,
type=str, help="Whether to use use bn layer")
parser.add_argument("--prune_method", default='bn',
type=str, help="method of prune(channel_prune_bn, channel_prune_conv)")
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last',
type = str, required = False, help = "Path of base oneflow model")
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str,
required = False, help = "Path to the output OneFlow model.")
parser.add_argument("--percent", default = 0.7, type = float, required = False,
help = "scale sparse rate (default: 0.7)")
parser.add_argument("--optimizer", type=str, default="momentum", required=False,
help="sgd, adam, momentum")
args = parser.parse_args()

def _SaveWeightBlob2File(blob, folder, var):
if not os.path.exists(folder):
os.makedirs(folder)
filename = os.path.join(folder, var)

f = open(filename, 'wb')
f.write(blob.tobytes())
f.close()

def _LoadWeightBlob2Numpy(shape, folder, dtype):
if not os.path.exists(folder):
print('fail to find', folder)
filename = os.path.join(folder, 'out')
f = open(filename, 'r')
n = np.fromfile(f, dtype=dtype)
n = n.reshape(shape)
f.close()
return n

def name2array(name, weights_dict):
folder=os.path.join(args.model_load_dir, name)
profile_dict = weights_dict[name]
shape=profile_dict["shape"]
dtype=profile_dict["dtype"]
dtype=dtype_dict[dtype]
array = _LoadWeightBlob2Numpy(shape,folder,dtype)
return array, dtype, shape

#制作待剪枝的namelist
def makeNameList(pruneName, nameList, name):
if pruneName == '_bn-gamma':
nameList.append(name+"_weight")
elif pruneName == "_weight":
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-gamma")
nameList.append(name+pruneName)
nameList.append(name+"_bias")
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta")
nameList.append(name+"_bn-moving_variance")
nameList.append(name+"_bn-moving_mean")
#adam时多加的参数
if args.optimizer == 'adam':
nameList.append(name+"_weight-v")
nameList.append(name+"_weight-m")
nameList.append(name+"_bias-v")
nameList.append(name+"_bias-m")
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta-v")
nameList.append(name+"_bn-beta-m")
nameList.append(name+"_bn-gamma-v")
nameList.append(name+"_bn-gamma-m")
#momentum时多加的参数
elif args.optimizer == 'momentum':
nameList.append(name+"_weight-momentum")
nameList.append(name+"_bias-momentum")
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta-momentum")
nameList.append(name+"_bn-gamma-momentum")
else:
if args.optimizer != 'sgd':
print('Error: optimizer!')
return nameList
def prune():
# 获取对应剪枝方法的thre阈值
if args.prune_method == 'bn':
thre = pa.get_pruneThre_bn()
elif args.prune_method == 'conv_avg':
thre = pa.get_pruneThre_conv_avg()
elif args.prune_method == 'conv_all':
thre = pa.get_pruneThre_conv_all()
elif args.prune_method == 'conv_max':
thre = pa.get_pruneThre_conv_max()
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)
modelWeight.weights_dict = {}
fcRemoveIndexs = []
fcDivideNum = 0
removeIndexs = []
lastRemoveIndexs = []
lastRemoveIndexs_shortcut = []
beforePrune = 0
afterPrune = 0
pruneName = ''
if "bn" in args.prune_method:
pruneName = "_bn-gamma"
elif "conv" in args.prune_method or args.prune_method=="random":
pruneName = "_weight"
for name, profile_dict in weights_dict.items():
if name.startswith("conv") and name.endswith(pruneName) and \
"stem" not in name and "shortcut" not in name:
a, dtype, shape = name2array(name, weights_dict)
lastRemoveIndexs = removeIndexs
#获取对应剪枝方法removeIndexs
if args.prune_method == 'bn':
removeIndexs = pa.get_removeIndex_bn(a, thre)
elif args.prune_method == "conv_avg":
removeIndexs = pa.get_removeIndex_conv_avg(a, shape, thre)
elif args.prune_method == "conv_all":
removeIndexs = pa.get_removeIndex_conv_all(a, shape, thre)
elif args.prune_method == "conv_max":
removeIndexs = pa.get_removeIndex_conv_max(a, shape, thre)
elif args.prune_method == "random":
removeIndexs = pa.get_removeIndex_random(shape)
elif args.prune_method == "conv_similarity":
removeIndexs = pa.get_removeIndex_conv_similarity(a, shape)
elif args.prune_method == "bn_similarity":
removeIndexs = pa.get_removeIndex_bn_similarity(a, shape)
elif args.prune_method == "conv_threshold":
removeIndexs = pa.get_removeIndex_conv_threshold(a, shape, threSet=0.06)
if len(removeIndexs) == len(a):
removeIndexs = np.delete(removeIndexs, 0)
if name == "conv47"+pruneName:
fcRemoveIndexs = removeIndexs
fcDivideNum = 2048

#待剪枝层的名字列表
name = name.split("_")[0].split("-")[0]
nameList = []
nameList = makeNameList(pruneName, nameList, name)

#除了shortcut层的真正剪枝
for name in nameList:
a, dtype, shape = name2array(name, weights_dict)
if name.endswith("weight") or name.endswith("weight-v") or \
name.endswith("weight-m") or name.endswith("weight-momentum"):
b = np.delete(a, removeIndexs, 0)
b = np.delete(b, lastRemoveIndexs, 1)
if name.endswith("weight"):
beforePrune += a.shape[0]
afterPrune += b.shape[0]
else:
b = np.delete(a, removeIndexs)
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
#resnet模型剪枝shortcut
#addName是shortcut层的数字后缀
addName = ""
#获取conv层name中的编号数字
n = int(name.split("_")[0].split("-")[0].replace("conv", ""))
if (n+1)%3 == 0:
n = int((n+1)/3)
if n <= 3:
addName = "0_" + str(n-1)
elif n <= 7:
addName = "1_" + str(n-4)
elif n <= 13:
addName = "2_" + str(n-8)
elif n <= 16:
addName = "3_" + str(n-14)
name = "conv_shortcut" + addName
#shortcut的conv层待剪枝层的名字列表
#nameList_shortcut是裁剪所有的名字列表
nameList_shortcut = []
nameList_shortcut = makeNameList(pruneName, nameList_shortcut, name)
#resnet模型的shortcut真正剪枝
for name in nameList_shortcut:
a, dtype, shape = name2array(name, weights_dict)
if name.endswith("weight") or name.endswith("weight-v") or \
name.endswith("weight-m") or name.endswith("weight-momentum"):
b = np.delete(a, removeIndexs, 0)
b = np.delete(b, lastRemoveIndexs_shortcut, 1)
else:
b = np.delete(a, removeIndexs)
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
lastRemoveIndexs_shortcut = removeIndexs
#复制stem层
elif "stem" in name:
a, dtype, shape = name2array(name, weights_dict)
b = a
print(name+" copy")
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
#第一个dense0层剪枝
elif name.startswith("dense"):
if name in ['dense0-weight', 'dense0-weight-v',
'dense0-weight-m', 'dense0-weight-momentum']:
fcRemoveIndexsNew = []
a, dtype, shape = name2array(name, weights_dict)
num = int(a.shape[1]/fcDivideNum)
for index in fcRemoveIndexs:
fcRemoveIndexsNew += [index+fcDivideNum*i for i in range(num)]
b = np.delete(a, fcRemoveIndexsNew, 1)
else:
a, dtype, shape = name2array(name, weights_dict)
b = a
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune)
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%")
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path")
modelWeight.save(weights_profile_path)
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model")))
def main():
prune()

if __name__ == "__main__":
main()

+ 228
- 0
model_compress/model_compress/ChannelSlimming/prune/pruneVggnet.py View File

@@ -0,0 +1,228 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np
import os
from util.model_weights import modelWeight
import util.prune_algorithm as pa


parser = argparse.ArgumentParser()
dtype_dict={2:np.float32,
3:np.float64,
4:np.int8,
5:np.int32,
6:np.int64,
9:np.float16}

parser.add_argument("--bn", default=False,
type=str, help="Whether to use use bn layer")
parser.add_argument("--prune_method", default='bn', type=str,
help="method of prune(bn, conv_avg, random...)")
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last',
type = str, required = False, help = "Path of base oneflow model")
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str,
required = False, help = "Path to the output OneFlow model.")
parser.add_argument("--percent", default = 0.7, type = float, required = False,
help = "scale sparse rate (default: 0.7)")
parser.add_argument("--optimizer", type=str, default="momentum", required=False,
help="sgd, adam, momentum")
args = parser.parse_args()

def _SaveWeightBlob2File(blob, folder, var):
if not os.path.exists(folder):
os.makedirs(folder)
filename = os.path.join(folder, var)

f = open(filename, 'wb')
f.write(blob.tobytes())
f.close()

def _LoadWeightBlob2Numpy(shape, folder, dtype):
if not os.path.exists(folder):
print('fail to find', folder)
filename = os.path.join(folder, 'out')
f = open(filename, 'r')
n = np.fromfile(f, dtype=dtype)
n = n.reshape(shape)
f.close()
return n

def name2array(name, weights_dict):
folder=os.path.join(args.model_load_dir, name)
profile_dict = weights_dict[name]
shape=profile_dict["shape"]
dtype=profile_dict["dtype"]
dtype=dtype_dict[dtype]
array = _LoadWeightBlob2Numpy(shape,folder,dtype)
return array, dtype, shape

#制作待剪枝的namelist
def makeNameList(pruneName, nameList, name):
if pruneName == '_bn-gamma':
nameList.append(name+"_weight")
elif pruneName == "_weight":
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-gamma")
nameList.append(name+pruneName)
nameList.append(name+"_bias")
#是否添加对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta")
nameList.append(name+"_bn-moving_variance")
nameList.append(name+"_bn-moving_mean")
#adam时多加的参数
if args.optimizer == 'adam':
nameList.append(name+"_weight-v")
nameList.append(name+"_weight-m")
nameList.append(name+"_bias-v")
nameList.append(name+"_bias-m")
#是否添加adam时对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta-v")
nameList.append(name+"_bn-beta-m")
nameList.append(name+"_bn-gamma-v")
nameList.append(name+"_bn-gamma-m")
#momentum时多加的参数
elif args.optimizer == 'momentum':
nameList.append(name+"_weight-momentum")
nameList.append(name+"_bias-momentum")
#是否添加momentum时对应bn层参数
if args.bn.lower() in ('yes', 'true', 't', 'y', '1'):
nameList.append(name+"_bn-beta-momentum")
nameList.append(name+"_bn-gamma-momentum")
else:
if args.optimizer != 'sgd':
print('Error: optimizer!')
return nameList
def prune():
# 获取对应剪枝方法的thre阈值
if args.prune_method == 'bn':
thre = pa.get_pruneThre_bn()
elif args.prune_method == 'conv_avg':
thre = pa.get_pruneThre_conv_avg()
elif args.prune_method == 'conv_all':
thre = pa.get_pruneThre_conv_all()
elif args.prune_method == 'conv_max':
thre = pa.get_pruneThre_conv_max()
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)
modelWeight.weights_dict = {}
fcRemoveIndexs = []
fcDivideNum = 0
removeIndexs = []
lastRemoveIndexs = []
beforePrune = 0
afterPrune = 0
pruneName = ''
if "bn" in args.prune_method:
pruneName = "_bn-gamma"
elif "conv" in args.prune_method or args.prune_method=="random":
pruneName = "_weight"
for name, profile_dict in weights_dict.items():
if name.startswith("conv") and name.endswith(pruneName):
a, dtype, shape = name2array(name, weights_dict)
lastRemoveIndexs = removeIndexs
#获取对应剪枝方法removeIndexs
if args.prune_method == 'bn':
removeIndexs = pa.get_removeIndex_bn(a, thre)
elif args.prune_method == "conv_avg":
removeIndexs = pa.get_removeIndex_conv_avg(a, shape, thre)
elif args.prune_method == "conv_all":
removeIndexs = pa.get_removeIndex_conv_all(a, shape, thre)
elif args.prune_method == "conv_max":
removeIndexs = pa.get_removeIndex_conv_max(a, shape, thre)
elif args.prune_method == "random":
removeIndexs = pa.get_removeIndex_random(shape)
elif args.prune_method == "conv_similarity":
removeIndexs = pa.get_removeIndex_conv_similarity(a, shape)
elif args.prune_method == "bn_similarity":
removeIndexs = pa.get_removeIndex_bn_similarity(a, shape)
elif args.prune_method == "conv_threshold":
removeIndexs = pa.get_removeIndex_conv_threshold(a, shape, threSet=0.06)
# print(removeIndexs)
if len(removeIndexs) == len(a):
removeIndexs = np.delete(removeIndexs, 0)
if name == "conv12"+pruneName:
fcRemoveIndexs = removeIndexs
fcDivideNum = 512

#待剪枝层的名字列表
name = name.split("_")[0].split("-")[0]
nameList = []
nameList = makeNameList(pruneName, nameList, name)

#真正剪枝
for name in nameList:
a, dtype, shape = name2array(name, weights_dict)
if name.endswith("weight") or name.endswith("weight-v") or \
name.endswith("weight-m") or name.endswith("weight-momentum"):
b = np.delete(a, removeIndexs, 0)
b = np.delete(b, lastRemoveIndexs, 1)
if name.endswith("weight"):
beforePrune += a.shape[0]
afterPrune += b.shape[0]
else:
b = np.delete(a, removeIndexs)
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
#第一个dense0层剪枝
elif name.startswith("dense"):
if name in ['dense0-weight', 'dense0-weight-v',
'dense0-weight-m', 'dense0-weight-momentum']:
fcRemoveIndexsNew = []
a, dtype, shape = name2array(name, weights_dict)
num = int(a.shape[1]/fcDivideNum)
for index in fcRemoveIndexs:
fcRemoveIndexsNew += [index+fcDivideNum*i for i in range(num)]
b = np.delete(a, fcRemoveIndexsNew, 1)
else:
a, dtype, shape = name2array(name, weights_dict)
b = a
print(name+" pruned: shape from", a.shape, "-->", b.shape)
if args.model_save_dir:
folder = os.path.join(args.model_save_dir, "model", name)
_SaveWeightBlob2File(b, folder, 'out')
modelWeight.add(name, list(dtype_dict.keys())[list(dtype_dict.values()).index(dtype)], b.shape)
print("Pruning done! Number of channel from", beforePrune, "-->", afterPrune)
print("Real Pruning rate:", 100*(beforePrune-afterPrune)/beforePrune, "%")
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path")
modelWeight.save(weights_profile_path)
os.system('cp -r {0}/System-Train-TrainStep-TrainNet {1}/System-Train-TrainStep-TrainNet '.format(args.model_load_dir, os.path.join(args.model_save_dir, "model")))
def main():
prune()

if __name__ == "__main__":
main()

+ 97
- 0
model_compress/model_compress/ChannelSlimming/prune/util/model_weights.py View File

@@ -0,0 +1,97 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import json
import oneflow as flow

# mysingle.py
class ModelWeights:
weights_dict={}
dtype_dict={flow.float32:2,
flow.float64:3,
flow.int8:4,
flow.int32:5,
flow.int64:6,
flow.float16:9,
2:2, 3:3, 4:4, 5:5, 6:6, 9:9}
def add(self, variable_name, dtype, shape):
assert variable_name not in self.weights_dict
profile_dict={}
profile_dict["dtype"]=dtype
profile_dict["shape"]=shape
self.weights_dict[variable_name]=profile_dict
return self.weights_dict
def addConv(self, index, dtype, shape1, shape2, optimizer):
dtype = self.dtype_dict[dtype]
# print(dtype)
self.add("conv{}".format(index)+'_weight', dtype, shape1)
self.add("conv{}".format(index)+'_bias', dtype, shape2)
self.add("conv{}".format(index)+'_bn-gamma', dtype, shape2)
self.add("conv{}".format(index)+'_bn-beta', dtype, shape2)
self.add("conv{}".format(index)+'_bn-moving_variance', dtype, shape2)
self.add("conv{}".format(index)+'_bn-moving_mean', dtype, shape2)
if optimizer == 'adam':
self.add("conv{}".format(index)+'_weight-v', dtype, shape1)
self.add("conv{}".format(index)+'_weight-m', dtype, shape1)
self.add("conv{}".format(index)+'_bias-v', dtype, shape2)
self.add("conv{}".format(index)+'_bias-m', dtype, shape2)
self.add("conv{}".format(index)+'_bn-gamma-v', dtype, shape2)
self.add("conv{}".format(index)+'_bn-gamma-m', dtype, shape2)
self.add("conv{}".format(index)+'_bn-beta-v', dtype, shape2)
self.add("conv{}".format(index)+'_bn-beta-m', dtype, shape2)
elif optimizer == 'momentum':
self.add("conv{}".format(index)+'_weight-momentum', dtype, shape1)
self.add("conv{}".format(index)+'_bias-momentum', dtype, shape2)
self.add("conv{}".format(index)+'_bn-gamma-momentum', dtype, shape2)
self.add("conv{}".format(index)+'_bn-beta-momentum', dtype, shape2)
def addDense(self, dtype_old, shape, optimizer, dense_num):
dtype = []
for old in dtype_old:
dtype.append(self.dtype_dict[old])
# print(dtype)
for i in range(0, dense_num):
self.add('dense'+str(i)+'-weight', dtype[i], shape[i])
self.add('dense'+str(i)+'-bias', dtype[i], (shape[i][0],))
if optimizer == 'adam':
self.add('dense'+str(i)+'-weight-v', dtype[i], shape[i])
self.add('dense'+str(i)+'-weight-m', dtype[i], shape[i])
self.add('dense'+str(i)+'-bias-v', dtype[i], (shape[i][0],))
self.add('dense'+str(i)+'-bias-m', dtype[i], (shape[i][0],))
elif optimizer == 'momentum':
self.add('dense'+str(i)+'-weight-momentum', dtype[i], shape[i])
self.add('dense'+str(i)+'-bias-momentum', dtype[i], (shape[i][0],))

def save(self,path):
print('Saving weights_profile_path to {}'.format(path))
# print('weights_dict',self.weights_dict)
with open(path,"w") as f:
for k,v in self.weights_dict.items():
v_json=json.dumps(v)
f.write(k+'__'+ v_json +'\n')
return self.weights_dict

def load(self,path):
if len(self.weights_dict)!=0:
return self.weights_dict
else:
with open(path,'r') as f:
for line in f:
variable_name,profile_dict=line.split('__')
profile_dict=json.loads(profile_dict)
self.weights_dict[variable_name]=profile_dict
return self.weights_dict

modelWeight = ModelWeights()

+ 315
- 0
model_compress/model_compress/ChannelSlimming/prune/util/prune_algorithm.py View File

@@ -0,0 +1,315 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import numpy as np
import os
from .model_weights import modelWeight
import random

parser = argparse.ArgumentParser()
dtype_dict={2:np.float32,
3:np.float64,
4:np.int8,
5:np.int32,
6:np.int64,
9:np.float16}

parser.add_argument("--bn", default=False,
type=str, help="Whether to use use bn layer")
parser.add_argument("--prune_method", default='bn',
type=str, help="method of prune(bn, conv_avg, random...)")
parser.add_argument("--model_load_dir", default = './output/snapshots/model_base/snapshot_last',
type = str, required = False, help = "Path of base oneflow model")
parser.add_argument("--model_save_dir", default = './output/snapshots/model_prune', type = str,
required = False, help = "Path to the output OneFlow model.")
parser.add_argument("--percent", default = 0.7, type = float, required = False,
help = "scale sparse rate (default: 0.7)")
parser.add_argument("--optimizer", type=str, default="momentum", required=False,
help="sgd, adam, momentum")
args = parser.parse_args()


def _LoadWeightBlob2Numpy(shape, folder, dtype):
if not os.path.exists(folder):
print('fail to find', folder)
filename = os.path.join(folder, 'out')
f = open(filename, 'r')
n = np.fromfile(f, dtype=dtype)
n = n.reshape(shape)
f.close()
return n

def name2array(name, weights_dict):
folder=os.path.join(args.model_load_dir, name)
profile_dict = weights_dict[name]
shape=profile_dict["shape"]
dtype=profile_dict["dtype"]
dtype=dtype_dict[dtype]
array = _LoadWeightBlob2Numpy(shape,folder,dtype)
return array, dtype, shape

# conv_avg剪枝方法:conv层weight的平均值作为缩放因子,获得对应阈值
def get_pruneThre_conv_avg():
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)

totalArray = []
for name, profile_dict in weights_dict.items():
if name.endswith("_weight") and "stem" not in name and "shortcut" not in name:
array, dtype, shape = name2array(name, weights_dict)
array = array.tolist()
array_rank = []
for i in range(0, shape[0]):
array_i = array[i]
array_i_faltten = [abs(m3) for m1 in array_i for m2 in m1 for m3 in m2]
array_rank.append(sum(array_i_faltten)/(shape[1]*shape[2]*shape[3]))
totalArray = totalArray + array_rank
totalArray.sort()
threIndex = int(len(totalArray) * args.percent)
thre = totalArray[threIndex]
print("threshold:", thre)
return thre

# conv_all剪枝方法:conv层weight的总和作为缩放因子,获得对应阈值
def get_pruneThre_conv_all():
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)

totalArray = []
for name, profile_dict in weights_dict.items():
if name.endswith("_weight") and "stem" not in name and "shortcut" not in name:
array, dtype, shape = name2array(name, weights_dict)
array = array.tolist()
array_rank = []
for i in range(0, shape[0]):
array_i = array[i]
array_i_faltten = [abs(m3) for m1 in array_i for m2 in m1 for m3 in m2]
array_rank.append(sum(array_i_faltten))
totalArray = totalArray + array_rank
totalArray.sort()
threIndex = int(len(totalArray) * args.percent)
thre = totalArray[threIndex]
print("threshold:", thre)
return thre

# conv_max剪枝方法:conv层weight的最大值作为缩放因子,获得对应阈值
def get_pruneThre_conv_max():
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)

totalArray = []
for name, profile_dict in weights_dict.items():
if name.endswith("_weight") and "stem" not in name and "shortcut" not in name:
array, dtype, shape = name2array(name, weights_dict)
array = array.tolist()
array_rank = []
for i in range(0, shape[0]):
array_i = array[i]
array_i_faltten = [abs(m3) for m1 in array_i for m2 in m1 for m3 in m2]
array_rank.append(max(array_i_faltten))
totalArray = totalArray + array_rank
totalArray.sort()
threIndex = int(len(totalArray) * args.percent)
thre = totalArray[threIndex]
print("threshold:", thre)
return thre
# bn剪枝方法:bn层weight作为缩放因子,获得对应阈值
def get_pruneThre_bn():
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict = modelWeight.load(of_weight_path)

totalArray = []
for name, profile_dict in weights_dict.items():
if name.endswith("_bn-gamma") and "stem" not in name and "shortcut" not in name:
array, dtype, shape = name2array(name, weights_dict)
array = array.tolist()
totalArray = totalArray + array
totalArray.sort()
threIndex = int(len(totalArray) * args.percent)
thre = totalArray[threIndex]
print("threshold:", thre)
return thre

#获得剪枝dnn层weight的阈值
def get_pruneThre_fc():
of_weight_path = args.model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
weights_dict=modelWeight.load(of_weight_path)

dictLen = len(weights_dict)
numDiv = 0
if args.optimizer == 'adam':
numDiv = 6
elif args.optimizer == 'momentum':
numDiv = 4
else:
numDiv = 2

totalArray = []
for name, profile_dict in weights_dict.items():
if name.startswith("dense"+str(int(dictLen/numDiv)-1)):
continue
if name.endswith("-weight"):
array, dtype, shape = name2array(name, weights_dict)
array = array.tolist()
array_rank = []
for i in range(0, shape[0]):
array_i = array[i]
array_i_faltten = [abs(m1) for m1 in array_i]
array_rank.append(sum(array_i_faltten)/shape[1])
totalArray = totalArray + array_rank
# print(totalArray, len(totalArray))
totalArray.sort()
threIndex = int(len(totalArray) * args.percent)
thre = totalArray[threIndex]
print("threshold:", thre)
return thre

# 获得fc剪枝方法对应的removeIndexs
def get_removeIndex_fc(a, shape, thre):
a_rank = []
for i in range(0, shape[0]):
a_i = a[i]
a_i_faltten = [abs(m1) for m1 in a_i]
a_rank.append(sum(a_i_faltten)/shape[1])
removeIndexs = np.where(np.array(a_rank)<thre)[0]
return removeIndexs


# 获得bn剪枝方法对应的removeIndexs
def get_removeIndex_bn(a, thre):
removeIndexs = np.where(a<thre)[0]
return removeIndexs

# 获得conv_avg剪枝方法对应的removeIndexs
def get_removeIndex_conv_avg(a, shape, thre):
a_rank = []
for i in range(0, shape[0]):
a_i = a[i]
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2]
#每一个通道的conv值的权重
a_rank.append(sum(a_i_faltten)/(shape[1]*shape[2]*shape[3]))
removeIndexs = np.where(np.array(a_rank)<thre)[0]
return removeIndexs

# 获得conv_all剪枝方法对应的removeIndexs
def get_removeIndex_conv_all(a, shape, thre):
a_rank = []
for i in range(0, shape[0]):
a_i = a[i]
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2]
#每一个通道的conv值的权重
a_rank.append(sum(a_i_faltten))
removeIndexs = np.where(np.array(a_rank)<thre)[0]
return removeIndexs

# 获得conv_max剪枝方法对应的removeIndexs
def get_removeIndex_conv_max(a, shape, thre):
a_rank = []
for i in range(0, shape[0]):
a_i = a[i]
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2]
#每一个通道的conv值的权重
a_rank.append(max(a_i_faltten))
removeIndexs = np.where(np.array(a_rank)<thre)[0]
return removeIndexs

# 随机选取removeIndexs
def get_removeIndex_random(shape):
removeIndexs = sorted(random.sample(range(shape[0]), int(shape[0]*args.percent)))
return removeIndexs

# 获得conv_similarity剪枝方法对应的removeIndexs
def get_removeIndex_conv_similarity(a, shape):
removeIndexs = []
while len(removeIndexs) <= shape[0]*args.percent:
a_rank = []
# 计算每一个元素和其他所有元素的相似度
for i in range(0, shape[0]):
# 已经移除的元素不再考虑
if i in removeIndexs:
continue
a_i = a[i]
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2]
min_similarity = float("inf")
for j in range(0, shape[0]):
# 已经移除的元素不再考虑
if j in removeIndexs+[i]:
continue
a_j = a[j]
a_j_faltten = [abs(m3) for m1 in a_j for m2 in m1 for m3 in m2]
similarity = sum([(n1-n2)**2 for n1,n2 in zip(a_i_faltten,a_j_faltten)])
if similarity < min_similarity:
min_similarity = similarity
a_rank.append(min_similarity)
# 选取相似度最小的添加到removeIndexs中
removeIndexs.append(a_rank.index(min(a_rank)))
# print(removeIndexs)
removeIndexs = sorted(removeIndexs)
return removeIndexs

# 获得bn_similarity剪枝方法对应的removeIndexs
def get_removeIndex_bn_similarity(a, shape):
removeIndexs = []
while len(removeIndexs) <= shape[0]*args.percent:
a_rank = []
# 计算每一个元素和其他所有元素的相似度
for i in range(0, shape[0]):
# 已经移除的元素不再考虑
if i in removeIndexs:
continue
a_i = a[i]
min_similarity = float("inf")
for j in range(0, shape[0]):
# 已经移除的元素不再考虑
if j in removeIndexs+[i]:
continue
a_j = a[j]
similarity = (a_i-a_j)**2
if similarity < min_similarity:
min_similarity = similarity
a_rank.append(min_similarity)
# 选取相似度最小的添加到removeIndexs中
removeIndexs.append(a_rank.index(min(a_rank)))
# print(removeIndexs)
removeIndexs = sorted(removeIndexs)
return removeIndexs

# 获得conv_threshold剪枝方法对应的removeIndexs
# 此thre是人为设置的,不是通过thre函数得到的
def get_removeIndex_conv_threshold(a, shape, threSet):
a_rank = []
for i in range(0, shape[0]):
a_i = a[i]
a_i_faltten = [abs(m3) for m1 in a_i for m2 in m1 for m3 in m2]
thre_sum = 0
for n in a_i_faltten:
if n < threSet:
thre_sum += 1
a_rank.append(thre_sum)
threIndex = int(len(a_rank) * args.percent)
thre = sorted(a_rank)[threIndex]
removeIndexs = np.where(np.array(a_rank)<thre)[0]
return removeIndexs

def main():
thre = get_pruneThre_bn()
print(thre)

if __name__ == "__main__":
main()

+ 251
- 0
model_compress/model_compress/ChannelSlimming/readme.md View File

@@ -0,0 +1,251 @@
# 通道剪枝快速上手

## 1. 简介

通道剪枝:剪去DNN模型或者CNN模型的一些冗余的参数通道,来获得更小的模型和更快的结果

炼知技术平台提供了7个通道剪枝相关算子,以及众多基于Oneflow算子复现的通道剪枝模型和使用示例。

| 类型 | 通道剪枝算子 | 算子介绍 |
| ------- | -------------------- | ------------------------------------------------------------ |
| DNN剪枝 | 神经元权重剪枝 | 以DNN神经网络的神经元训练参数的平均值作为剪枝权重,根据用户设置的剪枝率,减去权重较小的神经元 |
| CNN剪枝 | BN层剪枝 | 以CNN神经网络的BN层gamma参数作为剪枝权重,根据用户设置的剪枝率,减去权重较小的卷积通道(卷积核) |
| CNN剪枝 | 卷积层权重平均剪枝 | 以CNN神经网络的卷积层参数的平均值作为剪枝权重,根据用户设置的剪枝率,减去权重较小的卷积通道(卷积核) |
| CNN剪枝 | 卷积层权重总和剪枝 | 以CNN神经网络的卷积层参数的总和作为剪枝权重,根据用户设置的剪枝率,减去权重较小的神卷积通道(卷积核) |
| CNN剪枝 | 卷积层权重最大值剪枝 | 以CNN神经网络的卷积层参数的最大值作为剪枝权重,根据用户设置的剪枝率,减去权重较小的卷积通道(卷积核) |
| CNN剪枝 | 随机剪枝 | 根据用户设置的剪枝率,随机选取卷积通道(卷积核)进行剪枝 |
| CNN剪枝 | 卷积层阈值剪枝 | 计算CNN神经网络的卷积层参数中大于阈值的个数,将此个数作为剪枝的权重,根据用户设置的剪枝率,减去权重较小的卷积通道(卷积核) |

## 2. 使用

### 2.1 依赖及安装

- CUDA Version 10.1.243

- CUDA Driver Version: 418.56

- oneflow_cu101
- numpy > 1.17.0
- 可通过以下命令安装

```
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user
# 若找不到对应版本,升级pip
python3 -m pip install --upgrade --user pip
# 运行关于numpy的报错,例如module 'numpy.random' has no attribute 'default_rng'
# 由于numpy版本低于1.17.0,升级numpy
python3 -m pip install --upgrade --user numpy
```

### 2.2 数据获取

- 数据集需转换成oneflow格式,存放在**ofData文件夹**下
- 通道剪枝主要针对CV相关的任务,数据集需处理成oneflow格式,此任务提供了默认的数据集:Cifar10、Cifar100、mnist分类数据集。可从以下链接直接下载of格式数据集,并放至ofData数据集中:https://pan.baidu.com/s/1fj0DuQM6342CWx2DrMJGhQ(提取码:r8qx)
- 若要使用使用自定义数据集,使用方法见**2.3 运行**下的**使用自己数据集**

### 2.3 运行

- **默认运行(训练基模型、剪枝、微调)**

```
# cifar10数据集、alexnet模型
python run.py
```

- 运行结果见**output文件夹**,文件夹结构说明见**2.4 文件说明**

- 运行过程的日志见**log文件夹**,文件夹结构说明见**2.4 文件说明**

- **改变默认数据集(可选mnist、cifar10、cifar100)**

```
python run.py --data_type=mnist
```

- **改变默认模型(可选dnn_2、dnn_4、lenet、alexnet、vgg、resnet)**

```
python run.py --model=lenet
```

- **改变剪枝率**
```
python run.py --percent=0.5
```
- **改变剪枝算子**
```
# dnn剪枝不需要此参数,默认权重剪枝
# cnn剪枝算子有bn、conv_avg、conv_all、conv_max、random、conv_threshold
python run.py --prune_method=bn
```
- **改变更多参数**
- 见下面**2.4 文件说明**中**train_val.py文件**
- **使用自己数据集(以randomData255为例)**
- 数据集示例见myData下的randomData255,里面train.json包含了2张3\*32\*32大小的图片,test.json包含了2张3\*32\*32大小的图片

- 创建自己的数据集文件夹在**myData**文件夹下,文件夹名为数据集的名字**randomData255**

- randomData255文件夹中有两个文件:train.json和test.json,介绍如下

- **train.json**
- 存储为一个字典,字段有data、label、shape
- data为二维数组,如randomData255数据集为2张3\*32\*32大小的图片,则data维度为2\*3027,3027是3\*32\*32的展开,图片的像素值范围为 [0, 255]
- label为一维数组,代表每张图片的类别,randomData255数据集中长度为2,是一个2维度的向量
- shape为一维数组,长度为3,第一个是通道数,第二三个为像素(需相等),如 [3, 32, 32]
- **test.json**
- 存储为一个字典,和train.json相似,字段有data、label,没有shape字段

- 在ofrecordMake.py文件夹下运行:

```
# randomData255换成自己的数据集名称,制作完成的数据集见ofData文件夹
python ofrecordMake.py --dataName=randomData255
```

- 基模型训练、剪枝、微调,运行:

```
# randomData255换成自己的数据集名称
python run.py --data_type=randomData255
```

- **自定义步骤step**

- 1代表训练基模型;2代表剪枝、3代表微调,默认step=123

- 只运行训练基模型

```
python run.py --step=1
```

- 只运行剪枝:

```
# 在output/snapshots中对应位置需要有model_base,位置介绍见下面output文件夹介绍
python run.py --step=2
```

- 只运行微调:

```
# 在./output/snapshots中对应位置需要有model_prune
python run.py --step=3
```

- 运行训练基模型、剪枝

```
python run.py --step=12
```


### 2.4 文件说明

- **py文件说明**
- **run.py**

- 自动化调用train.py来进行训练和微调,以及prune剪枝

- 大部分参数设置为默认值,可以自动调整部分参数

- 部分参数包括:model、data_type、prune_method、percent

- 示例

```
python run.py --model alexnet --data_type=cifar10 --prune_method=bn --step=123
```

- **train_val.py**
- 训练train和微调refine模型的主函数
- 可以自己调整所有参数,参数列表见**2.5 config参数**
- 示例见run_dnn2_cifar10.sh、run_alexnet_cifar10.sh(bn剪枝算法)
- **ofrecordMake.py**
- 制作自定义数据集
- **文件夹说明**

- **log文件夹**
- 日志文件夹,存储不同模型和数据的日志log文件,记录每个epoch在test数据集上的top1准确率、topk准确率、运行速度。
- 如"log_vgg_cifar10_base_model.txt":vgg模型-cifar10数据集-baseline模型训练的log记录。

- **model文件夹**
- **cnn文件夹**
- lenet_model.py:LeNet模型
- alexnet_model.py:AlexNet模型
- vgg_model.py:VggNet模型
- resnet_model.py:ResNet模型
- **dnn文件夹**
- dnn_model:Dnn模型,包括两层Dnn模型dnn_2、四层Dnn模型dnn_4

- **util文件夹**
- config.py:命令行参数配置文件
- job_function_util.py:job function相关config
- model_weight.py:模型加载、保存等相关函数
- ofrecord_util.py:数据集加载函数
- optimizer_util.py:model相关config
- util.py:加载cfg,data,snapshot,summary等函数

- **prune文件夹**
- util文件夹
- 存放model_weight.py文件,模型加载、保存等相关函数
- 存放prune_algorithm.py文件,剪枝的不同算法
- 不同模型下的剪枝算法

- **ofData文件夹**
- 存放of格式的数据

- **output文件夹**
- 模型输出文件夹,模型文件存储在snapshots文件夹下
- 按照模型分别存放,各自模型下按照数据集存放,各自数据集下分为基模型model_base、剪枝模型model_prune、微调模型model_refine。


### 2.5 config参数

- --dtype=float32:训练过程中参数的类型
- --gpu_num_per_node=1:每个训练节点的GPU数量
- --num_nodes = 1:训练节点个数
- --model=vgg:训练中的模型(vgg、lenet、alexnet、dnn_2、dnn_4)
- --data_type='imageNet':加载的数据集(imageNet / cifar10)
- --log_type==base_model:写log日志的类型(base_model / prune_model)
- --default_dir=train:使用默认地址来加载和保存模型(推荐)'train'或者'refine'
- --model_load_dir='xxxxxx':自己指定模型加载地址(使用default_dir则不需要此项)
- --model_save_dir='xxxxxx':自己指定模型保存地址(使用default_dir则不需要此项)
- --batch_size_per_device=32:train中每个设备的batch_size(乘上gpu_num_per_node和num_nodes就是train_batch_size)
- --val_batch_size_per_device=32:test中每个设备的batch_size(乘上gpu_num_per_node和num_nodes就是test_batch_size)
- --num_classes=1000:分类数据集的类别个数
- --num_epochs=1:epoch的个数
- --num_examples=64000:决定train中的iter个数(除以train_batch_size就是iter个数)
- --num_val_examples=50000:决定test中的iter个数(除以test_batch_size就是iter个数)
- --rgb_mean=[123.68, 116.779, 103.939]:图片归一预处理时的均值
- --rgb_std=[58.393, 57.12, 57.375]:图片归一预处理时的方差
- --image_shape=[3, 224, 224]:图片的channel、height、width
- --log_dir='./output':log信息的保存地址
- --result_dir='./output': results json保存地址。results json文件名格式为:args.result_dir, "results_"+args.model+'_'+args.data_type+'_'+args.log_type+"_{}.json".format(self.desc))
- --loss_print_every_n_iter=1:每n个iter输出一次loss、accuracy、speed信息
- --model_save_every_n_epoch=10:每n个epoch保存一次模型
- --image_size=224:图片大小
- --train_data_dir='./ofData/imageNet/train':训练数据集的目录
- --train_data_part_num=30:训练数据集的part数(part0000-part00xx)
- --val_data_dir='./ofData/imageNet/test':测试数据集的目录
- --val_data_part_num=2:测试数据集的part数(part0000-part00xx)
- --model_update='momentum':训练的优化器('momentum' / 'adam' / 'sgd')
- --learning_rate=0.01:学习率
- --prune_method=bn:剪枝算法(bn、conv_avg、conv_all、conv_max、random、conv_similarity、bn_similarity、conv_threshold、以及不需要此参数的dnn剪枝)
- --step=123:剪枝步骤,1代表训练基模型;2代表剪枝、3代表微调,默认step=123








+ 240
- 0
model_compress/model_compress/ChannelSlimming/run.py View File

@@ -0,0 +1,240 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import argparse
import json
from datetime import datetime


def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')


parser = argparse.ArgumentParser()

parser.add_argument("--model", default="alexnet",
type=str, help="Model")
parser.add_argument("--data_type", default="cifar10",
type=str, help="Dataset name")
parser.add_argument("--bn", type=str2bool,
default=True, help="Whether to use use bn layer")
parser.add_argument("--percent", default="0.5",
type=str, help="scale sparse rate (default: 0.7)")
parser.add_argument("--prune_method", default='bn',
type=str, help="method of prune(bn, conv_avg, random...)")
parser.add_argument("--step", default='123',
type=str, help="choose steps from train, prune, refine")
parser.add_argument("--dataset_dir", type=str, default="./ofData/cifar10", help="dataset info load directory")
# snapshot
parser.add_argument("--model_save_dir", type=str, default="./models", help="model save directory", )
# log, save and loss print
parser.add_argument("--model_dir", type=str, default="./model", help="model info save directory")
parser.add_argument("--log_dir", type=str, default="./log", help="log info save directory")
parser.add_argument("--before_result_dir", type=str, default="./result/before", help="the save directory of results")
parser.add_argument("--after_result_dir", type=str, default="./result/after", help="the save directory of results")

args = parser.parse_args()


def getCommand():
model = args.model
data_type = args.data_type
dataset_dir = args.dataset_dir
model_save_dir = args.model_save_dir
log_dir = args.log_dir
before_result_dir = args.before_result_dir
after_result_dir = args.after_result_dir
num_classes, train_data_part_num, val_data_part_num = "", "", ""
image_shape, image_size, resize_shorter = "", "", ""
rgb_mean, rgb_std, num_examples, num_val_examples = "", "", "", ""
bn = args.bn
prune = ""
percent = args.percent
prune_method = args.prune_method

if "dnn" in args.model:
bn = "False"
prune = "Dnn"
elif args.model == "lenet":
prune = "Lenet"
elif "alexnet" in args.model:
prune = "Alexnet"
elif args.model == "vgg":
prune = "Vggnet"
elif args.model == "resnet":
prune = "Resnet"

if data_type == "cifar10":
num_classes, train_data_part_num, val_data_part_num = "10", "5", "1"
image_shape, image_size, resize_shorter = "3,32,32", "32", "32"
rgb_mean, rgb_std = "124.95,122.65,114.75", "61.252,60.767,65.852"
num_examples, num_val_examples = "50000", "10000"
elif data_type == "cifar100":
num_classes, train_data_part_num, val_data_part_num = "100", "5", "1"
image_shape, image_size, resize_shorter = "3,32,32", "32", "32"
rgb_mean, rgb_std = "124.95,122.65,114.75", "61.252,60.767,65.852"
num_examples, num_val_examples = "50000", "10000"
elif data_type == "mnist":
num_classes, train_data_part_num, val_data_part_num = "10", "6", "1"
image_shape, image_size, resize_shorter = "1,28,28", "28", "32"
rgb_mean, rgb_std = "33.3285", "78.5655"
num_examples, num_val_examples = "60000", "10000"
elif data_type == "svhn":
num_classes, train_data_part_num, val_data_part_num = "10", "1", "1"
image_shape, image_size, resize_shorter = "32,32,3", "32", "32"
rgb_mean, rgb_std = "111.61,113.16,120.57", "50.50,51.26,50.24"
num_examples, num_val_examples = "73257", "26032"
elif data_type == "imageNet":
num_classes, train_data_part_num, val_data_part_num = "1000", "30", "2"
image_shape, image_size, resize_shorter = "3,224,224", "224", "256"
rgb_mean, rgb_std = "123.68,116.779,103.939", "58.393,57.12,57.375"
num_examples, num_val_examples = "64000", "6400"
else:
with open(dataset_dir + "/meta.json") as f_meta:
dict_meta = json.load(f_meta)
shape = dict_meta["image_shape"]
mean_list = dict_meta["rgb_mean"]
std_list = dict_meta["rgb_std"]

num_classes = str(dict_meta["num_classes"])
train_data_part_num, val_data_part_num = "1", "1"
image_shape = str(shape[0]) + "," + str(shape[1]) + "," + str(shape[2])
image_size, resize_shorter = str(shape[1]), str(shape[1])
rgb_mean, rgb_std = "", ""
for mean in mean_list:
rgb_mean += str(mean) + ","
rgb_mean = rgb_mean.strip(",")
for std in std_list:
rgb_std += str(std) + ","
rgb_std = rgb_std.strip(",")
num_examples = dict_meta["num_examples"]
num_val_examples = dict_meta["num_val_examples"]

command1 = "python3 ./train_val.py \
--model={0} \
--data_type={1} \
--log_type=base_model \
--model_update=adam \
--num_classes={2} \
--train_data_dir={13}/train \
--train_data_part_num={3} \
--val_data_dir={13}/test \
--val_data_part_num={4} \
--num_nodes=1 \
--gpu_num_per_node=1 \
--loss_print_every_n_iter=1 \
--label_smoothing=0 \
--warmup_epochs=0 \
--lr_decay=None \
--image_shape={5} \
--image_size={6} \
--resize_shorter={7} \
--rgb_mean={8} \
--rgb_std={9} \
--num_examples={10} \
--num_val_examples={11} \
--batch_size_per_device=32 \
--val_batch_size_per_device=32 \
--learning_rate=0.001 \
--bn={12} \
--num_epochs=2 \
--model_save_every_n_epoch=10 \
--model_save_dir={16}/model_base \
--log_dir={14} \
--before_result_dir={15}" \
.format(model, data_type, num_classes, train_data_part_num,
val_data_part_num, image_shape, image_size,
resize_shorter, rgb_mean, rgb_std,
num_examples, num_val_examples, bn, dataset_dir, log_dir, before_result_dir, model_save_dir)

command2 = "python3 ./prune/prune{0}.py \
--percent={1} \
--optimizer=adam \
--prune_method={2} \
--bn={3} \
--model_load_dir={4}/model_base/snapshot_last \
--model_save_dir={4}/model_prune" \
.format(prune, percent, prune_method, bn, model_save_dir)

if "dnn" in args.model:
command2 = "python3 ./prune/prune{0}.py \
--percent={1} \
--optimizer=adam \
--model_load_dir={2}/model_base/snapshot_last \
--model_save_dir={2}/model_prune" \
.format(prune, percent, model_save_dir)

command3 = "python3 ./train_val.py \
--model={0} \
--data_type={1} \
--log_type=prune_model \
--model_update=adam \
--num_classes={2} \
--train_data_dir={13}/train \
--train_data_part_num={3} \
--val_data_dir={13}/test \
--val_data_part_num={4} \
--num_nodes=1 \
--gpu_num_per_node=1 \
--loss_print_every_n_iter=1 \
--label_smoothing=0 \
--warmup_epochs=0 \
--lr_decay=None \
--image_shape={5} \
--image_size={6} \
--resize_shorter={7} \
--rgb_mean={8} \
--rgb_std={9} \
--num_examples={10} \
--num_val_examples={11} \
--batch_size_per_device=32 \
--val_batch_size_per_device=32 \
--learning_rate=0.001 \
--bn={12} \
--num_epochs=2 \
--model_save_every_n_epoch=10 \
--model_save_dir={15}/model_refine \
--model_load_dir={15}/model_prune/model \
--log_dir={14} \
--after_result_dir={16}" \
.format(model, data_type, num_classes, train_data_part_num,
val_data_part_num, image_shape, image_size,
resize_shorter, rgb_mean, rgb_std,
num_examples, num_val_examples, bn, dataset_dir, log_dir, model_save_dir, after_result_dir)

return command1, command2, command3


def main():
command1, command2, command3 = getCommand()
step = args.step
# print(command1)
if "1" in step:
os.system(command1)
if "2" in step:
os.system(command2)
if "3" in step:
os.system(command3)


if __name__ == "__main__":
main()

+ 80
- 0
model_compress/model_compress/ChannelSlimming/run_alexnet_cifar10.sh View File

@@ -0,0 +1,80 @@
export ENABLE_USER_OP=True
export VISIBLE_DEVICES=3
#train base model
python3 of_cnn_train_val.py \
--model=alexnet \
--data_type=cifar10 \
--log_type=base_model \
--model_update=adam \
--num_classes=10 \
--train_data_dir=./ofData/cifar10/train \
--train_data_part_num=5 \
--val_data_dir=./ofData/cifar10/test \
--val_data_part_num=1 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--loss_print_every_n_iter=1 \
--label_smoothing=0 \
--warmup_epochs=0 \
--lr_decay=None \
--image_shape=3,32,32 \
--image_size=32 \
--resize_shorter=32 \
--rgb_mean=124.95,122.65,114.75 \
--rgb_std=61.252,60.767,65.852 \
--num_examples=50000 \
--num_val_examples=10000 \
--batch_size_per_device=32 \
--val_batch_size_per_device=32 \
--learning_rate=0.001 \
--bn=True \
--num_epochs=30 \
--model_save_every_n_epoch=10 \
--model_save_dir=./output/snapshots/alexnet/cifar10/model_base

#prune base model
python3 ./prune/pruneAlexnet.py \
--percent=0.7 \
--optimizer=adam \
--prune_method=bn \
--bn=True \
--model_load_dir=./output/snapshots/alexnet/cifar10/model_base/snapshot_last \
--model_save_dir=./output/snapshots/alexnet/cifar10/model_prune

#refine pruned model
python3 of_cnn_train_val.py \
--model=alexnet \
--data_type=cifar10 \
--model_update=adam \
--log_type=prune_model \
--num_classes=10 \
--train_data_dir=./ofData/cifar10/train \
--train_data_part_num=5 \
--val_data_dir=./ofData/cifar10/test \
--val_data_part_num=1 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--loss_print_every_n_iter=1 \
--label_smoothing=0 \
--warmup_epochs=0 \
--lr_decay=None \
--image_shape=3,32,32 \
--image_size=32 \
--resize_shorter=32 \
--rgb_mean=124.95,122.65,114.75 \
--rgb_std=61.252,60.767,65.852 \
--num_examples=50000 \
--num_val_examples=10000 \
--batch_size_per_device=32 \
--val_batch_size_per_device=32 \
--learning_rate=0.001 \
--bn=True \
--num_epochs=100 \
--model_save_every_n_epoch=10 \
--model_save_dir=./output/snapshots/alexnet/cifar10/model_refine \
--model_load_dir=./output/snapshots/alexnet/cifar10/model_prune/model

+ 78
- 0
model_compress/model_compress/ChannelSlimming/run_dnn2_cifar10.sh View File

@@ -0,0 +1,78 @@
export ENABLE_USER_OP=True
export VISIBLE_DEVICES=3
#train base model
python3 of_cnn_train_val.py \
--model=dnn_2 \
--data_type=cifar10 \
--log_type=base_model \
--model_update=adam \
--num_classes=10 \
--train_data_dir=./ofData/cifar10/train \
--train_data_part_num=5 \
--val_data_dir=./ofData/cifar10/test \
--val_data_part_num=1 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--loss_print_every_n_iter=1 \
--label_smoothing=0 \
--warmup_epochs=0 \
--lr_decay=None \
--image_shape=3,32,32 \
--image_size=32 \
--resize_shorter=32 \
--rgb_mean=124.95,122.65,114.75 \
--rgb_std=61.252,60.767,65.852 \
--num_examples=50000 \
--num_val_examples=10000 \
--batch_size_per_device=32 \
--val_batch_size_per_device=32 \
--learning_rate=0.001 \
--bn=True \
--num_epochs=30 \
--model_save_every_n_epoch=10 \
--model_save_dir=./output/snapshots/dnn_2/cifar10/model_base

#prune base model
python3 ./prune/pruneDnn.py \
--percent=0.5 \
--optimizer=adam \
--model_load_dir=./output/snapshots/dnn_2/cifar10/model_base/snapshot_last \
--model_save_dir=./output/snapshots/dnn_2/cifar10/model_prune

#refine pruned model
python3 of_cnn_train_val.py \
--model=dnn_2 \
--data_type=cifar10 \
--model_update=adam \
--log_type=prune_model \
--num_classes=10 \
--train_data_dir=./ofData/cifar10/train \
--train_data_part_num=5 \
--val_data_dir=./ofData/cifar10/test \
--val_data_part_num=1 \
--num_nodes=1 \
--gpu_num_per_node=1 \
--loss_print_every_n_iter=1 \
--label_smoothing=0 \
--warmup_epochs=0 \
--lr_decay=None \
--image_shape=3,32,32 \
--image_size=32 \
--resize_shorter=32 \
--rgb_mean=124.95,122.65,114.75 \
--rgb_std=61.252,60.767,65.852 \
--num_examples=50000 \
--num_val_examples=10000 \
--batch_size_per_device=32 \
--val_batch_size_per_device=32 \
--learning_rate=0.001 \
--bn=True \
--num_epochs=100 \
--model_save_every_n_epoch=10 \
--model_save_dir=./output/snapshots/dnn_2/cifar10/model_refine \
--model_load_dir=./output/snapshots/dnn_2/cifar10/model_prune/model

+ 153
- 0
model_compress/model_compress/ChannelSlimming/train_val.py View File

@@ -0,0 +1,153 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import math

import oneflow as flow

import util.config as configs
from util.util import Snapshot, Summary, InitNodes, Metric, LoadCfg, LoadData
from util.job_function_util import get_train_config, get_val_config
import model.cnn.resnet_model as resnet_model
import model.cnn.vgg_model as vgg_model
import model.cnn.alexnet_model as alexnet_model
import model.cnn.lenet_model as lenet_model
import model.dnn.dnn_model as dnn_model
from util.model_weights import modelWeight


parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)


total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
val_batch_size = total_device_num * args.val_batch_size_per_device
(C, H, W) = args.image_shape
epoch_size = math.ceil(args.num_examples / train_batch_size)
num_val_steps = int(args.num_val_examples / val_batch_size)


model_dict = {"resnet": resnet_model.resnet50,
"vgg": vgg_model.vgg,
"alexnet": alexnet_model.alexnet,
"alexnet_simple": alexnet_model.alexnet_simple,
"lenet": lenet_model.lenet,
"dnn_2": dnn_model.dnn_2,
"dnn_4": dnn_model.dnn_4,}

flow.config.gpu_device_num(args.gpu_num_per_node)
flow.config.enable_debug_mode(True)

if args.use_boxing_v2:
flow.config.collective_boxing.nccl_fusion_threshold_mb(8)
flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False)


def label_smoothing(labels, classes, eta, dtype):
assert classes > 0
assert eta >= 0.0 and eta < 1.0

return flow.one_hot(labels, depth=classes, dtype=dtype,
on_value=1 - eta + eta / classes, off_value=eta/classes)

@flow.global_function("train", get_train_config(args))
def TrainNet():
cfg = LoadCfg(args=args, model_load_dir=args.model_load_dir, load_type='train')
labels, images = LoadData(args, 'train')
if args.model in ("resnet", "vgg", "alexnet", "alexnet_simple", "lenet"):
logits = model_dict[args.model](images, cfg, optimizer=args.model_update,
need_transpose=False if args.train_data_dir else True,
bn=args.bn)
else:
logits = model_dict[args.model](images, cfg, optimizer=args.model_update)
if args.label_smoothing > 0:
one_hot_labels = label_smoothing(labels, args.num_classes, args.label_smoothing, logits.dtype)
loss = flow.nn.softmax_cross_entropy_with_logits(one_hot_labels, logits, name="softmax_loss")
else:
loss = flow.nn.sparse_softmax_cross_entropy_with_logits(labels, logits, name="softmax_loss")
# lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [args.learning_rate])
# flow.optimizer.SGD(lr_scheduler, momentum=args.mom).minimize(loss)
flow.losses.add_loss(loss)
predictions = flow.nn.softmax(logits)
outputs = {"loss": loss, "predictions": predictions, "labels": labels}
# outputs = {"loss": loss, "predictions": predictions, "labels": labels, 'logits':logits}
return outputs


@flow.global_function("predict", get_val_config(args))
def InferenceNet():
cfg = LoadCfg(args=args, model_load_dir=args.model_load_dir, load_type='test')
labels, images = LoadData(args, 'test')
if args.model in ("resnet", "vgg", "alexnet", "alexnet_simple", "lenet"):
logits = model_dict[args.model](images, cfg, optimizer=args.model_update,
need_transpose=False if args.train_data_dir else True,
model_weight=False, bn=args.bn)
else:
logits = model_dict[args.model](images, cfg, optimizer=args.model_update, model_weight=False)
predictions = flow.nn.softmax(logits)
outputs = {"predictions": predictions, "labels": labels}
return outputs


def main():
InitNodes(args)

flow.env.grpc_use_no_signal()
flow.env.log_dir(args.log_dir)

summary = Summary(args.log_dir, args)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir)
#open log file
log_file = open("./log/log_"+args.model+"_"+args.data_type+"_"+args.log_type+".txt", "w")
if not args.before_result_dir:
args.before_result_dir = "./log/before"
if not args.after_result_dir:
args.after_result_dir = "./log/after"

for epoch in range(args.num_epochs):
#config callback func during training
metric = Metric(desc='train', calculate_batches=args.loss_print_every_n_iter,
summary=summary, save_summary_steps=epoch_size,
batch_size=train_batch_size, loss_key='loss')
#training...(epoch times = epoch_size)
for i in range(epoch_size):
TrainNet().async_get(metric.metric_cb(epoch, i))

if args.val_data_dir:
#config callback func during testing
metric = Metric(desc='validation', calculate_batches=num_val_steps, summary=summary,
save_summary_steps=num_val_steps, batch_size=val_batch_size)
#tesing
for i in range(num_val_steps):
InferenceNet().async_get(metric.metric_cb(epoch, i, args=args, log_file=log_file))
if epoch % args.model_save_every_n_epoch == 0:
snapshot.save('epoch_{}'.format(epoch))
flow.sync_default_session()
#save last_snapeshot and model weight
snapshot.save('last')
flow.sync_default_session()
weights_profile_path = os.path.join(args.model_save_dir, "weights_profile_path")
modelWeight.save(weights_profile_path)


if __name__ == "__main__":
os.system("rm -rf {0}".format(args.model_save_dir))
main()

+ 144
- 0
model_compress/model_compress/ChannelSlimming/util/config.py View File

@@ -0,0 +1,144 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
from datetime import datetime


from util.optimizer_util import add_optimizer_args
from util.ofrecord_util import add_ofrecord_args


def get_parser(parser=None):
def str_list(x):
return x.split(',')

def int_list(x):
return list(map(int, x.split(',')))

def float_list(x):
return list(map(float, x.split(',')))

def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')

if parser is None:
parser = argparse.ArgumentParser("flags for cnn benchmark")

parser.add_argument("--dtype", type=str,
default='float32', help="float16 float32")

# resouce
parser.add_argument("--gpu_num_per_node", type=int, default=1)
parser.add_argument('--num_nodes', type=int, default=1,
help='node/machine number for training')
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
help='nodes ip list for training, devided by ",", length >= num_nodes')

parser.add_argument("--model", type=str, default="vgg",
help="vgg, alexnet, lenet")
parser.add_argument(
'--use_fp16',
type=str2bool,
nargs='?',
const=True,
help='Whether to use use fp16'
)
parser.add_argument(
'--use_boxing_v2',
type=str2bool,
nargs='?',
const=True,
help='Whether to use boxing v2'
)

# train and validaion
# parser.add_argument("--default_dir", type=str,
# default='', help="use default model dir to save and load (train / refine)")
parser.add_argument("--bn", type=str2bool,
default=False, help="Whether to use use bn layer")
parser.add_argument("--data_type", type=str,
default='imageNet', help="type of dataser (imageNet / cifar10...)")
parser.add_argument("--log_type", type=str,
default='base_model', help="type of log (base_model/prune_model)")
parser.add_argument('--num_epochs', type=int,
default=90, help='number of epochs')
parser.add_argument("--model_load_dir", type=str,
default=None, help="model load directory if need")
parser.add_argument("--batch_size_per_device", type=int, default=64)
parser.add_argument("--val_batch_size_per_device", type=int, default=8)

# inference
parser.add_argument("--image_path", type=str, default='tiger.jpg', help="image path")

# for data process
parser.add_argument("--num_classes", type=int, default=1000, help="num of pic classes")
parser.add_argument("--num_examples", type=int,
default=300000, help="train pic number")
parser.add_argument("--num_val_examples", type=int,
default=50000, help="validation pic number")
parser.add_argument('--rgb_mean', type=float_list, default=[123.68, 116.779, 103.939],
help='a tuple of size 3 for the mean rgb')
parser.add_argument('--rgb_std', type=float_list, default=[58.393, 57.12, 57.375],
help='a tuple of size 3 for the std rgb')
parser.add_argument("--input_layout", type=str,
default='NHWC', help="NCHW or NHWC")
parser.add_argument('--image_shape', type=int_list, default=[3, 224, 224],
help='the image shape feed into the network')
parser.add_argument('--label_smoothing', type=float, default=0.1, help='label smoothing factor')

# snapshot
parser.add_argument("--model_save_dir", type=str,
default="./output/snapshots/model_save-{}".format(
str(datetime.now().strftime("%Y%m%d%H%M%S"))),
help="model save directory",
)

# log, save and loss print
parser.add_argument("--log_dir", type=str,default="./output", help="log info save directory")
parser.add_argument("--before_result_dir", type=str,default="", help="the save directory of results")
parser.add_argument("--after_result_dir", type=str, default="", help="the save directory of results")

parser.add_argument("--loss_print_every_n_iter", type=int, default=1,
help="print loss every n iteration")
parser.add_argument("--model_save_every_n_epoch", type=int, default=10,
help="save model every n epoch",)
add_ofrecord_args(parser)
add_optimizer_args(parser)
return parser


def print_args(args):
print("=".ljust(66, "="))
print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
args.model, args.gpu_num_per_node, args.num_nodes))
print("=".ljust(66, "="))
for arg in vars(args):
print("{} = {}".format(arg, getattr(args, arg)))
print("-".ljust(66, "-"))
print("Time stamp: {}".format(
str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))


if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
print_args(args)

+ 53
- 0
model_compress/model_compress/ChannelSlimming/util/job_function_util.py View File

@@ -0,0 +1,53 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import oneflow as flow
from util.optimizer_util import gen_model_update_conf


def _default_config(args):
config = flow.function_config()
config.default_logical_view(flow.scope.consistent_view())
config.default_data_type(flow.float)
if args.use_fp16:
config.enable_auto_mixed_precision(True)
return config


def get_train_config(args):
train_config = _default_config(args)
train_config.train.primary_lr(args.learning_rate)
# train_config.disable_all_reduce_sequence(False)
# train_config.cudnn_conv_enable_pseudo_half(True)
# train_config.all_reduce_group_min_mbyte(8)
# train_config.all_reduce_group_num(128)
# train_config.all_reduce_lazy_ratio(0)

# train_config.enable_nccl_hierarchical_all_reduce(True)
# train_config.cudnn_buf_limit_mbyte(2048)
# train_config.concurrency_width(2)

if args.use_boxing_v2:
train_config.use_boxing_v2(True)

train_config.prune_parallel_cast_ops(True)
train_config.train.model_update_conf(gen_model_update_conf(args))
train_config.enable_inplace(True)
return train_config


def get_val_config(args):
return _default_config(args)

+ 97
- 0
model_compress/model_compress/ChannelSlimming/util/model_weights.py View File

@@ -0,0 +1,97 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import json
import oneflow as flow

# mysingle.py
class ModelWeights:
weights_dict={}
dtype_dict={flow.float32:2,
flow.float64:3,
flow.int8:4,
flow.int32:5,
flow.int64:6,
flow.float16:9,
2:2, 3:3, 4:4, 5:5, 6:6, 9:9}
def add(self, variable_name, dtype, shape):
assert variable_name not in self.weights_dict
profile_dict={}
profile_dict["dtype"]=dtype
profile_dict["shape"]=shape
self.weights_dict[variable_name]=profile_dict
return self.weights_dict
def addConv(self, index, dtype, shape1, shape2, optimizer):
dtype = self.dtype_dict[dtype]
# print(dtype)
self.add("conv{}".format(index)+'_weight', dtype, shape1)
self.add("conv{}".format(index)+'_bias', dtype, shape2)
self.add("conv{}".format(index)+'_bn-gamma', dtype, shape2)
self.add("conv{}".format(index)+'_bn-beta', dtype, shape2)
self.add("conv{}".format(index)+'_bn-moving_variance', dtype, shape2)
self.add("conv{}".format(index)+'_bn-moving_mean', dtype, shape2)
if optimizer == 'adam':
self.add("conv{}".format(index)+'_weight-v', dtype, shape1)
self.add("conv{}".format(index)+'_weight-m', dtype, shape1)
self.add("conv{}".format(index)+'_bias-v', dtype, shape2)
self.add("conv{}".format(index)+'_bias-m', dtype, shape2)
self.add("conv{}".format(index)+'_bn-gamma-v', dtype, shape2)
self.add("conv{}".format(index)+'_bn-gamma-m', dtype, shape2)
self.add("conv{}".format(index)+'_bn-beta-v', dtype, shape2)
self.add("conv{}".format(index)+'_bn-beta-m', dtype, shape2)
elif optimizer == 'momentum':
self.add("conv{}".format(index)+'_weight-momentum', dtype, shape1)
self.add("conv{}".format(index)+'_bias-momentum', dtype, shape2)
self.add("conv{}".format(index)+'_bn-gamma-momentum', dtype, shape2)
self.add("conv{}".format(index)+'_bn-beta-momentum', dtype, shape2)
def addDense(self, dtype_old, shape, optimizer, dense_num):
dtype = []
for old in dtype_old:
dtype.append(self.dtype_dict[old])
# print(dtype)
for i in range(0, dense_num):
self.add('dense'+str(i)+'-weight', dtype[i], shape[i])
self.add('dense'+str(i)+'-bias', dtype[i], (shape[i][0],))
if optimizer == 'adam':
self.add('dense'+str(i)+'-weight-v', dtype[i], shape[i])
self.add('dense'+str(i)+'-weight-m', dtype[i], shape[i])
self.add('dense'+str(i)+'-bias-v', dtype[i], (shape[i][0],))
self.add('dense'+str(i)+'-bias-m', dtype[i], (shape[i][0],))
elif optimizer == 'momentum':
self.add('dense'+str(i)+'-weight-momentum', dtype[i], shape[i])
self.add('dense'+str(i)+'-bias-momentum', dtype[i], (shape[i][0],))

def save(self,path):
print('Saving weights_profile_path to {}'.format(path))
# print('weights_dict',self.weights_dict)
with open(path,"w") as f:
for k,v in self.weights_dict.items():
v_json=json.dumps(v)
f.write(k+'__'+ v_json +'\n')
return self.weights_dict

def load(self,path):
if len(self.weights_dict)!=0:
return self.weights_dict
else:
with open(path,'r') as f:
for line in f:
variable_name,profile_dict=line.split('__')
profile_dict=json.loads(profile_dict)
self.weights_dict[variable_name]=profile_dict
return self.weights_dict

modelWeight = ModelWeights()

+ 335
- 0
model_compress/model_compress/ChannelSlimming/util/ofrecord_util.py View File

@@ -0,0 +1,335 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import oneflow as flow


def add_ofrecord_args(parser):
parser.add_argument("--image_size", type=int, default=224,
required=False, help="image size")
parser.add_argument("--resize_shorter", type=int, default=256,
required=False, help="resize shorter for validation")
parser.add_argument("--train_data_dir", type=str,
default=None, help="train dataset directory")
parser.add_argument("--train_data_part_num", type=int,
default=256, help="train data part num")
parser.add_argument("--val_data_dir", type=str,
default=None, help="val dataset directory")
parser.add_argument("--val_data_part_num", type=int,
default=256, help="val data part num")
return parser

#old version, cancelled
def load_imagenet(args, batch_size, data_dir, data_part_num, codec):
image_blob_conf = flow.data.BlobConf(
"encoded",
shape=(args.image_size, args.image_size, 3),
dtype=flow.float,
codec=codec,
preprocessors=[flow.data.NormByChannelPreprocessor(args.rgb_mean[::-1],
args.rgb_std[::-1])],
# preprocessors=[flow.data.NormByChannelPreprocessor(args.rgb_mean, args.rgb_std)], #bgr2rgb
)

label_blob_conf = flow.data.BlobConf(
"class/label", shape=(), dtype=flow.int32, codec=flow.data.RawCodec()
)

return flow.data.decode_ofrecord(
data_dir,
(label_blob_conf, image_blob_conf),
batch_size=batch_size,
data_part_num=data_part_num,
part_name_suffix_length=5,
#shuffle = True,
# buffer_size=32768,
name="decode")
#old version, cancelled
def load_cifar10(data_dir, batch_size, data_part_num, image_size=32):
image_blob_conf = flow.data.BlobConf(
"images",
shape=(image_size, image_size, 3),
dtype=flow.float,
codec=flow.data.RawCodec(),
preprocessors=[flow.data.NormByChannelPreprocessor((125.31, 122.96, 113.86), (61.252, 60.767, 65.852))],
)
label_blob_conf = flow.data.BlobConf("labels", shape=(), dtype=flow.int32, codec=flow.data.RawCodec())

return flow.data.decode_ofrecord(
data_dir,
(label_blob_conf, image_blob_conf),
batch_size=batch_size,
data_part_num=data_part_num,
name="decode",
)


def load_synthetic(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
batch_size = total_device_num * args.batch_size_per_device
label = flow.data.decode_random(
shape=(),
dtype=flow.int32,
batch_size=batch_size,
initializer=flow.zeros_initializer(flow.int32),
)

image = flow.data.decode_random(
shape=(args.image_size, args.image_size, 3), dtype=flow.float, batch_size=batch_size
)

return label, image


def load_imagenet_for_training(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device

color_space = 'RGB'
ofrecord = flow.data.ofrecord_reader(args.train_data_dir,
batch_size=train_batch_size,
data_part_num=args.train_data_part_num,
part_name_suffix_length=5,
random_shuffle=True,
shuffle_after_epoch=True)
image = flow.data.OFRecordImageDecoderRandomCrop(ofrecord, "encoded", # seed=seed,
color_space=color_space)
label = flow.data.OFRecordRawDecoder(ofrecord, "class/label", shape=(), dtype=flow.int32)
rsz = flow.image.Resize(image, resize_x=args.image_size, resize_y=args.image_size,
color_space=color_space)
rng = flow.random.CoinFlip(batch_size=train_batch_size) # , seed=seed)
normal = flow.image.CropMirrorNormalize(rsz, mirror_blob=rng, color_space=color_space,
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float)
return label, normal


def load_imagenet_for_validation(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
val_batch_size = total_device_num * args.val_batch_size_per_device

color_space = 'RGB'
ofrecord = flow.data.ofrecord_reader(args.val_data_dir,
batch_size=val_batch_size,
data_part_num=args.val_data_part_num,
part_name_suffix_length=5,
shuffle_after_epoch=False)
image = flow.data.OFRecordImageDecoder(ofrecord, "encoded", color_space=color_space)
label = flow.data.OFRecordRawDecoder(ofrecord, "class/label", shape=(), dtype=flow.int32)
rsz = flow.image.Resize(image, resize_shorter=args.resize_shorter, color_space=color_space)
normal = flow.image.CropMirrorNormalize(rsz, color_space=color_space,
crop_h=args.image_size, crop_w=args.image_size, crop_pos_y=0.5, crop_pos_x=0.5,
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float)
return label, normal

def load_cifar_for_training(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device

# color_space = 'RGB'
ofrecord = flow.data.ofrecord_reader(args.train_data_dir,
batch_size=train_batch_size,
data_part_num=args.train_data_part_num,
part_name_suffix_length=5,
random_shuffle=True,
shuffle_after_epoch=True)
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32)
image = flow.data.OFRecordRawDecoder(ofrecord, "images",
shape=(3, args.image_size, args.image_size),
dtype=flow.float)
image = flow.transpose(image, perm=[0, 2, 3, 1])
image_uint8 = flow.cast(image, flow.uint8)
rng = flow.random.CoinFlip(batch_size=train_batch_size)
normal = flow.image.CropMirrorNormalize(image_uint8, mirror_blob=rng,
mean=args.rgb_mean, std=args.rgb_std)
return label, normal

def load_cifar_for_validation(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
val_batch_size = total_device_num * args.val_batch_size_per_device

# color_space = 'RGB'
ofrecord = flow.data.ofrecord_reader(args.val_data_dir,
batch_size=val_batch_size,
data_part_num=args.val_data_part_num,
part_name_suffix_length=5,
shuffle_after_epoch=False)
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32)
image = flow.data.OFRecordRawDecoder(ofrecord, "images",
shape=(3, args.image_size, args.image_size),
dtype=flow.float)
image = flow.transpose(image, perm=[0, 2, 3, 1])
image_uint8 = flow.cast(image, flow.uint8)
normal = flow.image.CropMirrorNormalize(image_uint8, crop_h=args.image_size, crop_w=args.image_size,
crop_pos_y=0.5, crop_pos_x=0.5,
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float)
return label, normal

def load_mnist_for_training(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
ofrecord = flow.data.ofrecord_reader(args.train_data_dir,
batch_size=train_batch_size,
data_part_num=args.train_data_part_num,
part_name_suffix_length=5,
random_shuffle=True,
shuffle_after_epoch=True)
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32)
image = flow.data.OFRecordRawDecoder(ofrecord, "images",
shape=(1, args.image_size, args.image_size),
dtype=flow.float)
# print(image.shape)
image = flow.transpose(image, perm=[0, 2, 3, 1])
image_uint8 = flow.cast(image, flow.uint8)
rng = flow.random.CoinFlip(batch_size=train_batch_size)
normal = flow.image.CropMirrorNormalize(image_uint8, mirror_blob=rng, color_space="GRAY",
mean=args.rgb_mean, std=args.rgb_std)
return label, normal


def load_mnist_for_validation(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
val_batch_size = total_device_num * args.val_batch_size_per_device
ofrecord = flow.data.ofrecord_reader(args.val_data_dir,
batch_size=val_batch_size,
data_part_num=args.val_data_part_num,
part_name_suffix_length=5,
shuffle_after_epoch=False)
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32)
image = flow.data.OFRecordRawDecoder(ofrecord, "images",
shape=(1, args.image_size, args.image_size),
dtype=flow.float)
image = flow.transpose(image, perm=[0, 2, 3, 1])
image_uint8 = flow.cast(image, flow.uint8)
normal = flow.image.CropMirrorNormalize(image_uint8, crop_h=args.image_size, crop_w=args.image_size,
crop_pos_y=0.5, crop_pos_x=0.5, color_space="GRAY",
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float)
return label, normal

def load_svhn_for_training(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device

ofrecord = flow.data.ofrecord_reader(args.train_data_dir,
batch_size=train_batch_size,
data_part_num=args.train_data_part_num,
part_name_suffix_length=5,
random_shuffle=True,
shuffle_after_epoch=True)
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32)
image = flow.data.OFRecordRawDecoder(ofrecord, "images",
shape=(args.image_size, args.image_size, 3),
dtype=flow.float)
image_uint8 = flow.cast(image, flow.uint8)
rng = flow.random.CoinFlip(batch_size=train_batch_size)
normal = flow.image.CropMirrorNormalize(image_uint8, mirror_blob=rng,
mean=args.rgb_mean, std=args.rgb_std)
return label, normal

def load_svhn_for_validation(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
val_batch_size = total_device_num * args.val_batch_size_per_device

ofrecord = flow.data.ofrecord_reader(args.val_data_dir,
batch_size=val_batch_size,
data_part_num=args.val_data_part_num,
part_name_suffix_length=5,
shuffle_after_epoch=False)
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32)
image = flow.data.OFRecordRawDecoder(ofrecord, "images",
shape=(args.image_size, args.image_size, 3),
dtype=flow.float)
image_uint8 = flow.cast(image, flow.uint8)
normal = flow.image.CropMirrorNormalize(image_uint8, crop_h=args.image_size, crop_w=args.image_size,
crop_pos_y=0.5, crop_pos_x=0.5,
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float)
return label, normal

def load_mydata_for_training(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device

# color_space = 'RGB'
ofrecord = flow.data.ofrecord_reader(args.train_data_dir,
batch_size=train_batch_size,
data_part_num=args.train_data_part_num,
part_name_suffix_length=5,
random_shuffle=True,
shuffle_after_epoch=True)
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32)
image = flow.data.OFRecordRawDecoder(ofrecord, "images",
shape=(3, args.image_size, args.image_size),
dtype=flow.float)
image = flow.transpose(image, perm=[0, 2, 3, 1])
image_uint8 = flow.cast(image, flow.uint8)
rng = flow.random.CoinFlip(batch_size=train_batch_size)
normal = flow.image.CropMirrorNormalize(image_uint8, mirror_blob=rng,
mean=args.rgb_mean, std=args.rgb_std)
return label, normal

def load_mydata_for_validation(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
val_batch_size = total_device_num * args.val_batch_size_per_device

# color_space = 'RGB'
ofrecord = flow.data.ofrecord_reader(args.val_data_dir,
batch_size=val_batch_size,
data_part_num=args.val_data_part_num,
part_name_suffix_length=5,
shuffle_after_epoch=False)
label = flow.data.OFRecordRawDecoder(ofrecord, "labels", shape=(), dtype=flow.int32)
image = flow.data.OFRecordRawDecoder(ofrecord, "images",
shape=(3, args.image_size, args.image_size),
dtype=flow.float)
image = flow.transpose(image, perm=[0, 2, 3, 1])
image_uint8 = flow.cast(image, flow.uint8)
normal = flow.image.CropMirrorNormalize(image_uint8, crop_h=args.image_size, crop_w=args.image_size,
crop_pos_y=0.5, crop_pos_x=0.5,
mean=args.rgb_mean, std=args.rgb_std, output_dtype=flow.float)
return label, normal


if __name__ == "__main__":
import os
import config as configs
from util import Summary, Metric
from job_function_util import get_val_config
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)

flow.config.gpu_device_num(args.gpu_num_per_node)
flow.config.enable_debug_mode(True)
@flow.global_function(get_val_config(args))
def IOTest():
if args.train_data_dir:
assert os.path.exists(args.train_data_dir)
print("Loading data from {}".format(args.train_data_dir))
(labels, images) = load_imagenet_for_training(args)
else:
print("Loading synthetic data.")
(labels, images) = load_synthetic(args)
outputs = {"images": images, "labels": labels}
return outputs

total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
summary = Summary(args.log_dir, args, filename='io_test.csv')
metric = Metric(desc='io_test', calculate_batches=args.loss_print_every_n_iter,
summary=summary, save_summary_steps=args.loss_print_every_n_iter,
batch_size=train_batch_size, prediction_key=None)
for i in range(1000):
IOTest().async_get(metric.metric_cb(0, i))

+ 93
- 0
model_compress/model_compress/ChannelSlimming/util/optimizer_util.py View File

@@ -0,0 +1,93 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import math

def add_optimizer_args(parser):
group = parser.add_argument_group('optimizer parameters',
'entire group applies only to optimizer parameters')
group.add_argument("--model_update", type=str, default="momentum", help="sgd, adam, momentum")
group.add_argument("--learning_rate", type=float, default=0.256)
group.add_argument("--wd", type=float, default=1.0/32768, help="weight decay")
group.add_argument("--mom", type=float, default=0.875, help="momentum")
group.add_argument('--lr_decay', type=str, default='cosine', help='cosine, step, polynomial, None')
group.add_argument('--warmup_epochs', type=int, default=5,
help='the epochs to ramp-up lr to scaled large-batch value')
return parser

def gen_model_update_conf(args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
epoch_size = math.ceil(args.num_examples / train_batch_size)
num_train_batches = epoch_size * args.num_epochs
num_warmup_batches = epoch_size * args.warmup_epochs
decay_batches = num_train_batches - num_warmup_batches

model_update_conf = {}
# basic model update
if args.model_update == 'sgd':
model_update_conf["naive_conf"] = {}
elif args.model_update == 'adam':
model_update_conf["adam_conf"] = {"beta1": 0.9}
elif args.model_update == 'momentum':
assert args.mom < 1.0
assert args.mom > 0.0
model_update_conf["momentum_conf"] = {"beta": args.mom}
else:
assert False

# learning rate warmup
if args.warmup_epochs > 0: #linear warmup only
model_update_conf['warmup_conf'] = {"linear_conf": {
"warmup_batches": num_warmup_batches,
"start_multiplier": 0,
}}

# learning rate decay
if args.lr_decay == 'cosine':
model_update_conf['learning_rate_decay'] = {"cosine_conf": {"decay_batches": decay_batches}}
elif args.lr_decay == 'step':
boundaries = [x * epoch_size for x in [30, 60, 80]]
scales = [1, 0.1, 0.01, 0.001]
model_update_conf['learning_rate_decay'] = {"piecewise_scaling_conf": {
"boundaries": boundaries,
"scales":scales,
}}
elif args.lr_decay == 'polynomial':
model_update_conf['learning_rate_decay'] = {"polynomial_conf": {
"decay_batches": decay_batches,
"end_learning_rate": 0.00001,
}}
# weight decay
# if args.wd > 0:
# assert args.wd < 1.0
# model_update_conf['weight_decay_conf'] = {
# "weight_decay_rate": args.wd,
# "excludes": {"pattern": ['_bn-']}
# }

import pprint
pprint.pprint(model_update_conf)
return model_update_conf


if __name__ == '__main__':
import config as configs
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)
gen_model_update_conf(args)

+ 374
- 0
model_compress/model_compress/ChannelSlimming/util/util.py View File

@@ -0,0 +1,374 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import numpy as np
import pandas as pd
import oneflow as flow
import util.ofrecord_util as ofrecord_util
from util.model_weights import modelWeight
import json

def InitNodes(args):
if args.num_nodes > 1:
assert args.num_nodes <= len(args.node_ips)
flow.env.ctrl_port(12138)
nodes = []
for ip in args.node_ips:
addr_dict = {}
addr_dict["addr"] = ip
nodes.append(addr_dict)

flow.env.machine(nodes)
# laod cfg (model structure)
def LoadCfg(args, model_load_dir, load_type):
if model_load_dir:
if args.model == "resnet":
assert os.path.isdir(model_load_dir)
of_weight_path = model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
cfg_temp = []
cfg = []
weights_dict = modelWeight.load(of_weight_path)
for name, profile_dict in weights_dict.items():
if name.endswith("weight") and "stem" not in name and "shortcut" not in name:
shape=profile_dict["shape"]
cfg_temp.append(shape[0])
cfg.append(cfg_temp[0:9])
cfg.append(cfg_temp[9:21])
cfg.append(cfg_temp[21:39])
cfg.append(cfg_temp[39:48])
cfg.append(cfg_temp[48])
if load_type == 'train':
modelWeight.weights_dict = {}
else:
assert os.path.isdir(model_load_dir)
of_weight_path = model_load_dir.rsplit("/",1)[0] + "/weights_profile_path"
cfg = []
weights_dict = modelWeight.load(of_weight_path)
for name, profile_dict in weights_dict.items():
if name.endswith("weight"):
shape=profile_dict["shape"]
cfg.append(shape[0])
# print(load_type, modelWeight.weights_dict)
if load_type == 'train':
modelWeight.weights_dict = {}
else:
if args.model == 'vgg':
# cfg = [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512, 4096, 4096, args.num_classes]
cfg = [64, 64, 128, 128, 256, 256, 256, 512, 512, 512, 512, 512, 512, 512, 128, args.num_classes]
elif args.model == 'alexnet':
cfg = [96, 256, 384, 384, 256, 4096, 4096, args.num_classes]
elif args.model == 'alexnet_simple':
cfg = [24, 96, 192, 192, 96, 1024, 1024, args.num_classes]
elif args.model == 'lenet':
cfg = [6, 16, 120, 84, args.num_classes]
elif args.model == "resnet":
cfg = [[64, 64, 256, 64, 64, 256, 64, 64, 256],
[128, 128, 512, 128, 128, 512, 128, 128, 512, 128, 128, 512],
[256, 256, 1024, 256, 256, 1024, 256, 256, 1024, 256, 256, 1024, 256, 256, 1024, 256, 256, 1024],
[512, 512, 2048, 512, 512, 2048, 512, 512, 2048], args.num_classes]
elif args.model == 'dnn_2':
cfg = [128, args.num_classes]
elif args.model == 'dnn_4':
cfg = [4096, 256, 128, args.num_classes]
else:
cfg = []
if load_type == 'train':
print('Model structure:', cfg)
return cfg

# laod cfg(model structure)
def LoadData(args, load_type):
# total_device_num = args.num_nodes * args.gpu_num_per_node
# train_batch_size = total_device_num * args.batch_size_per_device
# val_batch_size = total_device_num * args.val_batch_size_per_device
if load_type == 'train':
if args.train_data_dir:
assert os.path.exists(args.train_data_dir)
print("Loading data from {}".format(args.train_data_dir))
if args.data_type == 'imageNet':
(labels, images) = ofrecord_util.load_imagenet_for_training(args)
elif args.data_type == 'cifar10' or args.data_type == 'cifar100':
(labels, images) = ofrecord_util.load_cifar_for_training(args)
elif args.data_type == 'mnist' or args.data_type == 'mnist_32':
(labels, images) = ofrecord_util.load_mnist_for_training(args)
elif args.data_type == 'svhn':
(labels, images) = ofrecord_util.load_svhn_for_training(args)
elif args.data_type == 'random':
(labels, images) = ofrecord_util.load_synthetic(args)
else:
(labels, images) = ofrecord_util.load_mydata_for_training(args)
else:
print("Loading synthetic data.")
(labels, images) = ofrecord_util.load_synthetic(args)
elif load_type == 'test':
if args.val_data_dir:
assert os.path.exists(args.val_data_dir)
print("Loading data from {}".format(args.val_data_dir))
if args.data_type == 'imageNet':
(labels, images) = ofrecord_util.load_imagenet_for_validation(args)
elif args.data_type == 'cifar10' or args.data_type == 'cifar100':
(labels, images) = ofrecord_util.load_cifar_for_training(args)
elif args.data_type == 'mnist' or args.data_type == "mnist_32":
(labels, images) = ofrecord_util.load_mnist_for_validation(args)
elif args.data_type == 'svhn':
(labels, images) = ofrecord_util.load_svhn_for_validation(args)
elif args.data_type == 'random':
(labels, images) = ofrecord_util.load_synthetic(args)
else:
(labels, images) = ofrecord_util.load_mydata_for_training(args)
else:
print("Loading synthetic data.")
(labels, images) = ofrecord_util.load_synthetic(args)
else:
print("Loading synthetic data.")
(labels, images) = ofrecord_util.load_synthetic(args)
return labels, images

#get save path and load path of model
#def getSaveLoadDir(args):
# if args.default_dir == 'train':
# model_save_dir = './output/snapshots/model_base'
# if args.data_type == 'imageNet':
# if args.model == 'vgg':
# model_load_dir = './model_init/vgg/model_init_imageNet/of_init_model'
# elif args.model == 'alexnet':
# model_load_dir = './model_init/alexnet/model_init_imageNet/of_init_model'
# elif args.model == 'lenet':
# model_load_dir = './model_init/lenet/model_init_imageNet/of_init_model'
# elif args.data_type == 'cifar10':
# if args.model == 'vgg':
# model_load_dir = './model_init/vgg/model_init_cifar10/of_init_model'
# elif args.model == 'alexnet':
# model_load_dir = './model_init/alexnet/model_init_cifar10/of_init_model'
# elif args.model == 'lenet':
# model_load_dir = './model_init/lenet/model_init_cifar10/of_init_model'
# elif args.default_dir == 'refine':
# model_save_dir = './output/snapshots/model_refine'
# model_load_dir = './output/snapshots/model_prune/model'
# else:
# model_save_dir = args.model_save_dir
# model_load_dir = args.model_load_dir
# return model_save_dir, model_load_dir

class Snapshot(object):
def __init__(self, model_save_dir, model_load_dir):
self._model_save_dir = model_save_dir
self._check_point = flow.train.CheckPoint()
if model_load_dir:
assert os.path.isdir(model_load_dir)
print("Restoring model from {}.".format(model_load_dir))
self._check_point.load(model_load_dir)
else:
self._check_point.init()
self.save('initial_model')
print("Init model on demand.")

def save(self, name):
snapshot_save_path = os.path.join(self._model_save_dir, "snapshot_{}".format(name))
if not os.path.exists(snapshot_save_path):
os.makedirs(snapshot_save_path)
print("Saving model to {}.".format(snapshot_save_path))
self._check_point.save(snapshot_save_path)


class Summary(object):
def __init__(self, log_dir, config, filename='summary.csv'):
self._filename = filename
self._log_dir = log_dir
if not os.path.exists(log_dir): os.makedirs(log_dir)
self._metrics = pd.DataFrame({"epoch":0, "iter": 0, "legend": "cfg", "note": str(config)}, index=[0])

def scalar(self, legend, value, epoch, step=-1):
# TODO: support rank(which device/gpu)
df = pd.DataFrame(
{"epoch": epoch, "iter": step, "legend": legend, "value": value, "rank": 0},
index=[0])
self._metrics = pd.concat([self._metrics, df], axis=0, sort=False)

def save(self):
save_path = os.path.join(self._log_dir, self._filename)
self._metrics.to_csv(save_path, index=False)


class StopWatch(object):
def __init__(self):
pass

def start(self):
self.start_time = time.time()
self.last_split = self.start_time

def split(self):
now = time.time()
duration = now - self.last_split
self.last_split = now
return duration

def stop(self):
self.stop_time = time.time()

def duration(self):
return self.stop_time - self.start_time


def match_top_k(predictions, labels, top_k=1):
max_k_preds = np.argpartition(predictions.numpy(), -top_k)[:, -top_k:]
match_array = np.logical_or.reduce(max_k_preds==labels.reshape((-1, 1)), axis=1)
num_matched = match_array.sum()
return num_matched, match_array.shape[0]


class Metric(object):
def __init__(self, summary=None, save_summary_steps=-1, desc='train', calculate_batches=-1,
batch_size=256, top_k=6, prediction_key='predictions', label_key='labels',
loss_key=None):
self.summary = summary
self.save_summary = isinstance(self.summary, Summary)
self.save_summary_steps = save_summary_steps
self.desc = desc
self.calculate_batches = calculate_batches
self.top_k = top_k
self.prediction_key = prediction_key
self.label_key = label_key
self.loss_key = loss_key
self.teacher_model_size = 0
self.student_model_size = 0
if loss_key:
self.fmt = "{}: epoch {}, iter {}, loss: {:.6f}, accuracy(top1): {:.6f}, accuracy(topk): {:.6f}, samples/s: {:.3f}"
else:
self.fmt = "{}: epoch {}, iter {}, accuracy(top1): {:.6f}, accuracy(topk): {:.6f}, samples/s: {:.3f}"

self.timer = StopWatch()
self.timer.start()
self._clear()

def _clear(self):
self.top_1_num_matched = 0
self.top_k_num_matched = 0
self.num_samples = 0.0

def metric_cb(self, epoch, step, args=None, log_file=None):
def callback(outputs):
if step == 0: self._clear()
if self.prediction_key:
num_matched, num_samples = match_top_k(outputs[self.prediction_key],
outputs[self.label_key])
self.top_1_num_matched += num_matched
num_matched, _ = match_top_k(outputs[self.prediction_key],
outputs[self.label_key], self.top_k)
self.top_k_num_matched += num_matched
else:
num_samples = outputs[self.label_key].shape[0]

self.num_samples += num_samples

if (step + 1) % self.calculate_batches == 0:
throughput = self.num_samples / self.timer.split()
if self.prediction_key:
top_1_accuracy = self.top_1_num_matched / self.num_samples
top_k_accuracy = self.top_k_num_matched / self.num_samples
else:
top_1_accuracy = 0.0
top_k_accuracy = 0.0

if self.loss_key:
loss = outputs[self.loss_key].mean()
print(self.fmt.format(self.desc, epoch, step + 1, loss, top_1_accuracy,
top_k_accuracy, throughput))
# print(outputs[self.prediction_key].numpy(),
# outputs[self.label_key].numpy(),
# outputs['logits'].numpy())
if self.save_summary:
self.summary.scalar(self.desc+"_" + self.loss_key, loss, epoch, step)
else:
print('*'*106)
print(self.fmt.format(self.desc, epoch, step + 1, top_1_accuracy,
top_k_accuracy, throughput))


if self.desc=='validation':

def getdirsize(dir):
size = 0
for root, dirs, files in os.walk(dir):
for name in files:
if str(root[-2:]) == '-v' or str(root[-2:]) == '-m':
pass
else:
tmp = os.path.getsize(os.path.join(root, name))
size += tmp
# size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
return size
model_size = 0
if args.log_type == 'base_model':
if os.path.exists(os.path.join(args.model_save_dir,'snapshot_initial_model')):
self.teacher_model_size = getdirsize(os.path.join(args.model_save_dir,'snapshot_initial_model'))
elif os.path.exists(os.path.join(args.model_save_dir,'snapshot_last')):
self.teacher_model_size = getdirsize(os.path.join(args.model_save_dir,'snapshot_last'))
elif os.path.exists(os.path.join(args.model_save_dir,'snapshot_epoch_0')):
self.teacher_model_size = getdirsize(os.path.join(args.model_save_dir,'snapshot_epoch_0'))
else:
print('Error, not find {}'.format(args.model_save_dir))
model_size = self.teacher_model_size # 获取teacher model大小, 即 model_base/snapshot_initial_model 文件夹大小
elif args.log_type == 'prune_model':
if os.path.exists(args.model_load_dir):
self.student_model_size = getdirsize(args.model_load_dir)
else:
print('Error, not find {}'.format(args.model_load_dir))
model_size = self.student_model_size # 获取student model大小,即 model_prune/model 文件夹大小

save_dict = {"accuracy": "%.2f" % top_1_accuracy,
"top_k_accuracy": "%.2f" % top_k_accuracy,
"top_k": "%d" % self.top_k,
"modelSize": "%d" % (model_size / 1024 / 1024),
"reasoningTime": "%.2f" % throughput
} # samples/second

if args.log_type == 'base_model':
if not os.path.exists(args.before_result_dir):
os.makedirs(args.before_result_dir)
with open(os.path.join(args.before_result_dir, "results_eval.json"), "w") as f:
json.dump(save_dict, f)
if args.log_type == 'prune_model':
if not os.path.exists(args.after_result_dir):
os.makedirs(args.after_result_dir)
with open(os.path.join(args.after_result_dir, "results_eval.json"), "w") as f:
json.dump(save_dict, f)
if log_file:
log_file.write("epoch"+str(epoch)+" top_1_accuracy: "+str(top_1_accuracy)+\
"; top_k_accuracy: "+str(top_k_accuracy)+"; "+str(throughput)+"samples/s\n")
print('*'*106)

self._clear()
if self.save_summary:
self.summary.scalar(self.desc + "_throughput", throughput, epoch, step)
if self.prediction_key:
self.summary.scalar(self.desc + "_top_1", top_1_accuracy, epoch, step)
self.summary.scalar(self.desc + "_top_{}".format(self.top_k),
top_k_accuracy, epoch, step)

if self.save_summary:
if (step + 1) % self.save_summary_steps == 0:
self.summary.save()

return callback



+ 0
- 0
model_compress/model_compress/__init__.py View File


+ 196
- 0
model_compress/model_compress/distil/README.md View File

@@ -0,0 +1,196 @@
# 知识蒸馏快速上手

## 1. 简介
知识蒸馏:通过一些优化目标从大型、知识丰富的teacher模型学习一个小型的student模型

炼知技术平台提供了4个知识蒸馏相关算子,以及众多基于Oneflow算子复现的知识蒸馏模型和使用示例。
<table>
<thead>
<tr>
<th>类型</th>
<th>知识蒸馏模型</th>
<th><a href="../../docs/API_knowledge_distill.md" target="_blank">主要算子</a></th>
<th>使用文档</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="2">软标签蒸馏</td>
<td>KD</td>
<td>软标签蒸馏</td>
<td><a href="./examples/knowledge_distillation/README.md" target="_blank">链接</a></td>
</tr>
<tr>
<td>Distilled-BiLSTM</td>
<td>软标签蒸馏,将BERT蒸馏到BiLSTM</td>
<td><a href="./examples/distilled-bilstm/README.md" target="_blank">链接</a></td>
</tr>
<tr>
<td rowspan="2">从其他知识蒸馏</td>
<td>BERT-PKD</td>
<td>软标签蒸馏+层与层蒸馏</td>
<td><a href="./examples/bert-pkd/README.md" target="_blank">链接</a></td>
</tr>
<tr>
<td>TinyBERT</td>
<td>软标签蒸馏+层与层蒸馏+注意力蒸馏</td>
<td><a href="./examples/tinybert/README.md" target="_blank">链接</a></td>
</tr>
<tr>
<td>模块替换</td>
<td>BERT-Theseus</td>
<td>依照概率替换原有的BERT模块和Theseus的模块组成新的模型来训练</td>
<td><a href="theseus/README.md" target="_blank">链接</a></td>
</tr>
</tbody>
</table>

## 2. 使用
### 2.1 依赖
- Python 3.6
- oneflow-cu101 0.1.10
- numpy 1.19.2

完整的环境可以通过以下命令安装:
```bash
conda create -n distil python=3.6
```

```
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user
```
### 2.2 数据获取
知识蒸馏主要针对NLP相关的任务,炼知平台在GLUE任务的数据集上对不同算法进行了测试。

可以通过执行以下脚本下载GLUE任务的所有数据集,将会自动下载并解压到'--data_dir=data'目录下。

```
bash run_download_glue_data.sh
```
或者
```bash
python ../src/download_glue_data.py --data_dir data/glue_data --tasks all
```

TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]

以上脚本将会默认下载所有GLUE任务数据集,也可以通过'--tasks=TASKS',指定下载某些数据集

也可以在这里下载GLUE任务数据集,并放置到相关目录(data/glue_data)下
链接: https://pan.baidu.com/s/1Im0uQM_V_zXkmQuHsFae0A 提取码: u64u

参考[加载与准备OneFlow数据集](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/how_to_make_ofdataset.md),制作OFRecords数据集。或者执行以下命令,生成OFRecords数据集:
```
bash glue_process.sh
```

**或者直接下载转换后的OFRecords GLUE数据集,并放置到相关目录(data/glue_ofrecord)下:**
链接: https://pan.baidu.com/s/1CY2BfCGBZEeo1EgY5JQcuA 提取码: v2h4

### 2.3 微调教师模型
预训练BERT模型下载地址:
链接: https://pan.baidu.com/s/1jfTUY7ygcZZOJzjfrgUL8Q 提取码: 6b87

下载后放置在`./models/uncased_L-12_H-768_A-12_oneflow`
#### 2.3.1 训练
- 执行以下脚本进行微调教师模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- MODEL_SAVE_DIR: 模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等)

```bash
bash run_train_teacher.sh
```
- 我们微调过的教师模型可以在这里下载: 链接: https://pan.baidu.com/s/1jiOTSPBmmBoij0UwPO6UKw 提取码: 9xkp
- 已在SST-2,QQP,MRPC,RTE,CoLA数据集上微调
- 并放置到`"model_compress/distil/models/finetuned_teacher/"`。
- 在上述数据集的dev集上性能为SST-2: 92.2%, QQP: 91.1%, MRPC: 89.2%, RTE: 69.8%, CoLA: 58.5%
- 评价指标:
- Accuracy: SST-2, MRPC, QQP, RTE
- MCC (Matthews correlation coefficient): CoLA

#### 2.3.2 测试
- 微调后,可以执行以下脚本对教师模型进行测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- TEACHER_MODEL_DIR: 教师模型路径

```bash
bash run_eval_teacher.sh
```


### 2.4 蒸馏到学生模型
#### 2.4.1 训练
执行以下脚本将教师模型蒸馏到学生模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径
- TMP_STUDENT_DIR: 临时学生模型路径(如果需要的话,不需要则设为TMP_STUDENT_DIR="")
- STUDENT_DIR: 学生模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等)

- 不同知识蒸馏算法:
- KD
```bash
bash run_train_student_kd.sh
```
- Distilled-BiLSTM
```bash
bash run_train_student_distilled_lstm.sh
```
- BERT-PKD
```bash
bash run_train_student_bert_pkd.sh
```
>注:BERT-PKD可以随机初始化,也可以选择根据教师BERT中间层进行初始化,详细步骤请查阅[这里](./examples/bert-pkd/README.md#41-教师模型中间层保存与转换)
> 临时学生模型下载链接(SST-2, RTE, MRPC, CoLA, QQP数据集) 链接: https://pan.baidu.com/s/17F8KVsLd_lMODLaVLc7yrQ 提取码: 95ir
> 下载并解压,将相应的模型放置到`"./models/student_model/bert_pkd_3"`路径下
- TinyBERT
```bash
bash run_train_student_tinybert.sh
```
> 临时学生模型(通用TinyBERT)下载链接 链接: https://pan.baidu.com/s/1vZDILxXi-uxo2v3zFlWL3A 提取码: kpia

> BERT类模型最大序列长度设为128; LSTM类模型最大序列长度设为32,词表大小为10000

#### 2.4.2 测试
执行以下脚本进行测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- STUDENT_DIR: 学生模型保存路径,蒸馏过的学生模型下载链接如下(SST-2数据集)
- RESULT_DIR: 测试结果json文件保存路径 (如果RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)

- 不同知识蒸馏算法:
- KD
下载链接: https://pan.baidu.com/s/1EgQyQgxAcFAG8Ch3-4VPaw 提取码: 5k9p
```bash
bash run_eval_student_kd.sh
```
- Distilled-BiLSTM
下载链接: https://pan.baidu.com/s/1M4XzB2DnLikglxVFvhnYpw 提取码: hqhj
```bash
bash run_eval_student_distilled_lstm.sh
```
- BERT-PKD
- 从教师模型中间层初始化,下载链接: https://pan.baidu.com/s/1l7vXn-3U05Hzl0RXCJPiLg 提取码: 33dk
- 随机初始化,下载链接: https://pan.baidu.com/s/1m46j57Tova_yaGLabAqUIw 提取码: pdx4
```bash
bash run_eval_student_bert_pkd.sh
```

- TinyBERT
下载链接: https://pan.baidu.com/s/1nOAZHd3wLmyVw2vTJB7KfQ 提取码: ma65
```bash
bash run_eval_student_tinybert.sh
```



+ 0
- 0
model_compress/model_compress/distil/__init__.py View File


+ 89
- 0
model_compress/model_compress/distil/examples/bert-pkd/README.md View File

@@ -0,0 +1,89 @@
# BERT-PKD
["Patient knowledge distillation for bert model compression"](https://arxiv.org/abs/1908.09355)的论文实现。

传统的KD会导致学生模型在学习的时候只是学到了教师模型最终预测的概率分布,而完全忽略了中间隐藏层的表示,从而导致学生模型过拟合,泛化能力不足。
BERT-PKD除了进行软标签蒸馏外,还对教师模型的中间层进行蒸馏。

## 1. 依赖
- Python 3.6
- oneflow-cu101 0.1.10

完整的环境可以通过以下命令安装:
```bash
conda create -n tinybert python=3.6
```

```bash
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user
```
> 注:以下操作时,根目录为`model_compress/distil`

## 2. 数据获取
如何获取数据请查阅[这里](../../README.md#22-数据获取)

## 3. 微调教师模型
如何微调教师模型请查阅[这里](../../README.md#23-微调教师模型)
## 4. 蒸馏到学生模型
### 4.1 教师模型中间层保存与转换
为了初始化一个更好的学生模型,我们可以利用教师模型的中间层参数来初始化学生模型,而不是随机初始化一个学生模型。

执行以下命令将教师模型的某些中间层参数提取并保存,用于初始化学生模型:
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径
- TMP_STUDENT_DIR: 临时学生模型路径
- LAYER_LIST: 保存的层数,如"2,6,10"是保存教师模型的第2,6,10层,用来初始化学生模型的第1,2,3层参数
```bash
FT_BERT_BASE_DIR="./models/finetuned_teacher/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_best"
#FT_BERT_BASE_DIR="./models/finetuned_teacher/RTE_epoch-5_lr-3e-5_wd-0.0001/snapshot_best"
#FT_BERT_BASE_DIR="./models/finetuned_teacher/MRPC_epoch-5_lr-1e-5_wd-0.001/snapshot_best"
#FT_BERT_BASE_DIR="./models/finetuned_teacher/CoLA_epoch-5_lr-1e-5_wd-0.01/snapshot_best"
#FT_BERT_BASE_DIR="./models/finetuned_teacher/QQP_epoch-5_lr-2e-5_wd-0.0001/snapshot_best"

TMP_STUDENT_DIR='./models/student_model/bert_pkd_3/SST-2'
LAYER_LIST="2,6,10"
python3 examples/bert-pkd/bert-pkd_generate_student_model.py \
--teacher_model=${FT_BERT_BASE_DIR} \
--student_model=${TMP_STUDENT_DIR} \
--layer_list=${LAYER_LIST}
```

临时学生模型下载链接(SST-2, RTE, MRPC, CoLA, QQP数据集)

链接: https://pan.baidu.com/s/17F8KVsLd_lMODLaVLc7yrQ 提取码: 95ir

下载并解压,将相应的模型放置到`"./models/student_model/bert_pkd_3"`路径下

### 4.2 训练
执行以下脚本将教师模型蒸馏到学生模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径
- TMP_STUDENT_DIR: 临时学生模型路径(从教师模型中间层初始化时需要指定)
- STUDENT_DIR: 学生模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等)

```bash
bash run_train_student_bert_pkd.sh
```

### 4.3 测试
执行以下脚本进行测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- STUDENT_DIR: 学生模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)

蒸馏过的学生模型下载链接如下(SST-2数据集):

- 从教师模型中间层初始化,下载链接: https://pan.baidu.com/s/1l7vXn-3U05Hzl0RXCJPiLg 提取码: 33dk
- 随机初始化,下载链接: https://pan.baidu.com/s/1m46j57Tova_yaGLabAqUIw 提取码: pdx4
```bash
bash run_eval_student_bert_pkd.sh
```
### 4.4 结果
在SST-2 DEV数据集上:
- 模型精度:教师模型acc 92.2% ->学生模型acc 88.4%
- 模型尺寸:教师模型110M -> 学生模型 45.7M (↓2.4x)
- 推理耗时:教师模型4.04s -> 1.69s (↓2.4x)


+ 76
- 0
model_compress/model_compress/distil/examples/bert-pkd/bert-pkd_generate_student_model.py View File

@@ -0,0 +1,76 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import argparse
import shutil
import re

def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')

parser = argparse.ArgumentParser()
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.")
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.")
parser.add_argument("--layer_list", default="2,6,10", type=str, help="the set of intermediate layers to distill knowledge from")
args = parser.parse_args()

args.layer_list = args.layer_list.split(',')
args.layer_list = [int(i) for i in args.layer_list]
args.layer_num = len(args.layer_list)

student_filelist = []

def subString(template):
rule = r'bert-encoder-layer_(.*?)-'
slotList = re.findall(rule, template)
return slotList

def CopyFile(filepath, newPath):
if not os.path.exists(newPath):
os.makedirs(newPath)
fileNames = os.listdir(filepath)
for file in fileNames:
newDir = os.path.join(filepath,file)
if os.path.isfile(newDir):
newFile = os.path.join(newPath, file)
shutil.copyfile(newDir, newFile)
else:

if not os.path.exists(os.path.join(newPath, file)):
os.makedirs(os.path.join(newPath, file))
CopyFile(newDir,os.path.join(newPath, file))

if not os.path.exists(args.student_model):
os.makedirs(args.student_model)

for a, b, c in os.walk(args.teacher_model):
for subdir in b:
if str(subdir[-2:])=='-v' or str(subdir[-2:])=='-m':
continue
teacher_layer_num = subString(subdir)
x = 'student-' + subdir
if len(teacher_layer_num)==0:
CopyFile(os.path.join(args.teacher_model,subdir),os.path.join(args.student_model,x))
else:
teacher_layer_num = int(teacher_layer_num[0])
if teacher_layer_num in args.layer_list:
student_layer_num = args.layer_list.index(teacher_layer_num)
rule = r'bert-encoder-layer_(.*?)-'
x = re.sub(rule,'bert-encoder-layer_{}-'.format(str(student_layer_num)),x)
CopyFile(os.path.join(args.teacher_model, subdir), os.path.join(args.student_model, x))

+ 8
- 0
model_compress/model_compress/distil/examples/bert-pkd/run_bert-pkd_generate_student_mdoel.sh View File

@@ -0,0 +1,8 @@
#
FT_BERT_BASE_DIR="./models/finetuned_teacher/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_last_snapshot"
TMP_STUDENT_DIR='./models/student_model/bert_pkd_3/SST-2'
LAYER_LIST="2,6,10"
python3 bert-pkd_generate_student_model.py \
--teacher_model=${FT_BERT_BASE_DIR} \
--student_model=${TMP_STUDENT_DIR} \
--layer_list=${LAYER_LIST}

+ 491
- 0
model_compress/model_compress/distil/examples/bert-pkd/task_student_bert-pkd.py View File

@@ -0,0 +1,491 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import math
import numpy as np
import sys

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))

import oneflow as flow
from classifier import GlueBERT

from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, remove_optimizer_params, remove_teacher_params

import config as configs
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score
import argparse
import shutil
import tempfile
from knowledge_distill_util import BertForSequenceClassification, BertStudentForSequenceClassification, soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill
import time
import json

def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')

parser = configs.get_parser()
parser.add_argument("--task_name", type=str, default='CoLA')
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.")
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.")
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.")

parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs')
parser.add_argument("--train_data_dir", type=str, default=None)
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-')
parser.add_argument("--train_example_num", type=int, default=88614,
help="example number in dataset")
parser.add_argument("--batch_size_per_device", type=int, default=32)
parser.add_argument("--train_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--eval_data_dir", type=str, default=None)
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-')
parser.add_argument("--eval_example_num", type=int, default=10833,
help="example number in dataset")
parser.add_argument("--eval_batch_size_per_device", type=int, default=64)
parser.add_argument("--eval_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results")

parser.add_argument("--student_num_hidden_layers", type=int, default=24)
parser.add_argument("--student_num_attention_heads", type=int, default=16)
parser.add_argument("--student_max_position_embeddings", type=int, default=512)
parser.add_argument("--student_type_vocab_size", type=int, default=2)
parser.add_argument("--student_vocab_size", type=int, default=30522)
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_size_per_head", type=int, default=64)
parser.add_argument("--student_hidden_size", type=int, default=768)

parser.add_argument("--teacher_num_hidden_layers", type=int, default=24)
parser.add_argument("--teacher_num_attention_heads", type=int, default=16)
parser.add_argument("--teacher_max_position_embeddings", type=int, default=512)
parser.add_argument("--teacher_type_vocab_size", type=int, default=2)
parser.add_argument("--teacher_vocab_size", type=int, default=30522)
parser.add_argument("--teacher_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--teacher_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--teacher_hidden_size_per_head", type=int, default=64)
parser.add_argument("--teacher_hidden_size", type=int, default=768)

parser.add_argument("--kd_alpha", type=float, default=0.5, help='the usual Distillation loss {0.2,0.5,0.7}')
parser.add_argument("--kd_beta", type=float, default=10, help='the proposed loss {10,100,500,1000}')
parser.add_argument('--from_scratch', type=str2bool, nargs='?', const=False, help='train the student model from scratch or initialize from teacher layers')

parser.add_argument('--temperature', type=float, default=1.)
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?')

parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False, help='if serve for online, then after training, will delete the teacher params and optimizer parmas from model_save_dir')

args = parser.parse_args()

task_name = args.task_name.lower()

if args.aug_train:
args.train_data_dir = args.train_data_dir.replace('train','train_aug')

batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device

epoch_size = math.ceil(args.train_example_num / batch_size)
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size)
args.iter_num = epoch_size * args.num_epochs
configs.print_args(args)

glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification",
"sst-2": "classification",
"sts-b": "regression",
"qqp": "classification",
"qnli": "classification",
"rte": "classification",
"wnli": "classification",
}

acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
corr_tasks = ["sts-b"]
mcc_tasks = ["cola"]

output_mode = glue_output_modes[args.task_name.lower()]

def BertDecoder(
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True
):
with flow.scope.placement("cpu", "0:0"):
ofrecord = flow.data.ofrecord_reader(data_dir,
batch_size=batch_size,
data_part_num=data_part_num,
part_name_prefix=part_name_prefix,
random_shuffle=shuffle,
shuffle_after_epoch=shuffle)
blob_confs = {}
def _blob_conf(name, shape, dtype=flow.int32):
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype)

_blob_conf("input_ids", [seq_length])
_blob_conf("input_mask", [seq_length])
_blob_conf("segment_ids", [seq_length])
_blob_conf("label_ids", [1])
_blob_conf("is_real_example", [1])

return blob_confs

def get_tensor_data(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
shuffle=True
):
decoders = BertDecoder(
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle
)
return decoders

def BuildBert(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
shuffle=True
):
hidden_size = args.hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.hidden_size_per_head = hidden_size / args.num_attention_heads
intermediate_size = hidden_size * 4
# intermediate_size = 1200

decoders = BertDecoder(
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle
)
loss, logits = GlueBERT(
decoders['input_ids'],
decoders['input_mask'],
decoders['segment_ids'],
decoders['label_ids'],
args.vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.hidden_dropout_prob,
attention_probs_dropout_prob=args.attention_probs_dropout_prob,
max_position_embeddings=args.max_position_embeddings,
type_vocab_size=args.type_vocab_size,
initializer_range=0.02,
)
return loss, logits, decoders['label_ids']

def student_model(input_ids, input_mask, segment_ids,is_train=True):
# hidden_size = 64 * args.student_num_attention_heads # , H = 64, size per head
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads
intermediate_size = hidden_size * 4
# intermediate_size = 1200

logits, reps, atts = BertStudentForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.student_vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
num_hidden_layers=args.student_num_hidden_layers,
num_attention_heads=args.student_num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.student_hidden_dropout_prob,
attention_probs_dropout_prob=args.student_attention_probs_dropout_prob,
max_position_embeddings=args.student_max_position_embeddings,
type_vocab_size=args.student_type_vocab_size,
initializer_range=0.02,
is_student=True,
fit_size=args.teacher_hidden_size,
is_train=is_train
)
return logits, reps, atts


def teacher_model(input_ids,input_mask,segment_ids,is_train):
# hidden_size = 64 * args.teacher_num_attention_heads # , H = 64, size per head
teacher_hidden_size = args.teacher_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.teacher_hidden_size_per_head = teacher_hidden_size / args.teacher_num_attention_heads
intermediate_size = teacher_hidden_size * 4
logits, reps, atts = BertForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.vocab_size,
seq_length=args.seq_length,
hidden_size=teacher_hidden_size,
num_hidden_layers=args.teacher_num_hidden_layers,
num_attention_heads=args.teacher_num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.teacher_hidden_dropout_prob,
attention_probs_dropout_prob=args.teacher_attention_probs_dropout_prob,
max_position_embeddings=args.teacher_max_position_embeddings,
type_vocab_size=args.teacher_type_vocab_size,
initializer_range=0.02,
is_student=False,
is_train=is_train
)
return logits, reps, atts

@flow.global_function(type='train', function_config=GetFunctionConfig(args))
def DistilJob():
train_dataset = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
)
student_logits, student_reps, _ = student_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=True)

teacher_logits, teacher_reps, _ = teacher_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=False)

pt_loss = layer_distill(args, student_reps,teacher_reps)
if output_mode == "classification":
ds_loss = pred_distill(args, student_logits, teacher_logits)
elif output_mode == "regression":
"""
todo
loss_mse = MSELoss()
cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1))
"""
pass

loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=student_logits, labels=train_dataset['label_ids']
)
loss_pkd = loss_ce * (1-args.kd_alpha) + args.kd_alpha * ds_loss + args.kd_beta * pt_loss
flow.losses.add_loss(loss_pkd)

opt = CreateOptimizer(args)
opt.minimize(loss_pkd)

return {'loss': loss_pkd}

#
@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalTrainJob():
train_dataset = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
shuffle=False
)
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=False)

return student_logits, train_dataset['label_ids']

@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalValJob():
dev_dataset = get_tensor_data(
eval_batch_size,
args.eval_data_part_num,
args.eval_data_dir,
args.eval_data_prefix,
shuffle=False
)
student_logits, student_reps, student_atts = student_model(dev_dataset['input_ids'], dev_dataset['input_mask'], dev_dataset['segment_ids'],is_train=False)

return student_logits, dev_dataset['label_ids']

def run_eval_job(eval_job_func, num_steps, desc='train'):
labels = []
predictions = []
start_time = time.time()
for index in range(num_steps):
logits, label = eval_job_func().get()
predictions.extend(list(logits.numpy().argmax(axis=1)))
labels.extend(list(label))
end_time = time.time()
cost_time = end_time - start_time
print('cost time: {} s'.format(cost_time))

model_size = getdirsize(args.model_save_dir)
print('model_size: %d Mbytes' % (model_size / 1024 / 1024)) # Mbytes

accuracy = accuracy_score(labels, predictions)
mcc = matthews_corrcoef(labels, predictions)
precision = precision_score(labels, predictions)
recall = recall_score(labels, predictions)
f_1 = f1_score(labels, predictions)
save_dict = {"accuracy":"%.2f" % accuracy,
"MCC":"%.2f" % mcc,
"precision": "%.2f" % precision,
"recall": "%.2f" % recall,
"f_1": "%.2f" % f_1,
"modelSize":"%d" % (model_size/1024/1024),
"reasoningTime":"%.2f" % (args.eval_example_num / cost_time)} # sample/second

if args.result_dir == "":
args.result_dir = args.model_save_dir
if not os.path.exists(args.result_dir):
os.makedirs(args.result_dir)
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f:
json.dump(save_dict, f)

def metric_fn(predictions, labels):
return {
"accuracy": accuracy,
"matthews_corrcoef": mcc,
"precision": precision,
"recall": recall,
"f1": f_1,
}

metric_dict = metric_fn(predictions, labels)
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items()))
return metric_dict

def CopyFile(filepath, newPath):
fileNames = os.listdir(filepath)
for file in fileNames:
newDir = os.path.join(filepath,file)
if os.path.isfile(newDir):
newFile = os.path.join(newPath, file)
shutil.copyfile(newDir, newFile)
else:
if not os.path.exists(os.path.join(newPath, file)):
os.makedirs(os.path.join(newPath, file))
CopyFile(newDir,os.path.join(newPath, file))

def main():
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.env.log_dir(args.log_dir)

InitNodes(args)

check_point = flow.train.CheckPoint()

summary = Summary(args.log_dir, args)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)

if args.do_train:
print('Loading model...')
if args.from_scratch:
print('Train the student model from scratch...')
check_point.load(args.teacher_model)
else:
print('Combining two models into one dir')
if not os.path.exists('./tmp'):
os.makedirs('./tmp')
args.total_model = tempfile.mkdtemp(dir='./tmp')
CopyFile(args.student_model, args.total_model)
CopyFile(args.teacher_model, args.total_model)
print('Initialize the student model from the teacher model...')
check_point.load(args.total_model)

print('Start training...')
global_step = 0
best_dev_acc = 0.0
for epoch in range(args.num_epochs):
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
batch_size=batch_size, keys=['loss'])

for step in range(epoch_size):
DistilJob().async_get(metric.metric_cb(step, epoch=epoch))
global_step += 1
# if (global_step + 1) % args.model_save_every_n_iter == 0:
# if not os.path.exists(args.model_save_dir):
# os.makedirs(args.model_save_dir)
# snapshot_save_path = os.path.join(
# args.model_save_dir, "snapshot_%d" % (global_step + 1)
# )
# print("Saving model to {}.".format(snapshot_save_path))
# check_point.save(snapshot_save_path)

print('EvalTrainJob...')
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
print('EvalValJob...')
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')

save_model = False
if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
best_dev_acc = result['accuracy']
save_model = True

# if task_name in corr_tasks and result['corr'] > best_dev_acc:
# best_dev_acc = result['corr']
# save_model = True

if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
best_dev_acc = result['matthews_corrcoef']
save_model = True
print('Best result:', result)

if save_model:
if os.path.exists(args.model_save_dir):
import shutil
shutil.rmtree(args.model_save_dir)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
snapshot_save_path = os.path.join(args.model_save_dir)
print("Saving best model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)
flow.sync_default_session()

if args.save_last_snapshot:
snapshot_save_path = args.model_save_dir
if os.path.exists(args.model_save_dir):
import shutil
shutil.rmtree(args.model_save_dir)
print("Saving model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)
flow.sync_default_session()

if not args.from_scratch:
if global_step >= 100:
# remove tmp total models
print('Removing the tmp models...')
import shutil
shutil.rmtree(args.total_model)

if args.serve_for_online:
print('Deleting the teacher params and the optimizer parmas from model_save_dir...')
remove_teacher_params(args.model_save_dir)
print('Removing the tmp models...')
# shutil.rmtree(args.total_model)

if args.do_eval:
print('Loading model...')
print(args.model_save_dir)

if not args.do_train:
check_point.load(args.model_save_dir)
print('Evaluation...')

run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
# if args.save_last_snapshot:
# snapshot.save("last_snapshot")


if __name__ == "__main__":
main()

+ 61
- 0
model_compress/model_compress/distil/examples/distilled-bilstm/README.md View File

@@ -0,0 +1,61 @@
# Distilled-BiLSTM
["Distilling task-specific knowledge from bert into simple neural networks"](https://arxiv.org/abs/1903.12136)论文的实现

Distilled BiLSTM的教师模型采用微调过的BERT,学生模型采用简单神经网络LSTM。
蒸馏的目标是KD loss,即仅使用软标签进行蒸馏,将BERT中的知识蒸馏到LSTM中。

## 1. 依赖
- Python 3.6
- oneflow-cu101 0.1.10

完整的环境可以通过以下命令安装:
```bash
conda create -n tinybert python=3.6
```

```bash
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user
```
> 注:以下操作时,根目录为`model_compress/distil`

## 2. 数据获取
如何获取数据请查阅[这里](../../README.md#22-数据获取)

## 3. 微调教师模型
如何微调教师模型请查阅[这里](../../README.md#23-微调教师模型)
## 4. 蒸馏到学生模型
### 4.1 训练
执行以下脚本将教师模型蒸馏到学生模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径
- STUDENT_DIR: 学生模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等)

> 最大序列长度为32,词表大小为10000

```bash
bash run_train_student_distilled_lstm.sh
```

### 4.2 测试
蒸馏过的学生模型下载链接如下(SST-2数据集):

下载链接: https://pan.baidu.com/s/1M4XzB2DnLikglxVFvhnYpw 提取码: hqhj

执行以下脚本进行测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- STUDENT_DIR: 学生模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)

```bash
bash run_eval_student_distilled_lstm.sh
```
### 4.3 结果
在SST-2 DEV数据集上:
- 模型精度:教师模型acc 92.2% ->学生模型acc 82.9%
- 模型尺寸:教师模型110M -> 学生模型 15.3M (↓7.5x)
- 推理耗时:教师模型4.04s -> 0.83s (↓4.8x)

+ 118
- 0
model_compress/model_compress/distil/examples/distilled-bilstm/run_train_lstm.sh View File

@@ -0,0 +1,118 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

nvidia-smi

dataset=SST-2

# ofrecord dataset dir
DATA_ROOT=./data/glue_ofrecord

# choose dateset `CoLA`, `MRPC` 'SST-2'

if [ $DATA_ROOT = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
learning_rate=2e-5
wd=0.0001
epoch=70
elif [ $DATA_ROOT = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
learning_rate=5e-6
epoch=20
wd=0.001
elif [ $DATA_ROOT = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
learning_rate=3e-5
epoch=4
wd=0.0001
elif [ $DATA_ROOT = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $DATA_ROOT = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
learning_rate=2e-5
wd=0.0001
elif [ $DATA_ROOT = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
learning_rate=2e-5
wd=0.0001
elif [ $DATA_ROOT = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
learning_rate=2e-5
wd=0.0001
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

STUDENT_DIR="./models/student_model/${dataset}/lstm_32_epoch-${epoch}_lr-${learning_rate}_wd-${wd}"

train_data_dir=$DATA_ROOT/${dataset}/train
train_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/train

eval_data_dir=$DATA_ROOT/${dataset}/eval
eval_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/eval

#EPOCH=10
#learning_rate=2e-5 # 3e-5
GPU=0
CUDA_VISIBLE_DEVICES=$GPU python3 task_lstm.py \
--do_train='True' \
--do_eval='True' \
--model=Glue_${TASK_NAME} \
--task_name=${TASK_NAME} \
--gpu_num_per_node=1 \
--num_epochs=${epoch} \
--train_data_dir=$train_data_dir \
--train_data_dir_lstm=${train_data_dir_lstm} \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_data_dir_lstm=$eval_data_dir_lstm \
--eval_example_num=$eval_example_num \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 1 \
--log_dir=./log \
--model_save_dir=${STUDENT_DIR} \
--seq_length=32 \
--student_num_hidden_layers=4 \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=10002 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=26 \
--student_hidden_size=300 \
--learning_rate=$learning_rate \
--model_save_every_n_iter=50000 \
--weight_decay_rate=$wd

+ 338
- 0
model_compress/model_compress/distil/examples/distilled-bilstm/task_lstm.py View File

@@ -0,0 +1,338 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import math
import numpy as np
import sys

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))

import oneflow as flow
import oneflow.typing as tp

from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig

import config as configs
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score
import argparse
import shutil
import tempfile
from knowledge_distill_util import LSTMStudentForSequenceClassification, BertForSequenceClassification, BertStudentForSequenceClassification, soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill
import time
import json

def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')

parser = configs.get_parser()
parser.add_argument("--task_name", type=str, default='CoLA')
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.")
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.")

parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs')
parser.add_argument("--train_data_dir", type=str, default=None)
parser.add_argument("--train_data_dir_lstm", type=str, default=None)
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-')
parser.add_argument("--train_example_num", type=int, default=88614,
help="example number in dataset")
parser.add_argument("--batch_size_per_device", type=int, default=32)
parser.add_argument("--train_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--eval_data_dir", type=str, default=None)
parser.add_argument("--eval_data_dir_lstm", type=str, default=None)
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-')
parser.add_argument("--eval_example_num", type=int, default=10833,
help="example number in dataset")
parser.add_argument("--eval_batch_size_per_device", type=int, default=64)
parser.add_argument("--eval_data_part_num", type=int, default=1,
help="data part number in dataset")

#
parser.add_argument("--student_num_hidden_layers", type=int, default=24)
parser.add_argument("--student_num_attention_heads", type=int, default=16)
parser.add_argument("--student_max_position_embeddings", type=int, default=512)
parser.add_argument("--student_type_vocab_size", type=int, default=2)
parser.add_argument("--student_vocab_size", type=int, default=30522)
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_size_per_head", type=int, default=64)
parser.add_argument("--student_hidden_size", type=int, default=768)

parser.add_argument("--kd_alpha", type=float, default=0.1)

parser.add_argument('--temperature', type=float, default=1.)
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?')

args = parser.parse_args()

task_name = args.task_name.lower()

if args.aug_train:
args.train_data_dir = args.train_data_dir.replace('train','train_aug')

batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device

epoch_size = math.ceil(args.train_example_num / batch_size)
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size)
args.iter_num = epoch_size * args.num_epochs
configs.print_args(args)

glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification",
"sst-2": "classification",
"sts-b": "regression",
"qqp": "classification",
"qnli": "classification",
"rte": "classification",
"wnli": "classification",
}

acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
corr_tasks = ["sts-b"]
mcc_tasks = ["cola"]

output_mode = glue_output_modes[args.task_name.lower()]

def BertDecoder(
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True
):
with flow.scope.placement("cpu", "0:0"):
ofrecord = flow.data.ofrecord_reader(data_dir,
batch_size=batch_size,
data_part_num=data_part_num,
part_name_prefix=part_name_prefix,
random_shuffle=shuffle,
shuffle_after_epoch=shuffle)
blob_confs = {}
def _blob_conf(name, shape, dtype=flow.int32):
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype)

_blob_conf("input_ids", [seq_length])
_blob_conf("input_mask", [seq_length])
_blob_conf("segment_ids", [seq_length])
_blob_conf("label_ids", [1])
_blob_conf("is_real_example", [1])

return blob_confs

def get_tensor_data(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
shuffle=True
):
decoders = BertDecoder(
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle
)
return decoders

def student_model(input_ids, input_mask, segment_ids,is_train=True):
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
# args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads
# print('input_ids:',input_ids.shape)

logits = LSTMStudentForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.student_vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
intermediate_size=400,
num_hidden_layers=args.student_num_hidden_layers,
is_student=True,
is_train=is_train
)
return logits

def watch_handler(y: tp.Numpy):
print("out:",y)

def watch_diff_handler(blob: tp.Numpy):
print("watch_diff_handler:", blob, blob.shape, blob.dtype)

@flow.global_function(type='train', function_config=GetFunctionConfig(args))
def DistilJob():
train_dataset_lstm = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir_lstm,
args.train_data_prefix,
True
)
student_logits = student_model(train_dataset_lstm['input_ids'], train_dataset_lstm['input_mask'], train_dataset_lstm['segment_ids'],is_train=True)
# flow.watch(student_logits, watch_handler)

loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=student_logits, labels=train_dataset_lstm['label_ids']
)

loss = loss_ce

flow.losses.add_loss(loss)
opt = CreateOptimizer(args)
opt.minimize(loss)

return {'loss': loss}

#
@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalTrainJob():
train_dataset_lstm = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir_lstm,
args.train_data_prefix,
shuffle=False
)
student_logits = student_model(train_dataset_lstm['input_ids'], train_dataset_lstm['input_mask'], train_dataset_lstm['segment_ids'],is_train=False)

return student_logits, train_dataset_lstm['label_ids']

@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalValJob():
dev_dataset = get_tensor_data(
eval_batch_size,
args.eval_data_part_num,
args.eval_data_dir_lstm,
args.eval_data_prefix,
shuffle=False
)
student_logits= student_model(dev_dataset['input_ids'], dev_dataset['input_mask'], dev_dataset['segment_ids'],is_train=False)

return student_logits, dev_dataset['label_ids']

#
def run_eval_job(eval_job_func, num_steps, desc='train'):
labels = []
predictions = []
start_time = time.time()
for index in range(num_steps):
logits, label = eval_job_func().get()
predictions.extend(list(logits.numpy().argmax(axis=1)))
labels.extend(list(label))
end_time = time.time()
print('cost time: {} s'.format(end_time-start_time))

def metric_fn(predictions, labels):
return {
"accuracy": accuracy_score(labels, predictions),
"matthews_corrcoef": matthews_corrcoef(labels, predictions),
"precision": precision_score(labels, predictions),
"recall": recall_score(labels, predictions),
"f1": f1_score(labels, predictions),
}

metric_dict = metric_fn(predictions, labels)
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items()))
return metric_dict

def getdirsize(dir):
size = 0
for root, dirs, files in os.walk(dir):
size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
return size

def main():
flow.config.enable_debug_mode(True)
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.env.log_dir(args.log_dir)

InitNodes(args)

check_point = flow.train.CheckPoint()
check_point.init()

summary = Summary(args.log_dir, args)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)

if args.do_train:
print('Start training...')
global_step = 0
best_dev_acc = 0.0
print('epoch_size:',epoch_size)
print('args.iter_num:',args.iter_num)
for epoch in range(args.num_epochs):
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
batch_size=batch_size, keys=['loss'])

for step in range(epoch_size):
loss = DistilJob().get()
if step % 10 == 0:
print('step/epoch_size:{}/{} epoch:{}'.format(step,epoch_size,epoch))
print('loss:',loss['loss'].mean())
# global_step+=1
# DistilJob().async_get(metric.metric_cb(step, epoch=epoch))


print('EvalTrainJob...')
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
print('EvalValJob...')
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')

save_model = False
if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
best_dev_acc = result['accuracy']
save_model = True

# if task_name in corr_tasks and result['corr'] > best_dev_acc:
# best_dev_acc = result['corr']
# save_model = True

if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
best_dev_acc = result['matthews_corrcoef']
save_model = True
print('Best result:', result)

if save_model:
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
snapshot_save_path = os.path.join(args.model_save_dir)
print("Saving best model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)

if args.save_last_snapshot:
snapshot_save_path = args.model_save_dir
print("Saving model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)

if args.do_eval:
print('Loading model...')
print(args.model_save_dir)

if not args.do_train:
check_point.load(args.model_save_dir)
print('Evaluation...')
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
# if args.save_last_snapshot:
# snapshot.save("last_snapshot")



if __name__ == "__main__":
main()

+ 439
- 0
model_compress/model_compress/distil/examples/distilled-bilstm/task_student_kd_lstm.py View File

@@ -0,0 +1,439 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import math
import numpy as np
import sys

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))

import oneflow as flow

from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, remove_optimizer_params, remove_teacher_params

import config as configs
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score
import argparse
import shutil
import tempfile
from knowledge_distill_util import LSTMStudentForSequenceClassification, BertForSequenceClassification, BertStudentForSequenceClassification, soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill
import time
import json

def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')

parser = configs.get_parser()
parser.add_argument("--task_name", type=str, default='CoLA')
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.")
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.")
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.")

parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs')
parser.add_argument("--train_data_dir", type=str, default=None)
parser.add_argument("--train_data_dir_lstm", type=str, default=None)
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-')
parser.add_argument("--train_example_num", type=int, default=88614,
help="example number in dataset")
parser.add_argument("--batch_size_per_device", type=int, default=32)
parser.add_argument("--train_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--eval_data_dir", type=str, default=None)
parser.add_argument("--eval_data_dir_lstm", type=str, default=None)
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-')
parser.add_argument("--eval_example_num", type=int, default=10833,
help="example number in dataset")
parser.add_argument("--eval_batch_size_per_device", type=int, default=64)
parser.add_argument("--eval_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results")

#
parser.add_argument("--student_num_hidden_layers", type=int, default=24)
parser.add_argument("--student_num_attention_heads", type=int, default=16)
parser.add_argument("--student_max_position_embeddings", type=int, default=512)
parser.add_argument("--student_type_vocab_size", type=int, default=2)
parser.add_argument("--student_vocab_size", type=int, default=30522)
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_size_per_head", type=int, default=64)
parser.add_argument("--student_hidden_size", type=int, default=768)
parser.add_argument("--student_seq_length", type=int, default=32, help="the max seq length for studet")

parser.add_argument("--teacher_num_hidden_layers", type=int, default=24)
parser.add_argument("--teacher_num_attention_heads", type=int, default=16)
parser.add_argument("--teacher_max_position_embeddings", type=int, default=512)
parser.add_argument("--teacher_type_vocab_size", type=int, default=2)
parser.add_argument("--teacher_vocab_size", type=int, default=30522)
parser.add_argument("--teacher_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--teacher_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--teacher_hidden_size_per_head", type=int, default=64)
parser.add_argument("--teacher_hidden_size", type=int, default=768)

parser.add_argument("--kd_alpha", type=float, default=0.1)

parser.add_argument('--temperature', type=float, default=1.)
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?')

parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False, help='if serve for online, then after training, will delete the teacher params and optimizer parmas from model_save_dir')

args = parser.parse_args()

task_name = args.task_name.lower()

if args.aug_train:
args.train_data_dir = args.train_data_dir.replace('train','train_aug')

batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device

epoch_size = math.ceil(args.train_example_num / batch_size)
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size)
args.iter_num = epoch_size * args.num_epochs
configs.print_args(args)

glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification",
"sst-2": "classification",
"sts-b": "regression",
"qqp": "classification",
"qnli": "classification",
"rte": "classification",
"wnli": "classification",
}

acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
corr_tasks = ["sts-b"]
mcc_tasks = ["cola"]

output_mode = glue_output_modes[args.task_name.lower()]

def BertDecoder(
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True
):
with flow.scope.placement("cpu", "0:0"):
ofrecord = flow.data.ofrecord_reader(data_dir,
batch_size=batch_size,
data_part_num=data_part_num,
part_name_prefix=part_name_prefix,
random_shuffle=shuffle,
shuffle_after_epoch=shuffle)
blob_confs = {}
def _blob_conf(name, shape, dtype=flow.int32):
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype)

_blob_conf("input_ids", [seq_length])
_blob_conf("input_mask", [seq_length])
_blob_conf("segment_ids", [seq_length])
_blob_conf("label_ids", [1])
_blob_conf("is_real_example", [1])

return blob_confs

def get_tensor_data(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
seq_length,
shuffle=True
):
decoders = BertDecoder(
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=shuffle
)
return decoders

def student_model(input_ids, input_mask, segment_ids,is_train=True):
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
# args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads
# print('input_ids:',input_ids.shape)

logits = LSTMStudentForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.student_vocab_size,
seq_length=args.student_seq_length,
hidden_size=hidden_size,
intermediate_size=400,
num_hidden_layers=args.student_num_hidden_layers,
is_student=True,
is_train=is_train
)
return logits


def teacher_model(input_ids,input_mask,segment_ids,is_train):
teacher_hidden_size = args.teacher_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.teacher_hidden_size_per_head = teacher_hidden_size / args.teacher_num_attention_heads
intermediate_size = teacher_hidden_size * 4
logits, reps, atts = BertForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.vocab_size,
seq_length=args.seq_length,
hidden_size=teacher_hidden_size,
num_hidden_layers=args.teacher_num_hidden_layers,
num_attention_heads=args.teacher_num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.teacher_hidden_dropout_prob,
attention_probs_dropout_prob=args.teacher_attention_probs_dropout_prob,
max_position_embeddings=args.teacher_max_position_embeddings,
type_vocab_size=args.teacher_type_vocab_size,
initializer_range=0.02,
is_student=False,
is_train=is_train
)
return logits, reps, atts

@flow.global_function(type='train', function_config=GetFunctionConfig(args))
def DistilJob():
train_dataset = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
args.seq_length,
False
)
train_dataset_lstm = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir_lstm,
args.train_data_prefix,
args.student_seq_length,
False
)
student_logits = student_model(train_dataset_lstm['input_ids'], train_dataset_lstm['input_mask'], train_dataset_lstm['segment_ids'],is_train=True)
teacher_logits, teacher_reps, teacher_atts = teacher_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=False)
if output_mode == "classification":
cls_loss = pred_distill(args, student_logits, teacher_logits)
elif output_mode == "regression":
"""
todo
loss_mse = MSELoss()
cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1))
"""
pass

loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=student_logits, labels=train_dataset_lstm['label_ids']
)

loss = loss_ce * (1-args.kd_alpha) + cls_loss * args.kd_alpha

flow.losses.add_loss(loss)

opt = CreateOptimizer(args)
opt.minimize(loss)

return {'loss': loss}

#
@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalTrainJob():
train_dataset = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir_lstm,
args.train_data_prefix,
args.student_seq_length,
shuffle=False
)
student_logits = student_model(train_dataset['input_ids'], train_dataset['input_mask'], train_dataset['segment_ids'],is_train=False)

return student_logits, train_dataset['label_ids']

@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalValJob():
dev_dataset = get_tensor_data(
eval_batch_size,
args.eval_data_part_num,
args.eval_data_dir_lstm,
args.eval_data_prefix,
args.student_seq_length,
shuffle=False
)
student_logits= student_model(dev_dataset['input_ids'], dev_dataset['input_mask'], dev_dataset['segment_ids'],is_train=False)

return student_logits, dev_dataset['label_ids']

#
def run_eval_job(eval_job_func, num_steps, desc='train'):
labels = []
predictions = []
start_time = time.time()
for index in range(num_steps):
logits, label = eval_job_func().get()
predictions.extend(list(logits.numpy().argmax(axis=1)))
labels.extend(list(label))
end_time = time.time()
cost_time = end_time - start_time
print('cost time: {} s'.format(cost_time))

model_size = getdirsize(args.model_save_dir)
print('model_size: %d Mbytes' % (model_size / 1024 / 1024)) # Mbytes

accuracy = accuracy_score(labels, predictions)
mcc = matthews_corrcoef(labels, predictions)
precision = precision_score(labels, predictions)
recall = recall_score(labels, predictions)
f_1 = f1_score(labels, predictions)
save_dict = {"accuracy":"%.2f" % accuracy,
"MCC":"%.2f" % mcc,
"precision": "%.2f" % precision,
"recall": "%.2f" % recall,
"f_1": "%.2f" % f_1,
"modelSize":"%d" % (model_size/1024/1024),
"reasoningTime":"%.2f" % (args.eval_example_num / cost_time)} # sample/second

if args.result_dir == "":
args.result_dir = args.model_save_dir
if not os.path.exists(args.result_dir):
os.makedirs(args.result_dir)
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f:
json.dump(save_dict, f)

def metric_fn(predictions, labels):
return {
"accuracy": accuracy,
"matthews_corrcoef": mcc,
"precision": precision,
"recall": recall,
"f1": f_1,
}

metric_dict = metric_fn(predictions, labels)
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items()))
return metric_dict

def main():
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.env.log_dir(args.log_dir)

InitNodes(args)

check_point = flow.train.CheckPoint()
check_point.init()

summary = Summary(args.log_dir, args)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
import shutil
if args.do_train:
print('Loading model...')
check_point.load(args.teacher_model)

print('Start training...')
global_step = 0
best_dev_acc = 0.0
print('epoch_size:',epoch_size)
print('args.iter_num:',args.iter_num)
for epoch in range(args.num_epochs):
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
batch_size=batch_size, keys=['loss'])

for step in range(epoch_size):
loss = DistilJob().get()
if step % 10 == 0:
print('step/epoch_size:{}/{} epoch:{}'.format(step,epoch_size,epoch))
print('loss:',loss['loss'].mean())
# DistilJob().async_get(metric.metric_cb(step, epoch=epoch))

# DistilJob().get(metric.metric_cb(step))

# global_step += 1
# if (global_step + 1) % args.model_save_every_n_iter == 0:
# if (global_step + 1) % 1 == 0:
# print('global_step:',global_step)
# if not os.path.exists(args.model_save_dir):
# os.makedirs(args.model_save_dir)
# snapshot_save_path = os.path.join(
# args.model_save_dir, "snapshot_%d" % (global_step + 1)
# )
# print("Saving model to {}.".format(snapshot_save_path))
# check_point.save(snapshot_save_path)
#
print('EvalTrainJob...')
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
print('EvalValJob...')
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')

save_model = False
if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
best_dev_acc = result['accuracy']
save_model = True

# if task_name in corr_tasks and result['corr'] > best_dev_acc:
# best_dev_acc = result['corr']
# save_model = True

if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
best_dev_acc = result['matthews_corrcoef']
save_model = True
print('Best result:', result)

if save_model:
if os.path.exists(args.model_save_dir):
import shutil
shutil.rmtree(args.model_save_dir)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
snapshot_save_path = os.path.join(args.model_save_dir)
print("Saving best model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)
flow.sync_default_session()

if args.save_last_snapshot:
snapshot_save_path = args.model_save_dir
if os.path.exists(args.model_save_dir):
import shutil
shutil.rmtree(args.model_save_dir)
print("Saving model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)
flow.sync_default_session()

if args.serve_for_online:
print('Deleting the teacher params and the optimizer parmas from model_save_dir...')
remove_teacher_params(args.model_save_dir)

if args.do_eval:
print('Loading model...')
print(args.model_save_dir)

if not args.do_train:
check_point.load(args.model_save_dir)
print('Evaluation...')
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
# if args.save_last_snapshot:
# snapshot.save("last_snapshot")



if __name__ == "__main__":
main()

+ 62
- 0
model_compress/model_compress/distil/examples/knowledge_distillation/README.md View File

@@ -0,0 +1,62 @@
# Knowledge Distillation
["Distilling the knowledge in a neural network"](https://arxiv.org/abs/1503.02531)论文的实现

KD的思路是使用教师模型的softmax层输出logits作为“soft target”,使得student模型可以学习teacher模型的输出,达到student模型模仿teacher模型在预测层的表现的目的。

L_KD = αL_CE+(1-α)L_DS
- L_CE 为学生模型的输出logits和label的交叉熵。
- L_DS 为学生模型输出logits和教师模型输出logits的距离,比如可以用软softmax或者KL散度等计算。
- α用来调节两个loss的权重。

## 1. 依赖
- Python 3.6
- oneflow-cu101 0.1.10

完整的环境可以通过以下命令安装:
```bash
conda create -n tinybert python=3.6
```

```bash
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user
```
> 注:以下操作时,根目录为`model_compress/distil`

## 2. 数据获取
如何获取数据请查阅[这里](../../README.md#22-数据获取)

## 3. 微调教师模型
如何微调教师模型请查阅[这里](../../README.md#23-微调教师模型)
## 4. 蒸馏到学生模型
### 4.1 训练
执行以下脚本将教师模型蒸馏到学生模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径
- STUDENT_DIR: 学生模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等)

```bash
bash run_train_student_kd.sh
```

### 4.2 测试
执行以下脚本进行测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- STUDENT_DIR: 学生模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)

蒸馏过的学生模型下载链接如下(SST-2数据集):

下载链接: https://pan.baidu.com/s/1EgQyQgxAcFAG8Ch3-4VPaw 提取码: 5k9p
```bash
bash run_eval_student_kd.sh
```
### 4.3 结果
在SST-2 DEV数据集上:
- 模型精度:教师模型acc 92.2% ->学生模型acc 80.5%
- 模型尺寸:教师模型110M -> 学生模型 14.5M (↓7.5x)
- 推理耗时:教师模型4.04s -> 0.81s (↓5.0x)

+ 498
- 0
model_compress/model_compress/distil/examples/knowledge_distillation/task_student_kd.py View File

@@ -0,0 +1,498 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import math
import numpy as np
import sys

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))

import oneflow as flow
import oneflow.typing as tp
from typing import Tuple, Any
from classifier import GlueBERT

from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, \
remove_optimizer_params, remove_teacher_params

import config as configs
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score
import argparse
import shutil
import tempfile
from knowledge_distill_util import BertForSequenceClassification, BertStudentForSequenceClassification, \
soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill
import time
import json


def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')


parser = configs.get_parser()
parser.add_argument("--task_name", type=str, default='CoLA')
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.")
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.")
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.")

parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs')
parser.add_argument("--train_data_dir", type=str, default=None)
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-')
parser.add_argument("--train_example_num", type=int, default=88614,
help="example number in dataset")
parser.add_argument("--batch_size_per_device", type=int, default=32)
parser.add_argument("--train_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--eval_data_dir", type=str, default=None)
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-')
parser.add_argument("--eval_example_num", type=int, default=10833,
help="example number in dataset")
parser.add_argument("--eval_batch_size_per_device", type=int, default=64)
parser.add_argument("--eval_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results")

#
parser.add_argument("--student_num_hidden_layers", type=int, default=24)
parser.add_argument("--student_num_attention_heads", type=int, default=16)
parser.add_argument("--student_max_position_embeddings", type=int, default=512)
parser.add_argument("--student_type_vocab_size", type=int, default=2)
parser.add_argument("--student_vocab_size", type=int, default=30522)
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_size_per_head", type=int, default=64)
parser.add_argument("--student_hidden_size", type=int, default=768)

parser.add_argument("--teacher_num_hidden_layers", type=int, default=24)
parser.add_argument("--teacher_num_attention_heads", type=int, default=16)
parser.add_argument("--teacher_max_position_embeddings", type=int, default=512)
parser.add_argument("--teacher_type_vocab_size", type=int, default=2)
parser.add_argument("--teacher_vocab_size", type=int, default=30522)
parser.add_argument("--teacher_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--teacher_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--teacher_hidden_size_per_head", type=int, default=64)
parser.add_argument("--teacher_hidden_size", type=int, default=768)

parser.add_argument("--kd_alpha", type=float, default=0.9)

parser.add_argument('--temperature', type=float, default=1.)
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?')

parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False,
help='if serve for online, then after training, will delete the teacher params and optimizer parmas from model_save_dir')

args = parser.parse_args()

task_name = args.task_name.lower()

if args.aug_train:
args.train_data_dir = args.train_data_dir.replace('train', 'train_aug')

batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device

epoch_size = math.ceil(args.train_example_num / batch_size)
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size)
args.iter_num = epoch_size * args.num_epochs
configs.print_args(args)

glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification",
"sst-2": "classification",
"sts-b": "regression",
"qqp": "classification",
"qnli": "classification",
"rte": "classification",
"wnli": "classification",
}

acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
corr_tasks = ["sts-b"]
mcc_tasks = ["cola"]

output_mode = glue_output_modes[args.task_name.lower()]


def BertDecoder(
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True
):
with flow.scope.placement("cpu", "0:0"):
ofrecord = flow.data.ofrecord_reader(data_dir,
batch_size=batch_size,
data_part_num=data_part_num,
part_name_prefix=part_name_prefix,
random_shuffle=shuffle,
shuffle_after_epoch=shuffle)
blob_confs = {}

def _blob_conf(name, shape, dtype=flow.int32):
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype)

_blob_conf("input_ids", [seq_length])
_blob_conf("input_mask", [seq_length])
_blob_conf("segment_ids", [seq_length])
_blob_conf("label_ids", [1])
_blob_conf("is_real_example", [1])

return blob_confs


def get_tensor_data(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
shuffle=True
):
decoders = BertDecoder(
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle
)
return decoders


def BuildBert(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
shuffle=True
):
hidden_size = args.hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.hidden_size_per_head = hidden_size / args.num_attention_heads
# intermediate_size = hidden_size * 4
intermediate_size = 1200

decoders = BertDecoder(
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle
)
# is_real_example = decoders['is_real_example']

loss, logits = GlueBERT(
decoders['input_ids'],
decoders['input_mask'],
decoders['segment_ids'],
decoders['label_ids'],
args.vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.hidden_dropout_prob,
attention_probs_dropout_prob=args.attention_probs_dropout_prob,
max_position_embeddings=args.max_position_embeddings,
type_vocab_size=args.type_vocab_size,
initializer_range=0.02,
)
return loss, logits, decoders['label_ids']


def student_model(input_ids, input_mask, segment_ids, is_train=True):
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads
# intermediate_size = hidden_size * 4
intermediate_size = 1200

logits, reps, atts = BertStudentForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.student_vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
num_hidden_layers=args.student_num_hidden_layers,
num_attention_heads=args.student_num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.student_hidden_dropout_prob,
attention_probs_dropout_prob=args.student_attention_probs_dropout_prob,
max_position_embeddings=args.student_max_position_embeddings,
type_vocab_size=args.student_type_vocab_size,
initializer_range=0.02,
is_student=True,
fit_size=args.teacher_hidden_size,
is_train=is_train
)
return logits, reps, atts


def teacher_model(input_ids, input_mask, segment_ids, is_train):
teacher_hidden_size = args.teacher_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.teacher_hidden_size_per_head = teacher_hidden_size / args.teacher_num_attention_heads
intermediate_size = teacher_hidden_size * 4
logits, reps, atts = BertForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.vocab_size,
seq_length=args.seq_length,
hidden_size=teacher_hidden_size,
num_hidden_layers=args.teacher_num_hidden_layers,
num_attention_heads=args.teacher_num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.teacher_hidden_dropout_prob,
attention_probs_dropout_prob=args.teacher_attention_probs_dropout_prob,
max_position_embeddings=args.teacher_max_position_embeddings,
type_vocab_size=args.teacher_type_vocab_size,
initializer_range=0.02,
is_student=False,
is_train=is_train
)
return logits, reps, atts


def watch_handler(y: tp.Numpy):
print("out:", y)


@flow.global_function(type='train', function_config=GetFunctionConfig(args))
def DistilJob():
train_dataset = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
)
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'],
train_dataset['segment_ids'], is_train=True)

teacher_logits, teacher_reps, teacher_atts = teacher_model(train_dataset['input_ids'], train_dataset['input_mask'],
train_dataset['segment_ids'], is_train=False)

if output_mode == "classification":
cls_loss = pred_distill(args, student_logits, teacher_logits)
elif output_mode == "regression":
"""
todo
loss_mse = MSELoss()
cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1))
"""
pass

loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=student_logits, labels=train_dataset['label_ids']
)
# flow.watch(student_logits, watch_handler)
# flow.watch(train_dataset['label_ids'], watch_handler)
loss = loss_ce * args.kd_alpha + cls_loss * (1 - args.kd_alpha)

flow.losses.add_loss(loss)

opt = CreateOptimizer(args)
opt.minimize(loss)

return {'loss': loss}


#
@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalTrainJob():
train_dataset = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
shuffle=False
)
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'],
train_dataset['segment_ids'], is_train=False)

return student_logits, train_dataset['label_ids']


@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalValJob():
# 8551 or 1042
dev_dataset = get_tensor_data(
eval_batch_size,
args.eval_data_part_num,
args.eval_data_dir,
args.eval_data_prefix,
shuffle=False
)
student_logits, student_reps, student_atts = student_model(dev_dataset['input_ids'], dev_dataset['input_mask'],
dev_dataset['segment_ids'], is_train=False)

return student_logits, dev_dataset['label_ids']


def run_eval_job(eval_job_func, num_steps, desc='train'):
labels = []
predictions = []
start_time = time.time()
for index in range(num_steps):
logits, label = eval_job_func().get()
predictions.extend(list(logits.numpy().argmax(axis=1)))
labels.extend(list(label))
end_time = time.time()
cost_time = end_time - start_time
print('cost time: {} s'.format(cost_time))

model_size = getdirsize(args.model_save_dir)
print('model_size: %d Mbytes' % (model_size / 1024 / 1024)) # Mbytes

accuracy = accuracy_score(labels, predictions)
mcc = matthews_corrcoef(labels, predictions)
precision = precision_score(labels, predictions)
recall = recall_score(labels, predictions)
f_1 = f1_score(labels, predictions)
save_dict = {"accuracy": "%.2f" % accuracy,
"MCC": "%.2f" % mcc,
"precision": "%.2f" % precision,
"recall": "%.2f" % recall,
"f_1": "%.2f" % f_1,
"modelSize": "%d" % (model_size / 1024 / 1024),
"reasoningTime": "%.2f" % (args.eval_example_num / cost_time)} # sample/second

if args.result_dir == "":
args.result_dir = args.model_save_dir
if not os.path.exists(args.result_dir):
os.makedirs(args.result_dir)
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f:
json.dump(save_dict, f)

def metric_fn(predictions, labels):
return {
"accuracy": accuracy,
"matthews_corrcoef": mcc,
"precision": precision,
"recall": recall,
"f1": f_1,
}

metric_dict = metric_fn(predictions, labels)
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items()))
return metric_dict


def CopyFile(filepath, newPath):
fileNames = os.listdir(filepath)
for file in fileNames:
newDir = os.path.join(filepath, file)
if os.path.isfile(newDir):
# print(newDir)
newFile = os.path.join(newPath, file)
shutil.copyfile(newDir, newFile)
else:
if not os.path.exists(os.path.join(newPath, file)):
os.makedirs(os.path.join(newPath, file))
CopyFile(newDir, os.path.join(newPath, file))


def main():
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.env.log_dir(args.log_dir)

InitNodes(args)

check_point = flow.train.CheckPoint()

summary = Summary(args.log_dir, args)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
if args.do_train:
print('Loading model...')
check_point.load(args.teacher_model)

print('Start training...')
global_step = 0
best_dev_acc = 0.0
for epoch in range(args.num_epochs):
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
batch_size=batch_size, keys=['loss'])

for step in range(epoch_size):
DistilJob().async_get(metric.metric_cb(step, epoch=epoch))
global_step += 1
# if (global_step + 1) % args.model_save_every_n_iter == 0:
# if not os.path.exists(args.model_save_dir):
# os.makedirs(args.model_save_dir)
# snapshot_save_path = os.path.join(
# args.model_save_dir, "snapshot_%d" % (global_step + 1)
# )
# print("Saving model to {}.".format(snapshot_save_path))
# check_point.save(snapshot_save_path)

print('EvalTrainJob...')
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
print('EvalValJob...')
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')

save_model = False
if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
best_dev_acc = result['accuracy']
save_model = True

# if task_name in corr_tasks and result['corr'] > best_dev_acc:
# best_dev_acc = result['corr']
# save_model = True

if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
best_dev_acc = result['matthews_corrcoef']
save_model = True
print('Best result:', result)

if save_model:
if os.path.exists(args.model_save_dir):
import shutil
shutil.rmtree(args.model_save_dir)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
snapshot_save_path = os.path.join(args.model_save_dir)
print("Saving best model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)
flow.sync_default_session()

if args.save_last_snapshot:
snapshot_save_path = args.model_save_dir
if os.path.exists(args.model_save_dir):
import shutil
shutil.rmtree(args.model_save_dir)
print("Saving model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)
flow.sync_default_session()

if args.serve_for_online:
print('Deleting the teacher params and the optimizer parmas from model_save_dir...')
remove_teacher_params(args.model_save_dir)

if args.do_eval:
print('Loading model...')
print(args.model_save_dir)

if not args.do_train:
check_point.load(args.model_save_dir)
print('Evaluation...')
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
# if args.save_last_snapshot:
# snapshot.save("last_snapshot")


if __name__ == "__main__":
main()

+ 54
- 0
model_compress/model_compress/distil/examples/teacher_bert/README.md View File

@@ -0,0 +1,54 @@
# BERT教师模型
使用BERT在GLUE文本分类任务数据集上进行微调,作为知识蒸馏的教师模型。

## 1. 依赖
- Python 3.6
- oneflow-cu101 0.1.10

完整的环境可以通过以下命令安装:
```bash
conda create -n tinybert python=3.6
```

```bash
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user
```
> 注:以下操作时,根目录为`model_compress/distil`

## 2. 数据获取
如何获取数据请查阅[这里](../../README.md#22-数据获取)

## 3. 微调教师模型
预训练BERT模型下载地址:
链接: https://pan.baidu.com/s/1jfTUY7ygcZZOJzjfrgUL8Q 提取码: 6b87

下载后放置在`model_compress/models/uncased_L-12_H-768_A-12_oneflow`
#### 3.1 训练
- 执行以下脚本进行微调教师模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- MODEL_SAVE_DIR: 模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如优化器参数等)

```bash
bash run_train_teacher.sh
```
- 我们微调过的教师模型可以在这里下载: 链接: https://pan.baidu.com/s/1jiOTSPBmmBoij0UwPO6UKw 提取码: 9xkp
- 已在SST-2,QQP,MRPC,RTE,CoLA数据集上微调
- 并放置到`"model_compress/distil/models/finetuned_teacher/"`。
- 在上述数据集的dev集上性能为SST-2: 92.2%, QQP: 91.1%, MRPC: 89.2%, RTE: 69.8%, CoLA: 58.5%
- 评价指标:
- Accuracy: SST-2, MRPC, QQP, RTE
- MCC (Matthews correlation coefficient): CoLA
#### 3.2 测试
- 微调后,可以执行以下脚本对教师模型进行测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- TEACHER_MODEL_DIR: 教师模型路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)

```bash
bash run_eval_teacher.sh
```

+ 311
- 0
model_compress/model_compress/distil/examples/teacher_bert/task_teacher.py View File

@@ -0,0 +1,311 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import math
import numpy as np

import oneflow as flow
import sys

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))

from classifier import GlueBERT
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, remove_optimizer_params, remove_teacher_params

import config as configs
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score
import argparse
import time
import json
import shutil

def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')

parser = configs.get_parser()
parser.add_argument("--task_name", type=str, default='CoLA')
parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs')
parser.add_argument("--train_data_dir", type=str, default=None)
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-')
parser.add_argument("--train_example_num", type=int, default=88614,
help="example number in dataset")
parser.add_argument("--batch_size_per_device", type=int, default=32)
parser.add_argument("--train_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--eval_data_dir", type=str, default=None)
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-')
parser.add_argument("--eval_example_num", type=int, default=10833,
help="example number in dataset")
parser.add_argument("--eval_batch_size_per_device", type=int, default=64)
parser.add_argument("--eval_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results")

parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False, help='if True, then after training, will delete the teacher params and optimizer parmas from model_save_dir')

args = parser.parse_args()

task_name = args.task_name.lower()

batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device

epoch_size = math.ceil(args.train_example_num / batch_size)
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size)
args.iter_num = epoch_size * args.num_epochs
configs.print_args(args)

glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification",
"sst-2": "classification",
"sts-b": "regression",
"qqp": "classification",
"qnli": "classification",
"rte": "classification",
"wnli": "classification",
}

acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
corr_tasks = ["sts-b"]
mcc_tasks = ["cola"]


def BertDecoder(
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True
):
with flow.scope.placement("cpu", "0:0"):
ofrecord = flow.data.ofrecord_reader(data_dir,
batch_size=batch_size,
data_part_num=data_part_num,
part_name_prefix=part_name_prefix,
random_shuffle=shuffle,
shuffle_after_epoch=shuffle)
blob_confs = {}
def _blob_conf(name, shape, dtype=flow.int32):
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype)

_blob_conf("input_ids", [seq_length])
_blob_conf("input_mask", [seq_length])
_blob_conf("segment_ids", [seq_length])
_blob_conf("label_ids", [1])
_blob_conf("is_real_example", [1])

return blob_confs


def BuildBert(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
shuffle=True
):
hidden_size = 64 * args.num_attention_heads # , H = 64, size per head
intermediate_size = hidden_size * 4
# intermediate_size=1200
decoders = BertDecoder(
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle
)
#is_real_example = decoders['is_real_example']

loss, logits = GlueBERT(
decoders['input_ids'],
decoders['input_mask'],
decoders['segment_ids'],
decoders['label_ids'],
args.vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.hidden_dropout_prob,
attention_probs_dropout_prob=args.attention_probs_dropout_prob,
max_position_embeddings=args.max_position_embeddings,
type_vocab_size=args.type_vocab_size,
initializer_range=0.02,
)
return loss, logits, decoders['label_ids']


@flow.global_function(type='train', function_config=GetFunctionConfig(args))
def BertGlueFinetuneJob():
loss, logits, _ = BuildBert(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
)
flow.losses.add_loss(loss)
opt = CreateOptimizer(args)
opt.minimize(loss)
return {'loss': loss}


@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def BertGlueEvalTrainJob():
_, logits, label_ids = BuildBert(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
shuffle=False
)
return logits, label_ids


@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def BertGlueEvalValJob():
#8551 or 1042
_, logits, label_ids = BuildBert(
eval_batch_size,
args.eval_data_part_num,
args.eval_data_dir,
args.eval_data_prefix,
shuffle=False
)
return logits, label_ids

def run_eval_job(eval_job_func, num_steps, desc='train'):
labels = []
predictions = []
start_time = time.time()
for index in range(num_steps):
logits, label = eval_job_func().get()
predictions.extend(list(logits.numpy().argmax(axis=1)))
labels.extend(list(label))
end_time = time.time()
cost_time = end_time-start_time
print('cost time: {} s'.format(cost_time))

model_size = getdirsize(args.model_save_dir)
print('model_size: %d Mbytes' % (model_size/1024/1024)) # Mbytes

accuracy = accuracy_score(labels, predictions)
mcc = matthews_corrcoef(labels, predictions)
precision = precision_score(labels, predictions)
recall = recall_score(labels, predictions)
f_1 = f1_score(labels, predictions)
save_dict = {"accuracy":"%.2f" % accuracy,
"MCC":"%.2f" % mcc,
"precision": "%.2f" % precision,
"recall": "%.2f" % recall,
"f_1": "%.2f" % f_1,
"modelSize":"%d" % (model_size/1024/1024),
"reasoningTime":"%.2f" % (args.eval_example_num / cost_time)} # sample/second

if args.result_dir == "":
args.result_dir = args.model_save_dir
if not os.path.exists(args.result_dir):
os.makedirs(args.result_dir)
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f:
json.dump(save_dict, f)

def metric_fn(predictions, labels):
return {
"accuracy": accuracy,
"matthews_corrcoef": mcc,
"precision": precision,
"recall": recall,
"f1": f_1,
}

metric_dict = metric_fn(predictions, labels)
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items()))
#pd.DataFrame({'predictions': predictions, 'labels': labels}).to_csv('predictions_{0}.csv'.format(step), index=False)
return metric_dict

def main():
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.env.log_dir(args.log_dir)

InitNodes(args)
if args.do_train:
snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

summary = Summary(args.log_dir, args)
best_dev_acc = 0.0
best_result = {}
for epoch in range(args.num_epochs):
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
batch_size=batch_size, keys=['loss'])

for step in range(epoch_size):
BertGlueFinetuneJob().async_get(metric.metric_cb(step, epoch=epoch))
#if 1: #step % args.loss_print_every_n_iter == 0:

run_eval_job(BertGlueEvalTrainJob, epoch_size, desc='train')
result = run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')

save_model = False
if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
best_dev_acc = result['accuracy']
best_result = result
save_model = True
print('Best result:', result)

# if task_name in corr_tasks and result['corr'] > best_dev_acc:
# best_dev_acc = result['corr']
# best_result = result
# save_model = True
#print('Best result:', result)

if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
best_dev_acc = result['matthews_corrcoef']
best_result = result
save_model = True
print('Best result:', result)

if save_model:
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
# snapshot_save_path = os.path.join(args.model_save_dir)
# print("Saving best model to {}".format(snapshot_save_path))
snapshot.save('best')
flow.sync_default_session()
print('Best result:',best_result )
print("Saving best model to "+os.path.join(args.model_save_dir,'snapshot_best'))

if args.serve_for_online:
print('Deleting the optimizer parmas from model_save_dir...')
remove_optimizer_params(os.path.join(args.model_save_dir,'snapshot_best'))

# if args.save_last_snapshot:
# snapshot.save("last_snapshot")
if args.do_eval:
print('Loading model...')
print(args.model_save_dir)
if not args.do_train:
check_point = flow.train.CheckPoint()
check_point.load(args.model_save_dir)
print('Evaluation...')
run_eval_job(BertGlueEvalValJob, num_eval_steps, desc='eval')


if __name__ == "__main__":
main()

+ 203
- 0
model_compress/model_compress/distil/examples/tinybert/README.md View File

@@ -0,0 +1,203 @@
# TinyBERT
["TinyBERT: Distilling BERT for Natural Language Understanding"](https://arxiv.org/abs/1909.10351)论文的实现

## 1. 依赖
- Python 3.6
- oneflow-cu101 0.1.10

完整的环境可以通过以下命令安装:
```bash
conda create -n tinybert python=3.6
```

```bash
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user
```
> 注:以下操作时,根目录为`model_compress/distil`
## 2. 通用蒸馏 (General Distillation,可选)
通用蒸馏阶段使用预训练得到的 BERT-base 为教师模型,在大规模文本语料上进行知识蒸馏得到通用的TinyBERT。
这个操作可以让TinyBERT学习到通用的语义表示,提高了模型的泛化能力,也为随后针对特定任务的蒸馏提供了一个很好的初始化。

通用蒸馏包含两步:
(1)语料预处理 (2)通用蒸馏

### 2.1 语料预处理
准备大规模语料,比如[WikiText-2 dataset](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)。可以用过如下命令下载:
```
cd data
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
unzip wikitext-103-raw-v1.zip
rm wikitext-103-raw-v1.zip
```
执行以下命令,进行训练数据预处理
- CORPUS_RAW:大规模语料,比如说Wikipedia
- BERT_BASE_DIR:教师模型类型
- OUTPUT_DIR: 处理过的语料保存路径
直接执行
```bash
bash run_pregenerate_training_data.sh
```
或者 执行
```bash
CORPUS_RAW='./data/wikitext-103-raw/wiki.train.raw'
BERT_BASE_DIR='bert-base-uncased'
OUTPUT_DIR='./data/pretrain_data_json'

python pregenerate_training_data.py \
--train_corpus $CORPUS_RAW \
--bert_model $BERT_BASE_DIR \
--do_lower_case \
--epochs_to_generate 3 \
--output_dir $OUTPUT_DIR
```

### 2.2 通用蒸馏
将Pytorch的通用TinyBERT模型转为OneFlow的模型格式:
Pytorch版通用tinybert -> tensorflow版通用tinybert -> OneFlow版通用tinybert

#### Step1:
- 从[TinyBERT页面](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT)下载已经训练好的通用TinyBERT模型:

- 利用我们提供的convert_bert_pytorch_checkpoint_to_original_tf.py脚本,将其转为tensorflow模型格式。转换过程如下:

```
python convert_bert_pytorch_checkpoint_to_original_tf.py --model_name='./models/2nd_General_TinyBERT_4L_312D' --pytorch_model_path='./models/2nd_General_TinyBERT_4L_312D/pytorch_model.bin' --tf_cache_dir='./models/2nd_General_TinyBERT_4L_312D_tf'
```
- 再利用我们提供的convert_tf_ckpt_to_of.py脚本,将其转为oneflow模型格式。转换过程如下:

```
cd ./models/2nd_General_TinyBERT_4L_312D_tf/
cat > checkpoint <<ONEFLOW
model_checkpoint_path: "bert_model.ckpt"
all_model_checkpoint_paths: "bert_model.ckpt"
ONEFLOW
```
该命令将在解压目录下创建一个checkpoint文件,并写入以下内容:

model_checkpoint_path: "bert_model.ckpt"
all_model_checkpoint_paths: "bert_model.ckpt"

此时,已经准备好待转化的tensorflow模型目录,整个模型目录的结构如下:

```2nd_General_TinyBERT_4L_312D_tf
├── bert_config.json
├── bert_model.ckpt.data-00000-of-00001
├── bert_model.ckpt.index
├── checkpoint
└── vocab.txt
```
#### Step2:
我们接着使用convert_tf_ckpt_to_of.py将tensorflow模型转为OneFlow模型:

```
python convert_tf_ckpt_to_of.py \
--tf_checkpoint_path ./models/2nd_General_TinyBERT_4L_312D_tf \
--of_dump_path ./models/2nd_General_TinyBERT_4L_312D_oneflow
```

以上命令,将转化好的OneFlow格式的模型保存在`./2nd_General_TinyBERT_4L_312D_oneflow`目录下,供后续微调训练使用。


**你也可以直接下载我们提供的两种规模的通用TinyBERT: General_TinyBERT(4layer-312dim)和General_TinyBERT(6layer-768dim)**

下载地址如下:

链接: https://pan.baidu.com/s/1vZDILxXi-uxo2v3zFlWL3A 提取码: kpia

将他们下载下来,放置在`'./models'`路径下,如`'./models/2nd_General_TinyBERT_4L_312D_oneflow'`和`'./models/2nd_General_TinyBERT_6L_768D_oneflow'`



## 3. 数据增强 (可选)
数据增强是TinyBERT中重要的一步个步骤,通过数据增强步骤,TinyBERT可以学习更多的任务相关的例子,可以进一步提高学生模型的泛化能力。可以帮助TinyBERT获得和BERT-base相匹配的性能,甚至在部分任务上超过BERT-base的表现。

### 3.1 GLUE数据集下载
可以通过执行以下脚本下载GLUE任务的所有数据集,将会自动下载并解压到'--data_dir=data'目录下。

```bash
python ../../src/download_glue_data.py --data_dir data/glue_data --tasks all
```

TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]

以上脚本将会默认下载所有BLUE任务数据集,也可以通过'--tasks=TASKS',指定下载某些数据集

参考[加载与准备OneFlow数据集](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/how_to_make_ofdataset.md),制作OFRecords数据集。或者执行,生成OFRecords数据集:
```
bash glue_process.sh
```

**或者直接下载转换后的OFRecords GLUE数据集:**
链接: https://pan.baidu.com/s/1TuDJpJ8z9zJvvhqjjXiGDg 提取码: phyf

### 3.2 下载GloVe嵌入
TinyBERT所采用的数据增强方法,结合了预训练BERT和GloVe嵌入来做词级别的替换。
可以同以下脚本下载GloVe嵌入,放置到'model_compress/distil/glove'目录下
```
cd glove
wget http://nlp.stanford.edu/data/glove.840B.300d.zip
unzip glove.840B.300d.zip
rm glove.840B.300d.zip
```

### 3.3 进行数据增强
通过执行以下脚本进行数据增强
``` bash
bash run_data_augmentation.sh
```
增强后的数据集 train_aug.tsv 会自动保存到相应的GLUE任务数据集下。


## 4. 任务特定蒸馏 (Task-specific Distillation)
在任务特定蒸馏中,将重新对得到的通用TinyBERT进行微调。通过在特定任务上进行微调,来进一步改进TinyBERT。任务特定化蒸馏包括三个步骤:
(1)微调教师BERT,随后(2)微调学生TinyBERT,包含层与层蒸馏、注意力蒸馏和软标签蒸馏。

### 4.1 微调教师模型BERT
预训练BERT模型下载地址:
- 链接: https://pan.baidu.com/s/1jfTUY7ygcZZOJzjfrgUL8Q 提取码: 6b87
- 下载后放置在`./models/uncased_L-12_H-768_A-12_oneflow`

如何微调教师模型请查阅[这里](../../README.md#23-微调教师模型)
- 我们微调过的教师模型可以在这里下载: 链接: https://pan.baidu.com/s/1jiOTSPBmmBoij0UwPO6UKw 提取码: 9xkp
- 已在SST-2,QQP,MRPC,RTE,CoLA数据集上微调
- 并放置到`"model_compress/distil/models/finetuned_teacher/"`。
- 在上述数据集的dev集上性能为SST-2: 92.2%, QQP: 91.1%, MRPC: 89.2%, RTE: 69.8%, CoLA: 58.5%
- 评价指标:
- Accuracy: SST-2, MRPC, QQP, RTE
- MCC (Matthews correlation coefficient): CoLA

### 4.2 微调学生模型TinyBERT
执行以下脚本将教师模型蒸馏到学生模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径
- TMP_STUDENT_DIR: 临时学生模型路径
- STUDENT_DIR: 学生模型保存路径
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)
- SERVE_FOR_ONLINE: 模型是否用于上线 (默认SERVE_FOR_ONLINE='False',如果SERVE_FOR_ONLINE='True',则删除清理模型保存路径中的无关变量,如教师模型参数和优化器参数等等)

直接执行
```bash
bash run_train_student_tinybert.sh
```

最终得到学生TinyBERT,可以从这里下载:
- 下载链接: https://pan.baidu.com/s/1nOAZHd3wLmyVw2vTJB7KfQ 提取码: ma65
- 并放置到`./models/student_model/SST-2/tinybert_epoch-4_lr-2e-5_wd-0.0001`

### 4.3 性能测试
通过执行以下脚本,在GLUE任务上进行性能测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- STUDENT_DIR: 学生模型保存路径,蒸馏过的学生模型下载链接如下(SST-2数据集)
- RESULT_DIR: 测试结果json文件保存路径 (如果为RESULT_DIR="",则默认保存到模型保存路径下,results_eval.json)

```bash
bash run_eval_student_tinybert.sh
```

### 4.4 结果:
在SST-2 DEV数据集上:
- 模型精度:教师模型acc 92.2% ->学生模型acc 91.3%
- 模型尺寸:教师模型110M -> 学生模型 14.5M (↓7.5x)
- 推理耗时:教师模型4.04s -> 0.65s (↓6.2×)

+ 393
- 0
model_compress/model_compress/distil/examples/tinybert/data_augmentation.py View File

@@ -0,0 +1,393 @@
# coding: utf-8
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import random
import sys
import os
import unicodedata
import re
import logging
import csv
import argparse
import numpy as np
import sys

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../src")))

sys.path.append(rootPath)

import tokenization
from maskedBert import maskedBert
import oneflow as flow
import config as configs
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig
import math
import oneflow.typing as tp

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)

logger = logging.getLogger(__name__)

StopWordsList = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself',
'they', 'them', 'their', 'theirs', 'themselves', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be',
'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because',
'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
'there', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've',
'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
"haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't", "'s", "'re"]

parser = configs.get_parser()
parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str,
help="Downloaded pretrained model (bert-base-uncased) is under this folder")
parser.add_argument("--glove_embs", default="./glove/glove.840B.300d.txt", type=str,
help="Glove word embeddings file")
parser.add_argument("--glue_dir", default="./data", type=str,
help="GLUE data dir")
parser.add_argument("--task_name", default="MRPC", type=str,
help="Task(eg. CoLA, SST-2) that we want to do data augmentation for its train set")
parser.add_argument("--N", default=30, type=int,
help="How many times is the corpus expanded?")
parser.add_argument("--M", default=15, type=int,
help="Choose from M most-likely words in the corresponding position")
parser.add_argument("--p", default=0.4, type=float,
help="Threshold probability p to replace current word")

parser.add_argument(
'--vocab_file',
help='The vocabulary file that the BERT model was trained on.',
default=None,
type=str,
required=True
)

parser.add_argument("--eval_data_dir", type=str, default=None)
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-')
parser.add_argument("--eval_batch_size_per_device", type=int, default=64)
parser.add_argument("--eval_data_part_num", type=int, default=1,
help="data part number in dataset")

args = parser.parse_args()

# batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device
# eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device
eval_batch_size = 1
# epoch_size = math.ceil(args.train_example_num / batch_size)
# num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size)
# args.iter_num = epoch_size * args.num_epochs
configs.print_args(args)


def strip_accents(text):
"""
Strip accents from input String.

:param text: The input string.
:type text: String.

:returns: The processed String.
:rtype: String.
"""
try:
text = unicode(text, 'utf-8')
except (TypeError, NameError):
# unicode is a default on python 3
pass
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return str(text)


# valid string only includes al
def _is_valid(string):
return True if not re.search('[^a-z]', string) else False


def _read_tsv(input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
return lines


def prepare_embedding_retrieval(glove_file, vocab_size=100000):
cnt = 0
words = []
embeddings = {}

# only read first 100,000 words for fast retrieval
with open(glove_file, 'r', encoding='utf-8') as fin:
for line in fin:
items = line.strip().split(' ')
words.append(items[0])
embeddings[items[0] = [float(x) for x in items[1:]

cnt += 1
if cnt == vocab_size:
break

vocab = {w: idx for idx, w in enumerate(words)}
ids_to_tokens = {idx: w for idx, w in enumerate(words)}

vector_dim = len(embeddings[ids_to_tokens[0])
emb_matrix = np.zeros((vocab_size, vector_dim))
for word, v in embeddings.items():
if word == '<unk>':
continue
emb_matrix[vocab[word], :] = v

# normalize each word vector
d = (np.sum(emb_matrix ** 2, 1) ** 0.5)
emb_norm = (emb_matrix.T / d).T
return emb_norm, vocab, ids_to_tokens

BATCH_SIZE = 1
@flow.global_function(type="predict", function_config=GetFunctionConfig(args))
def eval_job(
input_ids: tp.Numpy.Placeholder((BATCH_SIZE, args.seq_length), dtype=flow.int32),
input_mask: tp.Numpy.Placeholder((BATCH_SIZE, args.seq_length), dtype=flow.int32),
segment_ids: tp.Numpy.Placeholder((BATCH_SIZE, args.seq_length), dtype=flow.int32),
mask_id: tp.Numpy.Placeholder((BATCH_SIZE, 1), dtype=flow.int32),
) -> tp.Numpy:
# with flow.scope.placement("gpu", "0:0"):
hidden_size = 64 * args.num_attention_heads # , H = 64, size per head
intermediate_size = hidden_size * 4
outputs = maskedBert(
input_ids,
input_mask,
segment_ids,
mask_id,
args.vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.hidden_dropout_prob,
attention_probs_dropout_prob=args.attention_probs_dropout_prob,
max_position_embeddings=args.max_position_embeddings,
type_vocab_size=args.type_vocab_size,
initializer_range=0.02,
)
return outputs

class DataAugmentor(object):
def __init__(self, tokenizer, emb_norm, vocab, ids_to_tokens, M, N, p):
# self.model = model
self.tokenizer = tokenizer
self.emb_norm = emb_norm
self.vocab = vocab
self.ids_to_tokens = ids_to_tokens
self.M = M
self.N = N
self.p = p

def _word_distance(self, word):
if word not in self.vocab.keys():
return []
word_idx = self.vocab[word]
word_emb = self.emb_norm[word_idx]

dist = np.dot(self.emb_norm, word_emb.T)
dist[word_idx] = -np.Inf

candidate_ids = np.argsort(-dist)[:self.M]
return [self.ids_to_tokens[idx] for idx in candidate_ids][:self.M]

def _masked_language_model(self, sent, word_pieces, mask_id):
tokenized_text = self.tokenizer.tokenize(sent)
tokenized_text = ['[CLS]'] + tokenized_text
tokenized_len = len(tokenized_text)

tokenized_text = word_pieces + ['[SEP]'] + tokenized_text[1:] + ['[SEP]']

if len(tokenized_text) > 512:
tokenized_text = tokenized_text[:512]

token_ids = self.tokenizer.convert_tokens_to_ids(tokenized_text)
segments_ids = [0] * (tokenized_len + 1) + [1] * (len(tokenized_text) - tokenized_len - 1)
input_mask = [1] * len(token_ids)

# Zero-pad up to the sequence length.
while len(token_ids) < args.seq_length:
token_ids.append(0)
input_mask.append(0)
segments_ids.append(0)

token_ids = np.array(token_ids).reshape(1,args.seq_length).astype(np.int32)
input_mask = np.array(input_mask).reshape(1,args.seq_length).astype(np.int32)
segments_ids = np.array(segments_ids).reshape(1,args.seq_length).astype(np.int32)
mask_id = np.array(mask_id).reshape(1,1).astype(np.int32)
# print('token_ids:',token_ids)
# print('mask_id:',mask_id)
outputs = eval_job(token_ids,input_mask,segments_ids,mask_id)
# print(outputs)
predictions = outputs
predictions = np.array(predictions)
# print('predictions:',predictions)
word_candidates = np.argsort(-predictions)[0][:self.M].tolist()

word_candidates = self.tokenizer.convert_ids_to_tokens(word_candidates)

return list(filter(lambda x: x.find("##"), word_candidates))

def _word_augment(self, sentence, mask_token_idx, mask_token):
word_pieces = self.tokenizer.tokenize(sentence)
word_pieces = ['[CLS]'] + word_pieces
tokenized_len = len(word_pieces)

token_idx = -1
for i in range(1, tokenized_len):
if "##" not in word_pieces[i]:
token_idx = token_idx + 1
if token_idx < mask_token_idx:
word_piece_ids = []
elif token_idx == mask_token_idx:
word_piece_ids = [i]
else:
break
else:
word_piece_ids.append(i)

if len(word_piece_ids) == 1:
word_pieces[word_piece_ids[0] = '[MASK]'
candidate_words = self._masked_language_model(
sentence, word_pieces, word_piece_ids[0])
elif len(word_piece_ids) > 1:
candidate_words = self._word_distance(mask_token)
else:
logger.info("invalid input sentence!")
if len(candidate_words)==0:
candidate_words.append(mask_token)

return candidate_words

def augment(self, sent):
candidate_sents = [sent]

tokens = self.tokenizer.basic_tokenizer.tokenize(sent)
candidate_words = {}
for (idx, word) in enumerate(tokens):
if _is_valid(word) and word not in StopWordsList:
candidate_words[idx] = self._word_augment(sent, idx, word)
logger.info(candidate_words)
cnt = 0
while cnt < self.N:
new_sent = list(tokens)
for idx in candidate_words.keys():
candidate_word = random.choice(candidate_words[idx])

x = random.random()
if x < self.p:
new_sent[idx] = candidate_word

if " ".join(new_sent) not in candidate_sents:
candidate_sents.append(' '.join(new_sent))
cnt += 1

return candidate_sents


class AugmentProcessor(object):
def __init__(self, augmentor, glue_dir, task_name):
self.augmentor = augmentor
self.glue_dir = glue_dir
self.task_name = task_name
self.augment_ids = {'MRPC': [3, 4], 'MNLI': [8, 9], 'CoLA': [3], 'SST-2': [0],
'STS-B': [7, 8], 'QQP': [3, 4], 'QNLI': [1, 2], 'RTE': [1, 2]}

self.filter_flags = { 'MRPC': True, 'MNLI': True, 'CoLA': False, 'SST-2': True,
'STS-B': True, 'QQP': True, 'QNLI': True, 'RTE': True}

assert self.task_name in self.augment_ids

def read_augment_write(self):
task_dir = os.path.join(self.glue_dir, self.task_name)
train_samples = _read_tsv(os.path.join(task_dir, "train.tsv"))
output_filename = os.path.join(task_dir, "train_aug.tsv")

augment_ids_ = self.augment_ids[self.task_name]
filter_flag = self.filter_flags[self.task_name]

with open(output_filename, 'w', newline='', encoding="utf-8") as f:
writer = csv.writer(f, delimiter="\t")
for (i, line) in enumerate(train_samples):
if i == 0 and filter_flag:
writer.writerow(line)
continue

for augment_id in augment_ids_:
sent = line[augment_id]
augmented_sents = self.augmentor.augment(sent)
for augment_sent in augmented_sents:
line[augment_id] = augment_sent
writer.writerow(line)

if (i+1) % 1000 == 0:
logger.info("Having been processing {} examples".format(str(i+1)))

def main():
# logger.info(args)
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.env.log_dir(args.log_dir)

default_params = {
"CoLA": {"N": 30},
"MNLI": {"N": 10},
"MRPC": {"N": 30},
"SST-2": {"N": 20},
"STS-b": {"N": 30},
"QQP": {"N": 10},
"QNLI": {"N": 20},
"RTE": {"N": 30}
}

if args.task_name in default_params:
args.N = default_params[args.task_name]["N"]

# Prepare data augmentor
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

InitNodes(args)
snapshot = Snapshot(args.model_save_dir, args.model_load_dir)

emb_norm, vocab, ids_to_tokens = prepare_embedding_retrieval(args.glove_embs)

data_augmentor = DataAugmentor(tokenizer, emb_norm, vocab, ids_to_tokens, args.M, args.N, args.p)

# Do data augmentation
processor = AugmentProcessor(data_augmentor, args.glue_dir, args.task_name)
processor.read_augment_write()


if __name__ == "__main__":
main()

+ 224
- 0
model_compress/model_compress/distil/examples/tinybert/maskedBert.py View File

@@ -0,0 +1,224 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import sys
import os
curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))

import bert as bert_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util


def maskedBert(
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
masked_lm_positions_blob,
# masked_lm_positions_blob,
# masked_lm_ids_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
# max_predictions_per_seq=20,
initializer_range=0.02,
):
backbone = bert_util.BertBackbone(
input_ids_blob=input_ids_blob,
input_mask_blob=input_mask_blob,
token_type_ids_blob=token_type_ids_blob,
vocab_size=vocab_size,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
)

predictions = _AddMaskedLanguageModel(
input_blob=backbone.sequence_output(),
output_weights_blob=backbone.embedding_table(),
positions_blob=masked_lm_positions_blob,
seq_length=seq_length,
hidden_size=hidden_size,
vocab_size=vocab_size,
hidden_act=bert_util.GetActivation(hidden_act),
initializer_range=initializer_range,
)
pooled_output = PooledOutput(
backbone.sequence_output(), hidden_size, initializer_range
)
return predictions


def PooledOutput(sequence_output, hidden_size, initializer_range):
with flow.scope.namespace("bert-pooler"):
first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1])
first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size])
pooled_output = bert_util._FullyConnected(
first_token_tensor,
input_size=hidden_size,
units=hidden_size,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name="dense",
)
pooled_output = flow.math.tanh(pooled_output)
return pooled_output


def _AddMaskedLanguageModelLoss(
input_blob,
output_weights_blob,
positions_blob,
label_id_blob,
label_weight_blob,
seq_length,
hidden_size,
vocab_size,
max_predictions_per_seq,
hidden_act,
initializer_range,
):
with flow.scope.namespace("other"):
sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1])
ones = sum_label_weight_blob * 0.0 + 1.0
sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
batch_size = flow.math.reduce_sum(ones)
sum_label_weight_blob = sum_label_weight_blob / batch_size
with flow.scope.namespace("cls-predictions"):
input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
with flow.scope.namespace("transform"):
if callable(hidden_act):
act_fn = op_conf_util.kNone
else:
act_fn = hidden_act
input_blob = bert_util._FullyConnected(
input_blob,
input_size=hidden_size,
units=hidden_size,
activation=act_fn,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name="dense",
)
if callable(hidden_act):
input_blob = hidden_act(input_blob)
input_blob = bert_util._LayerNorm(input_blob, hidden_size)
output_bias = flow.get_variable(
name="output_bias",
shape=[vocab_size],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(1.0),
)
logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias)
label_id_blob = flow.reshape(label_id_blob, [-1])
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=logit_blob, labels=label_id_blob
)
pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
numerator = pre_example_loss * label_weight_blob
with flow.scope.namespace("loss"):
numerator = flow.math.reduce_sum(numerator, axis=[-1])
denominator = sum_label_weight_blob + 1e-5
loss = numerator / denominator
return loss, pre_example_loss, logit_blob

def _AddMaskedLanguageModel(
input_blob,
output_weights_blob,
positions_blob,
seq_length,
hidden_size,
vocab_size,
hidden_act,
initializer_range,
):
with flow.scope.namespace("cls-predictions"):
# 获取mask词的encode
input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
# 在输出之前添加一个非线性变换,只在预训练阶段起作用
with flow.scope.namespace("transform"):
if callable(hidden_act):
act_fn = op_conf_util.kNone
else:
act_fn = hidden_act
# print('hhhhh')
input_blob = bert_util._FullyConnected(
input_blob,
input_size=hidden_size,
units=hidden_size,
activation=act_fn,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name="dense",
)
if callable(hidden_act):
input_blob = hidden_act(input_blob)
input_blob = bert_util._LayerNorm(input_blob, hidden_size)
# output_weights是和传入的word embedding一样的
# 这里再添加一个bias
output_bias = flow.get_variable(
name="output_bias",
shape=[vocab_size],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(1.0),
)
logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias)

return logit_blob


def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size):
output = flow.gather(
params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2
)
output = flow.reshape(output, [-1, hidden_size])
return output


def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range):
with flow.scope.namespace("cls-seq_relationship"):
output_weight_blob = flow.get_variable(
name="output_weights",
shape=[2, hidden_size],
dtype=input_blob.dtype,
initializer=bert_util.CreateInitializer(initializer_range),
)
output_bias_blob = flow.get_variable(
name="output_bias",
shape=[2],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(0.0),
)
logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=logit_blob, labels=label_blob
)
loss = pre_example_loss
return loss, pre_example_loss, logit_blob

+ 407
- 0
model_compress/model_compress/distil/examples/tinybert/pregenerate_training_data.py View File

@@ -0,0 +1,407 @@
# coding=utf-8
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import json
import collections
import logging
import os
import shelve
from argparse import ArgumentParser
from pathlib import Path
from tqdm import tqdm, trange
from tempfile import TemporaryDirectory
from multiprocessing import Pool

import numpy as np
from random import random, randrange, randint, shuffle, choice

import sys

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))

from glue_ofrecord import tokenization

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)


class DocumentDatabase:
def __init__(self, reduce_memory=False):
if reduce_memory:
self.temp_dir = TemporaryDirectory()
self.working_dir = Path(self.temp_dir.name)
self.document_shelf_filepath = self.working_dir / 'shelf.db'
self.document_shelf = shelve.open('/cache/shelf.db',
flag='n', protocol=-1)
self.documents = None
else:
self.documents = []
self.document_shelf = None
self.document_shelf_filepath = None
self.temp_dir = None
self.doc_lengths = []
self.doc_cumsum = None
self.cumsum_max = None
self.reduce_memory = reduce_memory

def add_document(self, document):
if not document:
return
if self.reduce_memory:
current_idx = len(self.doc_lengths)
self.document_shelf[str(current_idx)] = document
else:
self.documents.append(document)
self.doc_lengths.append(len(document))

def _precalculate_doc_weights(self):
self.doc_cumsum = np.cumsum(self.doc_lengths)
self.cumsum_max = self.doc_cumsum[-1]

def sample_doc(self, current_idx, sentence_weighted=True):
# Uses the current iteration counter to ensure we don't sample the same doc twice
if sentence_weighted:
# With sentence weighting, we sample docs proportionally to their sentence length
if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths):
self._precalculate_doc_weights()
rand_start = self.doc_cumsum[current_idx]
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
else:
# If we don't use sentence weighting, then every doc has an equal chance to be chosen
sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
assert sampled_doc_index != current_idx
if self.reduce_memory:
return self.document_shelf[str(sampled_doc_index)]
else:
return self.documents[sampled_doc_index]

def __len__(self):
return len(self.doc_lengths)

def __getitem__(self, item):
if self.reduce_memory:
return self.document_shelf[str(item)]
else:
return self.documents[item]

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, traceback):
if self.document_shelf is not None:
self.document_shelf.close()
if self.temp_dir is not None:
self.temp_dir.cleanup()


def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
"""Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break

trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1

# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()


MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])


def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
"""Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
with several refactors to clean it up and remove a lot of unnecessary variables."""
cand_indices = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
# Whole Word Masking means that if we mask all of the wordpieces
# corresponding to an original word. When a word has been split into
# WordPieces, the first token does not have any marker and any subsequence
# tokens are prefixed with ##. So whenever we see the ## token, we
# append it to the previous set of word indexes.
#
# Note that Whole Word Masking does *not* change the training code
# at all -- we still predict each WordPiece independently, softmaxed
# over the entire vocabulary.
if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")):
cand_indices[-1].append(i)
else:
cand_indices.append([i])

num_to_mask = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
shuffle(cand_indices)
masked_lms = []
covered_indexes = set()
for index_set in cand_indices:
if len(masked_lms) >= num_to_mask:
break
# If adding a whole-word mask would exceed the maximum number of
# predictions, then just skip this candidate.
if len(masked_lms) + len(index_set) > num_to_mask:
continue
is_any_index_covered = False
for index in index_set:
if index in covered_indexes:
is_any_index_covered = True
break
if is_any_index_covered:
continue
for index in index_set:
covered_indexes.add(index)

# 80% of the time, replace with [MASK]
if random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = choice(vocab_list)
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
tokens[index] = masked_token

assert len(masked_lms) <= num_to_mask
masked_lms = sorted(masked_lms, key=lambda x: x.index)
mask_indices = [p.index for p in masked_lms]
masked_token_labels = [p.label for p in masked_lms]

return tokens, mask_indices, masked_token_labels


def create_instances_from_document(
doc_database, doc_idx, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list, bi_text=True):
"""This code is mostly a duplicate of the equivalent function from Google BERT's repo.
However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
(rather than each document) has an equal chance of being sampled as a false example for the NextSentence task."""
document = doc_database[doc_idx]
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3

# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length = max_num_tokens
if random() < short_seq_prob:
target_seq_length = randint(2, max_num_tokens)

# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = randrange(1, len(current_chunk))

tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])

tokens_b = []

# Random next
if bi_text and (len(current_chunk) == 1 or random() < 0.5):
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)

# Sample a random document, with longer docs being sampled more frequently
random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)

random_start = randrange(0, len(random_document))
for j in range(random_start, len(random_document)):
tokens_b.extend(random_document[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])

if not tokens_a or len(tokens_a) == 0:
tokens_a = ["."]

if not tokens_b or len(tokens_b) == 0:
tokens_b = ["."]

assert len(tokens_a) >= 1
assert len(tokens_b) >= 1

truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)

tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
# The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP]
# They are 1 for the B tokens and the final [SEP]
segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]

tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)

instance = {
"tokens": tokens,
"segment_ids": segment_ids,
"is_random_next": is_random_next,
"masked_lm_positions": masked_lm_positions,
"masked_lm_labels": masked_lm_labels}

instances.append(instance)
current_chunk = []
current_length = 0
i += 1

return instances


def create_training_file(docs, vocab_list, args, epoch_num, bi_text=True):
epoch_filename = args.output_dir / "epoch_{}.json".format(epoch_num)
num_instances = 0
with epoch_filename.open('w') as epoch_file:
for doc_idx in trange(len(docs), desc="Document"):
doc_instances = create_instances_from_document(
docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list, bi_text=bi_text)
doc_instances = [json.dumps(instance) for instance in doc_instances]
for instance in doc_instances:
epoch_file.write(instance + '\n')
num_instances += 1
metrics_filename = args.output_dir / "epoch_{}_metrics.json".format(epoch_num)
with metrics_filename.open('w') as metrics_file:
metrics = {
"num_training_examples": num_instances,
"max_seq_len": args.max_seq_len
}
metrics_file.write(json.dumps(metrics))

return epoch_filename, metrics_filename


def main():
parser = ArgumentParser()
parser.add_argument('--train_corpus', type=Path, required=True)
parser.add_argument("--output_dir", type=Path, required=True)
# parser.add_argument("--bert_model", type=str, required=True)
parser.add_argument(
'--vocab_file',
help='The vocabulary file that the BERT model was trained on.',
default=None,
type=str,
required=True
)
parser.add_argument("--do_lower_case", action="store_true")
parser.add_argument("--do_whole_word_mask", action="store_true",
help="Whether to use whole word masking rather than per-WordPiece masking.")
parser.add_argument("--reduce_memory", action="store_true",
help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")

parser.add_argument("--num_workers", type=int, default=1,
help="The number of workers to use to write the files")
parser.add_argument("--epochs_to_generate", type=int, default=3,
help="Number of epochs of data to pregenerate")
parser.add_argument("--max_seq_len", type=int, default=128)
parser.add_argument("--short_seq_prob", type=float, default=0.1,
help="Probability of making a short sentence as a training example")
parser.add_argument("--masked_lm_prob", type=float, default=0.0,
help="Probability of masking each token for the LM task") # no [mask] symbol in corpus
parser.add_argument("--max_predictions_per_seq", type=int, default=20,
help="Maximum number of tokens to mask in each sequence")
parser.add_argument('--oneseq', action='store_true')

args = parser.parse_args()

if args.num_workers > 1 and args.reduce_memory:
raise ValueError("Cannot use multiple workers while reducing memory")

# tokenizer = BertTokenizer.from_pretrained(args.bert_model) #, do_lower_case=args.do_lower_case
tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

vocab_list = list(tokenizer.vocab.keys())
doc_num = 0
with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
with args.train_corpus.open(encoding="utf-8") as f:
doc = []
for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
line = line.strip()
if line == "":
docs.add_document(doc)
doc = []
doc_num += 1
if doc_num % 100 == 0:
logger.info('loaded {} docs!'.format(doc_num))
else:
tokens = tokenizer.tokenize(line)
doc.append(tokens)
if doc:
docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added
if len(docs) <= 1:
exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
"ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
"indicate breaks between documents in your input file. If your dataset does not contain multiple "
"documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
"sections or paragraphs.")

args.output_dir.mkdir(exist_ok=True)

if args.num_workers > 1:
writer_workers = Pool(min(args.num_workers, args.epochs_to_generate))
arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)]
writer_workers.starmap(create_training_file, arguments)
else:
for epoch in trange(args.epochs_to_generate, desc="Epoch"):
bi_text = True if not args.oneseq else False
epoch_file, metric_file = create_training_file(docs, vocab_list, args, epoch, bi_text=bi_text)


if __name__ == '__main__':
main()

+ 32
- 0
model_compress/model_compress/distil/examples/tinybert/run_data_augmentation.sh View File

@@ -0,0 +1,32 @@
# Data augmentation aims to expand the task-specific training set.

nvidia-smi

PRETRAINED_MODEL='../../models/uncased_L-12_H-768_A-12_oneflow' # the BERT-base teacher model
VOCAB_FILE='../../src/glue_ofrecord/vocab.txt'

GLOVE_EMB='../../glove/glove.840B.300d.txt'
GLUE_DIR='../../data/glue_data'
TASK_NAME=SST-2

GPU=0
CUDA_VISIBLE_DEVICES=$GPU python3 data_augmentation.py \
--model_load_dir=${PRETRAINED_MODEL} \
--model_save_dir=./snapshots \
--vocab_file $VOCAB_FILE \
--do_lower_case \
--glove_embs $GLOVE_EMB \
--glue_dir $GLUE_DIR \
--task_name $TASK_NAME \
--log_dir=./log \
--save_last_snapshot=True \
--gpu_num_per_node=1 \
--seq_length=512 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64

+ 16
- 0
model_compress/model_compress/distil/examples/tinybert/run_pregenerate_training_data.sh View File

@@ -0,0 +1,16 @@
nvidia-smi

# tqdm
# wikitext-103-raw, wikitext-2-raw
CORPUS_RAW='./data/wikitext-103-raw/wiki.train.raw'
VOCAB_FILE='./glue_ofrecord/vocab.txt'
OUTPUT_DIR='./data/pretrain_data_json'
GPU=0

CUDA_VISIBLE_DEVICES=$GPU python pregenerate_training_data.py \
--train_corpus $CORPUS_RAW \
--vocab_file $VOCAB_FILE \
--do_lower_case \
--epochs_to_generate 3 \
--output_dir $OUTPUT_DIR


+ 516
- 0
model_compress/model_compress/distil/examples/tinybert/task_student_tinybert.py View File

@@ -0,0 +1,516 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import math
import numpy as np
import sys

curPath = os.path.abspath(os.path.dirname(__file__))
rootPath = os.path.split(curPath)[0]
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./src")))

import oneflow as flow
import oneflow.typing as tp
from classifier import GlueBERT
from util import Snapshot, Summary, InitNodes, Metric, CreateOptimizer, GetFunctionConfig, getdirsize, \
remove_optimizer_params, remove_teacher_params

import config as configs
from sklearn.metrics import accuracy_score, matthews_corrcoef, precision_score, recall_score, f1_score
import argparse
import shutil
import tempfile
import time
import json
from knowledge_distill_util import BertForSequenceClassification, BertStudentForSequenceClassification, \
soft_cross_entropy, mseloss, layer_distill, att_distill, pred_distill


def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')


parser = configs.get_parser()
parser.add_argument("--task_name", type=str, default='CoLA')
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.")
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.")
parser.add_argument("--total_model", default=None, type=str, help="The student model dir.")

parser.add_argument('--num_epochs', type=int, default=3, help='number of epochs')
parser.add_argument("--train_data_dir", type=str, default=None)
parser.add_argument("--train_data_prefix", type=str, default='train.of_record-')
parser.add_argument("--train_example_num", type=int, default=88614,
help="example number in dataset")
parser.add_argument("--batch_size_per_device", type=int, default=32)
parser.add_argument("--train_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--eval_data_dir", type=str, default=None)
parser.add_argument("--eval_data_prefix", type=str, default='eval.of_record-')
parser.add_argument("--eval_example_num", type=int, default=10833,
help="example number in dataset")
parser.add_argument("--eval_batch_size_per_device", type=int, default=64)
parser.add_argument("--eval_data_part_num", type=int, default=1,
help="data part number in dataset")
parser.add_argument("--result_dir", type=str, default="", help="the save directory of results")

#
parser.add_argument("--student_num_hidden_layers", type=int, default=24)
parser.add_argument("--student_num_attention_heads", type=int, default=16)
parser.add_argument("--student_max_position_embeddings", type=int, default=512)
parser.add_argument("--student_type_vocab_size", type=int, default=2)
parser.add_argument("--student_vocab_size", type=int, default=30522)
parser.add_argument("--student_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--student_hidden_size_per_head", type=int, default=64)
parser.add_argument("--student_hidden_size", type=int, default=768)

parser.add_argument("--teacher_num_hidden_layers", type=int, default=24)
parser.add_argument("--teacher_num_attention_heads", type=int, default=16)
parser.add_argument("--teacher_max_position_embeddings", type=int, default=512)
parser.add_argument("--teacher_type_vocab_size", type=int, default=2)
parser.add_argument("--teacher_vocab_size", type=int, default=30522)
parser.add_argument("--teacher_attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--teacher_hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--teacher_hidden_size_per_head", type=int, default=64)
parser.add_argument("--teacher_hidden_size", type=int, default=768)

parser.add_argument('--intermediate_distill', type=str2bool, nargs='?', const=True,
help='distill attention, intermediate and embedding information')
parser.add_argument('--pred_distill', type=str2bool, nargs='?', const=True, help='distill prediction layer')
parser.add_argument('--temperature', type=float, default=1.)
parser.add_argument('--aug_train', type=str2bool, nargs='?', const=False, help='using augmented training set?')

parser.add_argument('--serve_for_online', type=str2bool, nargs='?', const=False,
help='if serve for online, then after training, will delete the teacher params and optimizer parmas from model_save_dir')

args = parser.parse_args()

task_name = args.task_name.lower()

if args.aug_train:
args.train_data_dir = args.train_data_dir.replace('train', 'train_aug')

batch_size = args.num_nodes * args.gpu_num_per_node * args.batch_size_per_device
eval_batch_size = args.num_nodes * args.gpu_num_per_node * args.eval_batch_size_per_device

epoch_size = math.ceil(args.train_example_num / batch_size)
num_eval_steps = math.ceil(args.eval_example_num / eval_batch_size)
args.iter_num = epoch_size * args.num_epochs
configs.print_args(args)

glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
"mrpc": "classification",
"sst-2": "classification",
"sts-b": "regression",
"qqp": "classification",
"qnli": "classification",
"rte": "classification",
"wnli": "classification",
}

acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"]
corr_tasks = ["sts-b"]
mcc_tasks = ["cola"]

output_mode = glue_output_modes[args.task_name.lower()]


def BertDecoder(
data_dir, batch_size, data_part_num, seq_length, part_name_prefix, shuffle=True
):
with flow.scope.placement("cpu", "0:0"):
ofrecord = flow.data.ofrecord_reader(data_dir,
batch_size=batch_size,
data_part_num=data_part_num,
part_name_prefix=part_name_prefix,
random_shuffle=shuffle,
shuffle_after_epoch=shuffle)
blob_confs = {}

def _blob_conf(name, shape, dtype=flow.int32):
blob_confs[name] = flow.data.OFRecordRawDecoder(ofrecord, name, shape=shape, dtype=dtype)

_blob_conf("input_ids", [seq_length])
_blob_conf("input_mask", [seq_length])
_blob_conf("segment_ids", [seq_length])
_blob_conf("label_ids", [1])
_blob_conf("is_real_example", [1])

return blob_confs


def get_tensor_data(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
shuffle=True
):
decoders = BertDecoder(
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle
)
return decoders


def BuildBert(
batch_size,
data_part_num,
data_dir,
part_name_prefix,
shuffle=True
):
hidden_size = args.hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.hidden_size_per_head = hidden_size / args.num_attention_heads
# intermediate_size = hidden_size * 4
intermediate_size = 1200

decoders = BertDecoder(
data_dir, batch_size, data_part_num, args.seq_length, part_name_prefix, shuffle=shuffle
)
# is_real_example = decoders['is_real_example']

loss, logits = GlueBERT(
decoders['input_ids'],
decoders['input_mask'],
decoders['segment_ids'],
decoders['label_ids'],
args.vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.hidden_dropout_prob,
attention_probs_dropout_prob=args.attention_probs_dropout_prob,
max_position_embeddings=args.max_position_embeddings,
type_vocab_size=args.type_vocab_size,
initializer_range=0.02,
)
return loss, logits, decoders['label_ids']


def student_model(input_ids, input_mask, segment_ids, is_train=True):
# with flow.scope.placement("gpu", "0:0"):
# hidden_size = 64 * args.student_num_attention_heads # , H = 64, size per head
hidden_size = args.student_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.student_hidden_size_per_head = hidden_size / args.student_num_attention_heads
# intermediate_size = hidden_size * 4
intermediate_size = 1200
logits, reps, atts = BertStudentForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.student_vocab_size,
seq_length=args.seq_length,
hidden_size=hidden_size,
num_hidden_layers=args.student_num_hidden_layers,
num_attention_heads=args.student_num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.student_hidden_dropout_prob,
attention_probs_dropout_prob=args.student_attention_probs_dropout_prob,
max_position_embeddings=args.student_max_position_embeddings,
type_vocab_size=args.student_type_vocab_size,
initializer_range=0.02,
is_student=True,
fit_size=args.teacher_hidden_size,
is_train=is_train
)
return logits, reps, atts


def teacher_model(input_ids, input_mask, segment_ids, is_train):
# hidden_size = 64 * args.teacher_num_attention_heads # , H = 64, size per head
teacher_hidden_size = args.teacher_hidden_size ##64 * args.num_attention_heads # , H = 64, size per head
args.teacher_hidden_size_per_head = teacher_hidden_size / args.teacher_num_attention_heads
intermediate_size = teacher_hidden_size * 4
logits, reps, atts = BertForSequenceClassification(
input_ids_blob=input_ids,
input_mask_blob=input_mask,
token_type_ids_blob=segment_ids,
label_blob=None,
vocab_size=args.vocab_size,
seq_length=args.seq_length,
hidden_size=teacher_hidden_size,
num_hidden_layers=args.teacher_num_hidden_layers,
num_attention_heads=args.teacher_num_attention_heads,
intermediate_size=intermediate_size,
hidden_act="gelu",
hidden_dropout_prob=args.teacher_hidden_dropout_prob,
attention_probs_dropout_prob=args.teacher_attention_probs_dropout_prob,
max_position_embeddings=args.teacher_max_position_embeddings,
type_vocab_size=args.teacher_type_vocab_size,
initializer_range=0.02,
is_student=False,
is_train=is_train
)
return logits, reps, atts


@flow.global_function(type='train', function_config=GetFunctionConfig(args))
def DistilJob():
train_dataset = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
)
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'],
train_dataset['segment_ids'], is_train=True)

teacher_logits, teacher_reps, teacher_atts = teacher_model(train_dataset['input_ids'], train_dataset['input_mask'],
train_dataset['segment_ids'], is_train=False)

loss = 0.
if args.intermediate_distill:
rep_loss = layer_distill(args, student_reps, teacher_reps)
att_loss = att_distill(args, student_atts, teacher_atts)
loss += att_loss + rep_loss
if args.pred_distill:
if output_mode == "classification":
cls_loss = pred_distill(args, student_logits, teacher_logits)
elif output_mode == "regression":
"""
todo
loss_mse = MSELoss()
cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1))
"""
pass
loss += cls_loss
# loss_ce = flow.nn.sparse_softmax_cross_entropy_with_logits(
# logits=student_logits, labels=train_dataset['label_ids']
# )
# loss = loss_ce
flow.losses.add_loss(loss)

opt = CreateOptimizer(args)
opt.minimize(loss)

return {'loss': loss}


#
@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalTrainJob():
train_dataset = get_tensor_data(
batch_size,
args.train_data_part_num,
args.train_data_dir,
args.train_data_prefix,
shuffle=False
)
student_logits, student_reps, student_atts = student_model(train_dataset['input_ids'], train_dataset['input_mask'],
train_dataset['segment_ids'], is_train=False)

return student_logits, train_dataset['label_ids']


@flow.global_function(type='predict', function_config=GetFunctionConfig(args))
def StudentBertGlueEvalValJob():
dev_dataset = get_tensor_data(
eval_batch_size,
args.eval_data_part_num,
args.eval_data_dir,
args.eval_data_prefix,
shuffle=False
)
student_logits, student_reps, student_atts = student_model(dev_dataset['input_ids'], dev_dataset['input_mask'],
dev_dataset['segment_ids'], is_train=False)

return student_logits, dev_dataset['label_ids']


#
def run_eval_job(eval_job_func, num_steps, desc='train'):
labels = []
predictions = []
start_time = time.time()
for index in range(num_steps):
logits, label = eval_job_func().get()
predictions.extend(list(logits.numpy().argmax(axis=1)))
labels.extend(list(label))
end_time = time.time()
cost_time = end_time - start_time
print('cost time: {} s'.format(cost_time))

model_size = getdirsize(args.model_save_dir)
print('model_size: %d Mbytes' % (model_size / 1024 / 1024)) # Mbytes

accuracy = accuracy_score(labels, predictions)
mcc = matthews_corrcoef(labels, predictions)
precision = precision_score(labels, predictions)
recall = recall_score(labels, predictions)
f_1 = f1_score(labels, predictions)
save_dict = {"accuracy": "%.2f" % accuracy,
"MCC": "%.2f" % mcc,
"precision": "%.2f" % precision,
"recall": "%.2f" % recall,
"f_1": "%.2f" % f_1,
"modelSize": "%d" % (model_size / 1024 / 1024),
"reasoningTime": "%.2f" % (args.eval_example_num / cost_time)} # sample/second

if args.result_dir == "":
args.result_dir = args.model_save_dir
if not os.path.exists(args.result_dir):
os.makedirs(args.result_dir)
with open(os.path.join(args.result_dir, 'results_{}.json'.format(desc)), "w") as f:
json.dump(save_dict, f)

def metric_fn(predictions, labels):
return {
"accuracy": accuracy,
"matthews_corrcoef": mcc,
"precision": precision,
"recall": recall,
"f1": f_1,
}

metric_dict = metric_fn(predictions, labels)
print(desc, ', '.join('{}: {:.3f}'.format(k, v) for k, v in metric_dict.items()))
return metric_dict


def CopyFile(filepath, newPath):
fileNames = os.listdir(filepath)
for file in fileNames:
newDir = os.path.join(filepath, file)
if os.path.isfile(newDir):
# print(newDir)
newFile = os.path.join(newPath, file)
shutil.copyfile(newDir, newFile)
else:
if not os.path.exists(os.path.join(newPath, file)):
os.makedirs(os.path.join(newPath, file))
CopyFile(newDir, os.path.join(newPath, file))


def main():
flow.config.gpu_device_num(args.gpu_num_per_node)
flow.env.log_dir(args.log_dir)

InitNodes(args)

check_point = flow.train.CheckPoint()

summary = Summary(args.log_dir, args)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
if args.do_train:
print('Combining two models into one dir')
if not os.path.exists('./tmp'):
os.makedirs('./tmp')

args.total_model = tempfile.mkdtemp(dir='./tmp')
CopyFile(args.student_model, args.total_model)
CopyFile(args.teacher_model, args.total_model)
print('Loading model...')
check_point.load(args.total_model)
# # check_point.load(args.teacher_model)
# # check_point.load(args.student_model)
#
print('Start training...')
global_step = 0
best_dev_acc = 0.0
for epoch in range(args.num_epochs):
metric = Metric(desc='finetune', print_steps=args.loss_print_every_n_iter, summary=summary,
batch_size=batch_size, keys=['loss'])

for step in range(epoch_size):
DistilJob().async_get(metric.metric_cb(step, epoch=epoch))
global_step += 1
# if (global_step + 1) % args.model_save_every_n_iter == 0:
# if not os.path.exists(args.model_save_dir):
# os.makedirs(args.model_save_dir)
# snapshot_save_path = os.path.join(
# args.model_save_dir, "snapshot_%d" % (global_step + 1)
# )
# print("Saving model to {}.".format(snapshot_save_path))
# check_point.save(snapshot_save_path)

# if args.pred_distill:
print('EvalTrainJob...')
run_eval_job(StudentBertGlueEvalTrainJob, epoch_size, desc='train')
print('EvalValJob...')
result = run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
if not args.pred_distill:
save_model = True
else:
save_model = False
if task_name in acc_tasks and result['accuracy'] > best_dev_acc:
best_dev_acc = result['accuracy']
save_model = True

# if task_name in corr_tasks and result['corr'] > best_dev_acc:
# best_dev_acc = result['corr']
# save_model = True

if task_name in mcc_tasks and result['matthews_corrcoef'] > best_dev_acc:
best_dev_acc = result['matthews_corrcoef']
save_model = True
print('Best result:', result)

if save_model:
if os.path.exists(args.model_save_dir):
import shutil
shutil.rmtree(args.model_save_dir)
if not os.path.exists(args.model_save_dir):
os.makedirs(args.model_save_dir)
snapshot_save_path = os.path.join(args.model_save_dir)
print("Saving best model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)
flow.sync_default_session()

if args.save_last_snapshot:
snapshot_save_path = args.model_save_dir
if os.path.exists(args.model_save_dir):
import shutil
shutil.rmtree(args.model_save_dir)
print("Saving model to {}".format(snapshot_save_path))
check_point.save(snapshot_save_path)
flow.sync_default_session()

if global_step >= 100:
# remove tmp total models
print('Removing the tmp models...')
import shutil
shutil.rmtree(args.total_model)

if args.serve_for_online:
print('Deleting the teacher params and the optimizer parmas from model_save_dir...')
remove_teacher_params(args.model_save_dir)

if args.do_eval:
print('Loading model...')
print(args.model_save_dir)

if not args.do_train:
check_point.load(args.model_save_dir)
print('Evaluation...')
run_eval_job(StudentBertGlueEvalValJob, num_eval_steps, desc='eval')
# if args.save_last_snapshot:
# snapshot.save("last_snapshot")


if __name__ == "__main__":
main()

+ 8
- 0
model_compress/model_compress/distil/glue_process.sh View File

@@ -0,0 +1,8 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

# --aug_train True
TASK=SST-2
python ./src/glue_ofrecord/glue_process.py --data_dir ./data/glue_data/${TASK} --output_dir ./data/glue_ofrecord_test/${TASK} \
--vocab_file ./glue_ofrecord/vocab.txt --do_lower_case True --max_seq_length 128 \
--do_train True --do_eval True --do_predict True --task=${TASK}

+ 4
- 0
model_compress/model_compress/distil/run_download_glue_data.sh View File

@@ -0,0 +1,4 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

python ../src/download_glue_data.py --data_dir ../data/glue_data --tasks all

+ 95
- 0
model_compress/model_compress/distil/run_eval_student_bert_pkd.sh View File

@@ -0,0 +1,95 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

dataset=SST-2

# ofrecord dataset dir
DATA_ROOT=./data/glue_ofrecord

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

LAYER_NUM=3
KD_ALPHA=0.2
KD_BETA=10

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

# saved student model dir
STUDENT_DIR="./models/student_model/SST-2/bert-pkd_3_epoch-4_lr-2e-5_wd-0.0001_kd_alpha-0.2_kd_beta-10"
RESULT_DIR=""

CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/bert-pkd/task_student_bert-pkd.py \
--do_train='False' \
--do_eval='True' \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--num_epochs=${epoch} \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 10 \
--log_dir=./log \
--model_save_dir=${STUDENT_DIR} \
--result_dir=${RESULT_DIR} \
--seq_length=128 \
--student_num_hidden_layers=${LAYER_NUM} \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=30522 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=64 \
--student_hidden_size=768 \
--teacher_num_hidden_layers=12 \
--teacher_num_attention_heads=12 \
--teacher_max_position_embeddings=512 \
--teacher_type_vocab_size=2 \
--teacher_vocab_size=30522 \
--teacher_attention_probs_dropout_prob=0.1 \
--teacher_hidden_dropout_prob=0.1 \
--teacher_hidden_size_per_head=64 \
--teacher_hidden_size=768 \
--model_save_every_n_iter=50000 \
--kd_alpha=${KD_ALPHA} \
--kd_beta=${KD_BETA} \
--from_scratch='False'

+ 94
- 0
model_compress/model_compress/distil/run_eval_student_distilled_lstm.sh View File

@@ -0,0 +1,94 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

dataset=SST-2

# ofrecord dataset dir
DATA_ROOT=./data/glue_ofrecord

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

KD_ALPHA=0.7
STUDENT_DIR="./models/student_model/SST-2/bert-lstm_32-distl_epoch-5_lr-1e-4_wd-0.0001_kd_alpha-0.7"
RESULT_DIR=""

train_data_dir=$DATA_ROOT/${dataset}/train
train_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/train

eval_data_dir=$DATA_ROOT/${dataset}/eval
eval_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/eval

CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/distilled-bilstm/task_student_kd_lstm.py \
--do_train='False' \
--do_eval='True' \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--train_data_dir=$train_data_dir \
--train_data_dir_lstm=${train_data_dir_lstm} \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_data_dir_lstm=$eval_data_dir_lstm \
--eval_example_num=$eval_example_num \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 1 \
--log_dir=./log \
--model_save_dir=${STUDENT_DIR} \
--result_dir=${RESULT_DIR} \
--seq_length=128 \
--student_seq_length=32 \
--student_num_hidden_layers=4 \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=10002 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=26 \
--student_hidden_size=300 \
--teacher_num_hidden_layers=12 \
--teacher_num_attention_heads=12 \
--teacher_max_position_embeddings=512 \
--teacher_type_vocab_size=2 \
--teacher_vocab_size=30522 \
--teacher_attention_probs_dropout_prob=0.1 \
--teacher_hidden_dropout_prob=0.1 \
--teacher_hidden_size_per_head=64 \
--teacher_hidden_size=768 \
--model_save_every_n_iter=50000 \
--kd_alpha=${KD_ALPHA}

+ 88
- 0
model_compress/model_compress/distil/run_eval_student_kd.sh View File

@@ -0,0 +1,88 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

dataset=SST-2

# ofrecord dataset dir
DATA_ROOT=./data/glue_ofrecord

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

KD_ALPHA=0.8
STUDENT_DIR="./models/student_model/SST-2/bert-kd-distl_epoch-4_lr-2e-5_wd-0.0001_kd_alpha-0.8"
RESULT_DIR=""

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/knowledge_distillation/task_student_kd.py \
--do_train='False' \
--do_eval='True' \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 10 \
--log_dir=./log \
--model_save_dir=${STUDENT_DIR} \
--result_dir=${RESULT_DIR} \
--seq_length=128 \
--student_num_hidden_layers=4 \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=30522 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=26 \
--student_hidden_size=312 \
--teacher_num_hidden_layers=12 \
--teacher_num_attention_heads=12 \
--teacher_max_position_embeddings=512 \
--teacher_type_vocab_size=2 \
--teacher_vocab_size=30522 \
--teacher_attention_probs_dropout_prob=0.1 \
--teacher_hidden_dropout_prob=0.1 \
--teacher_hidden_size_per_head=64 \
--teacher_hidden_size=768 \
--model_save_every_n_iter=50000 \
--kd_alpha=${KD_ALPHA}

+ 89
- 0
model_compress/model_compress/distil/run_eval_student_tinybert.sh View File

@@ -0,0 +1,89 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

dataset=SST-2

# ofrecord dataset dir
DATA_ROOT=./data/glue_ofrecord

# which GPU to use
GPU=0

# saved student model dir
STUDENT_DIR="./models/student_model/SST-2/tinybert_epoch-4_lr-2e-5_wd-0.0001"
RESULT_DIR=""

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/tinybert/task_student_tinybert.py \
--do_train='False' \
--do_eval='True' \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 10 \
--log_dir=./log \
--model_save_dir=${STUDENT_DIR} \
--result_dir=${RESULT_DIR} \
--seq_length=128 \
--student_num_hidden_layers=4 \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=30522 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=26 \
--student_hidden_size=312 \
--teacher_num_hidden_layers=12 \
--teacher_num_attention_heads=12 \
--teacher_max_position_embeddings=512 \
--teacher_type_vocab_size=2 \
--teacher_vocab_size=30522 \
--teacher_attention_probs_dropout_prob=0.1 \
--teacher_hidden_dropout_prob=0.1 \
--teacher_hidden_size_per_head=64 \
--teacher_hidden_size=768 \
--model_save_every_n_iter=50000 \
--intermediate_distill='True' \
--pred_distill='True'

+ 86
- 0
model_compress/model_compress/distil/run_eval_teacher.sh View File

@@ -0,0 +1,86 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

# ofrecord dataset dir
DATA_ROOT=./data/glue_ofrecord

# choose dateset `CoLA`, `MRPC` 'SST-2'
dataset=SST-2

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
elif [ $dataset = "QNLI" ]; then
train_example_num=104743
eval_example_num=5463
test_example_num=0
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','',"
exit
fi

TEACHER_MODEL_DIR="./models/finetuned_teacher/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_best"
#TEACHER_MODEL_DIR="./models/finetuned_teacher/RTE_epoch-5_lr-3e-5_wd-0.0001/snapshot_best"
#TEACHER_MODEL_DIR="./models/finetuned_teacher/MRPC_epoch-5_lr-1e-5_wd-0.001/snapshot_best"
#TEACHER_MODEL_DIR="./models/finetuned_teacher/CoLA_epoch-5_lr-1e-5_wd-0.01/snapshot_best"
#TEACHER_MODEL_DIR="./models/finetuned_teacher/QQP_epoch-5_lr-2e-5_wd-0.0001/snapshot_best"

RESULT_DIR="./models/finetuned_teacher/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_best"

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

CUDA_VISIBLE_DEVICES=$GPU python3 examples/teacher_bert/task_teacher.py \
--do_train='False' \
--do_eval='True' \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 20 \
--log_dir=./log \
--model_save_dir=${TEACHER_MODEL_DIR} \
--result_dir=${RESULT_DIR} \
--save_last_snapshot=False \
--seq_length=128 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64

+ 99
- 0
model_compress/model_compress/distil/run_eval_theseus.sh View File

@@ -0,0 +1,99 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

# pretrained model dir
# PRETRAINED_MODEL=/remote-home/my/Projects/bert_theseus/BERT/uncased_L-12_H-768_A-12_oneflow
# PRETRAINED_MODEL=/remote-home/my/Projects/bert_theseus/BERT-theseus/log/MRPC_uncased_L-12_H-768_A-12_oneflow_v1/snapshot_last_snapshot

# ofrecord dataset dir
DATA_ROOT=/usr/local/glue_ofrecord
GPU_ID=0

# choose dateset `CoLA` or `MRPC`
dataset=SST-2
#dataset=MRPC
if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
epoch=1
wd=0.0001
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
epoch=1
wd=0.0001
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
learning_rate=2e-5
epoch=1
wd=0.0001
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
learning_rate=2e-5
epoch=1
wd=0.0001
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
learning_rate=2e-5
epoch=1
wd=0.0001
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
learning_rate=2e-5
epoch=1
wd=0.0001
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi


train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval
model_load_dir=./log/${dataset}_bert_theseus_uncased_L-12_H-768_A-12_oneflow_v1/snapshot_last_snapshot
# mkdir -p ${model_save_dir}

replace_prob=1.0

CUDA_VISIBLE_DEVICES=$1 python3 ./theseus/run_classifier.py \
--do_train=false \
--do_eval=True \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--num_epochs=$epoch \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--model_load_dir=${model_load_dir} \
--batch_size_per_device=32 \
--eval_batch_size_per_device=4 \
--loss_print_every_n_iter 20 \
--log_dir=./log \
--save_last_snapshot=True \
--seq_length=128 \
--num_hidden_layers=4 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--compress_ratio $compress_ratio \
--replace_prob $replace_prob \

+ 130
- 0
model_compress/model_compress/distil/run_train_student_bert_pkd.sh View File

@@ -0,0 +1,130 @@
# Copyright (c) The OneFlow Authors.
# Licensed under the Apache License

# Script for knowledge distillation with BERT-PKD algorithm.

# ofrecord dataset dir
DATA_ROOT=$1

# saved student model dir
STUDENT_DIR="$2/student_model"

# tran log out
TRAIN_LOG_DIR=$3

# inference json result out
RESULT_DIR=$4

dataset=$5

# fine-tuned teacher model dir
FT_BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best"

# temp student model dir
TMP_STUDENT_DIR="./models/bert_pkd_3/${dataset}"

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
learning_rate=5e-5
wd=0.0001
epoch=100
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
learning_rate=2e-5
epoch=5
wd=0.001
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
learning_rate=2e-5
epoch=4
wd=0.0001
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
learning_rate=3e-5
epoch=5
wd=0.0001
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/bert-pkd/task_student_bert-pkd.py \
--do_train='True' \
--do_eval='True' \
--serve_for_online='True' \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--num_epochs=${epoch} \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--teacher_model=${FT_BERT_BASE_DIR} \
--student_model=${TMP_STUDENT_DIR} \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 10 \
--log_dir=${TRAIN_LOG_DIR} \
--model_save_dir=${STUDENT_DIR} \
--result_dir=${RESULT_DIR} \
--seq_length=128 \
--student_num_hidden_layers=3 \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=30522 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=64 \
--student_hidden_size=768 \
--teacher_num_hidden_layers=12 \
--teacher_num_attention_heads=12 \
--teacher_max_position_embeddings=512 \
--teacher_type_vocab_size=2 \
--teacher_vocab_size=30522 \
--teacher_attention_probs_dropout_prob=0.1 \
--teacher_hidden_dropout_prob=0.1 \
--teacher_hidden_size_per_head=64 \
--teacher_hidden_size=768 \
--learning_rate=$learning_rate \
--model_save_every_n_iter=50000 \
--weight_decay_rate $wd \
--kd_alpha=0.2 \
--kd_beta=10 \
--from_scratch='False'

+ 130
- 0
model_compress/model_compress/distil/run_train_student_distilled_lstm.sh View File

@@ -0,0 +1,130 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

# Script for knowledge distillation with distilled_bilstm algorithm.

# ofrecord dataset dir
DATA_ROOT=$1

# saved student model dir
STUDENT_DIR="$2/student_model"

# tran log out
TRAIN_LOG_DIR=$3

# inference json result out
RESULT_DIR=$4

dataset=$5

# fine-tuned teacher model dir
FT_BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best"

train_data_dir=$DATA_ROOT/${dataset}/train
train_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/train

eval_data_dir=$DATA_ROOT/${dataset}/eval
eval_data_dir_lstm=$DATA_ROOT/${dataset}_lstm_32/eval

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
learning_rate=5e-5
wd=0.0001
epoch=100
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
learning_rate=5e-6
epoch=30
wd=0.001
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
learning_rate=1e-4
epoch=5
wd=0.0001
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
learning_rate=7e-5
epoch=10
wd=0.0001
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
learning_rate=2e-5
epoch=30
wd=0.0001
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/distilled-bilstm/task_student_kd_lstm.py \
--do_train='True' \
--do_eval='True' \
--serve_for_online='True' \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--num_epochs=${epoch} \
--train_data_dir=$train_data_dir \
--train_data_dir_lstm=${train_data_dir_lstm} \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_data_dir_lstm=$eval_data_dir_lstm \
--eval_example_num=$eval_example_num \
--teacher_model=${FT_BERT_BASE_DIR} \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 1 \
--log_dir=./log \
--model_save_dir=${STUDENT_DIR} \
--result_dir=${RESULT_DIR} \
--seq_length=128 \
--student_seq_length=32 \
--student_num_hidden_layers=4 \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=10002 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=26 \
--student_hidden_size=300 \
--teacher_num_hidden_layers=12 \
--teacher_num_attention_heads=12 \
--teacher_max_position_embeddings=512 \
--teacher_type_vocab_size=2 \
--teacher_vocab_size=30522 \
--teacher_attention_probs_dropout_prob=0.1 \
--teacher_hidden_dropout_prob=0.1 \
--teacher_hidden_size_per_head=64 \
--teacher_hidden_size=768 \
--learning_rate=$learning_rate \
--model_save_every_n_iter=50000 \
--weight_decay_rate=$wd \
--kd_alpha=0.7

+ 128
- 0
model_compress/model_compress/distil/run_train_student_kd.sh View File

@@ -0,0 +1,128 @@
# Copyright (c) The OneFlow Authors.
# Licensed under the Apache License

# Script for knowledge distillation with KD algorithm.

# ofrecord dataset dir
DATA_ROOT=$1

# saved student model dir
STUDENT_DIR="$2/student_model"

# tran log out
TRAIN_LOG_DIR=$3

# inference json result out
RESULT_DIR=$4

dataset=$5

# fine-tuned teacher model dir
FT_BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best"

# temp student model dir
TMP_STUDENT_DIR="./models/bert_pkd_3/${dataset}"

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
learning_rate=5e-5
wd=0.0001
epoch=70
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
learning_rate=2e-5
epoch=5
wd=0.001
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
learning_rate=2e-5
epoch=4
wd=0.0001
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
learning_rate=5e-5
epoch=5
wd=0.0001
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/knowledge_distillation/task_student_kd.py \
--do_train='True' \
--do_eval='True' \
--serve_for_online='True' \
--model=Glue_${dataset} \
--task_name=${dataset} \
--gpu_num_per_node=1 \
--num_epochs=${epoch} \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--teacher_model=${FT_BERT_BASE_DIR} \
--student_model=${TMP_STUDENT_DIR} \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 10 \
--log_dir=${TRAIN_LOG_DIR} \
--result_dir=${RESULT_DIR} \
--model_save_dir=${STUDENT_DIR} \
--seq_length=128 \
--student_num_hidden_layers=4 \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=30522 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=26 \
--student_hidden_size=312 \
--teacher_num_hidden_layers=12 \
--teacher_num_attention_heads=12 \
--teacher_max_position_embeddings=512 \
--teacher_type_vocab_size=2 \
--teacher_vocab_size=30522 \
--teacher_attention_probs_dropout_prob=0.1 \
--teacher_hidden_dropout_prob=0.1 \
--teacher_hidden_size_per_head=64 \
--teacher_hidden_size=768 \
--learning_rate=$learning_rate \
--model_save_every_n_iter=50000 \
--weight_decay_rate=$wd \
--kd_alpha=0.8

+ 128
- 0
model_compress/model_compress/distil/run_train_student_tinybert.sh View File

@@ -0,0 +1,128 @@
# Copyright (c) The Tianshu Platform Authors.
# Licensed under the Apache License

# Script for knowledge distillation with TinyBERT algorithm.

# ofrecord dataset dir
DATA_ROOT=$1

# saved student model dir
STUDENT_DIR="$2/student_model"

# tran log out
TRAIN_LOG_DIR=$3

# inference json result out
RESULT_DIR=$4

dataset=$5

# fine-tuned teacher model dir
FT_BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best"

TMP_STUDENT_DIR="./models/2nd_General_TinyBERT_4L_312D_oneflow"

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
learning_rate=7e-5
wd=0.0001
epoch=100
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
learning_rate=2e-5
epoch=30
wd=0.001
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
learning_rate=2e-5
epoch=4
wd=0.0001
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
learning_rate=1e-4
epoch=5
wd=0.0001
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
learning_rate=2e-5
epoch=5
wd=0.0001
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

CUDA_VISIBLE_DEVICES=$GPU python3 ./examples/tinybert/task_student_tinybert.py \
--do_train='True' \
--do_eval='True' \
--serve_for_online='True' \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--num_epochs=${epoch} \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--teacher_model=${FT_BERT_BASE_DIR} \
--student_model=${TMP_STUDENT_DIR} \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 10 \
--log_dir=./log \
--model_save_dir=${STUDENT_DIR} \
--result_dir=${RESULT_DIR} \
--seq_length=128 \
--student_num_hidden_layers=4 \
--student_num_attention_heads=12 \
--student_max_position_embeddings=512 \
--student_type_vocab_size=2 \
--student_vocab_size=30522 \
--student_attention_probs_dropout_prob=0.1 \
--student_hidden_dropout_prob=0.1 \
--student_hidden_size_per_head=26 \
--student_hidden_size=312 \
--teacher_num_hidden_layers=12 \
--teacher_num_attention_heads=12 \
--teacher_max_position_embeddings=512 \
--teacher_type_vocab_size=2 \
--teacher_vocab_size=30522 \
--teacher_attention_probs_dropout_prob=0.1 \
--teacher_hidden_dropout_prob=0.1 \
--teacher_hidden_size_per_head=64 \
--teacher_hidden_size=768 \
--learning_rate=$learning_rate \
--model_save_every_n_iter=50000 \
--weight_decay_rate $wd \
--pred_distill='True' \
--intermediate_distill='True'

+ 116
- 0
model_compress/model_compress/distil/run_train_teacher.sh View File

@@ -0,0 +1,116 @@
# Copyright (c) The OneFlow Authors.
# Licensed under the Apache License
# Fine-tune Teacher model.
# ofrecord dataset dir
DATA_ROOT=$1

# pretrained model dir
PRETRAINED_MODEL=$2

TRAIN_LOG_DIR=$3

RESULT_DIR=$4

# choose dateset
dataset=$5

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

MODEL_SAVE_DIR="/usr/local/output/model/before"

# which GPU to use
GPU=0

if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
learning_rate=1e-5
EPOCH=5
wd=0.01
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
learning_rate=3e-5
EPOCH=5
wd=0.001
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
learning_rate=2e-5
EPOCH=3
wd=0.0001
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
learning_rate=2e-5
EPOCH=5
wd=0.0001
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
learning_rate=2e-5
EPOCH=5
wd=0.0001
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
learning_rate=2e-5
EPOCH=5
wd=0.0001
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
learning_rate=3e-5
EPOCH=5
wd=0.0001
elif [ $dataset == "QNLI" ]; then
train_example_num=104743
eval_example_num=5463
test_example_num=0
learning_rate=2e-5
EPOCH=5
wd=0.0001
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI'"
exit
fi

CUDA_VISIBLE_DEVICES=$GPU python3 examples/teacher_bert/task_teacher.py \
--do_train='True' \
--do_eval='True' \
--serve_for_online='True' \
--model=Glue_${dataset} \
--task_name=${dataset} \
--gpu_num_per_node=1 \
--num_epochs=${EPOCH} \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--model_load_dir=${PRETRAINED_MODEL} \
--batch_size_per_device=32 \
--eval_batch_size_per_device=32 \
--loss_print_every_n_iter 20 \
--log_dir=${TRAIN_LOG_DIR} \
--result_dir=${RESULT_DIR} \
--model_save_dir=${MODEL_SAVE_DIR} \
--save_last_snapshot=True \
--seq_length=128 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--learning_rate $learning_rate \
--weight_decay_rate $wd

+ 158
- 0
model_compress/model_compress/distil/run_train_theseus.sh View File

@@ -0,0 +1,158 @@
# pretrained model dir
# ofrecord dataset dir
DATA_ROOT=$1

# saved student model dir
STUDENT_DIR="$2/student_v2"

# tran log out
TRAIN_LOG_DIR=$3

# inference json result out
RESULT_DIR=$4

BERT_BASE_DIR="/usr/local/output/model/before/snapshot_best"
#BERT_BASE_DIR="/usr/local/Oneflow-Model-Compression/model_compress/distil/models/SST-2_epoch-3_lr-2e-5_wd-0.0001/snapshot_best"
INIT_STUDENT_DIR="$2/student_init"
ONE_TRAIN_MODEL="$2/student_v1"

dataset=$5

train_data_dir=$DATA_ROOT/${dataset}/train
eval_data_dir=$DATA_ROOT/${dataset}/eval

# which GPU to use
GPU=0

#dataset=MRPC
if [ $dataset = "CoLA" ]; then
train_example_num=8551
eval_example_num=1043
test_example_num=1063
learning_rate=1e-5
EPOCH=5
wd=0.01
elif [ $dataset = "MRPC" ]; then
train_example_num=3668
eval_example_num=408
test_example_num=1725
learning_rate=1e-5
EPOCH=5
wd=0.001
elif [ $dataset = "SST-2" ]; then
train_example_num=67349
eval_example_num=872
test_example_num=1821
learning_rate=1e-5
EPOCH=5
wd=0.0001
elif [ $dataset = "QQP" ]; then
train_example_num=363849
eval_example_num=40430
test_example_num=0
learning_rate=1e-5
EPOCH=5
wd=0.0001
elif [ $dataset = "MNLI" ]; then
train_example_num=392702
eval_example_num=9815
test_example_num=0
learning_rate=1e-5
EPOCH=5
wd=0.0001
elif [ $dataset = "WNLI" ]; then
train_example_num=635
eval_example_num=71
test_example_num=0
learning_rate=1e-5
EPOCH=5
wd=0.0001
elif [ $dataset = "RTE" ]; then
train_example_num=2490
eval_example_num=277
test_example_num=0
learning_rate=1e-5
EPOCH=5
wd=0.0001
else
echo "dataset must be GLUE such as 'CoLA','MRPC','SST-2','QQP','MNLI','WNLI','STS-B',"
exit
fi

mkdir -p ${INIT_STUDENT_DIR}

# LAYER_LIST="0,1,2,3,4,5"
python3 ./theseus/init_stu.py \
--teacher_model=${BERT_BASE_DIR} \
--student_model=${INIT_STUDENT_DIR} \
--layer_list="0,1,2"

mkdir -p ${ONE_TRAIN_MODEL}

CUDA_VISIBLE_DEVICES=$GPU python3 ./theseus/run_classifier.py \
--do_train=True \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--num_epochs=${EPOCH} \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--model_load_dir=${INIT_STUDENT_DIR} \
--batch_size_per_device=32 \
--eval_batch_size_per_device=4 \
--loss_print_every_n_iter 20 \
--log_dir=${TRAIN_LOG_DIR} \
--result_dir=${RESULT_DIR} \
--model_save_dir=${ONE_TRAIN_MODEL} \
--save_last_snapshot=True \
--seq_length=128 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--learning_rate $learning_rate \
--weight_decay_rate $wd \
--compress_ratio=4 \
--replace_prob=0.5 \
| tee -a ${ONE_TRAIN_MODEL}/train_log.txt

mkdir -p ${STUDENT_DIR}

CUDA_VISIBLE_DEVICES=$GPU python3 ./theseus/run_classifier.py \
--do_train=True \
--model=Glue_$dataset \
--task_name=$dataset \
--gpu_num_per_node=1 \
--num_epochs=${EPOCH} \
--train_data_dir=$train_data_dir \
--train_example_num=$train_example_num \
--eval_data_dir=$eval_data_dir \
--eval_example_num=$eval_example_num \
--model_load_dir=${ONE_TRAIN_MODEL}/snapshot_last_snapshot \
--batch_size_per_device=32 \
--eval_batch_size_per_device=4 \
--loss_print_every_n_iter 200 \
--log_dir=${TRAIN_LOG_DIR} \
--result_dir=${RESULT_DIR} \
--model_save_dir=${STUDENT_DIR} \
--save_last_snapshot=True \
--seq_length=128 \
--num_hidden_layers=12 \
--num_attention_heads=12 \
--max_position_embeddings=512 \
--type_vocab_size=2 \
--vocab_size=30522 \
--attention_probs_dropout_prob=0.1 \
--hidden_dropout_prob=0.1 \
--hidden_size_per_head=64 \
--learning_rate=1e-5 \
--weight_decay_rate $wd \
--compress_ratio=4 \
--replace_prob=1.0 \
| tee -a ${STUDENT_DIR}/train_log.txt

+ 376
- 0
model_compress/model_compress/distil/src/bert.py View File

@@ -0,0 +1,376 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import oneflow.core.common.data_type_pb2 as data_type_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util
import math

class BertBackbone(object):

def __init__(self,
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
do_return_all_layers=True,
do_return_attentions=False,
is_train=True):

with flow.scope.namespace("bert"):
with flow.scope.namespace("embeddings"):
(self.embedding_output_, self.embedding_table_) = _EmbeddingLookup(
input_ids_blob=input_ids_blob,
vocab_size=vocab_size,
embedding_size=hidden_size,
initializer_range=initializer_range,
word_embedding_name="word_embeddings",
is_train=is_train)
self.embedding_output_ = _EmbeddingPostprocessor(
input_blob=self.embedding_output_,
seq_length=seq_length,
embedding_size=hidden_size,
use_token_type=True,
token_type_ids_blob=token_type_ids_blob,
token_type_vocab_size=type_vocab_size,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=initializer_range,
max_position_embeddings=max_position_embeddings,
dropout_prob=hidden_dropout_prob,
is_train=is_train)
with flow.scope.namespace("encoder"):
attention_mask_blob = _CreateAttentionMaskFromInputMask(
input_mask_blob, from_seq_length=seq_length, to_seq_length=seq_length)
outputs = _TransformerModel(
input_blob=self.embedding_output_,
attention_mask_blob=attention_mask_blob,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
intermediate_act_fn=GetActivation(hidden_act),
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_all_layers=True,
do_return_attentions=True,
is_train=is_train)
# if asdf:
# self.all_encoder_layers_ = outputs
# else:
self.all_encoder_layers_ = outputs[0]
self.all_attention_probs_ = outputs[1]
self.sequence_output_ = self.all_encoder_layers_[-1]

def embedding_output(self): return self.embedding_output_
def all_encoder_layers(self): return self.all_encoder_layers_
def all_attention_probs(self): return self.all_attention_probs_
def sequence_output(self): return self.sequence_output_
def embedding_table(self): return self.embedding_table_


def CreateInitializer(std):
return flow.truncated_normal(std)

def _Gelu(in_blob):
return flow.math.gelu(in_blob)

def _TransformerModel(input_blob,
attention_mask_blob,
seq_length,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
intermediate_act_fn=_Gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False,
do_return_attentions=False,
is_train=True):

assert hidden_size % num_attention_heads == 0
attention_head_size = int(hidden_size / num_attention_heads)
input_width = hidden_size
prev_output_blob = flow.reshape(input_blob, (-1, input_width))
all_layer_output_blobs = []
all_attention_prob_blobs = []
for layer_idx in range(num_hidden_layers):
with flow.scope.namespace("layer_%d"%layer_idx):
layer_input_blob = prev_output_blob
with flow.scope.namespace("attention"):
with flow.scope.namespace("self"):
attention_output_blob, attention_probs_blob = _AttentionLayer(
from_blob=layer_input_blob,
to_blob=layer_input_blob,
attention_mask_blob=attention_mask_blob,
num_attention_heads=num_attention_heads,
size_per_head=attention_head_size,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_2d_tensor=True,
from_seq_length=seq_length,
to_seq_length=seq_length,
is_train=is_train)
all_attention_prob_blobs.append(attention_probs_blob)
with flow.scope.namespace("output"):
attention_output_blob = _FullyConnected(
attention_output_blob,
input_size=num_attention_heads * attention_head_size,
units=hidden_size,
weight_initializer=CreateInitializer(initializer_range),
name='dense',
is_train=is_train)
attention_output_blob = _Dropout(attention_output_blob, hidden_dropout_prob)
attention_output_blob = attention_output_blob + layer_input_blob
attention_output_blob = _LayerNorm(attention_output_blob, hidden_size,is_train=is_train)
with flow.scope.namespace("intermediate"):
if callable(intermediate_act_fn):
act_fn = op_conf_util.kNone
else:
act_fn = intermediate_act_fn
intermediate_output_blob = _FullyConnected(
attention_output_blob,
input_size=num_attention_heads * attention_head_size,
units=intermediate_size,
activation=act_fn,
weight_initializer=CreateInitializer(initializer_range),
name='dense',
is_train=is_train)
if callable(intermediate_act_fn):
intermediate_output_blob = intermediate_act_fn(intermediate_output_blob)
with flow.scope.namespace("output"):
layer_output_blob = _FullyConnected(
intermediate_output_blob,
input_size=intermediate_size,
units=hidden_size,
weight_initializer=CreateInitializer(initializer_range),
name='dense',
is_train=is_train)
layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob)
layer_output_blob = layer_output_blob + attention_output_blob
layer_output_blob = _LayerNorm(layer_output_blob, hidden_size,is_train=is_train)
prev_output_blob = layer_output_blob
all_layer_output_blobs.append(layer_output_blob)

input_shape = (-1, seq_length, hidden_size)
if do_return_all_layers:
final_output_blobs = []
for layer_output_blob in all_layer_output_blobs:
final_output_blob = flow.reshape(layer_output_blob, input_shape)
final_output_blobs.append(final_output_blob)
if not do_return_attentions:
return final_output_blobs
else:
return final_output_blobs, all_attention_prob_blobs
else:
final_output_blob = flow.reshape(prev_output_blob, input_shape)
return [final_output_blob]

def _AttentionLayer(from_blob,
to_blob,
attention_mask_blob,
num_attention_heads=1,
size_per_head=512,
query_act=op_conf_util.kNone,
key_act=op_conf_util.kNone,
value_act=op_conf_util.kNone,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
do_return_2d_tensor=False,
batch_size=None,
from_seq_length=None,
to_seq_length=None,
is_train=True):

def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
output_blob = flow.reshape(input_blob, [-1, seq_length, num_attention_heads, width])
output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
return output_blob

from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])

query_blob = _FullyConnected(
from_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=query_act,
name="query",
weight_initializer=CreateInitializer(initializer_range),
is_train=is_train)

key_blob = _FullyConnected(
to_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=key_act,
name="key",
weight_initializer=CreateInitializer(initializer_range),
is_train=is_train)

value_blob = _FullyConnected(
to_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=value_act,
name="value",
weight_initializer=CreateInitializer(initializer_range),
is_train=is_train)

query_blob = TransposeForScores(query_blob, num_attention_heads, from_seq_length, size_per_head)
key_blob = TransposeForScores(key_blob, num_attention_heads, to_seq_length, size_per_head)

attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
attention_scores_blob = attention_scores_blob * (1.0 / math.sqrt(float(size_per_head)))

attention_mask_blob = flow.reshape(attention_mask_blob, [-1, 1, from_seq_length, to_seq_length])
attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
addr_blob = (attention_mask_blob - 1.0) * 10000.0

attention_scores_blob = attention_scores_blob + addr_blob
attention_probs_blob = flow.nn.softmax(attention_scores_blob)
attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)
# print('attention_probs_blob.shape:',attention_probs_blob.shape)

value_blob = flow.reshape(value_blob, [-1, to_seq_length, num_attention_heads, size_per_head])
value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
context_blob = flow.matmul(attention_probs_blob, value_blob)
context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])

if do_return_2d_tensor:
context_blob = flow.reshape(context_blob, [-1, num_attention_heads * size_per_head])
else:
context_blob = flow.reshape(context_blob, [-1, from_seq_length, num_attention_heads * size_per_head])

return context_blob,attention_probs_blob

def _FullyConnected(input_blob, input_size, units, activation=None, name=None,
weight_initializer=None,is_train=True):
weight_blob = flow.get_variable(
name=name + '-weight',
shape=[input_size, units],
dtype=input_blob.dtype,
trainable=is_train,
initializer=weight_initializer)
bias_blob = flow.get_variable(
name=name + '-bias',
shape=[units],
dtype=input_blob.dtype,
trainable=is_train,
initializer=flow.constant_initializer(0.0))
output_blob = flow.matmul(input_blob, weight_blob)
output_blob = flow.nn.bias_add(output_blob, bias_blob)
return output_blob

def _Dropout(input_blob, dropout_prob):
if dropout_prob == 0.0:
return input_blob
return flow.nn.dropout(input_blob, rate=dropout_prob)


def _LayerNorm(input_blob, hidden_size,is_train=True):
return flow.layers.layer_norm(input_blob, name='LayerNorm', begin_norm_axis=-1, begin_params_axis=-1,trainable=is_train)

def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length):
output = flow.cast(to_mask_blob, dtype=flow.float)
output = flow.reshape(output, [-1, 1, to_seq_length])
zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length])
output = zeros + output
return output


def _EmbeddingPostprocessor(input_blob,
seq_length,
embedding_size,
use_token_type=False,
token_type_ids_blob=None,
token_type_vocab_size=16,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=0.02,
max_position_embeddings=512,
dropout_prob=0.1,
is_train=True):
output = input_blob

if use_token_type:
assert token_type_ids_blob is not None
token_type_table = flow.get_variable(name=token_type_embedding_name,
shape=[token_type_vocab_size, embedding_size],
dtype=input_blob.dtype,
trainable=is_train,
initializer=CreateInitializer(initializer_range))
token_type_embeddings = flow.gather(params=token_type_table, indices=token_type_ids_blob, axis=0)
output = output + token_type_embeddings

if use_position_embeddings:
position_table = flow.get_variable(name=position_embedding_name,
shape=[1, max_position_embeddings, embedding_size],
dtype=input_blob.dtype,
trainable=is_train,
initializer=CreateInitializer(initializer_range))
assert seq_length <= max_position_embeddings
if seq_length != max_position_embeddings:
position_table = flow.slice(position_table, begin=[None, 0, 0], size=[None, seq_length, -1])
output = output + position_table

output = _LayerNorm(output, embedding_size, is_train=is_train)
output = _Dropout(output, dropout_prob)

return output


def _EmbeddingLookup(input_ids_blob,
vocab_size,
embedding_size=128,
initializer_range=0.02,
word_embedding_name="word_embeddings",
is_train=True):
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size],
dtype=flow.float,
trainable=is_train,
initializer=CreateInitializer(initializer_range))
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
return output, embedding_table

def GetActivation(name):
if name == 'linear':
return None
elif name == 'relu':
return flow.math.relu
elif name == 'tanh':
return flow.math.tanh
elif name == 'gelu':
return flow.math.gelu
else:
raise Exception("unsupported activation")


+ 116
- 0
model_compress/model_compress/distil/src/classifier.py View File

@@ -0,0 +1,116 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import bert as bert_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util


def GlueBERT(
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
label_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
label_num=2,
replace_prob=None,
):
backbone = bert_util.BertBackbone(
input_ids_blob=input_ids_blob,
input_mask_blob=input_mask_blob,
token_type_ids_blob=token_type_ids_blob,
vocab_size=vocab_size,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
)
pooled_output = PooledOutput(
sequence_output=backbone.sequence_output(),
hidden_size=hidden_size,
initializer_range=initializer_range
)
loss, _, logit_blob = _AddClassficationLoss(
input_blob=pooled_output,
label_blob=label_blob,
hidden_size=hidden_size,
label_num=label_num,
initializer_range=initializer_range,
scope_name='classification'
)

return loss, logit_blob


def PooledOutput(sequence_output, hidden_size, initializer_range):
with flow.scope.namespace("bert-pooler"):
first_token_tensor = flow.slice(
sequence_output, [None, 0, 0], [None, 1, -1])
first_token_tensor = flow.reshape(
first_token_tensor, [-1, hidden_size])
pooled_output = bert_util._FullyConnected(
first_token_tensor,
input_size=hidden_size,
units=hidden_size,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name="dense",
)
pooled_output = flow.math.tanh(pooled_output)
return pooled_output


def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range,
scope_name='classification'):
with flow.scope.namespace(scope_name):
output_weight_blob = flow.get_variable(
name="output_weights",
shape=[label_num, hidden_size],
dtype=input_blob.dtype,
# initializer=bert_util.CreateInitializer(initializer_range),
initializer=flow.random_normal_initializer(
mean=0.0, stddev=initializer_range, seed=None, dtype=None)
)
output_bias_blob = flow.get_variable(
name="output_bias",
shape=[label_num],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(0.0),
)
logit_blob = flow.matmul(
input_blob, output_weight_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=logit_blob, labels=label_blob
)
loss = pre_example_loss
return loss, pre_example_loss, logit_blob

+ 107
- 0
model_compress/model_compress/distil/src/config.py View File

@@ -0,0 +1,107 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import argparse
from datetime import datetime


def str_list(x):
return x.split(',')

def int_list(x):
return list(map(int, x.split(',')))

def float_list(x):
return list(map(float, x.split(',')))

def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')

def get_parser(parser=None):

parser = argparse.ArgumentParser(description="flags for bert")

parser.add_argument('--do_train', type=str2bool, nargs='?', const=True, help='train or not')
parser.add_argument('--do_eval', type=str2bool, nargs='?', const=True, help='eval or not')
# resouce
parser.add_argument("--model", type=str, default='BERT Pretrain')
parser.add_argument("--gpu_num_per_node", type=int, default=1)
parser.add_argument('--num_nodes', type=int, default=1,
help='node/machine number for training')
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
help='nodes ip list for training, devided by ",", length >= num_nodes')
parser.add_argument("--ctrl_port", type=int, default=50051, help='ctrl_port for multinode job')

# train
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
parser.add_argument("--weight_decay_rate", type=float, default=0.01, help="weight decay rate")
parser.add_argument("--warmup_proportion", type=float, default=0.1)
parser.add_argument('--use_fp16', type=str2bool, nargs='?', default='False', const=True,
help='use use fp16 or not')
parser.add_argument('--use_xla', type=str2bool, nargs='?', const=True,
help='Whether to use use xla')

# log and resore/save
parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False,
help="print loss every n iteration")
parser.add_argument("--model_save_every_n_iter", type=int, default=10000, required=False,
help="save model every n iteration",)
parser.add_argument("--model_save_dir", type=str,
default="./output/model_save-{}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))),
required=False, help="model save directory")
parser.add_argument("--save_last_snapshot", type=bool, default=False, required=False,
help="save model snapshot for last iteration")
parser.add_argument("--model_load_dir", type=str, default=None, help="model load directory")
parser.add_argument("--log_dir", type=str, default="./output", help="log info save directory")

# bert backbone
parser.add_argument('--do_lower_case', type=str2bool, nargs='?', const=True, default='True')
parser.add_argument("--seq_length", type=int, default=512)
parser.add_argument("--max_predictions_per_seq", type=int, default=80)
parser.add_argument("--num_hidden_layers", type=int, default=24)
parser.add_argument("--num_attention_heads", type=int, default=16)
parser.add_argument("--max_position_embeddings", type=int, default=512)
parser.add_argument("--type_vocab_size", type=int, default=2)
parser.add_argument("--vocab_size", type=int, default=30522)
parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--hidden_size_per_head", type=int, default=64)
parser.add_argument("--hidden_size", type=int, default=768)

return parser


def print_args(args):
print("=".ljust(66, "="))
print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
args.model, args.gpu_num_per_node, args.num_nodes))
print("=".ljust(66, "="))
for arg in vars(args):
print("{} = {}".format(arg, getattr(args, arg)))
print("-".ljust(66, "-"))
print("Time stamp: {}".format(
str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))


if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
print_args(args)

+ 121
- 0
model_compress/model_compress/distil/src/convert_bert_pytorch_checkpoint_to_original_tf.py View File

@@ -0,0 +1,121 @@
# coding=utf-8
# Copyright 2018 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""

import argparse
import os

import numpy as np
import tensorflow as tf
import torch

from transformers import BertModel,BertConfig
from modeling import TinyBertForSequenceClassification

def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str):

"""
:param model:BertModel Pytorch model instance to be converted
:param ckpt_dir: Tensorflow model directory
:param model_name: model name
:return:

Currently supported HF models:
Y BertModel
N BertForMaskedLM
N BertForPreTraining
N BertForMultipleChoice
N BertForNextSentencePrediction
N BertForSequenceClassification
N BertForQuestionAnswering
"""

tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value")

var_map = (
("layer.", "layer_"),
("word_embeddings.weight", "word_embeddings"),
("position_embeddings.weight", "position_embeddings"),
("token_type_embeddings.weight", "token_type_embeddings"),
(".", "/"),
("LayerNorm/weight", "LayerNorm/gamma"),
("LayerNorm/bias", "LayerNorm/beta"),
("weight", "kernel"),
("classifier", "classification-output"),
)

if not os.path.isdir(ckpt_dir):
os.makedirs(ckpt_dir)

state_dict = model.state_dict()
print('torch state_dict.keys(): ',state_dict.keys())
def to_tf_var_name(name: str):
for patt, repl in iter(var_map):
name = name.replace(patt, repl)
# return "bert/{}".format(name)
return "{}".format(name)

def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
session.run(tf.variables_initializer([tf_var]))
session.run(tf_var)
return tf_var

tf.reset_default_graph()
with tf.Session() as session:
for var_name in state_dict:
tf_name = to_tf_var_name(var_name)
torch_tensor = state_dict[var_name].numpy()
if any([x in var_name for x in tensors_to_transpose]):
torch_tensor = torch_tensor.T
tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
tf.keras.backend.set_value(tf_var, torch_tensor)
tf_weight = session.run(tf_var)
print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))

saver = tf.train.Saver(tf.trainable_variables())
saver.save(session, os.path.join(ckpt_dir, 'bert_model' + ".ckpt"))


def main(raw_args=None):
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased")
parser.add_argument(
"--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model"
)
parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin")
parser.add_argument("--model_config_path", type=str, required=True, help="/path/to/<pytorch-model-name>")
parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model")
args = parser.parse_args(raw_args)

# model = BertModel.from_pretrained(
# pretrained_model_name_or_path=args.model_name,
# state_dict=torch.load(args.pytorch_model_path),
# cache_dir=args.cache_dir,
# )
num_labels=2
student_config = BertConfig.from_pretrained(args.model_config_path, num_labels=num_labels)

model = TinyBertForSequenceClassification.from_pretrained(args.model_config_path, config=student_config)
# model = BertModel.from_pretrained(
# state_dict=torch.load(args.pytorch_model_path)
# )
convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)


if __name__ == "__main__":
main()

+ 90
- 0
model_compress/model_compress/distil/src/convert_tf_ckpt_to_of.py View File

@@ -0,0 +1,90 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""Convert tensorflow checkpoint to oneflow snapshot"""

import re
import argparse
import tensorflow as tf
import numpy as np
import os

parser = argparse.ArgumentParser()

## Required parameters
parser.add_argument("--tf_checkpoint_path",
default = None,
type = str,
required = True,
help = "Path the TensorFlow checkpoint path.")
parser.add_argument("--of_dump_path",
default = None,
type = str,
required = True,
help = "Path to the output OneFlow model.")

#args = parser.parse_args()
args, unknown = parser.parse_known_args()
print(args)

# parse unknown arguments for extra weights
extra_weights = {}
for u in unknown:
w = u.split("=")
assert len(w) == 2
if len(w) == 2:
extra_weights[w[0]] = float(w[1])


def _write_blob(folder, blob):
os.makedirs(folder, exist_ok=True)
filename = os.path.join(folder, "out")
f = open(filename, 'wb')
f.write(blob.tobytes())
f.close()
print(filename, blob.shape)

def _SaveWeightBlob2File(blob, folder):
_write_blob(folder, blob)

for weight, default_value in extra_weights.items():
d = np.full_like(blob, default_value)
_write_blob(folder + weight, d)

def convert():
path = args.tf_checkpoint_path
init_vars = tf.train.list_variables(path)
for name, shape in init_vars:
array = tf.train.load_variable(path, name)

sep = name.rfind('/')
blob_name = name[sep + 1:]
op_name = name[:sep].replace('/', '-')

if blob_name == "kernel":
blob_name = "weight"
elif blob_name in ['adam_m', 'adam_v']:
print("find m, v weights")

folder_name = op_name+"-"+blob_name
folder = os.path.join(args.of_dump_path, folder_name)
#print("saved to:", folder)

_SaveWeightBlob2File(array, folder)


if __name__ == "__main__":
convert()


+ 90
- 0
model_compress/model_compress/distil/src/convert_tf_ckpt_to_of_student.py View File

@@ -0,0 +1,90 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""Convert tensorflow checkpoint to oneflow snapshot"""

import re
import argparse
import tensorflow as tf
import numpy as np
import os

parser = argparse.ArgumentParser()

## Required parameters
parser.add_argument("--tf_checkpoint_path",
default = None,
type = str,
required = True,
help = "Path the TensorFlow checkpoint path.")
parser.add_argument("--of_dump_path",
default = None,
type = str,
required = True,
help = "Path to the output OneFlow model.")

#args = parser.parse_args()
args, unknown = parser.parse_known_args()
print(args)

# parse unknown arguments for extra weights
extra_weights = {}
for u in unknown:
w = u.split("=")
assert len(w) == 2
if len(w) == 2:
extra_weights[w[0]] = float(w[1])


def _write_blob(folder, blob):
os.makedirs(folder, exist_ok=True)
filename = os.path.join(folder, "out")
f = open(filename, 'wb')
f.write(blob.tobytes())
f.close()
print(filename, blob.shape)

def _SaveWeightBlob2File(blob, folder):
_write_blob(folder, blob)

for weight, default_value in extra_weights.items():
d = np.full_like(blob, default_value)
_write_blob(folder + weight, d)

def convert():
path = args.tf_checkpoint_path
init_vars = tf.train.list_variables(path)
for name, shape in init_vars:
array = tf.train.load_variable(path, name)

sep = name.rfind('/')
blob_name = name[sep + 1:]
op_name = name[:sep].replace('/', '-')

if blob_name == "kernel":
blob_name = "weight"
elif blob_name in ['adam_m', 'adam_v']:
print("find m, v weights")

folder_name = 'student'+'-'+op_name+"-"+blob_name
folder = os.path.join(args.of_dump_path, folder_name)
#print("saved to:", folder)

_SaveWeightBlob2File(array, folder)


if __name__ == "__main__":
convert()


+ 141
- 0
model_compress/model_compress/distil/src/download_glue_data.py View File

@@ -0,0 +1,141 @@
''' Script for downloading all GLUE data.

Note: for legal reasons, we are unable to host MRPC.
You can either use the version hosted by the SentEval team, which is already tokenized,
or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually.
For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example).
You should then rename and place specific files in a folder (see below for an example).

mkdir MRPC
cabextract MSRParaphraseCorpus.msi -d MRPC
cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt
cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt
rm MRPC/_*
rm MSRParaphraseCorpus.msi

1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now.
2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray!
'''

import os
import sys
import shutil
import argparse
import tempfile
import urllib.request
import zipfile

TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]
TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4',
"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8',
"MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc',
"QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5',
"STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5',
"MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce',
"SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df',
"QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601',
"RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb',
"WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf',
"diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'}

MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt'
MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt'

def download_and_extract(task, data_dir):
print("Downloading and extracting %s..." % task)
data_file = "%s.zip" % task
urllib.request.urlretrieve(TASK2PATH[task], data_file)
with zipfile.ZipFile(data_file) as zip_ref:
zip_ref.extractall(data_dir)
os.remove(data_file)
print("\tCompleted!")

def format_mrpc(data_dir, path_to_data):
print("Processing MRPC...")
mrpc_dir = os.path.join(data_dir, "MRPC")
if not os.path.isdir(mrpc_dir):
os.mkdir(mrpc_dir)
if path_to_data:
mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt")
else:
print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN)
mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file)
urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file)
assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv"))

dev_ids = []
with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
for row in ids_fh:
dev_ids.append(row.strip().split('\t'))

with open(mrpc_train_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \
open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh:
header = data_fh.readline()
train_fh.write(header)
dev_fh.write(header)
for row in data_fh:
label, id1, id2, s1, s2 = row.strip().split('\t')
if [id1, id2] in dev_ids:
dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
else:
train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))

with open(mrpc_test_file, encoding="utf8") as data_fh, \
open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh:
header = data_fh.readline()
test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
for idx, row in enumerate(data_fh):
label, id1, id2, s1, s2 = row.strip().split('\t')
test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
print("\tCompleted!")

def download_diagnostic(data_dir):
print("Downloading and extracting diagnostic...")
if not os.path.isdir(os.path.join(data_dir, "diagnostic")):
os.mkdir(os.path.join(data_dir, "diagnostic"))
data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv")
urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file)
print("\tCompleted!")
return

def get_tasks(task_names):
task_names = task_names.split(',')
if "all" in task_names:
tasks = TASKS
else:
tasks = []
for task_name in task_names:
assert task_name in TASKS, "Task %s not found!" % task_name
tasks.append(task_name)
return tasks

def main(arguments):
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data')
parser.add_argument('--tasks', help='tasks to download data for as a comma separated string',
type=str, default='all')
parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt',
type=str, default='')
args = parser.parse_args(arguments)

if not os.path.isdir(args.data_dir):
os.mkdir(args.data_dir)
tasks = get_tasks(args.tasks)

for task in tasks:
if task == 'MRPC':
format_mrpc(args.data_dir, args.path_to_mrpc)
elif task == 'diagnostic':
download_diagnostic(args.data_dir)
else:
download_and_extract(task, args.data_dir)


if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))

+ 793
- 0
model_compress/model_compress/distil/src/glue_ofrecord/glue_process.py View File

@@ -0,0 +1,793 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import oneflow.core.record.record_pb2 as of_record
import collections
import csv
import os
import tokenization
import logging_setup
import struct
from parse_args import parse_args

# import pandas as pd
# import tensorflow as tf


if __name__ == '__main__':
logger = logging_setup.setup_logger(__name__)
else:
logger = logging_setup.setup_multiprocessing_logger()


class InputExample(object):
"""A single training/test example for simple sequence classification."""

def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label


class PaddingInputExample(object):
"""Fake example so the num input examples is a multiple of the batch size.
When running eval/predict on the TPU, we need to pad the number of examples
to be a multiple of the batch size, because the TPU requires a fixed batch
size. The alternative is to drop the last batch, which is bad because it means
the entire output data won't be generated.
We use this class instead of `None` because treating `None` as padding
battches could cause silent errors.
"""


class InputFeatures(object):
"""A single set of features of data."""

def __init__(self,
input_ids,
input_mask,
segment_ids,
label_id,
is_real_example=True):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
self.is_real_example = is_real_example


class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""

def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()

def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()

def get_test_examples(self, data_dir):
"""Gets a collection of `InputExample`s for prediction."""
raise NotImplementedError()

def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()

@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding='utf-8') as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
lines.append(line)
return lines


class XnliProcessor(DataProcessor):
"""Processor for the XNLI data set."""

def __init__(self):
self.language = "zh"

def get_train_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(
os.path.join(data_dir, "multinli",
"multinli.train.%s.tsv" % self.language))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "train-%d" % (i)
text_a = tokenization.convert_to_unicode(line[0])
text_b = tokenization.convert_to_unicode(line[1])
label = tokenization.convert_to_unicode(line[2])
if label == tokenization.convert_to_unicode("contradictory"):
label = tokenization.convert_to_unicode("contradiction")
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples

def get_dev_examples(self, data_dir):
"""See base class."""
lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "dev-%d" % (i)
language = tokenization.convert_to_unicode(line[0])
if language != tokenization.convert_to_unicode(self.language):
continue
text_a = tokenization.convert_to_unicode(line[6])
text_b = tokenization.convert_to_unicode(line[7])
label = tokenization.convert_to_unicode(line[1])
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples

def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]



class MrpcProcessor(DataProcessor):
"""Processor for the MRPC data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence1"].numpy().decode("utf-8"),
tensor_dict["sentence2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv")))
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_aug_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")

def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

def get_labels(self):
"""See base class."""
return ["0", "1"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = line[3]
text_b = line[4]
label = line[0]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples


class MnliProcessor(DataProcessor):
"""Processor for the MultiNLI data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["premise"].numpy().decode("utf-8"),
tensor_dict["hypothesis"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")

def get_aug_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")

def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
text_a = line[8]
text_b = line[9]
label = line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples


class MnliMismatchedProcessor(MnliProcessor):
"""Processor for the MultiNLI Mismatched data set (GLUE version)."""

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched")


class ColaProcessor(DataProcessor):
"""Processor for the CoLA data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence"].numpy().decode("utf-8"),
None,
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_aug_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")

def get_labels(self):
"""See base class."""
return ["0", "1"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text_a = line[3]
label = line[1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples


class Sst2Processor(DataProcessor):
"""Processor for the SST-2 data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence"].numpy().decode("utf-8"),
None,
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_test_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")

def get_aug_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")

def get_labels(self):
"""See base class."""
return ["0", "1"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = line[0]
label = line[1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples


class StsbProcessor(DataProcessor):
"""Processor for the STS-B data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence1"].numpy().decode("utf-8"),
tensor_dict["sentence2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_aug_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")

def get_labels(self):
"""See base class."""
return [None]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
text_a = line[7]
text_b = line[8]
label = line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples


class QqpProcessor(DataProcessor):
"""Processor for the QQP data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["question1"].numpy().decode("utf-8"),
tensor_dict["question2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_aug_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")

def get_labels(self):
"""See base class."""
return ["0", "1"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
try:
text_a = line[3]
text_b = line[4]
label = line[5]
except IndexError:
continue
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples


class QnliProcessor(DataProcessor):
"""Processor for the QNLI data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["question"].numpy().decode("utf-8"),
tensor_dict["sentence"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")

def get_aug_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")

def get_labels(self):
"""See base class."""
return ["entailment", "not_entailment"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
text_a = line[1]
text_b = line[2]
label = line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples


class RteProcessor(DataProcessor):
"""Processor for the RTE data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence1"].numpy().decode("utf-8"),
tensor_dict["sentence2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_aug_examples(self, data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "aug")

def get_labels(self):
"""See base class."""
return ["entailment", "not_entailment"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
text_a = line[1]
text_b = line[2]
label = line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples


class WnliProcessor(DataProcessor):
"""Processor for the WNLI data set (GLUE version)."""

def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["sentence1"].numpy().decode("utf-8"),
tensor_dict["sentence2"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)

def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

def get_labels(self):
"""See base class."""
return ["0", "1"]

def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
text_a = line[1]
text_b = line[2]
label = line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""

# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()


def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer):
"""Converts a single `InputExample` into a single `InputFeatures`."""

if isinstance(example, PaddingInputExample):
return InputFeatures(
input_ids=[0] * max_seq_length,
input_mask=[0] * max_seq_length,
segment_ids=[0] * max_seq_length,
label_id=0,
is_real_example=False)

label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i

tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)

if tokens_b:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
else:
# Account for [CLS] and [SEP] with "- 2"
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]

# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)

if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)

input_ids = tokenizer.convert_tokens_to_ids(tokens)

# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)

# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)

assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length

label_id = label_map[example.label]
if ex_index < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label: %s (id = %d)" % (example.label, label_id))

feature = InputFeatures(
input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id,
is_real_example=True)
return feature


def file_based_convert_examples_to_features(
examples, label_list, max_seq_length, tokenizer, output_file):
"""Convert a set of `InputExample`s to a TFRecord file."""

# writer = tf.python_io.TFRecordWriter(output_file)
writer = open(output_file, 'ab')

total_written = 0
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))

feature = convert_single_example(ex_index, example, label_list,
max_seq_length, tokenizer)

def create_int32_feature(values):
return of_record.Feature(int32_list=of_record.Int32List(value=values)),

sample = of_record.OFRecord(
feature={
"input_ids": create_int32_feature(feature.input_ids),
"input_mask": create_int32_feature(feature.input_mask),
"segment_ids": create_int32_feature(feature.segment_ids),
"label_ids": create_int32_feature([feature.label_id]),
"is_real_example": create_int32_feature([int(feature.is_real_example)])
}
)

writer.write(struct.pack("q", sample.ByteSize()))
writer.write(sample.SerializeToString())
if ex_index % 10000 == (len(examples) - 1) % 10000:
logger.info('Wrote intances %d/%d to "%s"', ex_index, len(examples), output_file)

total_written += 1

writer.close()
logger.info('Wrote total %d instances to output files "%s"', total_written, output_file)


def glue_process(args):
processors = {
"xnli": XnliProcessor,
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mnli-mm": MnliMismatchedProcessor,
"mrpc": MrpcProcessor,
"sst-2": Sst2Processor,
"sts-b": StsbProcessor,
"qqp": QqpProcessor,
"qnli": QnliProcessor,
"rte": RteProcessor,
"wnli": WnliProcessor,
}

if not args.do_train and not args.do_eval and not args.do_predict:
raise ValueError(
"At least one of `do_train`, `do_eval` or `do_predict' or `aug_train' must be True.")

os.makedirs(args.output_dir, exist_ok=True)

task_name = args.task_name.lower()

if task_name not in processors:
raise ValueError("Task not found: %s" % (task_name))

processor = processors[task_name]()

label_list = processor.get_labels()

tokenizer = tokenization.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

total_examples = {}
if args.do_train:
train_examples = processor.get_train_examples(args.data_dir)
total_examples['train'] = len(train_examples)
os.makedirs(os.path.join(args.output_dir,'train'), exist_ok=True)

train_file = os.path.join(args.output_dir,'train', "train.of_record-0")
file_based_convert_examples_to_features(
train_examples, label_list, args.max_seq_length, tokenizer, train_file)

if args.do_eval:
eval_examples = processor.get_dev_examples(args.data_dir)
total_examples['eval'] = len(eval_examples)

os.makedirs(os.path.join(args.output_dir,'eval'), exist_ok=True)

eval_file = os.path.join(args.output_dir,'eval' ,"eval.of_record-0")
file_based_convert_examples_to_features(
eval_examples, label_list, args.max_seq_length, tokenizer, eval_file)

if args.do_predict:
try:
predict_examples = processor.get_test_examples(args.data_dir)
total_examples['test'] = len(predict_examples)

os.makedirs(os.path.join(args.output_dir,'test'), exist_ok=True)

predict_file = os.path.join(args.output_dir,'test' ,"predict.of_record-0")
file_based_convert_examples_to_features(predict_examples, label_list, args.max_seq_length, tokenizer,
predict_file)
except Exception as e:
print(e)
if args.aug_train:
train_aug_examples = processor.get_aug_examples(args.data_dir)
os.makedirs(os.path.join(args.output_dir,'train'), exist_ok=True)

train_aug_file = os.path.join(args.output_dir,'train' ,"train_aug.of_record-0")
file_based_convert_examples_to_features(train_aug_examples, label_list, args.max_seq_length, tokenizer,
train_aug_file)
print('task_name:',task_name)
print(total_examples)

def main():
args = parse_args()
glue_process(args)


if __name__ == '__main__':
main()

+ 339
- 0
model_compress/model_compress/distil/src/glue_ofrecord/glue_process_lstm.py View File

@@ -0,0 +1,339 @@
# coding: utf-8
import os
import numpy as np
import pickle as pkl
from tqdm import tqdm
import time
from datetime import timedelta
import csv
import sys
import codecs
import logging_setup
import struct
from parse_args import parse_args
import oneflow.core.record.record_pb2 as of_record

MAX_VOCAB_SIZE = 10000 # 词表长度限制
UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号

if __name__ == '__main__':
logger = logging_setup.setup_logger(__name__)
else:
logger = logging_setup.setup_multiprocessing_logger()

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()

def SST2_Processor(path):
examples = []
with open(path, 'r', encoding='UTF-8') as f:
i=0
for line in tqdm(f):
if i==0:
i += 1
continue
try:
lin = line.strip()
if not lin:
continue
text_a, label = lin.split('\t')
text_b = None
examples.append([text_a, text_b, label])

except Exception as e:
print(e)
return examples

def CoLA_Processor(path):
examples = []
with open(path, 'r', encoding='UTF-8') as f:
i=0
for line in tqdm(f):
try:
lin = line.strip().split('\t')
if not lin:
continue
text_a = lin[3]
text_b = None
label = lin[1]

examples.append([text_a, text_b, label])
except Exception as e:
print(e)
return examples

def QQP_Processor(path):
examples = []
with open(path, 'r', encoding='UTF-8') as f:
i=0
for line in tqdm(f):
if i==0:
i += 1
continue
try:
lin = line.strip().split('\t')
if not lin:
continue
text_a = lin[3]
text_b = lin[4]
label = lin[5]

examples.append([text_a,text_b,label])
except Exception as e:
print(e)
return examples

def RTE_Processor(path):
examples = []
with open(path, 'r', encoding='UTF-8') as f:
i=0
for line in tqdm(f):
if i==0:
i += 1
continue
try:
lin = line.strip().split('\t')
if not lin:
continue
text_a = lin[1]
text_b = lin[2]
label = lin[-1]

examples.append([text_a,text_b,label])
except Exception as e:
print(e)
return examples

def MRPC_Processor(path):
examples = []
with open(path, 'r', encoding='UTF-8') as f:
i=0
for line in tqdm(f):
if i==0:
i += 1
continue
try:
lin = line.strip().split('\t')
if not lin:
continue
text_a = lin[3]
text_b = lin[4]
label = lin[0]
examples.append([text_a,text_b,label])
except Exception as e:
print(e)
return examples

def convert_single_example(examples,tokenizer, pad_size, vocab):
contents = []
for example in examples:
text_a = example[0]
text_b = example[1]
label = example[2]

words_line = []
tokens_a = tokenizer(text_a)

if text_b:
tokens_b = tokenizer(text_b)
_truncate_seq_pair(tokens_a, tokens_b, pad_size - 1)
token = tokens_a + [PAD] + tokens_b
else:
token = tokens_a

seq_len = len(token)
if pad_size:
if len(token) < pad_size:
token.extend([PAD] * (pad_size - len(token)))
else:
token = token[:pad_size]
seq_len = pad_size
# word to id
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))
contents.append((words_line, label, seq_len))
return contents

def build_vocab(dataset, file_path, tokenizer, max_size, min_freq):
vocab_dic = {}
if dataset == 'SST-2':
examples = SST2_Processor(file_path)
elif dataset == 'CoLA':
examples = CoLA_Processor(file_path)
elif dataset == 'MRPC':
examples = MRPC_Processor(file_path)
elif dataset == 'QQP':
examples = QQP_Processor(file_path)
elif dataset == 'RTE':
examples = RTE_Processor(file_path)
else:
print('Error: the dataset does not support')

print('Building vocab ...')
for example in tqdm(examples):
text_a = example[0]
text_b = example[1]
if text_b:
text = text_a + text_b
else:
text = text_a
for word in tokenizer(text):
vocab_dic[word] = vocab_dic.get(word, 0) + 1
vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= min_freq], key=lambda x: x[1], reverse=True)[:max_size]
vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
return vocab_dic


def build_dataset(dataset, config, ues_word):
if ues_word:
tokenizer = lambda x: x.split(' ') # 以空格隔开,word-level
else:
tokenizer = lambda x: [y for y in x] # char-level
if os.path.exists(config.vocab_path):
vocab = pkl.load(open(config.vocab_path, 'rb'))
else:
vocab = build_vocab(dataset, config.train_path, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
pkl.dump(vocab, open(config.vocab_path, 'wb'))
print(f"Vocab size: {len(vocab)}")

def load_dataset(dataset, tokenizer, path, pad_size=32):
if dataset=='SST-2':
examples = SST2_Processor(path)
elif dataset=='CoLA':
examples = CoLA_Processor(path)
elif dataset=='MRPC':
examples = MRPC_Processor(path)
elif dataset=='QQP':
examples = QQP_Processor(path)
elif dataset == 'RTE':
examples = RTE_Processor(path)
else:
print('error dataset not support')
contents = convert_single_example(examples,tokenizer,pad_size,vocab)
return contents

train = load_dataset(dataset,tokenizer, config.train_path, config.pad_size)
dev = load_dataset(dataset,tokenizer, config.dev_path, config.pad_size)
# test = load_dataset(config.test_path, config.pad_size)
return vocab, train, dev


def get_time_dif(start_time):
"""获取已使用时间"""
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))

def file_based_convert_examples_to_features(
examples, label_list, max_seq_length, output_file):
"""Convert a set of `InputExample`s to a TFRecord file."""

# writer = tf.python_io.TFRecordWriter(output_file)
writer = open(output_file, 'ab')

total_written = 0
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))

label_map = {}
for (i, label) in enumerate(label_list):
label_map[label] = i

input_mask = [1] * example[2] + [0] * (max_seq_length - example[2])
segment_ids = [1] * max_seq_length
assert len(input_mask)==max_seq_length

label_id = label_map[example[1]]
def create_int32_feature(values):
return of_record.Feature(int32_list=of_record.Int32List(value=values)),

sample = of_record.OFRecord(
feature={
"input_ids": create_int32_feature(example[0]),
"input_mask": create_int32_feature(input_mask),
"segment_ids": create_int32_feature(segment_ids),
"label_ids": create_int32_feature([label_id]),
"is_real_example": create_int32_feature([int(True)])
}
)

writer.write(struct.pack("q", sample.ByteSize()))
writer.write(sample.SerializeToString())
if ex_index % 10000 == (len(examples) - 1) % 10000:
logger.info('Wrote intances %d/%d to "%s"', ex_index, len(examples), output_file)

total_written += 1

writer.close()
logger.info('Wrote total %d instances to output files "%s"', total_written, output_file)

class Config(object):
vocab_path = ''
train_path = ''
dev_path = ''
pad_size = 32

if __name__ == "__main__":
'''提取预训练词向量'''
# 下面的目录、文件名按需更改。
config =Config
dataset = "MRPC"
train_dir = "../../data/glue_data/{}/train.tsv".format(dataset)
dev_dir = "../../data/glue_data/{}/dev.tsv".format(dataset)

vocab_dir = "../../data/glue_ofrecord/{}_lstm_32".format(dataset)
pretrain_dir = ""
emb_dim = 300
if os.path.exists(os.path.join(vocab_dir,'vocab.pkl')):
word_to_id = pkl.load(open(os.path.join(vocab_dir,'vocab.pkl'), 'rb'))
else:
tokenizer = lambda x: x.split(' ') # 以词为单位构建词表(数据集中词之间以空格隔开)
# tokenizer = lambda x: [y for y in x] # 以字为单位构建词表
word_to_id = build_vocab(dataset, train_dir, tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
os.makedirs(vocab_dir, exist_ok=True)
pkl.dump(word_to_id, open(os.path.join(vocab_dir,'vocab.pkl'), 'wb'))

# print(word_to_id)
# print(len(word_to_id))

output_dir = '../../data/glue_ofrecord/{}_lstm_32'.format(dataset)
total_examples = {}
max_seq_length= 32
config.vocab_path = os.path.join(vocab_dir,'vocab.pkl')
config.train_path = train_dir
config.dev_path = dev_dir
config.pad_size = max_seq_length
if dataset == 'RTE':
label_list = ["entailment", "not_entailment"]
elif dataset in ['SST-2', 'MRPC', 'QQP', 'CoLA']:
label_list = ["0", "1"]
elif dataset == 'MNLI':
label_list = ["contradiction", "entailment", "neutral"]
else:
print('Error: the dataset not supports')

# print(config.vocab_path)
_,train_dataset,dev_dataset = build_dataset(dataset=dataset, config=config,ues_word='True')
# print(dev_dataset[0])

os.makedirs(os.path.join(output_dir, 'eval'), exist_ok=True)
dev_file = os.path.join(output_dir, 'eval', "eval.of_record-0")
file_based_convert_examples_to_features(dev_dataset,label_list,config.pad_size,dev_file)

os.makedirs(os.path.join(output_dir, 'train'), exist_ok=True)
train_file = os.path.join(output_dir, 'train', "train.of_record-0")
file_based_convert_examples_to_features(train_dataset,label_list,config.pad_size,train_file)

+ 15
- 0
model_compress/model_compress/distil/src/glue_ofrecord/list_files.py View File

@@ -0,0 +1,15 @@
import os

def list_files(path, filename=None):
fullname = path
if filename:
fullname = os.path.join(path, filename)
files = []
if os.path.isfile(fullname):
return [fullname]
elif os.path.isdir(fullname):
for sub in os.listdir(fullname):
files.extend(list_files(fullname, sub))
return files

+ 31
- 0
model_compress/model_compress/distil/src/glue_ofrecord/logging_setup.py View File

@@ -0,0 +1,31 @@
import sys
import logging
import multiprocessing

def setup_logger(name):
# Manually clear root loggers to prevent any module that may have called
# logging.basicConfig() from blocking our logging setup
# logging.root.handlers = []
FORMAT = '%(asctime)s [%(levelname)s] %(filename)s:%(lineno)d: %(message)s'
# DATEFMT = '%Y-%m-%d,%H:%M:%S.%f'
logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
logger = logging.getLogger(name)
return logger

def setup_multiprocessing_logger():
logger = multiprocessing.get_logger()
logger.setLevel(logging.INFO)
if not logger.handlers:
logger._rudimentary_setup = True
logfile = sys.__stdout__
if hasattr(logfile, "write"):
handler = logging.StreamHandler(logfile)
else:
handler = logging.FileHandler(logfile)
# formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(processName)s] '
# '%(filename)s:%(lineno)d: %(message)s')
formatter = logging.Formatter('%(asctime)s [%(levelname)s] [%(processName)s] %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

return logger

+ 71
- 0
model_compress/model_compress/distil/src/glue_ofrecord/parse_args.py View File

@@ -0,0 +1,71 @@
import argparse


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
'--data_dir',
help='Input glue task directories.',
default=None,
type=str,
required=True
)
parser.add_argument(
'--output_dir',
help='Output the directory of oneflow record files.',
default=None,
type=str,
required=True
)
parser.add_argument(
'--vocab_file',
help='The vocabulary file that the BERT model was trained on.',
default=None,
type=str,
required=True
)
parser.add_argument(
'--do_lower_case',
help='Whether to lower case the input text. Should be True for uncased '
'models and False for cased models.',
default=None,
type=bool
)
parser.add_argument(
'--max_seq_length',
help='Maximum sequence length.',
default=128,
type=int
)
parser.add_argument(
'--do_train',
help='Whether to process the training data',
default=None,
type=bool
)
parser.add_argument(
'--do_eval',
help='Whether to process the validation data',
default=None,
type=bool
)
parser.add_argument(
'--do_predict',
help='Whether to process the prediction data',
default=None,
type=bool
)
parser.add_argument(
'--aug_train',
help='Whether to process the augmented training data',
default=None,
type=bool
)
parser.add_argument(
'--task_name',
help='The task of glue to be processed',
default='cola',
type=str
)

return parser.parse_args()

+ 347
- 0
model_compress/model_compress/distil/src/glue_ofrecord/tokenization.py View File

@@ -0,0 +1,347 @@
# coding=utf-8
# Copyright 2018 Oneflow Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import unicodedata
import six


def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")


def printable_text(text):
"""Returns text encoded in a way suitable for print."""

# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")


def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r", encoding='utf-8') as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab


def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output


def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)


def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens


class FullTokenizer(object):
"""Runs end-to-end tokenziation."""

def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)

return split_tokens

def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)

def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)


class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.

Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case

def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)

# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)

orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)

def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1

return ["".join(x) for x in output]

def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)

def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True

return False

def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)


class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""

def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word

def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
input = "unaffable"
output = ["un", "##aff", "##able"]

Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.

Returns:
A list of wordpiece tokens.
"""

text = convert_to_unicode(text)

output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue

is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end

if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens


def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False


def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False


def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False

+ 30522
- 0
model_compress/model_compress/distil/src/glue_ofrecord/vocab.txt
File diff suppressed because it is too large
View File


+ 382
- 0
model_compress/model_compress/distil/src/knowledge_distill_util.py View File

@@ -0,0 +1,382 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import os
import math
import oneflow as flow
import oneflow.typing as tp
from typing import Tuple,Any

import bert as bert_util

def BertForSequenceClassification(
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
label_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
label_num=2,
is_student=False,
fit_size=768,
is_train=False
):
# with flow.scope.namespace('teacher'):
backbone = bert_util.BertBackbone(
input_ids_blob=input_ids_blob,
input_mask_blob=input_mask_blob,
token_type_ids_blob=token_type_ids_blob,
vocab_size=vocab_size,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
is_train=is_train
)
pooled_output = PooledOutput(
sequence_output=backbone.sequence_output(),
hidden_size=hidden_size,
initializer_range=initializer_range,
is_train=is_train
)
logit_blob = _AddClassfication(
input_blob=pooled_output,
label_blob=label_blob,
hidden_size=hidden_size,
label_num=label_num,
initializer_range=initializer_range,
scope_name='classification',
is_train=is_train
)
sequence_output = backbone.all_encoder_layers()
att_output = backbone.all_attention_probs()
embed_output = backbone.embedding_output()
sequence_output.insert(0,embed_output)
# print(logit_blob.shape)
# print(len(sequence_output))

# print(sequence_output.shape)

tmp = []
if is_student:
for s_id, sequence_layer in enumerate(sequence_output):
tmp.append(
fit_dense(
input_blob=sequence_layer,
hidden_size=hidden_size,
label_num=fit_size,
initializer_range=initializer_range,
scope_name='fit_dense',
is_train=is_train
))
sequence_output = tmp

return logit_blob, sequence_output, att_output

def BertStudentForSequenceClassification(
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
label_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
label_num=2,
is_student=False,
fit_size=768,
is_train=True
):
with flow.scope.namespace('student'):
backbone = bert_util.BertBackbone(
input_ids_blob=input_ids_blob,
input_mask_blob=input_mask_blob,
token_type_ids_blob=token_type_ids_blob,
vocab_size=vocab_size,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
is_train=is_train
)
pooled_output = PooledOutput(
sequence_output=backbone.sequence_output(),
hidden_size=hidden_size,
initializer_range=initializer_range,
is_train=is_train
)
logit_blob = _AddClassfication(
input_blob=pooled_output,
label_blob=label_blob,
hidden_size=hidden_size,
label_num=label_num,
initializer_range=initializer_range,
scope_name='classification',
is_train=is_train
)
sequence_output = backbone.all_encoder_layers()
att_output = backbone.all_attention_probs()
embed_output = backbone.embedding_output()
sequence_output.insert(0, embed_output)
# print(logit_blob.shape)
# print(len(sequence_output))
# print(sequence_output.shape)

tmp = []
if is_student:
for s_id, sequence_layer in enumerate(sequence_output):
tmp.append(
fit_dense(
input_blob=sequence_layer,
hidden_size=hidden_size,
label_num=fit_size,
initializer_range=initializer_range,
scope_name='fit_dense',
is_train=is_train
))
sequence_output = tmp

return logit_blob, sequence_output, att_output

def CreateInitializer(std):
return flow.truncated_normal(std)


def _EmbeddingLookup(input_ids_blob,
vocab_size,
embedding_size=128,
initializer_range=0.02,
word_embedding_name="word_embeddings",
is_train=True):
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size],
dtype=flow.float,
trainable=is_train,
initializer=CreateInitializer(initializer_range))
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
return output, embedding_table

def watch_diff_handler(blob: tp.Numpy):
print("watch_diff_handler:", blob, blob.shape, blob.dtype)

def watch_handler(y: tp.Numpy):
print("out:",y)

from lstm import lstm,Blstm
def LSTMStudentForSequenceClassification(
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
label_blob,
vocab_size,
seq_length=512,
hidden_size=300,
intermediate_size=400,
num_hidden_layers=1,
hidden_dropout_prob=0.5,
initializer_range=0.25,
label_num=2,
is_student=True,
is_train=True
):
with flow.scope.namespace('student'):
with flow.scope.namespace("embeddings"):
(embedding_output_, embedding_table_) = _EmbeddingLookup(
input_ids_blob=input_ids_blob,
vocab_size=vocab_size+1,
embedding_size=hidden_size,
word_embedding_name="word_embeddings",
is_train=is_train)

with flow.scope.namespace('lstm'):
output = lstm(embedding_output_, hidden_size, return_sequence=False, is_train=is_train)
output = flow.layers.dense(inputs=output,units=intermediate_size,activation=flow.nn.relu,kernel_initializer=CreateInitializer(initializer_range),trainable=is_train,name='FC1')
output = _Dropout(output, hidden_dropout_prob)
logit_blob = flow.layers.dense(inputs=output,units=label_num,kernel_initializer=CreateInitializer(initializer_range),trainable=is_train,name='FC2')
return logit_blob

def PooledOutput(sequence_output, hidden_size, initializer_range, is_train):
with flow.scope.namespace("bert-pooler"):
first_token_tensor = flow.slice(
sequence_output, [None, 0, 0], [None, 1, -1])
first_token_tensor = flow.reshape(
first_token_tensor, [-1, hidden_size])
pooled_output = bert_util._FullyConnected(
first_token_tensor,
input_size=hidden_size,
units=hidden_size,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name="dense",
is_train=is_train
)
pooled_output = flow.math.tanh(pooled_output)
return pooled_output


def _AddClassfication(input_blob, label_blob, hidden_size, label_num, initializer_range,
scope_name='classification',is_train=True):
with flow.scope.namespace(scope_name):
output_weight_blob = flow.get_variable(
name="output_weights",
shape=[label_num, hidden_size],
dtype=input_blob.dtype,
# initializer=bert_util.CreateInitializer(initializer_range),
initializer=flow.random_normal_initializer(
mean=0.0, stddev=initializer_range, seed=None, dtype=None),
trainable=is_train
)
output_bias_blob = flow.get_variable(
name="output_bias",
shape=[label_num],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(0.0),
trainable=is_train
)
logit_blob = flow.matmul(
input_blob, output_weight_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
# pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
# logits=logit_blob, labels=label_blob
# )
# loss = pre_example_loss
# return loss, pre_example_loss, logit_blob
return logit_blob

def _Dropout(input_blob, dropout_prob):
if dropout_prob == 0.0:
return input_blob
return flow.nn.dropout(input_blob, rate=dropout_prob)


def fit_dense(input_blob, hidden_size, label_num, initializer_range,
scope_name='fit_dense',is_train=True):
with flow.scope.namespace(scope_name):
in_shape = input_blob.shape
in_num_axes = len(in_shape)
assert in_num_axes >= 2

input_blob = (
flow.reshape(input_blob, (-1, in_shape[-1])) if in_num_axes > 2 else input_blob
)

output_weight_blob = flow.get_variable(
name="weight",
shape=[label_num, hidden_size],
dtype=input_blob.dtype,
# initializer=bert_util.CreateInitializer(initializer_range),
initializer=flow.random_normal_initializer(
mean=0.0, stddev=initializer_range, seed=None, dtype=None),
trainable=is_train
)
output_bias_blob = flow.get_variable(
name="bias",
shape=[label_num],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(0.0),
trainable=is_train
)
logit_blob = flow.matmul(
input_blob, output_weight_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
logit_blob = (
flow.reshape(logit_blob, in_shape[:-1] + (label_num,)) if in_num_axes > 2 else logit_blob
)
# pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
# logits=logit_blob, labels=label_blob
# )
# loss = pre_example_loss
# return loss, pre_example_loss, logit_blob
return logit_blob



def soft_cross_entropy(predicts, targets):
student_likelihood = flow.math.log(flow.nn.softmax(predicts, axis=-1))
targets_prob = flow.nn.softmax(targets, axis=-1)
tmp = flow.math.multiply(flow.math.negative(targets_prob), student_likelihood)
res = flow.math.reduce_mean(tmp)
return res

def mseloss(rep1, rep2):
return flow.math.reduce_mean(flow.math.square(rep1-rep2))

def layer_distill(args, student_reps, teacher_reps):
rep_loss = 0.
teacher_layer_num = len(teacher_reps) - 1
student_layer_num = len(student_reps) - 1

assert teacher_layer_num % student_layer_num == 0
layers_per_block = int(teacher_layer_num / student_layer_num)

new_teacher_reps = [teacher_reps[i * layers_per_block] for i in range(student_layer_num + 1)]
new_student_reps = student_reps

for student_rep, teacher_rep in zip(new_student_reps, new_teacher_reps):
tmp_loss = mseloss(student_rep, teacher_rep)
rep_loss += tmp_loss
return rep_loss


def att_distill(args, student_atts, teacher_atts):
att_loss = 0.
teacher_layer_num = len(teacher_atts)
student_layer_num = len(student_atts)

assert teacher_layer_num % student_layer_num == 0
layers_per_block = int(teacher_layer_num / student_layer_num)
new_teacher_atts = [teacher_atts[i * layers_per_block + layers_per_block - 1] for i in range(student_layer_num)]

for student_att, teacher_att in zip(student_atts, new_teacher_atts):
student_att = flow.where(student_att <= flow.constant(-1e2,dtype=flow.float), flow.zeros_like(student_att), student_att)
teacher_att = flow.where(teacher_att <= flow.constant(-1e2,dtype=flow.float), flow.zeros_like(teacher_att), teacher_att)

tmp_loss = mseloss(student_att, teacher_att)
att_loss += tmp_loss

return att_loss

def pred_distill(args, student_logits, teacher_logits):
soft_loss = soft_cross_entropy(student_logits / args.temperature,
teacher_logits / args.temperature)
return soft_loss


+ 311
- 0
model_compress/model_compress/distil/src/lstm.py View File

@@ -0,0 +1,311 @@
"""
Copyright 2020 Tianshu AI Platform. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import time
import argparse
from datetime import datetime

import test_global_storage
import oneflow as flow
import numpy as np
np.set_printoptions(suppress=True)

def _FullyConnected(input_blob,weight_blob,bias_blob):

output_blob = flow.matmul(input_blob, weight_blob)
if bias_blob:
output_blob = flow.nn.bias_add(output_blob, bias_blob)
return output_blob


def lstm(input,units,return_sequence=False,initial_state=None,direction='forward',layer_index=0, is_train=True):
'''
input: sequence input tensor with shape [batch_size,sequence_length,embedding size]
units: hidden units numbers
'''
batch_size=input.shape[0]
seq_len=input.shape[1]
input_size = input.shape[2]
dtype = flow.float32
with flow.scope.namespace('layer'+str(layer_index)):
with flow.scope.namespace(direction):
weight_blob_i = flow.get_variable(
name='input' + '-weight',
shape=[input_size, units],
dtype=dtype,
trainable=is_train,
initializer=flow.glorot_normal_initializer())

weight_blob_ih = flow.get_variable(
name='input' + '-h-weight',
shape=[units, units],
dtype=dtype,
trainable=is_train,
initializer=flow.glorot_normal_initializer())

bias_blob_i = flow.get_variable(
name='input' + '-bias',
shape=[units],
dtype=dtype,
trainable=is_train,
initializer=flow.constant_initializer(0.0))

weight_blob_f = flow.get_variable(
name='forget' + '-weight',
shape=[input_size, units],
dtype=dtype,
trainable=is_train,
initializer=flow.glorot_normal_initializer())

weight_blob_fh = flow.get_variable(
name='forget' + '-h-weight',
shape=[units, units],
dtype=dtype,
trainable=is_train,
initializer=flow.glorot_normal_initializer())

bias_blob_f = flow.get_variable(
name='forget' + '-bias',
shape=[units],
dtype=dtype,
trainable=is_train,
initializer=flow.constant_initializer(0.0))

weight_blob_c = flow.get_variable(
name='cell' + '-weight',
shape=[input_size, units],
dtype=dtype,
trainable=is_train,
initializer=flow.glorot_normal_initializer())

weight_blob_ch = flow.get_variable(
name='cell' + '-h-weight',
shape=[units, units],
dtype=dtype,
trainable=is_train,
initializer=flow.glorot_normal_initializer())

bias_blob_c = flow.get_variable(
name='cell' + '-bias',
shape=[units],
dtype=dtype,
trainable=is_train,
initializer=flow.constant_initializer(0.0))

weight_blob_o = flow.get_variable(
name='output' + '-weight',
shape=[input_size, units],
dtype=dtype,
trainable=is_train,
initializer=flow.glorot_normal_initializer())

weight_blob_oh = flow.get_variable(
name='output' + '-h-weight',
shape=[units, units],
dtype=dtype,
trainable=is_train,
initializer=flow.glorot_normal_initializer())

bias_blob_o = flow.get_variable(
name='output' + '-bias',
shape=[units],
dtype=dtype,
trainable=is_train,
initializer=flow.constant_initializer(0.0))
flow.watch(weight_blob_i, test_global_storage.Setter("weight_blob_i"))
flow.watch(weight_blob_f, test_global_storage.Setter("weight_blob_f"))
flow.watch(weight_blob_c, test_global_storage.Setter("weight_blob_c"))
flow.watch(weight_blob_o, test_global_storage.Setter("weight_blob_o"))

flow.watch(weight_blob_ih, test_global_storage.Setter("weight_blob_ih"))
flow.watch(weight_blob_fh, test_global_storage.Setter("weight_blob_fh"))
flow.watch(weight_blob_ch, test_global_storage.Setter("weight_blob_ch"))
flow.watch(weight_blob_oh, test_global_storage.Setter("weight_blob_oh"))

flow.watch(bias_blob_i, test_global_storage.Setter("bias_blob_i"))
flow.watch(bias_blob_f, test_global_storage.Setter("bias_blob_f"))
flow.watch(bias_blob_c, test_global_storage.Setter("bias_blob_c"))
flow.watch(bias_blob_o, test_global_storage.Setter("bias_blob_o"))

def step_function(input,states):
hx=states[0]
cx=states[1]

x_i = _FullyConnected(input,weight_blob_i,bias_blob_i) # input gate
mark_int=x_i
x_f = _FullyConnected(input,weight_blob_f,bias_blob_f) # forget gate
x_c = _FullyConnected(input,weight_blob_c,bias_blob_c) # cell state
x_o = _FullyConnected(input,weight_blob_o,bias_blob_o) # output gate

h_i = _FullyConnected(hx,weight_blob_ih,None)
h_f = _FullyConnected(hx,weight_blob_fh,None)
h_c = _FullyConnected(hx,weight_blob_ch,None)
h_o = _FullyConnected(hx,weight_blob_oh,None)


x_i = x_i + h_i
x_f = x_f+h_f
x_c = x_c+h_c
x_o = x_o+h_o

x_i = flow.math.sigmoid(x_i)
x_f = flow.math.sigmoid(x_f)
cellgate = flow.math.tanh(x_c)
x_o = flow.math.sigmoid(x_o)

cy = x_f * cx + x_i * cellgate

hy = x_o * flow.math.tanh(cy)

return hy, (hy,cy)

if initial_state:
states=initial_state
else:
states=[flow.constant(0, dtype=flow.float32, shape=[batch_size,units]),flow.constant(0, dtype=flow.float32, shape=[batch_size,units])]
successive_outputs=[]
successive_states= []

for index in range(seq_len):
# print('time step:',index)
inp = flow.slice(input, [None, index, 0], [None, 1, input_size])
# print(inp.shape)
inp = flow.reshape(inp, [-1, input_size])
# print(inp.shape)
output, states = step_function(inp, states)

output = flow.reshape(output,[-1,1,units])
successive_outputs.append(output)
successive_states.append(states)
last_output = successive_outputs[-1]
new_states = successive_states[-1]
outputs = flow.concat(successive_outputs,axis=1)


if return_sequence:
return outputs
else:
return flow.reshape(last_output,[-1,units])

def Blstm(input,units,return_sequence=True,initial_state=None,layer_index=0,is_train=True):
# return_sequence should be True for BLSTM currently
# default concat method : add

forward = lstm(input,units,return_sequence=return_sequence,initial_state=initial_state,direction='forward',layer_index=layer_index,is_train=is_train)

reverse_input = flow.reverse(input,axis=1)
backward = lstm(reverse_input,units,return_sequence=return_sequence,initial_state=initial_state,direction='backward',layer_index=layer_index,is_train=is_train)
backward = flow.reverse(backward,axis=1)

outputs = forward + backward

return outputs

def TestLstm():
func_config = flow.FunctionConfig()
func_config.default_data_type(flow.float32)

flow.config.gpu_device_num(1)

@flow.global_function(func_config)
def InferenceNet(sentence=flow.FixedTensorDef((32, 128, 312), dtype=flow.float32)):

output = lstm(sentence,512,return_sequence=False)
return output

flow.config.enable_debug_mode(True)
check_point = flow.train.CheckPoint()
check_point.init()
sentence_in = np.random.uniform(-10, 10, (32, 128, 312)).astype(np.float32)

output_of = InferenceNet(sentence_in).get()
print('output shape',output_of.numpy().shape)
print('lstm hello world')
# print('x_o',output_of[0].numpy())


# print('o',output_of[3].numpy())
#print('output shape:',output.numpy().shape)
# print('weight:',test_global_storage.Get("weight_blob_i") )
# print('weight:',test_global_storage.Get("weight_blob_ih").shape )
# print('lstm hello world')

# from tensorflow.keras import layers
# from tensorflow import keras
#
# inputs = keras.Input(shape=(14, 64))
# x = layers.LSTM(15,return_sequences=True,recurrent_activation ='sigmoid',name="lstm_one")(inputs)
#
# weight_blob_i = test_global_storage.Get("weight_blob_i")
# weight_blob_f = test_global_storage.Get("weight_blob_f")
# weight_blob_c = test_global_storage.Get("weight_blob_c")
# weight_blob_o = test_global_storage.Get("weight_blob_o")
# kernel_1 = np.concatenate( ( weight_blob_i,weight_blob_f,weight_blob_c,weight_blob_o) ,axis=1)
#
# weight_blob_ih = test_global_storage.Get("weight_blob_ih")
# weight_blob_fh = test_global_storage.Get("weight_blob_fh")
# weight_blob_ch = test_global_storage.Get("weight_blob_ch")
# weight_blob_oh = test_global_storage.Get("weight_blob_oh")
# kernel_2 = np.concatenate( ( weight_blob_ih,weight_blob_fh,weight_blob_ch,weight_blob_oh) ,axis=1)
#
# bias_blob_i = test_global_storage.Get("bias_blob_i")
# bias_blob_f = test_global_storage.Get("bias_blob_f")
# bias_blob_c = test_global_storage.Get("bias_blob_c")
# bias_blob_o = test_global_storage.Get("bias_blob_o")
# bias_1 = np.concatenate( ( bias_blob_i,bias_blob_f,bias_blob_c,bias_blob_o) )
#
# model = keras.Model(inputs,x)
# model.get_layer("lstm_one").set_weights([kernel_1,kernel_2,bias_1])
# output_tf = model.predict(sentence_in)
#
# print(output_of.numpy()[:,-1,:])
# print('-'*100)
# print(output_tf[:,-1,:])
# assert(np.allclose(output_of.numpy(),output_tf, rtol=1e-04,atol=1e-04))

def TestBlstm():
func_config = flow.FunctionConfig()
func_config.default_data_type(flow.float32)

flow.config.gpu_device_num(1)

@flow.global_function(func_config)
def InferenceNet(sentence=flow.FixedTensorDef((8,15,64), dtype=flow.float32)):

output = Blstm(sentence,15,return_sequence=True)
return output
flow.config.enable_debug_mode(True)
check_point = flow.train.CheckPoint()
check_point.init()
sentence_in = np.random.uniform(-10, 10, (8, 15, 64)).astype(np.float32)

output=InferenceNet(sentence_in).get()

print('output shape',output.numpy().shape)
print('blstm hello world')

if __name__ == "__main__":
TestLstm()
# TestBlstm()

+ 29
- 0
model_compress/model_compress/distil/src/test_global_storage.py View File

@@ -0,0 +1,29 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
global_storage = {}


def Get(name):
return global_storage.get(name).numpy()


def Setter(name):
global global_storage

def _set(x):
global_storage[name] = x

return _set

+ 400
- 0
model_compress/model_compress/distil/src/tokenization.py View File

@@ -0,0 +1,400 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

"""Tokenization classes."""

import collections
import re
import unicodedata
import six


# import tensorflow as tf


def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
"""Checks whether the casing config is consistent with the checkpoint name."""

# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.

if not init_checkpoint:
return

m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
if m is None:
return

model_name = m.group(1)

lower_models = [
"uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
"multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
]

cased_models = [
"cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
"multi_cased_L-12_H-768_A-12"
]

is_bad_config = False
if model_name in lower_models and not do_lower_case:
is_bad_config = True
actual_flag = "False"
case_name = "lowercased"
opposite_flag = "True"

if model_name in cased_models and do_lower_case:
is_bad_config = True
actual_flag = "True"
case_name = "cased"
opposite_flag = "False"

if is_bad_config:
raise ValueError(
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
"However, `%s` seems to be a %s model, so you "
"should pass in `--do_lower_case=%s` so that the fine-tuning matches "
"how the model was pre-training. If this error is wrong, please "
"just comment out this check." % (actual_flag, init_checkpoint,
model_name, case_name, opposite_flag))


def convert_to_unicode(text):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text.decode("utf-8", "ignore")
elif isinstance(text, unicode):
return text
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")


def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""

# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if six.PY3:
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
elif six.PY2:
if isinstance(text, str):
return text
elif isinstance(text, unicode):
return text.encode("utf-8")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
else:
raise ValueError("Not running on Python2 or Python 3?")


def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
# with tf.gfile.GFile(vocab_file, "r") as reader:
with open(vocab_file, "r", encoding='utf-8') as reader:
while True:
token = convert_to_unicode(reader.readline())
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab


def convert_by_vocab(vocab, items):
"""Converts a sequence of [tokens|ids] using the vocab."""
output = []
for item in items:
output.append(vocab[item])
return output


def convert_tokens_to_ids(vocab, tokens):
return convert_by_vocab(vocab, tokens)


def convert_ids_to_tokens(inv_vocab, ids):
return convert_by_vocab(inv_vocab, ids)


def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens


class FullTokenizer(object):
"""Runs end-to-end tokenziation."""

def __init__(self, vocab_file, do_lower_case=True):
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)

return split_tokens

def convert_tokens_to_ids(self, tokens):
return convert_by_vocab(self.vocab, tokens)

def convert_ids_to_tokens(self, ids):
return convert_by_vocab(self.inv_vocab, ids)


class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self, do_lower_case=True):
"""Constructs a BasicTokenizer.

Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case

def tokenize(self, text):
"""Tokenizes a piece of text."""
text = convert_to_unicode(text)
text = self._clean_text(text)

# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)

orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)

def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1

return ["".join(x) for x in output]

def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)

def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True

return False

def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)


class WordpieceTokenizer(object):
"""Runs WordPiece tokenziation."""

def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word

def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
input = "unaffable"
output = ["un", "##aff", "##able"]

Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.

Returns:
A list of wordpiece tokens.
"""

text = convert_to_unicode(text)

output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue

is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end

if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens


def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False


def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False


def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False

+ 220
- 0
model_compress/model_compress/distil/src/util.py View File

@@ -0,0 +1,220 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import time
import numpy as np
from collections import OrderedDict
import pandas as pd
from datetime import datetime
import oneflow as flow
import shutil

def InitNodes(args):
if args.num_nodes > 1:
assert args.num_nodes <= len(args.node_ips)
flow.env.ctrl_port(args.ctrl_port)
nodes = []
for ip in args.node_ips[:args.num_nodes]:
addr_dict = {}
addr_dict["addr"] = ip
nodes.append(addr_dict)

flow.env.machine(nodes)


class Snapshot(object):
def __init__(self, model_save_dir, model_load_dir):
self._model_save_dir = model_save_dir
self._check_point = flow.train.CheckPoint()
if model_load_dir:
assert os.path.isdir(model_load_dir)
print("Restoring model from {}.".format(model_load_dir))
self._check_point.load(model_load_dir)
else:
self._check_point.init()
self.save('initial_model')
print("Init model on demand.")

def save(self, name):
snapshot_save_path = os.path.join(self._model_save_dir, "snapshot_{}".format(name))
if not os.path.exists(snapshot_save_path):
os.makedirs(snapshot_save_path)
print("Saving model to {}.".format(snapshot_save_path))
self._check_point.save(snapshot_save_path)


class Summary(object):
def __init__(self, log_dir, config, filename='summary.csv'):
self._filename = filename
self._log_dir = log_dir
if not os.path.exists(log_dir): os.makedirs(log_dir)
self._metrics = pd.DataFrame({"legend": "cfg", "value": str(config)}, index=[0])

def scalar(self, legend, iter, value, **kwargs):
kwargs['legend'] = legend
kwargs['iter'] = int(iter)
kwargs['value'] = value
df = pd.DataFrame(kwargs, index=[0])
self._metrics = pd.concat([self._metrics, df], axis=0, sort=False)
self.save()

def save(self):
save_path = os.path.join(self._log_dir, self._filename)
self._metrics.to_csv(save_path, index=False)


class StopWatch(object):
def __init__(self):
pass

def start(self):
self.start_time = time.time()
self.last_split = self.start_time

def split(self):
now = time.time()
duration = now - self.last_split
self.last_split = now
return duration

def stop(self):
self.stop_time = time.time()

def duration(self):
return self.stop_time - self.start_time


class Metric(object):
def __init__(self, summary=None, desc='train', print_steps=-1, batch_size=256, keys=[]):
r"""accumulate and calculate metric

Args:
summary: A `Summary` object to write in.
desc: `str` general description of the metric to show
print_steps: `Int` print metrics every nth steps
batch_size: `Int` batch size per step
keys: keys in callback outputs
Returns:
"""
self.summary = summary
self.save_summary = isinstance(self.summary, Summary)
self.desc = desc
self.print_steps = print_steps
assert batch_size > 0
self.batch_size = batch_size

assert isinstance(keys, (list, tuple))
self.keys = keys
self.metric_dict = OrderedDict()
self.metric_dict['step'] = 0

self.timer = StopWatch()
self.timer.start()
self._clear()

def _clear(self):
for key in self.keys:
self.metric_dict[key] = 0.0
self.metric_dict['throughput'] = 0.0
self.num_samples = 0.0

def update_and_save(self, key, value, step, **kwargs):
self.metric_dict[key] = value
if self.save_summary:
self.summary.scalar(self.desc + "_" + key, step, value, **kwargs)

def metric_cb(self, step=0, **kwargs):
def callback(outputs):
if step == 0: self._clear()

for key in self.keys:
self.metric_dict[key] += outputs[key].sum()

self.num_samples += self.batch_size

if (step + 1) % self.print_steps == 0:
self.metric_dict['step'] = step
for k, v in kwargs.items():
self.metric_dict[k] = v
throughput = self.num_samples / self.timer.split()
self.update_and_save('throughput', throughput, step)
for key in self.keys:
value = self.metric_dict[key] / self.num_samples
self.update_and_save(key, value, step, **kwargs)
print(', '.join(('{}: {}' if type(v) is int else '{}: {:.3f}').format(k, v) \
for k, v in self.metric_dict.items()), time.time())
self._clear()

return callback

def CreateOptimizer(args):
warmup_batches = int(args.iter_num * args.warmup_proportion)
lr_warmup = flow.optimizer.warmup.linear(warmup_batches, 0)
lr_scheduler = flow.optimizer.PolynomialSchduler(args.learning_rate, args.iter_num, 0.0,
warmup=lr_warmup)
return flow.optimizer.AdamW(lr_scheduler, epsilon=1e-6, weight_decay=args.weight_decay_rate,
weight_decay_excludes=["bias", "LayerNorm", "layer_norm"],
grad_clipping=flow.optimizer.grad_clipping.by_global_norm(1.0))

def GetFunctionConfig(args):
config = flow.function_config()
config.enable_auto_mixed_precision(args.use_fp16)
if args.use_xla:
config.use_xla_jit(True)
return config

def getdirsize(dir):
size = 0
for root, dirs, files in os.walk(dir):
for name in files:
if str(root[-2:]) == '-v' or str(root[-2:]) == '-m':
pass
else:
tmp = os.path.getsize(os.path.join(root, name))
size += tmp
# size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
return size

def remove_optimizer_params(model_dir):
# delete the optimizer parmas from model_save_dir
for a, b, c in os.walk(model_dir):
for subdir in b:
if str(subdir[-2:]) == '-v' or str(subdir[-2:]) == '-m':
shutil.rmtree(os.path.join(model_dir, subdir))

def remove_teacher_params(model_dir):
# delete the teacher params from model_save_dir
# delete the optimizer parmas from model_save_dir
for a, b, c in os.walk(model_dir):
for subdir in b:
if subdir[:7]!='student':
shutil.rmtree(os.path.join(model_dir,subdir))
elif str(subdir[-2:]) == '-v' or str(subdir[-2:]) == '-m':
shutil.rmtree(os.path.join(model_dir, subdir))

glue_tasks_num_labels = {
"cola": 2,
"mnli": 3,
"mrpc": 2,
"sst-2": 2,
"sts-b": 1,
"qqp": 2,
"qnli": 2,
"rte": 2,
"wnli": 2,
}


+ 204
- 0
model_compress/model_compress/distil/theseus/README.md View File

@@ -0,0 +1,204 @@
# 知识蒸馏快速上手

## 1. 简介
知识蒸馏:通过一些优化目标从大型、知识丰富的teacher模型学习一个小型的student模型

炼知技术平台提供了4个知识蒸馏相关算子,以及众多基于Oneflow算子复现的知识蒸馏模型和使用示例。
<table>
<thead>
<tr>
<th>类型</th>
<th>知识蒸馏模型</th>
<th><a href="../../../docs/API_knowledge_distill.md" target="_blank">主要算子</a></th>
<th>使用文档</th>
</tr>
</thead>
<tbody>
<tr>
<td rowspan="2">软标签蒸馏</td>
<td>KD</td>
<td>软标签蒸馏</td>
<td><a href="./examples/knowledge_distillation/README.md" target="_blank">链接</a></td>
</tr>
<tr>
<td>Distilled-BiLSTM</td>
<td>软标签蒸馏,将BERT蒸馏到BiLSTM</td>
<td><a href="./examples/distilled-bilstm/README.md" target="_blank">链接</a></td>
</tr>
<tr>
<td rowspan="2">从其他知识蒸馏</td>
<td>BERT-PKD</td>
<td>软标签蒸馏+层与层蒸馏</td>
<td><a href="./examples/bert-pkd/README.md" target="_blank">链接</a></td>
</tr>
<tr>
<td>TinyBERT</td>
<td>软标签蒸馏+层与层蒸馏+注意力蒸馏</td>
<td><a href="./examples/tinybert/README.md" target="_blank">链接</a></td>
</tr>
<tr>
<td>模块替换</td>
<td>BERT-Theseus</td>
<td>依照概率替换原有的BERT模块和Theseus的模块组成新的模型来训练</td>
<td><a href="./examples/xxx/README.md" target="_blank">链接</a></td>
</tr>
</tbody>
</table>

## 2. 使用
### 2.1 依赖
- Python 3.6
- oneflow-cu101 0.1.10
- numpy 1.19.2

完整的环境可以通过以下命令安装:
```bash
conda create -n distil python=3.6
```

```
python3 -m pip install --find-links https://oneflow-inc.github.io/nightly oneflow_cu101 --user
```
### 2.2 数据获取
知识蒸馏主要针对NLP相关的任务,炼知平台在GLUE任务的数据集上对不同算法进行了测试。

可以通过执行以下脚本下载GLUE任务的所有数据集,将会自动下载并解压到'--data_dir=data'目录下。

```
bash run_download_glue_data.sh
```
或者
```bash
python ../src/download_glue_data.py --data_dir data/glue_data --tasks all
```

TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"]

以上脚本将会默认下载所有BLUE任务数据集,也可以通过'--tasks=TASKS',指定下载某些数据集

参考[加载与准备OneFlow数据集](https://github.com/Oneflow-Inc/oneflow-documentation/blob/master/cn/docs/extended_topics/how_to_make_ofdataset.md),制作OFRecords数据集。或者执行以下命令,生成OFRecords数据集:
```
bash glue_process.sh
```

**或者直接下载转换后的OFRecords GLUE数据集,并放置到相关目录(data/glue_ofrecord)下:**
链接: https://pan.baidu.com/s/1TuDJpJ8z9zJvvhqjjXiGDg 提取码: phyf


### 2.3 微调教师模型
预训练BERT模型下载地址:
链接: https://pan.baidu.com/s/1jfTUY7ygcZZOJzjfrgUL8Q 提取码: 6b87

下载后放置在`./models/uncased_L-12_H-768_A-12_oneflow`
#### 2.3.1 训练
- 执行以下脚本进行微调教师模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- MODEL_SAVE_DIR: 模型保存路径

```bash
bash run_train_teacher.sh
```
#### 2.3.2 测试
- 微调后,可以执行以下脚本对教师模型进行测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- TEACHER_MODEL_DIR: 教师模型路径

```bash
bash run_eval_teacher.sh
```
### 2.4 蒸馏到学生模型
#### 2.4.1 训练
执行以下脚本将教师模型蒸馏到学生模型:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- FT_BERT_BASE_DIR: 在特定任务上微调过的教师模型路径
- TMP_STUDENT_DIR: 临时学生模型路径(如果需要的话)
- STUDENT_DIR: 学生模型保存路径

- 不同知识蒸馏算法:
- KD
```bash
bash run_train_student_kd.sh
```
- Distilled-BiLSTM
```bash
bash run_train_student_distilled_lstm.sh
```
- BERT-PKD
```bash
bash run_train_student_bert_pkd.sh
```
>注:BERT-PKD可以随机初始化,也可以选择根据教师BERT中间层进行初始化,详细步骤请查阅[这里](./examples/bert-pkd/README.md#41-教师模型中间层保存与转换)
- TinyBERT
```bash
bash run_train_student_tinybert.sh
```
- BERT-of-Theseus
```bash
bash run_bert_theseus.sh ${GPU_ID} ${SAVE_TAG} {PHRASE1_LR} ${PHRASE2_LR} ${PHRASE1_REPLACE_RATE} ${COMPRESS_RATIO}
example: bash run_bert_theseus.sh 0 1 1e-5 1e-5 0.5 4
```
- GPU_ID: 指定进行训练的 GPU 的 id
- SAVE_TAG: 指定模型保存文件的特定标识符
- PHRASE1_LR: BERT-of-Theseus第一阶段的学习率
- PHRASE1_LR: BERT-of-Theseus第二阶段的学习率
- PHRASE1_REPLACE_RATE: 第一阶段当中,BERT的模块替换为Theseus模块的概率
- COMPRESS_RATIO: 压缩的比率,例如 COMPRESS_RATIO=4,会将12层的BERT-Base压缩为3层
- BERT-of-Theses 需要在特定数据集上微调好的模型作为输入
- 修改 run_bert_theseus.sh 里 line 25 的 dataset=你需要的数据集,现在默认是 SST-2
- 将特定数据集现象下面的 PRETRAINED_MODEL 和 BERT_BASE_DIR 都改成上面你微调好的模型所在的文件夹。
- 默认的保存路径为:
- 第一阶段./log/${dataset}_bert_theseus_uncased_L-12_H-768_A-12_oneflow_v${SAVE_TAG}s1
- 第一阶段./log/${dataset}_bert_theseus_uncased_L-12_H-768_A-12_oneflow_v${SAVE_TAG}s2
> BERT类模型最大序列长度设为128; LSTM类模型最大序列长度设为32,词表大小为10000

#### 2.4.2 测试
执行以下脚本进行测试:
- DATA_ROOT: GLUE数据集总路径
- dataset: 任务名
- STUDENT_DIR: 学生模型保存路径,蒸馏过的学生模型下载链接如下(SST-2数据集)

- 不同知识蒸馏算法:
- KD
下载链接: https://pan.baidu.com/s/1EgQyQgxAcFAG8Ch3-4VPaw 提取码: 5k9p
```bash
bash run_eval_student_kd.sh
```
- Distilled-BiLSTM
下载链接: https://pan.baidu.com/s/1M4XzB2DnLikglxVFvhnYpw 提取码: hqhj
```bash
bash run_eval_student_distilled_lstm.sh
```
- BERT-PKD
- 从教师模型中间层初始化,下载链接: https://pan.baidu.com/s/1l7vXn-3U05Hzl0RXCJPiLg 提取码: 33dk
- 随机初始化,下载链接: https://pan.baidu.com/s/1m46j57Tova_yaGLabAqUIw 提取码: pdx4
```bash
bash run_eval_student_bert_pkd.sh
```

- TinyBERT
下载链接: https://pan.baidu.com/s/1nOAZHd3wLmyVw2vTJB7KfQ 提取码: ma65
```bash
bash run_eval_student_tinybert.sh
```
- BERT-of-Theseus
```bash
bash eval_bert_theseus.sh ${GPU_ID} ${VERSION}
example: bash eval_bert_theseus.sh 0 1s1
```



+ 0
- 0
model_compress/model_compress/distil/theseus/__init__.py View File


+ 341
- 0
model_compress/model_compress/distil/theseus/bert.py View File

@@ -0,0 +1,341 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import oneflow.core.common.data_type_pb2 as data_type_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util
import math

class BertBackbone(object):

def __init__(self,
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02):

with flow.scope.namespace("bert"):
with flow.scope.namespace("embeddings"):
(self.embedding_output_, self.embedding_table_) = _EmbeddingLookup(
input_ids_blob=input_ids_blob,
vocab_size=vocab_size,
embedding_size=hidden_size,
initializer_range=initializer_range,
word_embedding_name="word_embeddings")
self.embedding_output_ = _EmbeddingPostprocessor(
input_blob=self.embedding_output_,
seq_length=seq_length,
embedding_size=hidden_size,
use_token_type=True,
token_type_ids_blob=token_type_ids_blob,
token_type_vocab_size=type_vocab_size,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=initializer_range,
max_position_embeddings=max_position_embeddings,
dropout_prob=hidden_dropout_prob)
with flow.scope.namespace("encoder"):
attention_mask_blob = _CreateAttentionMaskFromInputMask(
input_mask_blob, from_seq_length=seq_length, to_seq_length=seq_length)
self.all_encoder_layers_ = _TransformerModel(
input_blob=self.embedding_output_,
attention_mask_blob=attention_mask_blob,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
intermediate_act_fn=GetActivation(hidden_act),
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_all_layers=False)
self.sequence_output_ = self.all_encoder_layers_[-1]

def embedding_output(self): return self.embedding_output_
def all_encoder_layers(self): return self.all_encoder_layers_
def sequence_output(self): return self.sequence_output_
def embedding_table(self): return self.embedding_table_

def CreateInitializer(std):
return flow.truncated_normal(std)

def _Gelu(in_blob):
return flow.math.gelu(in_blob)

def _TransformerModel(input_blob,
attention_mask_blob,
seq_length,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
intermediate_act_fn=_Gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False):

assert hidden_size % num_attention_heads == 0
attention_head_size = int(hidden_size / num_attention_heads)
input_width = hidden_size
prev_output_blob = flow.reshape(input_blob, (-1, input_width))
all_layer_output_blobs = []
for layer_idx in range(num_hidden_layers):
with flow.scope.namespace("layer_%d"%layer_idx):
layer_input_blob = prev_output_blob
with flow.scope.namespace("attention"):
with flow.scope.namespace("self"):
attention_output_blob = _AttentionLayer(
from_blob=layer_input_blob,
to_blob=layer_input_blob,
attention_mask_blob=attention_mask_blob,
num_attention_heads=num_attention_heads,
size_per_head=attention_head_size,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_2d_tensor=True,
from_seq_length=seq_length,
to_seq_length=seq_length)
with flow.scope.namespace("output"):
attention_output_blob = _FullyConnected(
attention_output_blob,
input_size=num_attention_heads * attention_head_size,
units=hidden_size,
weight_initializer=CreateInitializer(initializer_range),
name='dense')
attention_output_blob = _Dropout(attention_output_blob, hidden_dropout_prob)
attention_output_blob = attention_output_blob + layer_input_blob
attention_output_blob = _LayerNorm(attention_output_blob, hidden_size)
with flow.scope.namespace("intermediate"):
if callable(intermediate_act_fn):
act_fn = op_conf_util.kNone
else:
act_fn = intermediate_act_fn
intermediate_output_blob = _FullyConnected(
attention_output_blob,
input_size=num_attention_heads * attention_head_size,
units=intermediate_size,
activation=act_fn,
weight_initializer=CreateInitializer(initializer_range),
name='dense')
if callable(intermediate_act_fn):
intermediate_output_blob = intermediate_act_fn(intermediate_output_blob)
with flow.scope.namespace("output"):
layer_output_blob = _FullyConnected(
intermediate_output_blob,
input_size=intermediate_size,
units=hidden_size,
weight_initializer=CreateInitializer(initializer_range),
name='dense')
layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob)
layer_output_blob = layer_output_blob + attention_output_blob
layer_output_blob = _LayerNorm(layer_output_blob, hidden_size)
prev_output_blob = layer_output_blob
all_layer_output_blobs.append(layer_output_blob)

input_shape = (-1, seq_length, hidden_size)
if do_return_all_layers:
final_output_blobs = []
for layer_output_blob in all_layer_output_blobs:
final_output_blob = flow.reshape(layer_output_blob, input_shape)
final_output_blobs.append(final_output_blob)
return final_output_blobs
else:
final_output_blob = flow.reshape(prev_output_blob, input_shape)
return [final_output_blob]

def _AttentionLayer(from_blob,
to_blob,
attention_mask_blob,
num_attention_heads=1,
size_per_head=512,
query_act=op_conf_util.kNone,
key_act=op_conf_util.kNone,
value_act=op_conf_util.kNone,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
do_return_2d_tensor=False,
batch_size=None,
from_seq_length=None,
to_seq_length=None):

def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
output_blob = flow.reshape(input_blob, [-1, seq_length, num_attention_heads, width])
output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
return output_blob

from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])

query_blob = _FullyConnected(
from_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=query_act,
name="query",
weight_initializer=CreateInitializer(initializer_range))

key_blob = _FullyConnected(
to_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=key_act,
name="key",
weight_initializer=CreateInitializer(initializer_range))

value_blob = _FullyConnected(
to_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=value_act,
name="value",
weight_initializer=CreateInitializer(initializer_range))

query_blob = TransposeForScores(query_blob, num_attention_heads, from_seq_length, size_per_head)
key_blob = TransposeForScores(key_blob, num_attention_heads, to_seq_length, size_per_head)

attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
attention_scores_blob = attention_scores_blob * (1.0 / math.sqrt(float(size_per_head)))

attention_mask_blob = flow.reshape(attention_mask_blob, [-1, 1, from_seq_length, to_seq_length])
attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
addr_blob = (attention_mask_blob - 1.0) * 10000.0

attention_scores_blob = attention_scores_blob + addr_blob
attention_probs_blob = flow.nn.softmax(attention_scores_blob)
attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)

value_blob = flow.reshape(value_blob, [-1, to_seq_length, num_attention_heads, size_per_head])
value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
context_blob = flow.matmul(attention_probs_blob, value_blob)
context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])

if do_return_2d_tensor:
context_blob = flow.reshape(context_blob, [-1, num_attention_heads * size_per_head])
else:
context_blob = flow.reshape(context_blob, [-1, from_seq_length, num_attention_heads * size_per_head])
return context_blob

def _FullyConnected(input_blob, input_size, units, activation=None, name=None,
weight_initializer=None, is_train=True):
weight_blob = flow.get_variable(
name=name + '-weight',
shape=[input_size, units],
dtype=input_blob.dtype,
trainable=is_train,
initializer=weight_initializer)
bias_blob = flow.get_variable(
name=name + '-bias',
shape=[units],
dtype=input_blob.dtype,
trainable=is_train,
initializer=flow.constant_initializer(0.0))
output_blob = flow.matmul(input_blob, weight_blob)
output_blob = flow.nn.bias_add(output_blob, bias_blob)
return output_blob

def _Dropout(input_blob, dropout_prob):
if dropout_prob == 0.0:
return input_blob
return flow.nn.dropout(input_blob, rate=dropout_prob)


def _LayerNorm(input_blob, hidden_size):
return flow.layers.layer_norm(input_blob, name='LayerNorm', begin_norm_axis=-1, begin_params_axis=-1)


def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length):
output = flow.cast(to_mask_blob, dtype=flow.float)
output = flow.reshape(output, [-1, 1, to_seq_length])
zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length])
output = zeros + output
return output


def _EmbeddingPostprocessor(input_blob,
seq_length,
embedding_size,
use_token_type=False,
token_type_ids_blob=None,
token_type_vocab_size=16,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=0.02,
max_position_embeddings=512,
dropout_prob=0.1):
output = input_blob

if use_token_type:
assert token_type_ids_blob is not None
token_type_table = flow.get_variable(name=token_type_embedding_name,
shape=[token_type_vocab_size, embedding_size],
dtype=input_blob.dtype,
initializer=CreateInitializer(initializer_range))
token_type_embeddings = flow.gather(params=token_type_table, indices=token_type_ids_blob, axis=0)
output = output + token_type_embeddings

if use_position_embeddings:
position_table = flow.get_variable(name=position_embedding_name,
shape=[1, max_position_embeddings, embedding_size],
dtype=input_blob.dtype,
initializer=CreateInitializer(initializer_range))
assert seq_length <= max_position_embeddings
if seq_length != max_position_embeddings:
position_table = flow.slice(position_table, begin=[None, 0, 0], size=[None, seq_length, -1])
output = output + position_table

output = _LayerNorm(output, embedding_size)
output = _Dropout(output, dropout_prob)

return output


def _EmbeddingLookup(input_ids_blob,
vocab_size,
embedding_size=128,
initializer_range=0.02,
word_embedding_name="word_embeddings"):
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size],
dtype=flow.float,
initializer=CreateInitializer(initializer_range))
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
return output, embedding_table

def GetActivation(name):
if name == 'linear':
return None
elif name == 'relu':
return flow.math.relu
elif name == 'tanh':
return flow.math.tanh
elif name == 'gelu':
return flow.math.gelu
else:
raise Exception("unsupported activation")


+ 408
- 0
model_compress/model_compress/distil/theseus/bert_theseus.py View File

@@ -0,0 +1,408 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import oneflow.core.common.data_type_pb2 as data_type_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util
import math

class BertTheseusBackbone(object):

def __init__(self,
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
replace_prob=None,
compress_ratio=1):

with flow.scope.namespace("bert"):
with flow.scope.namespace("embeddings"):
(self.embedding_output_, self.embedding_table_) = _EmbeddingLookup(
input_ids_blob=input_ids_blob,
vocab_size=vocab_size,
embedding_size=hidden_size,
initializer_range=initializer_range,
word_embedding_name="word_embeddings",
is_train=False)
self.embedding_output_ = _EmbeddingPostprocessor(
input_blob=self.embedding_output_,
seq_length=seq_length,
embedding_size=hidden_size,
use_token_type=True,
token_type_ids_blob=token_type_ids_blob,
token_type_vocab_size=type_vocab_size,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=initializer_range,
max_position_embeddings=max_position_embeddings,
dropout_prob=hidden_dropout_prob,
is_train=False)
with flow.scope.namespace("encoder"):
attention_mask_blob = _CreateAttentionMaskFromInputMask(
input_mask_blob, from_seq_length=seq_length, to_seq_length=seq_length)
self.all_encoder_layers_ = _TransformerModel(
input_blob=self.embedding_output_,
attention_mask_blob=attention_mask_blob,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
intermediate_act_fn=GetActivation(hidden_act),
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_all_layers=False,
replace_prob=replace_prob,
compress_ratio=compress_ratio)
self.sequence_output_ = self.all_encoder_layers_[-1]

def embedding_output(self): return self.embedding_output_
def all_encoder_layers(self): return self.all_encoder_layers_
def sequence_output(self): return self.sequence_output_
def embedding_table(self): return self.embedding_table_

def CreateInitializer(std):
return flow.truncated_normal(std)

def _Gelu(in_blob):
return flow.math.gelu(in_blob)

def _TransformerModel(input_blob,
attention_mask_blob,
seq_length,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
intermediate_act_fn=_Gelu,
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
initializer_range=0.02,
do_return_all_layers=False,
replace_prob=0.0,
compress_ratio=1):

# print('| transformer num hidden layers: ', num_hidden_layers)
assert hidden_size % num_attention_heads == 0
attention_head_size = int(hidden_size / num_attention_heads)
input_width = hidden_size
prev_output_blob = flow.reshape(input_blob, (-1, input_width))
# all_layer_output_blobs = []

per_add_teacher_layers = compress_ratio
per_add_student_layers = 1
teacher_layer_idx = student_layer_idx = 0

def add_teacher_layer(base_teacher_layer_idx, sub_teacher_output_blob):
for add_teacher_layer_idx in range(per_add_teacher_layers):
sub_teacher_output_blob = addOnelayer(
layer_idx=base_teacher_layer_idx+add_teacher_layer_idx,
prev_output_blob=sub_teacher_output_blob,
attention_mask_blob=attention_mask_blob,
num_attention_heads=num_attention_heads,
attention_head_size=attention_head_size,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range, seq_length=seq_length, hidden_size=hidden_size,
hidden_dropout_prob=hidden_dropout_prob,
intermediate_act_fn=intermediate_act_fn,
intermediate_size=intermediate_size, namescope_prefix='', is_train=False)
return sub_teacher_output_blob

def add_student_layer(base_student_layer_idx, sub_student_output_blob):
# with flow.scope.namespace("student"):
sub_student_output_blob = addOnelayer(
base_student_layer_idx, sub_student_output_blob, attention_mask_blob,
num_attention_heads, attention_head_size,
attention_probs_dropout_prob, initializer_range, seq_length, hidden_size, hidden_dropout_prob,
intermediate_act_fn, intermediate_size, namescope_prefix='student-', is_train=True)
return sub_student_output_blob

while teacher_layer_idx < num_hidden_layers:
with flow.scope.placement("cpu", "0:0"):
sample = flow.random.coin_flip(name='layer{}_replacing_prob'.format(teacher_layer_idx), probability=replace_prob)
sample = sample.with_distribute(flow.distribute.broadcast())

prev_output_blob = flow.where(
sample,
x=add_student_layer(student_layer_idx, prev_output_blob),
y=add_teacher_layer(teacher_layer_idx, prev_output_blob),
name='where_layer{}'.format(teacher_layer_idx)
)

teacher_layer_idx += per_add_teacher_layers
student_layer_idx += per_add_student_layers
# print('| current teacher_layer: ', teacher_layer_idx)
# print('| current student_layer: ', student_layer_idx)
# print('| num_hidden_layers: ', num_hidden_layers)

input_shape = (-1, seq_length, hidden_size)
final_output_blob = flow.reshape(prev_output_blob, input_shape)
return [final_output_blob]


def addOnelayer(layer_idx, prev_output_blob, attention_mask_blob, num_attention_heads, attention_head_size,
attention_probs_dropout_prob, initializer_range, seq_length, hidden_size, hidden_dropout_prob,
intermediate_act_fn, intermediate_size, namescope_prefix='', is_train=True):
# print('| {} | addOnelayer {}'.format(namescope_prefix, layer_idx))
with flow.scope.namespace("{}layer_{}".format(namescope_prefix, layer_idx)):
layer_input_blob = prev_output_blob
with flow.scope.namespace("attention"):
with flow.scope.namespace("self"):
attention_output_blob = _AttentionLayer(
from_blob=layer_input_blob,
to_blob=layer_input_blob,
attention_mask_blob=attention_mask_blob,
num_attention_heads=num_attention_heads,
size_per_head=attention_head_size,
attention_probs_dropout_prob=attention_probs_dropout_prob,
initializer_range=initializer_range,
do_return_2d_tensor=True,
from_seq_length=seq_length,
to_seq_length=seq_length,
is_train=is_train)
with flow.scope.namespace("output"):
attention_output_blob = _FullyConnected(
attention_output_blob,
input_size=num_attention_heads * attention_head_size,
units=hidden_size,
weight_initializer=CreateInitializer(initializer_range),
name='dense',
is_train=is_train)
attention_output_blob = _Dropout(attention_output_blob, hidden_dropout_prob)
attention_output_blob = attention_output_blob + layer_input_blob
attention_output_blob = _LayerNorm(attention_output_blob, hidden_size)
with flow.scope.namespace("intermediate"):
if callable(intermediate_act_fn):
act_fn = op_conf_util.kNone
else:
act_fn = intermediate_act_fn
intermediate_output_blob = _FullyConnected(
attention_output_blob,
input_size=num_attention_heads * attention_head_size,
units=intermediate_size,
activation=act_fn,
weight_initializer=CreateInitializer(initializer_range),
name='dense',
is_train=is_train)
if callable(intermediate_act_fn):
intermediate_output_blob = intermediate_act_fn(intermediate_output_blob)
with flow.scope.namespace("output"):
layer_output_blob = _FullyConnected(
intermediate_output_blob,
input_size=intermediate_size,
units=hidden_size,
weight_initializer=CreateInitializer(initializer_range),
name='dense',
is_train=is_train)
layer_output_blob = _Dropout(layer_output_blob, hidden_dropout_prob)
layer_output_blob = layer_output_blob + attention_output_blob
layer_output_blob = _LayerNorm(layer_output_blob, hidden_size)
output_blob = layer_output_blob
return output_blob


def _AttentionLayer(from_blob,
to_blob,
attention_mask_blob,
num_attention_heads=1,
size_per_head=512,
query_act=op_conf_util.kNone,
key_act=op_conf_util.kNone,
value_act=op_conf_util.kNone,
attention_probs_dropout_prob=0.0,
initializer_range=0.02,
do_return_2d_tensor=False,
batch_size=None,
from_seq_length=None,
to_seq_length=None,
is_train=True):

def TransposeForScores(input_blob, num_attention_heads, seq_length, width):
output_blob = flow.reshape(input_blob, [-1, seq_length, num_attention_heads, width])
output_blob = flow.transpose(output_blob, perm=[0, 2, 1, 3])
return output_blob

from_blob_2d = flow.reshape(from_blob, [-1, num_attention_heads * size_per_head])
to_blob_2d = flow.reshape(to_blob, [-1, num_attention_heads * size_per_head])

query_blob = _FullyConnected(
from_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=query_act,
name="query",
is_train=is_train,
weight_initializer=CreateInitializer(initializer_range))

key_blob = _FullyConnected(
to_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=key_act,
name="key",
is_train=is_train,
weight_initializer=CreateInitializer(initializer_range))

value_blob = _FullyConnected(
to_blob_2d,
input_size=num_attention_heads * size_per_head,
units=num_attention_heads * size_per_head,
activation=value_act,
name="value",
is_train=is_train,
weight_initializer=CreateInitializer(initializer_range))

query_blob = TransposeForScores(query_blob, num_attention_heads, from_seq_length, size_per_head)
key_blob = TransposeForScores(key_blob, num_attention_heads, to_seq_length, size_per_head)

attention_scores_blob = flow.matmul(query_blob, key_blob, transpose_b=True)
attention_scores_blob = attention_scores_blob * (1.0 / math.sqrt(float(size_per_head)))

attention_mask_blob = flow.reshape(attention_mask_blob, [-1, 1, from_seq_length, to_seq_length])
attention_mask_blob = flow.cast(attention_mask_blob, dtype=flow.float)
addr_blob = (attention_mask_blob - 1.0) * 10000.0

attention_scores_blob = attention_scores_blob + addr_blob
attention_probs_blob = flow.nn.softmax(attention_scores_blob)
attention_probs_blob = _Dropout(attention_probs_blob, attention_probs_dropout_prob)

value_blob = flow.reshape(value_blob, [-1, to_seq_length, num_attention_heads, size_per_head])
value_blob = flow.transpose(value_blob, perm=[0, 2, 1, 3])
context_blob = flow.matmul(attention_probs_blob, value_blob)
context_blob = flow.transpose(context_blob, perm=[0, 2, 1, 3])

if do_return_2d_tensor:
context_blob = flow.reshape(context_blob, [-1, num_attention_heads * size_per_head])
else:
context_blob = flow.reshape(context_blob, [-1, from_seq_length, num_attention_heads * size_per_head])
return context_blob

def _FullyConnected(input_blob, input_size, units, activation=None, name=None,
weight_initializer=None, is_train=True):
weight_blob = flow.get_variable(
name=name + '-weight',
shape=[input_size, units],
dtype=input_blob.dtype,
trainable=is_train,
initializer=weight_initializer)
bias_blob = flow.get_variable(
name=name + '-bias',
shape=[units],
dtype=input_blob.dtype,
trainable=is_train,
initializer=flow.constant_initializer(0.0))
output_blob = flow.matmul(input_blob, weight_blob)
output_blob = flow.nn.bias_add(output_blob, bias_blob)
return output_blob

def _Dropout(input_blob, dropout_prob):
if dropout_prob == 0.0:
return input_blob
return flow.nn.dropout(input_blob, rate=dropout_prob)


def _LayerNorm(input_blob, hidden_size):
return flow.layers.layer_norm(input_blob, name='LayerNorm', begin_norm_axis=-1, begin_params_axis=-1)


def _CreateAttentionMaskFromInputMask(to_mask_blob, from_seq_length, to_seq_length):
output = flow.cast(to_mask_blob, dtype=flow.float)
output = flow.reshape(output, [-1, 1, to_seq_length])
zeros = flow.constant(0.0, dtype=flow.float, shape=[from_seq_length, to_seq_length])
output = zeros + output
return output


def _EmbeddingPostprocessor(input_blob,
seq_length,
embedding_size,
use_token_type=False,
token_type_ids_blob=None,
token_type_vocab_size=16,
token_type_embedding_name="token_type_embeddings",
use_position_embeddings=True,
position_embedding_name="position_embeddings",
initializer_range=0.02,
max_position_embeddings=512,
dropout_prob=0.1,
is_train=True):
output = input_blob

if use_token_type:
assert token_type_ids_blob is not None
token_type_table = flow.get_variable(name=token_type_embedding_name,
shape=[token_type_vocab_size, embedding_size],
dtype=input_blob.dtype,
trainable=is_train,
initializer=CreateInitializer(initializer_range))
token_type_embeddings = flow.gather(params=token_type_table, indices=token_type_ids_blob, axis=0)
output = output + token_type_embeddings

if use_position_embeddings:
position_table = flow.get_variable(name=position_embedding_name,
shape=[1, max_position_embeddings, embedding_size],
dtype=input_blob.dtype,
trainable=is_train,
initializer=CreateInitializer(initializer_range))
assert seq_length <= max_position_embeddings
if seq_length != max_position_embeddings:
position_table = flow.slice(position_table, begin=[None, 0, 0], size=[None, seq_length, -1])
output = output + position_table

output = _LayerNorm(output, embedding_size)
output = _Dropout(output, dropout_prob)

return output


def _EmbeddingLookup(input_ids_blob,
vocab_size,
embedding_size=128,
initializer_range=0.02,
word_embedding_name="word_embeddings",
is_train=True):
embedding_table = flow.get_variable(name=word_embedding_name, shape=[vocab_size, embedding_size],
dtype=flow.float,
trainable=is_train,
initializer=CreateInitializer(initializer_range))
output = flow.gather(params=embedding_table, indices=input_ids_blob, axis=0)
return output, embedding_table

def GetActivation(name):
if name == 'linear':
return None
elif name == 'relu':
return flow.math.relu
elif name == 'tanh':
return flow.math.tanh
elif name == 'gelu':
return flow.math.gelu
else:
raise Exception("unsupported activation")


+ 126
- 0
model_compress/model_compress/distil/theseus/classifier.py View File

@@ -0,0 +1,126 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import bert as bert_util
import bert_theseus as bert_theseus_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util


def GlueBERT(
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
label_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
initializer_range=0.02,
label_num=2,
replace_prob=0.0,
compress_ratio=1
):
# print('| replace_prob: {} | compress_ratio: {}'.format(replace_prob, compress_ratio))
backbone = bert_theseus_util.BertTheseusBackbone(
input_ids_blob=input_ids_blob,
input_mask_blob=input_mask_blob,
token_type_ids_blob=token_type_ids_blob,
vocab_size=vocab_size,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
replace_prob=replace_prob,
compress_ratio=compress_ratio
)
pooled_output = PooledOutput(
sequence_output=backbone.sequence_output(),
hidden_size=hidden_size,
initializer_range=initializer_range,
is_train=False
)
loss, _, logit_blob = _AddClassficationLoss(
input_blob=pooled_output,
label_blob=label_blob,
hidden_size=hidden_size,
label_num=label_num,
initializer_range=initializer_range,
scope_name='classification',
is_train=False
)

return loss, logit_blob


def PooledOutput(sequence_output, hidden_size, initializer_range, is_train=True):
with flow.scope.namespace("bert-pooler"):
first_token_tensor = flow.slice(
sequence_output, [None, 0, 0], [None, 1, -1])
first_token_tensor = flow.reshape(
first_token_tensor, [-1, hidden_size])
pooled_output = bert_util._FullyConnected(
first_token_tensor,
input_size=hidden_size,
units=hidden_size,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name="dense",
is_train=is_train
)
pooled_output = flow.math.tanh(pooled_output)
return pooled_output


def _AddClassficationLoss(input_blob, label_blob, hidden_size, label_num, initializer_range,
scope_name='classification', is_train=True):
with flow.scope.namespace(scope_name):
output_weight_blob = flow.get_variable(
name="output_weights",
shape=[label_num, hidden_size],
dtype=input_blob.dtype,
# initializer=bert_util.CreateInitializer(initializer_range),
initializer=flow.random_normal_initializer(
mean=0.0, stddev=initializer_range, seed=None, dtype=None),
trainable=is_train
)
output_bias_blob = flow.get_variable(
name="output_bias",
shape=[label_num],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(0.0),
trainable=is_train
)
logit_blob = flow.matmul(
input_blob, output_weight_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=logit_blob, labels=label_blob
)
loss = pre_example_loss
return loss, pre_example_loss, logit_blob

+ 110
- 0
model_compress/model_compress/distil/theseus/config.py View File

@@ -0,0 +1,110 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import argparse
from datetime import datetime


def str_list(x):
return x.split(',')


def int_list(x):
return list(map(int, x.split(',')))


def float_list(x):
return list(map(float, x.split(',')))


def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')


def get_parser(parser=None):
parser = argparse.ArgumentParser(description="flags for bert")

parser.add_argument('--do_train', type=str2bool, nargs='?', const=True, help='train or not')
parser.add_argument('--do_eval', type=str2bool, nargs='?', const=True, help='eval or not')
# resouce
parser.add_argument("--model", type=str, default='BERT Pretrain')
parser.add_argument("--gpu_num_per_node", type=int, default=1)
parser.add_argument('--num_nodes', type=int, default=1,
help='node/machine number for training')
parser.add_argument('--node_ips', type=str_list, default=['192.168.1.13', '192.168.1.14'],
help='nodes ip list for training, devided by ",", length >= num_nodes')

# train
parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate")
parser.add_argument("--weight_decay_rate", type=float, default=0.01, help="weight decay rate")
parser.add_argument("--warmup_proportion", type=float, default=0.1)
parser.add_argument('--use_fp16', type=str2bool, nargs='?', default='False', const=True,
help='use use fp16 or not')

# log and resore/save
parser.add_argument("--loss_print_every_n_iter", type=int, default=10, required=False,
help="print loss every n iteration")
parser.add_argument("--model_save_every_n_iter", type=int, default=10000, required=False,
help="save model every n iteration", )
parser.add_argument("--model_save_dir", type=str,
default="./output/model_save-{}".format(str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))),
required=False, help="model save directory")
parser.add_argument("--result_dir", type=str, default="./result", required=False, help="result save directory")
parser.add_argument("--save_last_snapshot", type=bool, default=False, required=False,
help="save model snapshot for last iteration")
parser.add_argument("--model_load_dir", type=str, default=None, help="model load directory")
parser.add_argument("--log_dir", type=str, default="./output", help="log info save directory")

# bert backbone
parser.add_argument('--do_lower_case', type=str2bool, nargs='?', const=True, default='True')
parser.add_argument("--seq_length", type=int, default=512)
parser.add_argument("--max_predictions_per_seq", type=int, default=80)
parser.add_argument("--num_hidden_layers", type=int, default=24)
parser.add_argument("--num_attention_heads", type=int, default=16)
parser.add_argument("--max_position_embeddings", type=int, default=512)
parser.add_argument("--type_vocab_size", type=int, default=2)
parser.add_argument("--vocab_size", type=int, default=30522)
parser.add_argument("--attention_probs_dropout_prob", type=float, default=0.1)
parser.add_argument("--hidden_dropout_prob", type=float, default=0.1)
parser.add_argument("--hidden_size_per_head", type=int, default=64)

parser.add_argument("--replace_prob", type=float, default=0.0)
parser.add_argument("--compress_ratio", type=int, default=1)

return parser


def print_args(args):
print("=".ljust(66, "="))
print("Running {}: num_gpu_per_node = {}, num_nodes = {}.".format(
args.model, args.gpu_num_per_node, args.num_nodes))
print("=".ljust(66, "="))
for arg in vars(args):
print("{} = {}".format(arg, getattr(args, arg)))
print("-".ljust(66, "-"))
print("Time stamp: {}".format(
str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))))


if __name__ == '__main__':
parser = get_parser()
args = parser.parse_args()
print_args(args)

+ 90
- 0
model_compress/model_compress/distil/theseus/convert_tf_ckpt_to_of.py View File

@@ -0,0 +1,90 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
"""Convert tensorflow checkpoint to oneflow snapshot"""

import re
import argparse
import tensorflow as tf
import numpy as np
import os

parser = argparse.ArgumentParser()

## Required parameters
parser.add_argument("--tf_checkpoint_path",
default = None,
type = str,
required = True,
help = "Path the TensorFlow checkpoint path.")
parser.add_argument("--of_dump_path",
default = None,
type = str,
required = True,
help = "Path to the output OneFlow model.")

#args = parser.parse_args()
args, unknown = parser.parse_known_args()
print(args)

# parse unknown arguments for extra weights
extra_weights = {}
for u in unknown:
w = u.split("=")
assert len(w) == 2
if len(w) == 2:
extra_weights[w[0]] = float(w[1])


def _write_blob(folder, blob):
os.makedirs(folder, exist_ok=True)
filename = os.path.join(folder, "out")
f = open(filename, 'wb')
f.write(blob.tobytes())
f.close()
print(filename, blob.shape)

def _SaveWeightBlob2File(blob, folder):
_write_blob(folder, blob)

for weight, default_value in extra_weights.items():
d = np.full_like(blob, default_value)
_write_blob(folder + weight, d)

def convert():
path = args.tf_checkpoint_path
init_vars = tf.train.list_variables(path)
for name, shape in init_vars:
array = tf.train.load_variable(path, name)

sep = name.rfind('/')
blob_name = name[sep + 1:]
op_name = name[:sep].replace('/', '-')

if blob_name == "kernel":
blob_name = "weight"
elif blob_name in ['adam_m', 'adam_v']:
print("find m, v weights")

folder_name = op_name+"-"+blob_name
folder = os.path.join(args.of_dump_path, folder_name)
#print("saved to:", folder)

_SaveWeightBlob2File(array, folder)


if __name__ == "__main__":
convert()


+ 103
- 0
model_compress/model_compress/distil/theseus/init_stu.py View File

@@ -0,0 +1,103 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

import os
import argparse
import shutil
import re


def str2bool(v):
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Unsupported value encountered.')


parser = argparse.ArgumentParser()
parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.")
parser.add_argument("--student_model", default=None, type=str, help="The student model dir.")
parser.add_argument("--layer_list", default="2,6,10", type=str,
help="the set of intermediate layers to distill knowledge from")
args = parser.parse_args()

# args.layer_list =
args.layer_list = [int(i) for i in args.layer_list.split(',')]
args.layer_num = len(args.layer_list)

student_filelist = []


def subString(template):
rule = r'bert-encoder-layer_(.*?)-'
slotList = re.findall(rule, template)
return slotList


def CopyFile(filepath, newPath):
if not os.path.exists(newPath):
os.makedirs(newPath)
fileNames = os.listdir(filepath)
for file in fileNames:
newDir = os.path.join(filepath,file)
if os.path.isfile(newDir):
newFile = os.path.join(newPath, file)
shutil.copyfile(newDir, newFile)
else:
if not os.path.exists(os.path.join(newPath, file)):
os.makedirs(os.path.join(newPath, file))
CopyFile(newDir, os.path.join(newPath, file))


if not os.path.exists(args.student_model):
os.makedirs(args.student_model)


for a, b, c in os.walk(args.teacher_model):
for subdir in b:
if str(subdir[-2:]) == '-v' or str(subdir[-2:]) == '-m':
continue
teacher_layer_num = subString(subdir)
# print('| teacher_layer_num: {}'.format(teacher_layer_num))
if len(teacher_layer_num) == 0:
teacher_model_subdir = os.path.join(args.teacher_model, subdir)
student_model_subdir = os.path.join(args.student_model, subdir)
# print('| teacher model subdir: {} | student model subdir: {}'.format(
# teacher_model_subdir, student_model_subdir))
CopyFile(teacher_model_subdir, student_model_subdir)
else:
teacher_layer_num = int(teacher_layer_num[0])
teacher_model_source_subdir = os.path.join(args.teacher_model, subdir)
teacher_model_target_subdir = os.path.join(args.student_model, subdir)
CopyFile(teacher_model_source_subdir, teacher_model_target_subdir)
# print('| teacher_layer_num: {}'.format(teacher_layer_num))
# print(subdir, subdir.split('layer', 1))
prefix, suffix = subdir.split('layer', 1)
student_subdir = prefix + 'student-layer' + suffix
# student_subdir = 'student-' + subdir
# print('| student_subdir: ', student_subdir)
if teacher_layer_num in args.layer_list:
student_layer_num = args.layer_list.index(teacher_layer_num)
rule = r'bert-encoder-layer_(.*?)-'
x = re.sub(rule, 'bert-encoder-layer_{}-'.format(str(student_layer_num)), student_subdir)
# print('| x: ', x)
teacher_model_subdir = os.path.join(args.teacher_model, subdir)
student_model_subdir = os.path.join(args.student_model, x)
# print('| teacher model subdir: {} | student model subdir: {}'.format(teacher_model_subdir,
# student_model_subdir))
CopyFile(teacher_model_subdir, student_model_subdir)

+ 189
- 0
model_compress/model_compress/distil/theseus/pretrain.py View File

@@ -0,0 +1,189 @@
"""
Copyright 2020 The OneFlow Authors. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import bert as bert_util
import oneflow.core.operator.op_conf_pb2 as op_conf_util


def PreTrain(
input_ids_blob,
input_mask_blob,
token_type_ids_blob,
masked_lm_positions_blob,
masked_lm_ids_blob,
masked_lm_weights_blob,
next_sentence_label_blob,
vocab_size,
seq_length=512,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
max_predictions_per_seq=20,
initializer_range=0.02,
):
backbone = bert_util.BertBackbone(
input_ids_blob=input_ids_blob,
input_mask_blob=input_mask_blob,
token_type_ids_blob=token_type_ids_blob,
vocab_size=vocab_size,
seq_length=seq_length,
hidden_size=hidden_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
intermediate_size=intermediate_size,
hidden_act=hidden_act,
hidden_dropout_prob=hidden_dropout_prob,
attention_probs_dropout_prob=attention_probs_dropout_prob,
max_position_embeddings=max_position_embeddings,
type_vocab_size=type_vocab_size,
initializer_range=initializer_range,
)

(lm_loss, _, _) = _AddMaskedLanguageModelLoss(
input_blob=backbone.sequence_output(),
output_weights_blob=backbone.embedding_table(),
positions_blob=masked_lm_positions_blob,
label_id_blob=masked_lm_ids_blob,
label_weight_blob=masked_lm_weights_blob,
seq_length=seq_length,
hidden_size=hidden_size,
vocab_size=vocab_size,
max_predictions_per_seq=max_predictions_per_seq,
hidden_act=bert_util.GetActivation(hidden_act),
initializer_range=initializer_range,
)
pooled_output = PooledOutput(
backbone.sequence_output(), hidden_size, initializer_range
)
(ns_loss, _, _) = _AddNextSentenceOutput(
input_blob=pooled_output,
label_blob=next_sentence_label_blob,
hidden_size=hidden_size,
initializer_range=initializer_range,
)
with flow.scope.namespace("cls-loss"):
total_loss = lm_loss + ns_loss
return total_loss, lm_loss, ns_loss


def PooledOutput(sequence_output, hidden_size, initializer_range):
with flow.scope.namespace("bert-pooler"):
first_token_tensor = flow.slice(sequence_output, [None, 0, 0], [None, 1, -1])
first_token_tensor = flow.reshape(first_token_tensor, [-1, hidden_size])
pooled_output = bert_util._FullyConnected(
first_token_tensor,
input_size=hidden_size,
units=hidden_size,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name="dense",
)
pooled_output = flow.math.tanh(pooled_output)
return pooled_output


def _AddMaskedLanguageModelLoss(
input_blob,
output_weights_blob,
positions_blob,
label_id_blob,
label_weight_blob,
seq_length,
hidden_size,
vocab_size,
max_predictions_per_seq,
hidden_act,
initializer_range,
):
with flow.scope.namespace("other"):
sum_label_weight_blob = flow.math.reduce_sum(label_weight_blob, axis=[-1])
ones = sum_label_weight_blob * 0.0 + 1.0
sum_label_weight_blob = flow.math.reduce_sum(sum_label_weight_blob)
batch_size = flow.math.reduce_sum(ones)
sum_label_weight_blob = sum_label_weight_blob / batch_size
with flow.scope.namespace("cls-predictions"):
input_blob = _GatherIndexes(input_blob, positions_blob, seq_length, hidden_size)
with flow.scope.namespace("transform"):
if callable(hidden_act):
act_fn = op_conf_util.kNone
else:
act_fn = hidden_act
input_blob = bert_util._FullyConnected(
input_blob,
input_size=hidden_size,
units=hidden_size,
activation=act_fn,
weight_initializer=bert_util.CreateInitializer(initializer_range),
name="dense",
)
if callable(hidden_act):
input_blob = hidden_act(input_blob)
input_blob = bert_util._LayerNorm(input_blob, hidden_size)
output_bias = flow.get_variable(
name="output_bias",
shape=[vocab_size],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(1.0),
)
logit_blob = flow.matmul(input_blob, output_weights_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias)
label_id_blob = flow.reshape(label_id_blob, [-1])
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=logit_blob, labels=label_id_blob
)
pre_example_loss = flow.reshape(pre_example_loss, [-1, max_predictions_per_seq])
numerator = pre_example_loss * label_weight_blob
with flow.scope.namespace("loss"):
numerator = flow.math.reduce_sum(numerator, axis=[-1])
denominator = sum_label_weight_blob + 1e-5
loss = numerator / denominator
return loss, pre_example_loss, logit_blob


def _GatherIndexes(sequence_blob, positions_blob, seq_length, hidden_size):
output = flow.gather(
params=sequence_blob, indices=positions_blob, axis=2, batch_dims=2
)
output = flow.reshape(output, [-1, hidden_size])
return output


def _AddNextSentenceOutput(input_blob, label_blob, hidden_size, initializer_range):
with flow.scope.namespace("cls-seq_relationship"):
output_weight_blob = flow.get_variable(
name="output_weights",
shape=[2, hidden_size],
dtype=input_blob.dtype,
initializer=bert_util.CreateInitializer(initializer_range),
)
output_bias_blob = flow.get_variable(
name="output_bias",
shape=[2],
dtype=input_blob.dtype,
initializer=flow.constant_initializer(0.0),
)
logit_blob = flow.matmul(input_blob, output_weight_blob, transpose_b=True)
logit_blob = flow.nn.bias_add(logit_blob, output_bias_blob)
pre_example_loss = flow.nn.sparse_softmax_cross_entropy_with_logits(
logits=logit_blob, labels=label_blob
)
loss = pre_example_loss
return loss, pre_example_loss, logit_blob

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save