Browse Source

Merge pull request #242 from fastnlp/dev0.5.0

0.5.0 ready to release!
tags/v0.5.0
yhcc GitHub 5 years ago
parent
commit
c20d250cf4
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
100 changed files with 5242 additions and 2697 deletions
  1. +1
    -0
      .coverage
  2. +2
    -0
      .gitignore
  3. +3
    -1
      .travis.yml
  4. +25
    -18
      README.md
  5. +2
    -2
      docs/Makefile
  6. +0
    -1
      docs/README.md
  7. +158
    -0
      docs/count.py
  8. +11
    -7
      docs/source/conf.py
  9. +3
    -3
      docs/source/fastNLP.core.batch.rst
  10. +3
    -3
      docs/source/fastNLP.core.callback.rst
  11. +3
    -3
      docs/source/fastNLP.core.const.rst
  12. +3
    -3
      docs/source/fastNLP.core.dataset.rst
  13. +3
    -3
      docs/source/fastNLP.core.field.rst
  14. +3
    -3
      docs/source/fastNLP.core.instance.rst
  15. +3
    -3
      docs/source/fastNLP.core.losses.rst
  16. +3
    -3
      docs/source/fastNLP.core.metrics.rst
  17. +3
    -3
      docs/source/fastNLP.core.optimizer.rst
  18. +1
    -4
      docs/source/fastNLP.core.rst
  19. +3
    -3
      docs/source/fastNLP.core.sampler.rst
  20. +3
    -3
      docs/source/fastNLP.core.tester.rst
  21. +3
    -3
      docs/source/fastNLP.core.trainer.rst
  22. +3
    -3
      docs/source/fastNLP.core.utils.rst
  23. +3
    -3
      docs/source/fastNLP.core.vocabulary.rst
  24. +4
    -5
      docs/source/fastNLP.embeddings.bert_embedding.rst
  25. +4
    -5
      docs/source/fastNLP.embeddings.char_embedding.rst
  26. +6
    -0
      docs/source/fastNLP.embeddings.contextual_embedding.rst
  27. +4
    -5
      docs/source/fastNLP.embeddings.elmo_embedding.rst
  28. +2
    -3
      docs/source/fastNLP.embeddings.embedding.rst
  29. +3
    -4
      docs/source/fastNLP.embeddings.rst
  30. +4
    -5
      docs/source/fastNLP.embeddings.stack_embedding.rst
  31. +4
    -5
      docs/source/fastNLP.embeddings.static_embedding.rst
  32. +2
    -3
      docs/source/fastNLP.embeddings.utils.rst
  33. +0
    -7
      docs/source/fastNLP.io.base_loader.rst
  34. +7
    -0
      docs/source/fastNLP.io.data_bundle.rst
  35. +0
    -7
      docs/source/fastNLP.io.data_loader.rst
  36. +0
    -7
      docs/source/fastNLP.io.dataset_loader.rst
  37. +5
    -5
      docs/source/fastNLP.io.embed_loader.rst
  38. +7
    -0
      docs/source/fastNLP.io.file_utils.rst
  39. +7
    -0
      docs/source/fastNLP.io.loader.rst
  40. +5
    -5
      docs/source/fastNLP.io.model_io.rst
  41. +7
    -0
      docs/source/fastNLP.io.pipe.rst
  42. +8
    -7
      docs/source/fastNLP.io.rst
  43. +7
    -0
      docs/source/fastNLP.io.utils.rst
  44. +6
    -0
      docs/source/fastNLP.models.bert.rst
  45. +4
    -5
      docs/source/fastNLP.models.biaffine_parser.rst
  46. +4
    -5
      docs/source/fastNLP.models.cnn_text_classification.rst
  47. +3
    -4
      docs/source/fastNLP.models.rst
  48. +4
    -5
      docs/source/fastNLP.models.sequence_labeling.rst
  49. +2
    -3
      docs/source/fastNLP.models.snli.rst
  50. +4
    -5
      docs/source/fastNLP.models.star_transformer.rst
  51. +1
    -3
      docs/source/fastNLP.modules.decoder.rst
  52. +2
    -3
      docs/source/fastNLP.modules.encoder.rst
  53. +4
    -6
      docs/source/fastNLP.modules.rst
  54. +6
    -0
      docs/source/fastNLP.modules.utils.rst
  55. +6
    -7
      docs/source/fastNLP.rst
  56. BIN
      docs/source/figures/workflow.png
  57. +0
    -1
      docs/source/modules.rst
  58. BIN
      docs/source/quickstart/cn_cls_example.png
  59. +368
    -0
      docs/source/quickstart/文本分类.rst
  60. +220
    -0
      docs/source/tutorials/extend_1_bert_embedding.rst
  61. +0
    -0
      docs/source/tutorials/extend_2_fitlog.rst
  62. +132
    -0
      docs/source/tutorials/tutorial_10_callback.rst
  63. +47
    -47
      docs/source/tutorials/tutorial_1_data_preprocess.rst
  64. +0
    -224
      docs/source/tutorials/tutorial_2_load_dataset.rst
  65. +129
    -0
      docs/source/tutorials/tutorial_2_vocabulary.rst
  66. +370
    -131
      docs/source/tutorials/tutorial_3_embedding.rst
  67. +210
    -0
      docs/source/tutorials/tutorial_4_load_dataset.rst
  68. +0
    -267
      docs/source/tutorials/tutorial_4_loss_optimizer.rst
  69. +0
    -250
      docs/source/tutorials/tutorial_5_datasetiter.rst
  70. +237
    -0
      docs/source/tutorials/tutorial_5_loss_optimizer.rst
  71. +413
    -0
      docs/source/tutorials/tutorial_6_datasetiter.rst
  72. +0
    -114
      docs/source/tutorials/tutorial_6_seq_labeling.rst
  73. +0
    -0
      docs/source/tutorials/tutorial_7_metrics.rst
  74. +3
    -3
      docs/source/tutorials/tutorial_8_modules_models.rst
  75. +0
    -67
      docs/source/tutorials/tutorial_9_callback.rst
  76. +187
    -0
      docs/source/tutorials/tutorial_9_seq_labeling.rst
  77. +0
    -3
      docs/source/user/docs_in_code.rst
  78. +2
    -1
      docs/source/user/installation.rst
  79. +5
    -115
      docs/source/user/quickstart.rst
  80. +14
    -9
      docs/source/user/tutorials.rst
  81. +29
    -13
      fastNLP/__init__.py
  82. +72
    -9
      fastNLP/core/__init__.py
  83. +155
    -0
      fastNLP/core/_logger.py
  84. +26
    -7
      fastNLP/core/_parallel_utils.py
  85. +124
    -55
      fastNLP/core/batch.py
  86. +333
    -120
      fastNLP/core/callback.py
  87. +37
    -12
      fastNLP/core/const.py
  88. +169
    -122
      fastNLP/core/dataset.py
  89. +447
    -0
      fastNLP/core/dist_trainer.py
  90. +155
    -131
      fastNLP/core/field.py
  91. +22
    -13
      fastNLP/core/instance.py
  92. +67
    -102
      fastNLP/core/losses.py
  93. +299
    -247
      fastNLP/core/metrics.py
  94. +48
    -38
      fastNLP/core/optimizer.py
  95. +19
    -15
      fastNLP/core/predictor.py
  96. +10
    -17
      fastNLP/core/sampler.py
  97. +78
    -48
      fastNLP/core/tester.py
  98. +183
    -141
      fastNLP/core/trainer.py
  99. +148
    -108
      fastNLP/core/utils.py
  100. +107
    -73
      fastNLP/core/vocabulary.py

+ 1
- 0
.coverage
File diff suppressed because it is too large
View File


+ 2
- 0
.gitignore View File

@@ -14,3 +14,5 @@ caches
.fitlog
logs/
.fitconfig

docs/build

+ 3
- 1
.travis.yml View File

@@ -4,11 +4,13 @@ python:
# command to install dependencies
install:
- pip install --quiet -r requirements.txt
- pip install --quiet fitlog
- pip install pytest>=3.6
- pip install pytest-cov
# command to run tests
script:
- pytest --cov=./ test/
- python -m spacy download en
- pytest --cov=fastNLP test/

after_success:
- bash <(curl -s https://codecov.io/bash)

+ 25
- 18
README.md View File

@@ -6,11 +6,12 @@
![Hex.pm](https://img.shields.io/hexpm/l/plug.svg)
[![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest)

fastNLP 是一款轻量级的 NLP 处理套件。你既可以使用它快速地完成一个序列标注([NER](reproduction/seqence_labelling/ner)、POS-Tagging等)、中文分词、[文本分类](reproduction/text_classification)、[Matching](reproduction/matching)、[指代消解](reproduction/coreference_resolution)、[摘要](reproduction/Summarization)等任务; 也可以使用它构建许多复杂的网络模型,进行科研。它具有如下的特性:
fastNLP 是一款轻量级的 NLP 工具包。你既可以使用它快速地完成一个序列标注([NER](reproduction/sequence_labelling/ner)、POS-Tagging等)、中文分词、[文本分类](reproduction/text_classification)、[Matching](reproduction/matching)、[指代消解](reproduction/coreference_resolution)、[摘要](reproduction/Summarization)等任务; 也可以使用它快速构建许多复杂的网络模型,进行科研。它具有如下的特性:

- 统一的Tabular式数据容器,让数据预处理过程简洁明了。内置多种数据集的DataSet Loader,省去预处理代码;
- 统一的Tabular式数据容器,让数据预处理过程简洁明了。内置多种数据集的Loader和Pipe,省去预处理代码;
- 多种训练、测试组件,例如训练器Trainer;测试器Tester;以及各种评测metrics等等;
- 各种方便的NLP工具,例如预处理embedding加载(包括ELMo和BERT); 中间数据cache等;
- 部分[数据集与预训练模型](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)的自动下载
- 详尽的中文[文档](https://fastnlp.readthedocs.io/)、[教程](https://fastnlp.readthedocs.io/zh/latest/user/tutorials.html)以供查阅;
- 提供诸多高级模块,例如Variational LSTM, Transformer, CRF等;
- 在序列标注、中文分词、文本分类、Matching、指代消解、摘要等任务上封装了各种模型可供直接使用,详细内容见 [reproduction](reproduction) 部分;
@@ -27,6 +28,7 @@ fastNLP 依赖以下包:
+ nltk>=3.4.1
+ requests
+ spacy
+ prettytable>=0.7.2

其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 [PyTorch 官网](https://pytorch.org/) 。
在依赖包安装完成后,您可以在命令行执行如下指令完成安装
@@ -36,24 +38,30 @@ pip install fastNLP
python -m spacy download en
```

目前使用pip安装fastNLP的版本是0.4.1,有较多功能仍未更新,最新内容以master分支为准。
fastNLP0.5.0版本将在近期推出,请密切关注。


## fastNLP教程

### 快速入门

- [0. 快速入门](https://fastnlp.readthedocs.io/zh/latest/user/quickstart.html)

### 详细使用教程

- [1. 使用DataSet预处理文本](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html)
- [2. 使用DataSetLoader加载数据集](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_load_dataset.html)
- [2. 使用Vocabulary转换文本与index](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_vocabulary.html)
- [3. 使用Embedding模块将文本转成向量](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_3_embedding.html)
- [4. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_4_loss_optimizer.html)
- [5. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_5_datasetiter.html)
- [6. 快速实现序列标注模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_6_seq_labeling.html)
- [7. 使用Modules和Models快速搭建自定义模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_7_modules_models.html)
- [8. 使用Metric快速评测你的模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_8_metrics.html)
- [9. 使用Callback自定义你的训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_9_callback.html)
- [10. 使用fitlog 辅助 fastNLP 进行科研](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_10_fitlog.html)
- [4. 使用Loader和Pipe加载并处理数据集](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_4_load_dataset.html)
- [5. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_5_loss_optimizer.html)
- [6. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_6_datasetiter.html)
- [7. 使用Metric快速评测你的模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_7_metrics.html)
- [8. 使用Modules和Models快速搭建自定义模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_8_modules_models.html)
- [9. 快速实现序列标注模型](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_9_seq_labeling.html)
- [10. 使用Callback自定义你的训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_10_callback.html)

### 扩展教程

- [Extend-1. BertEmbedding的各种用法](https://fastnlp.readthedocs.io/zh/latest/tutorials/extend_1_bert_embedding.html)
- [Extend-2. 使用fitlog 辅助 fastNLP 进行科研](https://fastnlp.readthedocs.io/zh/latest/tutorials/extend_2_fitlog.html)


## 内置组件
@@ -79,19 +87,19 @@ fastNLP 在 embeddings 模块中内置了几种不同的embedding:静态embedd
<tr>
<td> encoder </td>
<td> 将输入编码为具有具有表示能力的向量 </td>
<td> embedding, RNN, CNN, transformer
<td> Embedding, RNN, CNN, Transformer, ...
</tr>
<tr>
<td> decoder </td>
<td> 将具有某种表示意义的向量解码为需要的输出形式 </td>
<td> MLP, CRF </td>
<td> MLP, CRF, ... </td>
</tr>
</table>


## 项目结构

<img src="./docs/source/figures/workflow.png" width="60%" height="60%">
![](./docs/source/figures/workflow.png)

fastNLP的大致工作流程如上图所示,而项目结构如下:

@@ -118,11 +126,10 @@ fastNLP的大致工作流程如上图所示,而项目结构如下:
</tr>
<tr>
<td><b> fastNLP.io </b></td>
<td> 实现了读写功能,包括数据读入,模型读写等 </td>
<td> 实现了读写功能,包括数据读入与预处理,模型读写,数据与模型自动下载等 </td>
</tr>
</table>


<hr>

*In memory of @FengZiYjun. May his soul rest in peace. We will miss you very very much!*

+ 2
- 2
docs/Makefile View File

@@ -14,13 +14,13 @@ help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

apidoc:
$(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ)
$(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ) && python3 format.py

server:
cd build/html && python -m http.server

dev:
rm -rf build/html && make html && make server
rm -rf build && make html && make server

.PHONY: help Makefile



+ 0
- 1
docs/README.md View File

@@ -32,7 +32,6 @@ Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...

我们在[这里](./source/user/example.rst)列举了fastNLP文档经常用到的reStructuredText语法(网页查看请结合Raw模式),
您可以通过阅读它进行快速上手。FastNLP大部分的文档都是写在代码中通过Sphinx工具进行抽取生成的,
您还可以参考这篇[未完成的文章](./source/user/docs_in_code.rst)了解代码内文档编写的规范。

## 文档维护人员



+ 158
- 0
docs/count.py View File

@@ -0,0 +1,158 @@
import inspect
import os
import sys


def _colored_string(string: str, color: str or int) -> str:
"""在终端中显示一串有颜色的文字
:param string: 在终端中显示的文字
:param color: 文字的颜色
:return:
"""
if isinstance(color, str):
color = {
"black": 30, "Black": 30, "BLACK": 30,
"red": 31, "Red": 31, "RED": 31,
"green": 32, "Green": 32, "GREEN": 32,
"yellow": 33, "Yellow": 33, "YELLOW": 33,
"blue": 34, "Blue": 34, "BLUE": 34,
"purple": 35, "Purple": 35, "PURPLE": 35,
"cyan": 36, "Cyan": 36, "CYAN": 36,
"white": 37, "White": 37, "WHITE": 37
}[color]
return "\033[%dm%s\033[0m" % (color, string)


def gr(string, flag):
if flag:
return _colored_string(string, "green")
else:
return _colored_string(string, "red")


def find_all_modules():
modules = {}
children = {}
to_doc = set()
root = '../fastNLP'
for path, dirs, files in os.walk(root):
for file in files:
if file.endswith('.py'):
name = ".".join(path.split('/')[1:])
if file.split('.')[0] != "__init__":
name = name + '.' + file.split('.')[0]
__import__(name)
m = sys.modules[name]
modules[name] = m
try:
m.__all__
except:
print(name, "__all__ missing")
continue
if m.__doc__ is None:
print(name, "__doc__ missing")
continue
if "undocumented" not in m.__doc__:
to_doc.add(name)
for module in to_doc:
t = ".".join(module.split('.')[:-1])
if t in to_doc:
if t not in children:
children[t] = set()
children[t].add(module)
for m in children:
children[m] = sorted(children[m])
return modules, to_doc, children


def create_rst_file(modules, name, children):
m = modules[name]
with open("./source/" + name + ".rst", "w") as fout:
t = "=" * len(name)
fout.write(name + "\n")
fout.write(t + "\n")
fout.write("\n")
fout.write(".. automodule:: " + name + "\n")
if name != "fastNLP.core" and len(m.__all__) > 0:
fout.write(" :members: " + ", ".join(m.__all__) + "\n")
short = name[len("fastNLP."):]
if not (short.startswith('models') or short.startswith('modules') or short.startswith('embeddings')):
fout.write(" :inherited-members:\n")
fout.write("\n")
if name in children:
fout.write("子模块\n------\n\n.. toctree::\n :maxdepth: 1\n\n")
for module in children[name]:
fout.write(" " + module + "\n")


def check_file(m, name):
names = name.split('.')
test_name = "test." + ".".join(names[1:-1]) + ".test_" + names[-1]
try:
__import__(test_name)
tm = sys.modules[test_name]
except ModuleNotFoundError:
tm = None
tested = tm is not None
funcs = {}
classes = {}
for item, obj in inspect.getmembers(m):
if inspect.isclass(obj) and obj.__module__ == name and not obj.__name__.startswith('_'):
this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm), {})
for i in dir(obj):
func = getattr(obj, i)
if inspect.isfunction(func) and not i.startswith('_'):
this[2][i] = (func.__doc__ is not None, False)
classes[obj.__name__] = this
if inspect.isfunction(obj) and obj.__module__ == name and not obj.__name__.startswith('_'):
this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm)) # docs
funcs[obj.__name__] = this
return funcs, classes


def check_files(modules, out=None):
for name in sorted(modules.keys()):
print(name, file=out)
funcs, classes = check_file(modules[name], name)
if out is None:
for f in funcs:
print("%-30s \t %s \t %s" % (f, gr("文档", funcs[f][0]), gr("测试", funcs[f][1])))
for c in classes:
print("%-30s \t %s \t %s" % (c, gr("文档", classes[c][0]), gr("测试", classes[c][1])))
methods = classes[c][2]
for f in methods:
print(" %-28s \t %s" % (f, gr("文档", methods[f][0])))
else:
for f in funcs:
if not funcs[f][0]:
print("缺少文档 %s" % (f), file=out)
if not funcs[f][1]:
print("缺少测试 %s" % (f), file=out)
for c in classes:
if not classes[c][0]:
print("缺少文档 %s" % (c), file=out)
if not classes[c][1]:
print("缺少测试 %s" % (c), file=out)
methods = classes[c][2]
for f in methods:
if not methods[f][0]:
print("缺少文档 %s" % (c + "." + f), file=out)
print(file=out)


def main():
sys.path.append("..")
print(_colored_string('Getting modules...', "Blue"))
modules, to_doc, children = find_all_modules()
print(_colored_string('Done!', "Green"))
print(_colored_string('Creating rst files...', "Blue"))
for name in to_doc:
create_rst_file(modules, name, children)
print(_colored_string('Done!', "Green"))
print(_colored_string('Checking all files...', "Blue"))
check_files(modules, out=open("results.txt", "w"))
print(_colored_string('Done!', "Green"))


if __name__ == "__main__":
main()

+ 11
- 7
docs/source/conf.py View File

@@ -24,9 +24,9 @@ copyright = '2018, xpqiu'
author = 'xpqiu'

# The short X.Y version
version = '0.4.5'
version = '0.5.0'
# The full version, including alpha/beta/rc tags
release = '0.4.5'
release = '0.5.0'

# -- General configuration ---------------------------------------------------

@@ -48,12 +48,14 @@ extensions = [
autodoc_default_options = {
'member-order': 'bysource',
'special-members': '__init__',
'undoc-members': True,
'undoc-members': False,
}

autoclass_content = "class"

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# template_bridge
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
@@ -113,7 +115,7 @@ html_static_path = ['_static']
# -- Options for HTMLHelp output ---------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = 'fastNLPdoc'
htmlhelp_basename = 'fastNLP doc'

# -- Options for LaTeX output ------------------------------------------------

@@ -166,10 +168,12 @@ texinfo_documents = [

# -- Extension configuration -------------------------------------------------
def maybe_skip_member(app, what, name, obj, skip, options):
if name.startswith("_"):
return True
if obj.__doc__ is None:
return True
if name == "__init__":
return False
if name.startswith("_"):
return True
return False




+ 3
- 3
docs/source/fastNLP.core.batch.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.batch
==================

.. automodule:: fastNLP.core.batch
:members:
:undoc-members:
:show-inheritance:
:members: BatchIter, DataSetIter, TorchLoaderIter
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.callback.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.callback
=====================

.. automodule:: fastNLP.core.callback
:members:
:undoc-members:
:show-inheritance:
:members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.const.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.const
==================

.. automodule:: fastNLP.core.const
:members:
:undoc-members:
:show-inheritance:
:members: Const
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.dataset.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.dataset
====================

.. automodule:: fastNLP.core.dataset
:members:
:undoc-members:
:show-inheritance:
:members: DataSet
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.field.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.field
==================

.. automodule:: fastNLP.core.field
:members:
:undoc-members:
:show-inheritance:
:members: Padder, AutoPadder, EngChar2DPadder
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.instance.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.instance
=====================

.. automodule:: fastNLP.core.instance
:members:
:undoc-members:
:show-inheritance:
:members: Instance
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.losses.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.losses
===================

.. automodule:: fastNLP.core.losses
:members:
:undoc-members:
:show-inheritance:
:members: LossBase, LossFunc, LossInForward, CrossEntropyLoss, BCELoss, L1Loss, NLLLoss
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.metrics.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.metrics
====================

.. automodule:: fastNLP.core.metrics
:members:
:undoc-members:
:show-inheritance:
:members: MetricBase, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.optimizer.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.optimizer
======================

.. automodule:: fastNLP.core.optimizer
:members:
:undoc-members:
:show-inheritance:
:members: Optimizer, SGD, Adam, AdamW
:inherited-members:

+ 1
- 4
docs/source/fastNLP.core.rst View File

@@ -2,12 +2,9 @@ fastNLP.core
============

.. automodule:: fastNLP.core
:members:
:undoc-members:
:show-inheritance:

子模块
----------
------

.. toctree::
:maxdepth: 1


+ 3
- 3
docs/source/fastNLP.core.sampler.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.sampler
====================

.. automodule:: fastNLP.core.sampler
:members:
:undoc-members:
:show-inheritance:
:members: Sampler, BucketSampler, SequentialSampler, RandomSampler
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.tester.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.tester
===================

.. automodule:: fastNLP.core.tester
:members:
:undoc-members:
:show-inheritance:
:members: Tester
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.trainer.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.trainer
====================

.. automodule:: fastNLP.core.trainer
:members:
:undoc-members:
:show-inheritance:
:members: Trainer
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.utils.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.utils
==================

.. automodule:: fastNLP.core.utils
:members:
:undoc-members:
:show-inheritance:
:members: cache_results, seq_len_to_mask, get_seq_len
:inherited-members:

+ 3
- 3
docs/source/fastNLP.core.vocabulary.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.vocabulary
=======================

.. automodule:: fastNLP.core.vocabulary
:members:
:undoc-members:
:show-inheritance:
:members: Vocabulary, VocabularyOption
:inherited-members:

+ 4
- 5
docs/source/fastNLP.embeddings.bert_embedding.rst View File

@@ -1,7 +1,6 @@
fastNLP.embeddings.bert\_embedding
==================================
fastNLP.embeddings.bert_embedding
=================================

.. automodule:: fastNLP.embeddings.bert_embedding
:members:
:undoc-members:
:show-inheritance:
:members: BertEmbedding, BertWordPieceEncoder


+ 4
- 5
docs/source/fastNLP.embeddings.char_embedding.rst View File

@@ -1,7 +1,6 @@
fastNLP.embeddings.char\_embedding
==================================
fastNLP.embeddings.char_embedding
=================================

.. automodule:: fastNLP.embeddings.char_embedding
:members:
:undoc-members:
:show-inheritance:
:members: CNNCharEmbedding, LSTMCharEmbedding


+ 6
- 0
docs/source/fastNLP.embeddings.contextual_embedding.rst View File

@@ -0,0 +1,6 @@
fastNLP.embeddings.contextual_embedding
=======================================

.. automodule:: fastNLP.embeddings.contextual_embedding
:members: ContextualEmbedding


+ 4
- 5
docs/source/fastNLP.embeddings.elmo_embedding.rst View File

@@ -1,7 +1,6 @@
fastNLP.embeddings.elmo\_embedding
==================================
fastNLP.embeddings.elmo_embedding
=================================

.. automodule:: fastNLP.embeddings.elmo_embedding
:members:
:undoc-members:
:show-inheritance:
:members: ElmoEmbedding


+ 2
- 3
docs/source/fastNLP.embeddings.embedding.rst View File

@@ -2,6 +2,5 @@ fastNLP.embeddings.embedding
============================

.. automodule:: fastNLP.embeddings.embedding
:members:
:undoc-members:
:show-inheritance:
:members: Embedding, TokenEmbedding


+ 3
- 4
docs/source/fastNLP.embeddings.rst View File

@@ -2,18 +2,17 @@ fastNLP.embeddings
==================

.. automodule:: fastNLP.embeddings
:members:
:undoc-members:
:show-inheritance:
:members: Embedding, TokenEmbedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, BertWordPieceEncoder, StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding, get_embeddings

子模块
----------
------

.. toctree::
:maxdepth: 1

fastNLP.embeddings.bert_embedding
fastNLP.embeddings.char_embedding
fastNLP.embeddings.contextual_embedding
fastNLP.embeddings.elmo_embedding
fastNLP.embeddings.embedding
fastNLP.embeddings.stack_embedding


+ 4
- 5
docs/source/fastNLP.embeddings.stack_embedding.rst View File

@@ -1,7 +1,6 @@
fastNLP.embeddings.stack\_embedding
===================================
fastNLP.embeddings.stack_embedding
==================================

.. automodule:: fastNLP.embeddings.stack_embedding
:members:
:undoc-members:
:show-inheritance:
:members: StackEmbedding


+ 4
- 5
docs/source/fastNLP.embeddings.static_embedding.rst View File

@@ -1,7 +1,6 @@
fastNLP.embeddings.static\_embedding
====================================
fastNLP.embeddings.static_embedding
===================================

.. automodule:: fastNLP.embeddings.static_embedding
:members:
:undoc-members:
:show-inheritance:
:members: StaticEmbedding


+ 2
- 3
docs/source/fastNLP.embeddings.utils.rst View File

@@ -2,6 +2,5 @@ fastNLP.embeddings.utils
========================

.. automodule:: fastNLP.embeddings.utils
:members:
:undoc-members:
:show-inheritance:
:members: get_embeddings


+ 0
- 7
docs/source/fastNLP.io.base_loader.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.base\_loader
=======================

.. automodule:: fastNLP.io.base_loader
:members:
:undoc-members:
:show-inheritance:

+ 7
- 0
docs/source/fastNLP.io.data_bundle.rst View File

@@ -0,0 +1,7 @@
fastNLP.io.data_bundle
======================

.. automodule:: fastNLP.io.data_bundle
:members: DataBundle
:inherited-members:


+ 0
- 7
docs/source/fastNLP.io.data_loader.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.data\_loader
==========================

.. automodule:: fastNLP.io.data_loader
:members:
:undoc-members:
:show-inheritance:

+ 0
- 7
docs/source/fastNLP.io.dataset_loader.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.dataset\_loader
==========================

.. automodule:: fastNLP.io.dataset_loader
:members:
:undoc-members:
:show-inheritance:

+ 5
- 5
docs/source/fastNLP.io.embed_loader.rst View File

@@ -1,7 +1,7 @@
fastNLP.io.embed\_loader
========================
fastNLP.io.embed_loader
=======================

.. automodule:: fastNLP.io.embed_loader
:members:
:undoc-members:
:show-inheritance:
:members: EmbedLoader, EmbeddingOption
:inherited-members:

+ 7
- 0
docs/source/fastNLP.io.file_utils.rst View File

@@ -0,0 +1,7 @@
fastNLP.io.file_utils
=====================

.. automodule:: fastNLP.io.file_utils
:members: cached_path, get_filepath, get_cache_path, split_filename_suffix, get_from_cache
:inherited-members:


+ 7
- 0
docs/source/fastNLP.io.loader.rst View File

@@ -0,0 +1,7 @@
fastNLP.io.loader
=================

.. automodule:: fastNLP.io.loader
:members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, CoReferenceLoader
:inherited-members:


+ 5
- 5
docs/source/fastNLP.io.model_io.rst View File

@@ -1,7 +1,7 @@
fastNLP.io.model\_io
====================
fastNLP.io.model_io
===================

.. automodule:: fastNLP.io.model_io
:members:
:undoc-members:
:show-inheritance:
:members: ModelLoader, ModelSaver
:inherited-members:

+ 7
- 0
docs/source/fastNLP.io.pipe.rst View File

@@ -0,0 +1,7 @@
fastNLP.io.pipe
===============

.. automodule:: fastNLP.io.pipe
:members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, LCQMCPipe, CNXNLIPipe, BQCorpusPipe, RenamePipe, GranularizePipe, MachingTruncatePipe, CoReferencePipe
:inherited-members:


+ 8
- 7
docs/source/fastNLP.io.rst View File

@@ -2,18 +2,19 @@ fastNLP.io
==========

.. automodule:: fastNLP.io
:members:
:undoc-members:
:show-inheritance:
:members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver
:inherited-members:

子模块
----------
------

.. toctree::
:maxdepth: 1

fastNLP.io.base_loader
fastNLP.io.data_bundle
fastNLP.io.embed_loader
fastNLP.io.dataset_loader
fastNLP.io.data_loader
fastNLP.io.file_utils
fastNLP.io.loader
fastNLP.io.model_io
fastNLP.io.pipe
fastNLP.io.utils

+ 7
- 0
docs/source/fastNLP.io.utils.rst View File

@@ -0,0 +1,7 @@
fastNLP.io.utils
================

.. automodule:: fastNLP.io.utils
:members: check_loader_paths
:inherited-members:


+ 6
- 0
docs/source/fastNLP.models.bert.rst View File

@@ -0,0 +1,6 @@
fastNLP.models.bert
===================

.. automodule:: fastNLP.models.bert
:members: BertForSequenceClassification, BertForSentenceMatching, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering


+ 4
- 5
docs/source/fastNLP.models.biaffine_parser.rst View File

@@ -1,7 +1,6 @@
fastNLP.models.biaffine\_parser
===============================
fastNLP.models.biaffine_parser
==============================

.. automodule:: fastNLP.models.biaffine_parser
:members:
:undoc-members:
:show-inheritance:
:members: BiaffineParser, GraphParser


+ 4
- 5
docs/source/fastNLP.models.cnn_text_classification.rst View File

@@ -1,7 +1,6 @@
fastNLP.models.cnn\_text\_classification
========================================
fastNLP.models.cnn_text_classification
======================================

.. automodule:: fastNLP.models.cnn_text_classification
:members:
:undoc-members:
:show-inheritance:
:members: CNNText


+ 3
- 4
docs/source/fastNLP.models.rst View File

@@ -2,16 +2,15 @@ fastNLP.models
==============

.. automodule:: fastNLP.models
:members:
:undoc-members:
:show-inheritance:
:members: CNNText, SeqLabeling, AdvSeqLabel, ESIM, StarTransEnc, STSeqLabel, STNLICls, STSeqCls, BiaffineParser, GraphParser, BertForSequenceClassification, BertForSentenceMatching, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering

子模块
----------
------

.. toctree::
:maxdepth: 1

fastNLP.models.bert
fastNLP.models.biaffine_parser
fastNLP.models.cnn_text_classification
fastNLP.models.sequence_labeling


+ 4
- 5
docs/source/fastNLP.models.sequence_labeling.rst View File

@@ -1,7 +1,6 @@
fastNLP.models.sequence\_labeling
=================================
fastNLP.models.sequence_labeling
================================

.. automodule:: fastNLP.models.sequence_labeling
:members:
:undoc-members:
:show-inheritance:
:members: SeqLabeling, AdvSeqLabel, BiLSTMCRF


+ 2
- 3
docs/source/fastNLP.models.snli.rst View File

@@ -2,6 +2,5 @@ fastNLP.models.snli
===================

.. automodule:: fastNLP.models.snli
:members:
:undoc-members:
:show-inheritance:
:members: ESIM


+ 4
- 5
docs/source/fastNLP.models.star_transformer.rst View File

@@ -1,7 +1,6 @@
fastNLP.models.star\_transformer
================================
fastNLP.models.star_transformer
===============================

.. automodule:: fastNLP.models.star_transformer
:members:
:undoc-members:
:show-inheritance:
:members: StarTransEnc, STNLICls, STSeqCls, STSeqLabel


+ 1
- 3
docs/source/fastNLP.modules.decoder.rst View File

@@ -2,7 +2,5 @@ fastNLP.modules.decoder
=======================

.. automodule:: fastNLP.modules.decoder
:members:
:undoc-members:
:show-inheritance:
:members: MLP, ConditionalRandomField, viterbi_decode, allowed_transitions


+ 2
- 3
docs/source/fastNLP.modules.encoder.rst View File

@@ -2,6 +2,5 @@ fastNLP.modules.encoder
=======================

.. automodule:: fastNLP.modules.encoder
:members:
:undoc-members:
:show-inheritance:
:members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, KMaxPool, AvgPool, AvgPoolWithMask, MultiHeadAttention, BiAttention, SelfAttention


+ 4
- 6
docs/source/fastNLP.modules.rst View File

@@ -2,16 +2,14 @@ fastNLP.modules
===============

.. automodule:: fastNLP.modules
:members:
:undoc-members:
:show-inheritance:
:members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, KMaxPool, AvgPool, AvgPoolWithMask, MultiHeadAttention, MLP, ConditionalRandomField, viterbi_decode, allowed_transitions, TimestepDropout

子模块
-----------
------

.. toctree::
:titlesonly:
:maxdepth: 1

fastNLP.modules.decoder
fastNLP.modules.encoder
fastNLP.modules.encoder
fastNLP.modules.utils

+ 6
- 0
docs/source/fastNLP.modules.utils.rst View File

@@ -0,0 +1,6 @@
fastNLP.modules.utils
=====================

.. automodule:: fastNLP.modules.utils
:members: initial_parameter, summary


+ 6
- 7
docs/source/fastNLP.rst View File

@@ -1,13 +1,12 @@
API 文档
===============
fastNLP
=======

.. automodule:: fastNLP
:members:
:undoc-members:
:show-inheritance:
:members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger
:inherited-members:

内部模块
-----------
模块
------

.. toctree::
:maxdepth: 1


BIN
docs/source/figures/workflow.png View File

Before After
Width: 2400  |  Height: 1798  |  Size: 250 kB Width: 1267  |  Height: 1107  |  Size: 47 kB

+ 0
- 1
docs/source/modules.rst View File

@@ -2,7 +2,6 @@ fastNLP
=======

.. toctree::
:titlesonly:
:maxdepth: 4

fastNLP

BIN
docs/source/quickstart/cn_cls_example.png View File

Before After
Width: 722  |  Height: 317  |  Size: 162 kB

+ 368
- 0
docs/source/quickstart/文本分类.rst View File

@@ -0,0 +1,368 @@
文本分类(Text classification)
=============================

文本分类任务是将一句话或一段话划分到某个具体的类别。比如垃圾邮件识别,文本情绪分类等。

.. code-block:: text

1, 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!

其中开头的1是只这条评论的标签,表示是正面的情绪。我们将使用到的数据可以通过 `此链接 <http://212.129.155.247/dataset/chn_senti_corp.zip>`_
下载并解压,当然也可以通过fastNLP自动下载该数据。

数据中的内容如下图所示。接下来,我们将用fastNLP在这个数据上训练一个分类网络。

.. figure:: ./cn_cls_example.png
:alt: jupyter

jupyter

步骤
----

一共有以下的几个步骤:

1. `读取数据 <#id4>`_

2. `预处理数据 <#id5>`_

3. `选择预训练词向量 <#id6>`_

4. `创建模型 <#id7>`_

5. `训练模型 <#id8>`_

(1) 读取数据
~~~~~~~~~~~~~~~~~~~~

fastNLP提供多种数据的自动下载与自动加载功能,对于这里我们要用到的数据,我们可以用 :class:`~fastNLP.io.Loader` 自动下载并加载该数据。
更多有关Loader的使用可以参考 :mod:`~fastNLP.io.loader`

.. code-block:: python

from fastNLP.io import ChnSentiCorpLoader
loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader
data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回
data_bundle = loader.load(data_dir) # 这一行代码将从{data_dir}处读取数据至DataBundle


DataBundle的相关介绍,可以参考 :class:`~fastNLP.io.DataBundle` 。我们可以打印该data\_bundle的基本信息。

.. code-block:: python

print(data_bundle)


.. code-block:: text

In total 3 datasets:
dev has 1200 instances.
train has 9600 instances.
test has 1200 instances.
In total 0 vocabs:


可以看出,该data\_bundle中一个含有三个 :class:`~fastNLP.DataSet` 。通过下面的代码,我们可以查看DataSet的基本情况

.. code-block:: python

print(data_bundle.get_dataset('train')[:2]) # 查看Train集前两个sample


.. code-block:: text

+-----------------------------+--------+
| raw_chars | target |
+-----------------------------+--------+
| 选择珠江花园的原因就是方... | 1 |
| 15.4寸笔记本的键盘确实爽... | 1 |
+-----------------------------+--------+

(2) 预处理数据
~~~~~~~~~~~~~~~~~~~~

在NLP任务中,预处理一般包括:

(a) 将一整句话切分成汉字或者词;

(b) 将文本转换为index

fastNLP中也提供了多种数据集的处理类,这里我们直接使用fastNLP的ChnSentiCorpPipe。更多关于Pipe的说明可以参考 :mod:`~fastNLP.io.pipe` 。

.. code-block:: python

from fastNLP.io import ChnSentiCorpPipe

pipe = ChnSentiCorpPipe()
data_bundle = pipe.process(data_bundle) # 所有的Pipe都实现了process()方法,且输入输出都为DataBundle类型

print(data_bundle) # 打印data_bundle,查看其变化


.. code-block:: text

In total 3 datasets:
dev has 1200 instances.
train has 9600 instances.
test has 1200 instances.
In total 2 vocabs:
chars has 4409 entries.
target has 2 entries.



可以看到除了之前已经包含的3个 :class:`~fastNLP.DataSet` ,还新增了两个 :class:`~fastNLP.Vocabulary` 。我们可以打印DataSet中的内容

.. code-block:: python

print(data_bundle.get_dataset('train')[:2])


.. code-block:: text

+-----------------+--------+-----------------+---------+
| raw_chars | target | chars | seq_len |
+-----------------+--------+-----------------+---------+
| 选择珠江花园... | 0 | [338, 464, 1... | 106 |
| 15.4寸笔记本... | 0 | [50, 133, 20... | 56 |
+-----------------+--------+-----------------+---------+


新增了一列为数字列表的chars,以及变为数字的target列。可以看出这两列的名称和刚好与data\_bundle中两个Vocabulary的名称是一致的,我们可以打印一下Vocabulary看一下里面的内容。

.. code-block:: python

char_vocab = data_bundle.get_vocab('chars')
print(char_vocab)


.. code-block:: text

Vocabulary(['选', '择', '珠', '江', '花']...)


Vocabulary是一个记录着词语与index之间映射关系的类,比如

.. code-block:: python

index = char_vocab.to_index('选')
print("'选'的index是{}".format(index)) # 这个值与上面打印出来的第一个instance的chars的第一个index是一致的
print("index:{}对应的汉字是{}".format(index, char_vocab.to_word(index)))


.. code-block:: text

'选'的index是338
index:338对应的汉字是选


(3) 选择预训练词向量
~~~~~~~~~~~~~~~~~~~~

由于Word2vec, Glove, Elmo,
Bert等预训练模型可以增强模型的性能,所以在训练具体任务前,选择合适的预训练词向量非常重要。
在fastNLP中我们提供了多种Embedding使得加载这些预训练模型的过程变得更加便捷。
这里我们先给出一个使用word2vec的中文汉字预训练的示例,之后再给出一个使用Bert的文本分类。
这里使用的预训练词向量为'cn-fastnlp-100d',fastNLP将自动下载该embedding至本地缓存,
fastNLP支持使用名字指定的Embedding以及相关说明可以参见 :mod:`fastNLP.embeddings`

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding

word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d')


.. code-block:: text

Found 4321 out of 4409 compound in the pre-training embedding.

(4) 创建模型
~~~~~~~~~~~~

.. code-block:: python

from torch import nn
from fastNLP.modules import LSTM
import torch
# 定义模型
class BiLSTMMaxPoolCls(nn.Module):
def __init__(self, embed, num_classes, hidden_size=400, num_layers=1, dropout=0.3):
super().__init__()
self.embed = embed
self.lstm = LSTM(self.embed.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers,
batch_first=True, bidirectional=True)
self.dropout_layer = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, chars, seq_len): # 这里的名称必须和DataSet中相应的field对应,比如之前我们DataSet中有chars,这里就必须为chars
# chars:[batch_size, max_len]
# seq_len: [batch_size, ]
chars = self.embed(chars)
outputs, _ = self.lstm(chars, seq_len)
outputs = self.dropout_layer(outputs)
outputs, _ = torch.max(outputs, dim=1)
outputs = self.fc(outputs)
return {'pred':outputs} # [batch_size,], 返回值必须是dict类型,且预测值的key建议设为pred
# 初始化模型
model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))

(5) 训练模型
~~~~~~~~~~~~

fastNLP提供了Trainer对象来组织训练过程,包括完成loss计算(所以在初始化Trainer的时候需要指定loss类型),梯度更新(所以在初始化Trainer的时候需要提供优化器optimizer)以及在验证集上的性能验证(所以在初始化时需要提供一个Metric)

.. code-block:: python

from fastNLP import Trainer
from fastNLP import CrossEntropyLoss
from torch.optim import Adam
from fastNLP import AccuracyMetric
loss = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
metric = AccuracyMetric()
device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快
trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss,
optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'),
metrics=metric, device=device)
trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型
# 在测试集上测试一下模型的性能
from fastNLP import Tester
print("Performance on test is:")
tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)
tester.test()


.. code-block:: text

input fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
chars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
Evaluate data in 0.01 seconds!
training epochs started 2019-09-03-23-57-10

Evaluate data in 0.43 seconds!
Evaluation on dev at Epoch 1/10. Step:300/3000:
AccuracyMetric: acc=0.81

Evaluate data in 0.44 seconds!
Evaluation on dev at Epoch 2/10. Step:600/3000:
AccuracyMetric: acc=0.8675

Evaluate data in 0.44 seconds!
Evaluation on dev at Epoch 3/10. Step:900/3000:
AccuracyMetric: acc=0.878333

....

Evaluate data in 0.48 seconds!
Evaluation on dev at Epoch 9/10. Step:2700/3000:
AccuracyMetric: acc=0.8875

Evaluate data in 0.43 seconds!
Evaluation on dev at Epoch 10/10. Step:3000/3000:
AccuracyMetric: acc=0.895833
In Epoch:7/Step:2100, got best dev performance:
AccuracyMetric: acc=0.8975
Reloaded the best model.

Evaluate data in 0.34 seconds!
[tester]
AccuracyMetric: acc=0.8975

{'AccuracyMetric': {'acc': 0.8975}}



使用Bert进行文本分类
~~~~~~~~~~~~~~~~~~~~

.. code-block:: python

# 只需要切换一下Embedding即可
from fastNLP.embeddings import BertEmbedding
# 这里为了演示一下效果,所以默认Bert不更新权重
bert_embed = BertEmbedding(char_vocab, model_dir_or_name='cn', auto_truncate=True, requires_grad=False)
model = BiLSTMMaxPoolCls(bert_embed, len(data_bundle.get_vocab('target')), )
import torch
from fastNLP import Trainer
from fastNLP import CrossEntropyLoss
from torch.optim import Adam
from fastNLP import AccuracyMetric
loss = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=2e-5)
metric = AccuracyMetric()
device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快
trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss,
optimizer=optimizer, batch_size=16, dev_data=data_bundle.get_dataset('test'),
metrics=metric, device=device, n_epochs=3)
trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型
# 在测试集上测试一下模型的性能
from fastNLP import Tester
print("Performance on test is:")
tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)
tester.test()


.. code-block:: text

loading vocabulary file ~/.fastNLP/embedding/bert-chinese-wwm/vocab.txt
Load pre-trained BERT parameters from file ~/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin.
Start to generating word pieces for word.
Found(Or segment into word pieces) 4286 words out of 4409.
input fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
chars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
Evaluate data in 0.05 seconds!
training epochs started 2019-09-04-00-02-37

Evaluate data in 15.89 seconds!
Evaluation on dev at Epoch 1/3. Step:1200/3600:
AccuracyMetric: acc=0.9

Evaluate data in 15.92 seconds!
Evaluation on dev at Epoch 2/3. Step:2400/3600:
AccuracyMetric: acc=0.904167

Evaluate data in 15.91 seconds!
Evaluation on dev at Epoch 3/3. Step:3600/3600:
AccuracyMetric: acc=0.918333

In Epoch:3/Step:3600, got best dev performance:
AccuracyMetric: acc=0.918333
Reloaded the best model.
Performance on test is:

Evaluate data in 29.24 seconds!
[tester]
AccuracyMetric: acc=0.919167

{'AccuracyMetric': {'acc': 0.919167}}



+ 220
- 0
docs/source/tutorials/extend_1_bert_embedding.rst View File

@@ -0,0 +1,220 @@
==============================
BertEmbedding的各种用法
==============================

Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_
中被提出后,因其性能卓越受到了极大的关注,在这里我们展示一下在fastNLP中如何使用Bert进行各类任务。其中中文Bert我们使用的模型的权重来自于
`中文Bert预训练 <https://github.com/ymcui/Chinese-BERT-wwm>`_ 。

为了方便大家的使用,fastNLP提供了预训练的Embedding权重及数据集的自动下载,支持自动下载的Embedding和数据集见
`数据集 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?tab=fed5xh&c=D42A0AC0>`_ 。或您可从 :doc:`/tutorials/tutorial_3_embedding` 与
:doc:`/tutorials/tutorial_4_load_dataset` 了解更多相关信息。

----------------------------------
中文任务
----------------------------------
下面我们将介绍通过使用Bert来进行文本分类, 中文命名实体识别, 文本匹配, 中文问答。

1. 使用Bert进行文本分类
----------------------------------
文本分类是指给定一段文字,判定其所属的类别。例如下面的文本情感分类

.. code-block:: text

1, 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!

这里我们使用fastNLP提供自动下载的微博分类进行测试

.. code-block:: python

from fastNLP.io import WeiboSenti100kPipe

data_bundle =WeiboSenti100kPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

# 载入BertEmbedding
from fastNLP.embeddings import BertEmbedding

embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)

# 载入模型
from fastNLP.models import BertForSequenceClassification

model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))

# 训练模型
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam

trainer = Trainer(data_bundle.get_dataset('train'), model,
optimizer=Adam(model_params=model.parameters(), lr=2e-5),
loss=CrossEntropyLoss(), device=0,
batch_size=8, dev_data=data_bundle.get_dataset('dev'),
metrics=AccuracyMetric(), n_epochs=2, print_every=1)
trainer.train()

# 测试结果
from fastNLP import Tester

tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())
tester.test()

输出结果::

In Epoch:1/Step:12499, got best dev performance:
AccuracyMetric: acc=0.9838
Reloaded the best model.
Evaluate data in 63.84 seconds!
[tester]
AccuracyMetric: acc=0.9815


2. 使用Bert进行命名实体识别
----------------------------------
命名实体识别是给定一句话,标记出其中的实体。一般序列标注的任务都使用conll格式,conll格式是至一行中通过制表符分隔不同的内容,使用空行分隔
两句话,例如下面的例子

.. code-block:: text

中 B-ORG
共 I-ORG
中 I-ORG
央 I-ORG
致 O
中 B-ORG
国 I-ORG
致 I-ORG
公 I-ORG
党 I-ORG
十 I-ORG
一 I-ORG
大 I-ORG
的 O
贺 O
词 O

这部分内容请参考 :doc:`快速实现序列标注模型 </tutorials/tutorial_9_seq_labeling>`


3. 使用Bert进行文本匹配
----------------------------------
文本匹配任务是指给定两句话判断他们的关系。比如,给定两句话判断前一句是否和后一句具有因果关系或是否是矛盾关系;或者给定两句话判断两句话是否
具有相同的意思。这里我们使用

.. code-block:: python

data_bundle = CNXNLIBertPipe().process_from_file(paths)
data_bundle.rename_field('chars', 'words')
print(data_bundle)

# 载入BertEmbedding
from fastNLP.embeddings import BertEmbedding

embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)

# 载入模型
from fastNLP.models import BertForSentenceMatching

model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target')))

# 训练模型
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam
from fastNLP.core.optimizer import AdamW
from fastNLP.core.callback import WarmupCallback

callbacks = [WarmupCallback(warmup=0.1, schedule='linear'), ]

trainer = Trainer(data_bundle.get_dataset('train'), model,
optimizer=AdamW(params=model.parameters(), lr=4e-5),
loss=CrossEntropyLoss(), device=0,
batch_size=8, dev_data=data_bundle.get_dataset('dev'),
metrics=AccuracyMetric(), n_epochs=5, print_every=1,
update_every=8, callbacks=callbacks)
trainer.train()

from fastNLP import Tester
tester = Tester(data_bundle.get_dataset('test'), model, batch_size=8, metrics=AccuracyMetric())
tester.test()

运行结果::

In Epoch:3/Step:73632, got best dev performance:
AccuracyMetric: acc=0.781928
Reloaded the best model.
Evaluate data in 18.54 seconds!
[tester]
AccuracyMetric: acc=0.783633


4. 使用Bert进行中文问答
----------------------------------
问答任务是给定一段内容,以及一个问题,需要从这段内容中找到答案。
例如::

"context": "锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的演奏方法。常
用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子和尾声,提示音乐的板式和速度,以及
作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合
相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果,如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单
皮鼓(板鼓)、大鼓、大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、大
钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责,又称为四大件,领奏的师
傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼
和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、昆剧和粤剧锣鼓中乐器对应的口诀用字:",
"question": "锣鼓经是什么?",
"answers": [
{
"text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法",
"answer_start": 4
},
{
"text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法",
"answer_start": 4
},
{
"text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法",
"answer_start": 4
}
]

您可以通过以下的代码训练 `CMRC2018 <https://github.com/ymcui/cmrc2018>`_

.. code-block:: python

from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForQuestionAnswering
from fastNLP.core.losses import CMRC2018Loss
from fastNLP.core.metrics import CMRC2018Metric
from fastNLP.io.pipe.qa import CMRC2018BertPipe
from fastNLP import Trainer, BucketSampler
from fastNLP import WarmupCallback, GradientClipCallback
from fastNLP.core.optimizer import AdamW


data_bundle = CMRC2018BertPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

print(data_bundle)

embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True,
dropout=0.5, word_dropout=0.01)
model = BertForQuestionAnswering(embed)
loss = CMRC2018Loss()
metric = CMRC2018Metric()

wm_callback = WarmupCallback(schedule='linear')
gc_callback = GradientClipCallback(clip_value=1, clip_type='norm')
callbacks = [wm_callback, gc_callback]

optimizer = AdamW(model.parameters(), lr=5e-5)

trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,
sampler=BucketSampler(seq_len_field_name='context_len'),
dev_data=data_bundle.get_dataset('dev'), metrics=metric,
callbacks=callbacks, device=0, batch_size=6, num_workers=2, n_epochs=2, print_every=1,
test_use_tqdm=False, update_every=10)
trainer.train(load_best_model=False)

训练结果(和论文中报道的基本一致)::

In Epoch:2/Step:1692, got best dev performance:
CMRC2018Metric: f1=85.61, em=66.08



docs/source/tutorials/tutorial_10_fitlog.rst → docs/source/tutorials/extend_2_fitlog.rst View File


+ 132
- 0
docs/source/tutorials/tutorial_10_callback.rst View File

@@ -0,0 +1,132 @@
===================================================
使用 Callback 自定义你的训练过程
===================================================

- `什么是Callback`_
- `使用 Callback`_
- `fastNLP 中的 Callback`_
- `自定义 Callback`_


什么是Callback
---------------------

:class:`~fastNLP.core.callback.Callback` 是与 :class:`~fastNLP.core.trainer.Trainer` 紧密结合的模块,利用 Callback 可以在 :class:`~fastNLP.core.trainer.Trainer` 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。

fastNLP 中提供了很多常用的 :class:`~fastNLP.core.callback.Callback` ,开箱即用。


使用 Callback
---------------------

使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。


.. code-block:: python

from fastNLP import (Callback, EarlyStopCallback,
Trainer, CrossEntropyLoss, AccuracyMetric)
from fastNLP.models import CNNText
import torch.cuda

# prepare data
def get_data():
from fastNLP.io import ChnSentiCorpPipe as pipe
data = pipe().process_from_file()
print(data)
data.rename_field('chars', 'words')
train_data = data.get_dataset('train')
dev_data = data.get_dataset('dev')
test_data = data.get_dataset('test')
vocab = data.get_vocab('words')
tgt_vocab = data.get_vocab('target')
return train_data, dev_data, test_data, vocab, tgt_vocab

# prepare model
train_data, dev_data, _, vocab, tgt_vocab = get_data()
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = CNNText((len(vocab),50), num_classes=len(tgt_vocab))

# define callback
callbacks=[EarlyStopCallback(5)]

# pass callbacks to Trainer
def train_with_callback(cb_list):
trainer = Trainer(
device=device,
n_epochs=3,
model=model,
train_data=train_data,
dev_data=dev_data,
loss=CrossEntropyLoss(),
metrics=AccuracyMetric(),
callbacks=cb_list,
check_code_level=-1
)
trainer.train()

train_with_callback(callbacks)



fastNLP 中的 Callback
---------------------

fastNLP 中提供了很多常用的 Callback,如梯度裁剪,训练时早停和测试验证集,fitlog 等等。具体 Callback 请参考 :mod:`fastNLP.core.callback`

.. code-block:: python

from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback
callbacks = [
EarlyStopCallback(5),
GradientClipCallback(clip_value=5, clip_type='value'),
EvaluateCallback(dev_data)
]

train_with_callback(callbacks)

自定义 Callback
---------------------

这里我们以一个简单的 Callback作为例子,它的作用是打印每一个 Epoch 平均训练 loss。

1. 创建 Callback
要自定义 Callback,我们要实现一个类,继承 :class:`~fastNLP.core.callback.Callback` 。这里我们定义 ``MyCallBack`` ,继承 fastNLP.Callback 。

2. 指定 Callback 调用的阶段
Callback 中所有以 `on_` 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用,on_epoch_end()
会在每个 epoch 结束时调用。 具体有哪些类方法,参见 :class:`~fastNLP.core.callback.Callback` 文档。这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录
当前 loss,在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。

3. 使用 Callback 的属性访问 Trainer 的内部信息
为了方便使用,可以使用 :class:`~fastNLP.core.callback.Callback` 的属性,访问 :class:`~fastNLP.core.trainer.Trainer` 中的对应信息,如 optimizer, epoch, n_epochs,分别对应训练时的优化器,
当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见 :class:`~fastNLP.core.callback.Callback` 。这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步
数,可以通过 self.step 属性得到当前训练了多少步。

.. code-block:: python

from fastNLP import Callback
from fastNLP import logger

class MyCallBack(Callback):
"""Print average loss in each epoch"""
def __init__(self):
super().__init__()
self.total_loss = 0
self.start_step = 0

def on_backward_begin(self, loss):
self.total_loss += loss.item()

def on_epoch_end(self):
n_steps = self.step - self.start_step
avg_loss = self.total_loss / n_steps
logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss)
self.start_step = self.step

callbacks = [MyCallBack()]
train_with_callback(callbacks)


+ 47
- 47
docs/source/tutorials/tutorial_1_data_preprocess.rst View File

@@ -1,24 +1,22 @@
==============================
使用DataSet预处理文本
fastNLP中的DataSet
==============================

:class:`~fastNLP.DataSet` 是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格,
每一行是一个sample (在fastNLP中被称为 :mod:`~fastNLP.core.instance` ),
每一列是一个feature (在fastNLP中称为 :mod:`~fastNLP.core.field` )
:class:`~fastNLP.DataSet` 是fastNLP用于承载数据的类,一般训练集、验证集和测试集会被加载为三个单独的 :class:`~fastNLP.DataSet` 对象。
:class:`~fastNLP.DataSet` 中的数据组织形式类似一个表格,比如下面 :class:`~fastNLP.DataSet` 一共有3列,列在fastNLP中被称为field

.. csv-table::
:header: "sentence", "words", "seq_len"
:header: "raw_chars", "chars", "seq_len"

"This is the first instance .", "[This, is, the, first, instance, .]", 6
"Second instance .", "[Second, instance, .]", 3
"历任公司副总经理、总工程师,", "[历 任 公 司 副 总 经 理 、 总 工 程 师 ,]", 6
"Third instance .", "[Third, instance, .]", 3
"...", "[...]", "..."

上面是一个样例数据中 DataSet 的存储结构。其中它的每一行是一个 :class:`~fastNLP.Instance` 对象; 每一列是一个 :class:`~fastNLP.FieldArray` 对象。
每一行是一个instance (在fastNLP中被称为 :mod:`~fastNLP.core.Instance` ),
每一列是一个field (在fastNLP中称为 :mod:`~fastNLP.core.FieldArray` )。

-----------------------------
数据集构建和删除
DataSet构建和删除
-----------------------------

我们使用传入字典的方式构建一个数据集,这是 :class:`~fastNLP.DataSet` 初始化的最基础的方式
@@ -26,11 +24,23 @@
.. code-block:: python

from fastNLP import DataSet
data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."],
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."],
'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],
'seq_len': [6, 3, 3]}
dataset = DataSet(data)
# 传入的dict的每个key的value应该为具有相同长度的list
print(dataset)

输出为::

+------------------------------+------------------------------------------------+---------+
| raw_words | words | seq_len |
+------------------------------+------------------------------------------------+---------+
| This is the first instance . | ['this', 'is', 'the', 'first', 'instance', ... | 6 |
| Second instance . | ['Second', 'instance', '.'] | 3 |
| Third instance . | ['Third', 'instance', '.'] | 3 |
+------------------------------+------------------------------------------------+---------+


我们还可以使用 :func:`~fastNLP.DataSet.append` 方法向数据集内增加数据

@@ -39,7 +49,7 @@
from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
instance = Instance(sentence="This is the first instance",
instance = Instance(raw_words="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6)
dataset.append(instance)
@@ -52,10 +62,10 @@
from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet([
Instance(sentence="This is the first instance",
Instance(raw_words="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6),
Instance(sentence="Second instance .",
Instance(raw_words="Second instance .",
words=['Second', 'instance', '.'],
seq_len=3)
])
@@ -82,7 +92,7 @@ FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop`
# 删除名为'a'的field
dataset.delete_field('a')

-----------------------------
简单的数据预处理
-----------------------------

@@ -106,51 +116,41 @@ FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop`
.. code-block:: python

from fastNLP import DataSet
data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."]}
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."]}
dataset = DataSet(data)

# 将句子分成单词形式, 详见DataSet.apply()方法
dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words')
dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')

# 或使用DataSet.apply_field()
dataset.apply_field(lambda sent:sent.split(), field_name='sentence', new_field_name='words')
dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')

# 除了匿名函数,也可以定义函数传递进去
def get_words(instance):
sentence = instance['sentence']
sentence = instance['raw_words']
words = sentence.split()
return words
dataset.apply(get_words, new_field_name='words')

除了手动处理数据集之外,你还可以使用 fastNLP 提供的各种 :class:`~fastNLP.io.base_loader.DataSetLoader` 来进行数据处理。
详细请参考这篇教程 :doc:`使用DataSetLoader加载数据集 </tutorials/tutorial_2_load_dataset>` 。
除了手动处理数据集之外,你还可以使用 fastNLP 提供的各种 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 来进行数据处理。
详细请参考这篇教程 :doc:`使用Loader和Pipe处理数据 </tutorials/tutorial_4_load_dataset>` 。

-----------------------------
DataSet与pad
fastNLP中field的命名习惯
-----------------------------

在fastNLP里,pad是与一个 :mod:`~fastNLP.core.field` 绑定的。即不同的 :mod:`~fastNLP.core.field` 可以使用不同的pad方式,比如在英文任务中word需要的pad和
character的pad方式往往是不同的。fastNLP是通过一个叫做 :class:`~fastNLP.Padder` 的子类来完成的。
默认情况下,所有field使用 :class:`~fastNLP.AutoPadder`
。可以通过使用以下方式设置Padder(如果将padder设置为None,则该field不会进行pad操作)。
大多数情况下直接使用 :class:`~fastNLP.AutoPadder` 就可以了。
如果 :class:`~fastNLP.AutoPadder` 或 :class:`~fastNLP.EngChar2DPadder` 无法满足需求,
也可以自己写一个 :class:`~fastNLP.Padder` 。
在英文任务中,fastNLP常用的field名称有:

.. code-block:: python
- **raw_words**: 表示的是原始的str。例如"This is a demo sentence ."。存在多个raw_words的情况,例如matching任务,它们会被定义为raw_words0, raw_words1。但在conll格式下,raw_words列也可能为["This", "is", "a", "demo", "sentence", "."]的形式。
- **words**: 表示的是已经tokenize后的词语。例如["This", "is", "a", "demo", "sentence"], 但由于str并不能直接被神经网络所使用,所以words中的内容往往被转换为int,如[3, 10, 4, 2, 7, ...]等。多列words的情况,会被命名为words0, words1
- **target**: 表示目标值。分类场景下,只有一个值;序列标注场景下是一个序列。
- **seq_len**: 一般用于表示words列的长度

from fastNLP import DataSet
from fastNLP import EngChar2DPadder
import random
dataset = DataSet()
max_chars, max_words, sent_num = 5, 10, 20
contents = [[
[random.randint(1, 27) for _ in range(random.randint(1, max_chars))]
for _ in range(random.randint(1, max_words))
] for _ in range(sent_num)]
# 初始化时传入
dataset.add_field('chars', contents, padder=EngChar2DPadder())
# 直接设置
dataset.set_padder('chars', EngChar2DPadder())
# 也可以设置pad的value
dataset.set_pad_val('chars', -1)
在中文任务中,fastNLP常用的field名称有:

- **raw_words**: 如果原始汉字序列中已经包含了词语的边界,则该列称为raw_words。如"上海 浦东 开发 与 法制 建设 同步"。
- **words**: 表示单独的汉字词语序列。例如["上海", "", "浦东", "开发", "与", "法制", "建设", ...]或[2, 3, 4, ...]
- **raw_chars**: 表示的是原始的连续汉字序列。例如"这是一个示例。"
- **chars**: 表示已经切分为单独的汉字的序列。例如["这", "是", "一", "个", "示", "例", "。"]。但由于神经网络不能识别汉字,所以一般该列会被转为int形式,如[3, 4, 5, 6, ...]。
- **target**: 表示目标值。分类场景下,只有一个值;序列标注场景下是一个序列
- **seq_len**: 表示输入序列的长度

+ 0
- 224
docs/source/tutorials/tutorial_2_load_dataset.rst View File

@@ -1,224 +0,0 @@
=================================
使用DataSetLoader加载数据集
=================================

这一部分是一个关于如何加载数据集的教程

教程目录:

- `Part I: 数据集容器`_
- `Part II: 数据集的使用方式`_
- `Part III: 不同数据类型的DataSetLoader`_
- `Part IV: DataSetLoader举例`_
- `Part V: fastNLP封装好的数据集加载器`_


----------------------------
Part I: 数据集容器
----------------------------

在fastNLP中,我们使用 :class:`~fastNLP.io.base_loader.DataBundle` 来存储数据集信息。
:class:`~fastNLP.io.base_loader.DataBundle` 类包含了两个重要内容: `datasets` 和 `vocabs` 。

`datasets` 是一个 `key` 为数据集名称(如 `train` , `dev` ,和 `test` 等), `value` 为 :class:`~fastNLP.DataSet` 的字典。

`vocabs` 是一个 `key` 为词表名称(如 :attr:`fastNLP.Const.INPUT` 表示输入文本的词表名称, :attr:`fastNLP.Const.TARGET` 表示目标
的真实标签词表的名称,等等), `value` 为词表内容( :class:`~fastNLP.Vocabulary` )的字典。

----------------------------
Part II: 数据集的使用方式
----------------------------

在fastNLP中,我们采用 :class:`~fastNLP.io.base_loader.DataSetLoader` 来作为加载数据集的基类。
:class:`~fastNLP.io.base_loader.DataSetLoader` 定义了各种DataSetLoader所需的API接口,开发者应该继承它实现各种的DataSetLoader。
在各种数据集的DataSetLoader当中,至少应该编写如下内容:

- _load 函数:从一个数据文件中读取数据到一个 :class:`~fastNLP.DataSet`
- load 函数(可以使用基类的方法):从一个或多个数据文件中读取数据到一个或多个 :class:`~fastNLP.DataSet`
- process 函数:一个或多个从数据文件中读取数据,并处理成可以训练的 :class:`~fastNLP.io.DataBundle`

**\*process函数中可以调用load函数或_load函数**

DataSetLoader的_load或者load函数返回的 :class:`~fastNLP.DataSet` 当中,内容为数据集的文本信息,process函数返回的
:class:`~fastNLP.io.DataBundle` 当中, `datasets` 的内容为已经index好的、可以直接被 :class:`~fastNLP.Trainer`
接受的内容。

--------------------------------------------------------
Part III: 不同数据类型的DataSetLoader
--------------------------------------------------------

:class:`~fastNLP.io.dataset_loader.CSVLoader`
读取CSV类型的数据集文件。例子如下:

.. code-block:: python

data_set_loader = CSVLoader(
headers=('words', 'target'), sep='\t'
)
# 表示将CSV文件中每一行的第一项填入'words' field,第二项填入'target' field。
# 其中每两项之间由'\t'分割开来

data_set = data_set_loader._load('path/to/your/file')

数据集内容样例如下 ::

But it does not leave you with much . 1
You could hate it for the same reason . 1
The performances are an absolute joy . 4


:class:`~fastNLP.io.dataset_loader.JsonLoader`
读取Json类型的数据集文件,数据必须按行存储,每行是一个包含各类属性的Json对象。例子如下:

.. code-block:: python

data_set_loader = JsonLoader(
fields={'sentence1': 'words1', 'sentence2': 'words2', 'gold_label': 'target'}
)
# 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'words1'、'words2'、'target'这三个fields

data_set = data_set_loader._load('path/to/your/file')

数据集内容样例如下 ::

{"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"}
{"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"}
{"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"}

------------------------------------------
Part IV: DataSetLoader举例
------------------------------------------

以Matching任务为例子:

:class:`~fastNLP.io.data_loader.MatchingLoader`
我们在fastNLP当中封装了一个Matching任务数据集的数据加载类: :class:`~fastNLP.io.data_loader.MatchingLoader` .

在MatchingLoader类当中我们封装了一个对数据集中的文本内容进行进一步的预处理的函数:
:meth:`~fastNLP.io.data_loader.MatchingLoader.process`
这个函数具有各种预处理option,如:
- 是否将文本转成全小写
- 是否需要序列长度信息,需要什么类型的序列长度信息
- 是否需要用BertTokenizer来获取序列的WordPiece信息
- 等等

具体内容参见 :meth:`fastNLP.io.MatchingLoader.process` 。

:class:`~fastNLP.io.data_loader.SNLILoader`
一个关于SNLI数据集的DataSetLoader。SNLI数据集来自
`SNLI Data Set <https://nlp.stanford.edu/projects/snli/snli_1.0.zip>`_ .

在 :class:`~fastNLP.io.data_loader.SNLILoader` 的 :meth:`~fastNLP.io.data_loader.SNLILoader._load`
函数中,我们用以下代码将数据集内容从文本文件读入内存:

.. code-block:: python

data = SNLILoader().process(
paths='path/to/snli/data', to_lower=False, seq_len_type='seq_len',
get_index=True, concat=False,
)
print(data)

输出的内容是::

In total 3 datasets:
train has 549367 instances.
dev has 9842 instances.
test has 9824 instances.
In total 2 vocabs:
words has 43154 entries.
target has 3 entries.


这里的data是一个 :class:`~fastNLP.io.base_loader.DataBundle` ,取 ``datasets`` 字典里的内容即可直接传入
:class:`~fastNLP.Trainer` 或者 :class:`~fastNLP.Tester` 进行训练或者测试。

:class:`~fastNLP.io.data_loader.IMDBLoader`
以IMDB数据集为例,在 :class:`~fastNLP.io.data_loader.IMDBLoader` 的 :meth:`~fastNLP.io.data_loader.IMDBLoader._load`
函数中,我们用以下代码将数据集内容从文本文件读入内存:

.. code-block:: python

data = IMDBLoader().process(
paths={'train': 'path/to/train/file', 'test': 'path/to/test/file'}
)
print(data)

输出的内容是::

In total 3 datasets:
train has 22500 instances.
test has 25000 instances.
dev has 2500 instances.
In total 2 vocabs:
words has 82846 entries.
target has 2 entries.


这里的将原来的train集按9:1的比例分成了训练集和验证集。


------------------------------------------
Part V: fastNLP封装好的数据集加载器
------------------------------------------

fastNLP封装好的数据集加载器可以适用于多种类型的任务:

- `文本分类任务`_
- `序列标注任务`_
- `Matching任务`_


文本分类任务
-------------------

========================== ==================================================================
数据集名称 数据集加载器
-------------------------- ------------------------------------------------------------------
IMDb :class:`~fastNLP.io.data_loader.IMDBLoader`
-------------------------- ------------------------------------------------------------------
SST :class:`~fastNLP.io.data_loader.SSTLoader`
-------------------------- ------------------------------------------------------------------
SST-2 :class:`~fastNLP.io.data_loader.SST2Loader`
-------------------------- ------------------------------------------------------------------
Yelp Polarity :class:`~fastNLP.io.data_loader.YelpLoader`
-------------------------- ------------------------------------------------------------------
Yelp Full :class:`~fastNLP.io.data_loader.YelpLoader`
-------------------------- ------------------------------------------------------------------
MTL16 :class:`~fastNLP.io.data_loader.MTL16Loader`
========================== ==================================================================



序列标注任务
-------------------

========================== ==================================================================
数据集名称 数据集加载器
-------------------------- ------------------------------------------------------------------
Conll :class:`~fastNLP.io.data_loader.ConllLoader`
-------------------------- ------------------------------------------------------------------
Conll2003 :class:`~fastNLP.io.data_loader.Conll2003Loader`
-------------------------- ------------------------------------------------------------------
人民日报数据集 :class:`~fastNLP.io.data_loader.PeopleDailyCorpusLoader`
========================== ==================================================================



Matching任务
-------------------

========================== ==================================================================
数据集名称 数据集加载器
-------------------------- ------------------------------------------------------------------
SNLI :class:`~fastNLP.io.data_loader.SNLILoader`
-------------------------- ------------------------------------------------------------------
MultiNLI :class:`~fastNLP.io.data_loader.MNLILoader`
-------------------------- ------------------------------------------------------------------
QNLI :class:`~fastNLP.io.data_loader.QNLILoader`
-------------------------- ------------------------------------------------------------------
RTE :class:`~fastNLP.io.data_loader.RTELoader`
-------------------------- ------------------------------------------------------------------
Quora Pair Dataset :class:`~fastNLP.io.data_loader.QuoraLoader`
========================== ==================================================================


+ 129
- 0
docs/source/tutorials/tutorial_2_vocabulary.rst View File

@@ -0,0 +1,129 @@
==============================
fastNLP中的Vocabulary
==============================

:class:`~fastNLP.Vocabulary` 是包含字或词与index关系的类,用于将文本转换为index。


构建Vocabulary
-----------------------------

.. code-block:: python

from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst(['复', '旦', '大', '学']) # 加入新的字
vocab.add_word('上海') # `上海`会作为一个整体
vocab.to_index('复') # 应该会为3
vocab.to_index('我') # 会输出1,Vocabulary中默认pad的index为0, unk(没有找到的词)的index为1

# 在构建target的Vocabulary时,词表中应该用不上pad和unk,可以通过以下的初始化
vocab = Vocabulary(unknown=None, padding=None)
vocab.add_word_lst(['positive', 'negative'])
vocab.to_index('positive') # 输出0
vocab.to_index('neutral') # 会报错,因为没有unk这种情况

除了通过以上的方式建立词表,Vocabulary还可以通过使用下面的函数直从 :class:`~fastNLP.DataSet` 中的某一列建立词表以及将该列转换为index

.. code-block:: python

from fastNLP import Vocabulary
from fastNLP import DataSet

dataset = DataSet({'chars': [
['今', '天', '天', '气', '很', '好', '。'],
['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']
],
'target': ['neutral', 'negative']
})

vocab = Vocabulary()
vocab.from_dataset(dataset, field_name='chars')
vocab.index_dataset(dataset, field_name='chars')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(dataset, field_name='target')
target_vocab.index_dataset(dataset, field_name='target')
print(dataset)

输出内容为::

+---------------------------------------------------+--------+
| chars | target |
+---------------------------------------------------+--------+
| [4, 2, 2, 5, 6, 7, 3] | 0 |
| [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 3] | 1 |
+---------------------------------------------------+--------+


一些使用tips
-----------------------------

在通过使用from_dataset()函数在DataSet上建立词表时,将测试集和验证集放入参数no_create_entry_dataset中,如下所示

.. code-block:: python

from fastNLP import Vocabulary
from fastNLP import DataSet

tr_data = DataSet({'chars': [
['今', '天', '心', '情', '很', '好', '。'],
['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']
],
'target': ['positive', 'negative']
})
dev_data = DataSet({'chars': [
['住', '宿', '条', '件', '还', '不', '错'],
['糟', '糕', '的', '天', '气', ',', '无', '法', '出', '行', '。']
],
'target': ['positive', 'negative']
})

vocab = Vocabulary()
# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。
vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])


:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集
传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的
情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们
会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。通过与fastNLP中的各种Embedding配合使用,会有如下的效果,
如果一个词出现在了train中,但是没在预训练模型中,embedding会为随机初始化,且它单独的一个vector,如果finetune embedding的话,
这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中,那么就不能为它们单独建立vector,而应该让它指向unk这个vector的
值(当unk的值更新时,这个词也使用的是更新之后的vector)。所以被认为是no_create_entry的token,将首先从预训练的词表中寻找它的表示,如
果找到了,就使用该表示; 如果没有找到,则认为该词的表示应该为unk的表示。

下面我们结合部分 :class:`~fastNLP.embeddings.StaticEmbedding` 的例子来说明下该值造成的影响,如果您对
:class:`~fastNLP.embeddings.StaticEmbedding` 不太了解,您可以先参考 :doc:`使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>` 部分再来阅读该部分

.. code-block:: python

import torch
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word('train')
vocab.add_word('only_in_train') # 仅在train出现,但肯定在预训练词表中不存在
vocab.add_word('test', no_create_entry=True) # 该词只在dev或test中出现
vocab.add_word('only_in_test', no_create_entry=True) # 这个词在预训练的词表中找不到

embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')
print(embed(torch.LongTensor([vocab.to_index('train')])))
print(embed(torch.LongTensor([vocab.to_index('only_in_train')])))
print(embed(torch.LongTensor([vocab.to_index('test')])))
print(embed(torch.LongTensor([vocab.to_index('only_in_test')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

输出结果(只截取了部分vector)::

tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, ...]], grad_fn=<EmbeddingBackward>) # train,en-glove-6b-50d,找到了该词
tensor([[ 0.0540, -0.0557, -0.0514, -0.1688, ...]], grad_fn=<EmbeddingBackward>) # only_in_train,en-glove-6b-50d,使用了随机初始化
tensor([[ 0.1318, -0.2552, -0.0679, 0.2619, ...]], grad_fn=<EmbeddingBackward>) # test,在en-glove-6b-50d中找到了这个词
tensor([[0., 0., 0., 0., 0., ...]], grad_fn=<EmbeddingBackward>) # only_in_test, en-glove-6b-50d中找不到这个词,使用unk的vector
tensor([[0., 0., 0., 0., 0., ...]], grad_fn=<EmbeddingBackward>) # unk,使用zero初始化

首先train和test都能够从预训练中找到对应的vector,所以它们是各自的vector表示; only_in_train在预训练中找不到,StaticEmbedding为它
新建了一个entry,所以它有一个单独的vector; 而only_in_test在预训练中找不到改词,因此被指向了unk的值(fastNLP用零向量初始化unk),与最后一行unk的
表示相同。

+ 370
- 131
docs/source/tutorials/tutorial_3_embedding.rst View File

@@ -7,208 +7,447 @@
教程目录:

- `Part I: embedding介绍`_
- `Part II: 使用随机初始化的embedding`_
- `Part III: 使用预训练的静态embedding`_
- `Part IV: 使用预训练的Contextual Embedding(ELMo & BERT)`_
- `Part V: 使用character-level的embedding`_
- `Part VI: 叠加使用多个embedding`_
- `Part II: 使用预训练的静态embedding`_
- `Part III: 使用随机初始化的embedding`_
- `Part IV: ELMo Embedding`_
- `Part V: Bert Embedding`_
- `Part VI: 使用character-level的embedding`_
- `Part VII: 叠加使用多个embedding`_
- `Part VIII: Embedding的其它说明`_
- `Part IX: StaticEmbedding的使用建议`_




---------------------------------------
Part I: embedding介绍
---------------------------------------

与torch.nn.Embedding类似,fastNLP的embedding接受的输入是一个被index好的序列,输出的内容是这个序列的embedding结果。
Embedding是一种词嵌入技术,可以将字或者词转换为实向量。目前使用较多的预训练词嵌入有word2vec, fasttext, glove, character embedding,
elmo以及bert。
但使用这些词嵌入方式的时候都需要做一些加载上的处理,比如预训练的word2vec, fasttext以及glove都有着超过几十万个词语的表示,但一般任务大概
只会用到其中的几万个词,如果直接加载所有的词汇,会导致内存占用变大以及训练速度变慢,需要从预训练文件中抽取本次实验的用到的词汇;而对于英文的
elmo和character embedding, 需要将word拆分成character才能使用;Bert的使用更是涉及到了Byte pair encoding(BPE)相关的内容。为了方便
大家的使用,fastNLP通过 :class:`~fastNLP.Vocabulary` 统一了不同embedding的使用。下面我们将讲述一些例子来说明一下

fastNLP的embedding包括了预训练embedding和随机初始化embedding。


Part II: 使用预训练的静态embedding
---------------------------------------
Part II: 使用随机初始化的embedding
---------------------------------------

使用随机初始化的embedding参见 :class:`~fastNLP.embeddings.embedding.Embedding` 。
在fastNLP中,加载预训练的word2vec, glove以及fasttext都使用的是 :class:`~fastNLP.embeddings.StaticEmbedding` 。另外,为了方便大家的
使用,fastNLP提供了多种静态词向量的自动下载并缓存(默认缓存到~/.fastNLP/embeddings文件夹下)的功能,支持自动下载的预训练向量可以在
`下载文档 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_ 查看。

.. code-block:: python

import torch
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')

words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) # 将文本转为index
print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的

输出为::

torch.Size([1, 5, 50])

fastNLP的StaticEmbedding在初始化之后,就和pytorch中的Embedding是类似的了。 :class:`~fastNLP.embeddings.StaticEmbedding` 的初始化
主要是从model_dir_or_name提供的词向量中抽取出 :class:`~fastNLP.Vocabulary` 中词语的vector。

除了可以通过使用预先提供的Embedding, :class:`~fastNLP.embeddings.StaticEmbedding` 也支持加载本地的预训练词向量,glove, word2vec以及
fasttext格式的。通过将model_dir_or_name修改为本地的embedding文件路径,即可使用本地的embedding。


Part III: 使用随机初始化的embedding
---------------------------------------

可以传入词表大小和embedding维度:
有时候需要使用随机初始化的Embedding,也可以通过使用 :class:`~fastNLP.embeddings.StaticEmbedding` 获得。只需要将model_dir_or_name
置为None,且传入embedding_dim,如下例所示

.. code-block:: python

embed = Embedding(10000, 50)
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

也可以传入一个初始化的参数矩阵:
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)

words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 5, 30])



Part IV: ELMo Embedding
-----------------------------------------------------------

在fastNLP中,我们提供了ELMo和BERT的embedding: :class:`~fastNLP.embeddings.ElmoEmbedding`
和 :class:`~fastNLP.embeddings.BertEmbedding` 。可自动下载的ElmoEmbedding可以
从 `下载文档 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_ 找到。

与静态embedding类似,ELMo的使用方法如下:

.. code-block:: python

embed = Embedding(init_embed)
from fastNLP.embeddings import ElmoEmbedding
from fastNLP import Vocabulary

其中的init_embed可以是torch.FloatTensor、torch.nn.Embedding或者numpy.ndarray。
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

---------------------------------------
Part III: 使用预训练的静态embedding
---------------------------------------
输出为::

在使用预训练的embedding之前,需要根据数据集的内容构建一个词表 :class:`~fastNLP.core.vocabulary.Vocabulary` ,在
预训练embedding类初始化的时候需要将这个词表作为参数传入。
torch.Size([1, 5, 256])

在fastNLP中,我们提供了 :class:`~fastNLP.embeddings.StaticEmbedding` 这一个类。
通过 :class:`~fastNLP.embeddings.StaticEmbedding` 可以加载预训练好的静态
Embedding,例子如下:
也可以输出多层的ELMo结果,fastNLP将在不同层的结果在最后一维上拼接,下面的代码需要在上面的代码执行结束之后执行

.. code-block:: python

embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')
print(embed(words).size())

vocab为根据数据集构建的词表,model_dir_or_name可以是一个路径,也可以是embedding模型的名称:
输出为::

1 如果传入的是路径,那么fastNLP将会根据该路径来读取预训练的权重文件并将embedding加载进来(glove
和word2vec类型的权重文件都支持)
torch.Size([1, 5, 512])

2 如果传入的是模型名称,那么fastNLP将会根据名称查找embedding模型,如果在cache目录下找到模型则会
自动加载;如果找不到则会自动下载。可以通过环境变量 ``FASTNLP_CACHE_DIR`` 来自定义cache目录,如::
另外,根据 `Deep contextualized word representations <https://arxiv.org/abs/1802.05365>`_ ,不同层之间使用可学习的权重可以使得ELMo的效果更好,在fastNLP中可以通过以下的初始化
实现3层输出的结果通过可学习的权重进行加法融合。

$ FASTNLP_CACHE_DIR=~/fastnlp_cache_dir python your_python_file.py
.. code-block:: python

这个命令表示fastNLP将会在 `~/fastnlp_cache_dir` 这个目录下寻找模型,找不到则会自动将模型下载到这个目录
embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')
print(embed(words).size()) # 三层输出按照权重element-wise的加起来

目前支持的静态embedding模型有:
输出为::

========================== ================================
模型名称 模型
-------------------------- --------------------------------
en glove.840B.300d
-------------------------- --------------------------------
en-glove-840d-300 glove.840B.300d
-------------------------- --------------------------------
en-glove-6b-50 glove.6B.50d
-------------------------- --------------------------------
en-word2vec-300 谷歌word2vec 300维
-------------------------- --------------------------------
en-fasttext 英文fasttext 300维
-------------------------- --------------------------------
cn 腾讯中文词向量 200维
-------------------------- --------------------------------
cn-fasttext 中文fasttext 300维
========================== ================================
torch.Size([1, 5, 256])



-----------------------------------------------------------
Part IV: 使用预训练的Contextual Embedding(ELMo & BERT)
Part V: Bert Embedding
-----------------------------------------------------------

在fastNLP中,我们提供了ELMo和BERT的embedding: :class:`~fastNLP.embeddings.ElmoEmbedding`
和 :class:`~fastNLP.embeddings.BertEmbedding` 。
虽然Bert并不算严格意义上的Embedding,但通过将Bert封装成Embedding的形式将极大减轻使用的复杂程度。可自动下载的Bert Embedding可以
从 `下载文档 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_ 找到。我们将使用下面的例子讲述一下
BertEmbedding的使用

与静态embedding类似,ELMo的使用方法如下:
.. code-block:: python

from fastNLP.embeddings import BertEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 5, 768])

可以通过申明使用指定层数的output也可以使用多层的output,下面的代码需要在上面的代码执行结束之后执行

.. code-block:: python

embed = ElmoEmbedding(vocab, model_dir_or_name='small', requires_grad=False)

目前支持的ElmoEmbedding模型有:

========================== ================================
模型名称 模型
-------------------------- --------------------------------
small allennlp ELMo的small
-------------------------- --------------------------------
medium allennlp ELMo的medium
-------------------------- --------------------------------
original allennlp ELMo的original
-------------------------- --------------------------------
5.5b-original allennlp ELMo的5.5B original
========================== ================================

BERT-embedding的使用方法如下:

.. code-block:: python

embed = BertEmbedding(
vocab, model_dir_or_name='en-base-cased', requires_grad=False, layers='4,-2,-1'
)

其中layers变量表示需要取哪几层的encode结果。

目前支持的BertEmbedding模型有:

========================== ====================================
模型名称 模型
-------------------------- ------------------------------------
en bert-base-cased
-------------------------- ------------------------------------
en-base-uncased bert-base-uncased
-------------------------- ------------------------------------
en-base-cased bert-base-cased
-------------------------- ------------------------------------
en-large-uncased bert-large-uncased
-------------------------- ------------------------------------
en-large-cased bert-large-cased
-------------------------- ------------------------------------
-------------------------- ------------------------------------
en-large-cased-wwm bert-large-cased-whole-word-mask
-------------------------- ------------------------------------
en-large-uncased-wwm bert-large-uncased-whole-word-mask
-------------------------- ------------------------------------
en-base-cased-mrpc bert-base-cased-finetuned-mrpc
-------------------------- ------------------------------------
-------------------------- ------------------------------------
multilingual bert-base-multilingual-cased
-------------------------- ------------------------------------
multilingual-base-uncased bert-base-multilingual-uncased
-------------------------- ------------------------------------
multilingual-base-cased bert-base-multilingual-cased
========================== ====================================
# 使用后面两层的输出
embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')
print(embed(words).size()) # 结果将是在最后一维做拼接

-----------------------------------------------------
Part V: 使用character-level的embedding
输出为::

torch.Size([1, 5, 1536])

在Bert中还存在两个特殊的字符[CLS]和[SEP],默认情况下这两个字符是自动加入并且在计算结束之后会自动删除,以使得输入的序列长度和输出的序列
长度是一致的,但是有些分类的情况,必须需要使用[CLS]的表示,这种情况可以通过在初始化时申明一下需要保留[CLS]的表示,如下例所示

.. code-block:: python

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)
print(embed(words).size()) # 结果将在序列维度上增加2
# 取出句子的cls表示
cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]

输出为::

torch.Size([1, 7, 768])

在英文Bert模型中,一个英文单词可能会被切分为多个subword,例如"fairness"会被拆分为 ``["fair", "##ness"]`` ,这样一个word对应的将有两个输出,
:class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector,通过pool_method可以控制
该pooling方法,支持的有"first"(即使用fair的表示作为fairness的表示), "last"(使用##ness的表示作为fairness的表示), "max"(对fair和
##ness在每一维上做max),"avg"(对fair和##ness每一维做average)。

.. code-block:: python

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')
print(embed(words).size())

输出为::

torch.Size([1, 5, 768])

另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ ,
Bert在针对具有两句话的任务时(如matching,Q&A任务),句子之间通过[SEP]拼接起来,前一句话的token embedding为0,
后一句话的token embedding为1。BertEmbedding能够自动识别句子中间的[SEP]来正确设置对应的token_type_id的。

.. code-block:: python

vocab = Vocabulary()
vocab.add_word_lst("this is a demo . [SEP] another sentence .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo . [SEP] another sentence .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 9, 768])

在多个[SEP]的情况下,将会使token_type_id不断0,1循环。比如"first sentence [SEP] second sentence [SEP] third sentence", 它们的
token_type_id将是[0, 0, 0, 1, 1, 1, 0, 0]。但请注意[SEP]一定要大写的,不能是[sep],否则无法识别。

更多 :class:`~fastNLP.embedding.BertEmbedding` 的使用,请参考 :doc:`/tutorials/extend_1_bert_embedding`


Part VI: 使用character-level的embedding
-----------------------------------------------------

除了预训练的embedding以外,fastNLP还提供了CharEmbedding: :class:`~fastNLP.embeddings.CNNCharEmbedding` 和
:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。
除了预训练的embedding以外,fastNLP还提供了两种Character Embedding: :class:`~fastNLP.embeddings.CNNCharEmbedding` 和
:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时,需要在预处理的时候将word拆分成character,这
会使得预处理过程变得非常繁琐。在fastNLP中,使用character embedding也只需要传入 :class:`~fastNLP.Vocabulary` 即可,而且该
Vocabulary与其它Embedding使用的Vocabulary是一致的,下面我们看两个例子。

CNNCharEmbedding的使用例子如下:

.. code-block:: python

embed = CNNCharEmbedding(vocab, embed_size=100, char_emb_size=50)
from fastNLP.embeddings import CNNCharEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

# character的embedding维度大小为50,返回的embedding结果维度大小为64。
embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

这表示这个CNNCharEmbedding当中character的embedding维度大小为50,返回的embedding结果维度大小为100。
输出为::

torch.Size([1, 5, 64])

与CNNCharEmbedding类似,LSTMCharEmbedding的使用例子如下:

.. code-block:: python

embed = LSTMCharEmbedding(vocab, embed_size=100, char_emb_size=50)
from fastNLP.embeddings import LSTMCharEmbeddding
from fastNLP import Vocabulary

这表示这个LSTMCharEmbedding当中character的embedding维度大小为50,返回的embedding结果维度大小为100。
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

# character的embedding维度大小为50,返回的embedding结果维度大小为64。
embed = LSTMCharEmbeddding(vocab, embed_size=64, char_emb_size=50)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 5, 64])


Part VII: 叠加使用多个embedding
-----------------------------------------------------
Part VI: 叠加使用多个embedding
-----------------------------------------------------

在fastNLP中,我们使用 :class:`~fastNLP.embeddings.StackEmbedding` 来叠加多个embedding
单独使用Character Embedding往往效果并不是很好,需要同时结合word embedding。在fastNLP中可以通过 :class:`~fastNLP.embeddings.StackEmbedding`
来叠加embedding,具体的例子如下所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding, StackEmbedding, CNNCharEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')
char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)
embed = StackEmbedding([word_embed, char_embed])

words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size()) # 输出embedding的维度为50+64=114

输出为::

torch.Size([1, 5, 114])

例子如下:
:class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` ,
:class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。
:class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的,即输出index返回对应的表示。但能够拼接起来的Embedding
必须使用同样的 :class:`~fastNLP.Vocabulary` ,因为只有使用同样的 :class:`~fastNLP.Vocabulary` 才能保证同一个index指向的是同一个词或字



Part VIII: Embedding的其它说明
-----------------------------------------------------------

(1) 获取各种Embedding的dimension

.. code-block:: python

embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)
from fastNLP.embeddings import *

stack_embed = StackEmbedding([embed_1, embed_2])
vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

StackEmbedding会把多个embedding的结果拼接起来,如上面例子的stack_embed返回的embedding维度为350维。
static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')
print(static_embed.embedding_dim) # 50
char_embed = CNNCharEmbedding(vocab, embed_size=30)
print(char_embed.embedding_dim) # 30
elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')
print(elmo_embed_1.embedding_dim) # 256
elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')
print(elmo_embed_2.embedding_dim) # 512
bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')
print(bert_embed_1.embedding_dim) # 768
bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')
print(bert_embed_2.embedding_dim) # 1536
stack_embed = StackEmbedding([static_embed, char_embed])
print(stack_embed.embedding_dim) # 80

除此以外,还可以把静态embedding跟上下文相关的embedding拼接起来:
(2) 设置Embedding的权重是否更新

.. code-block:: python

elmo_embedding = ElmoEmbedding(vocab, model_dir_or_name='medium', layers='0,1,2', requires_grad=False)
glove_embedding = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
from fastNLP.embeddings import *

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新
embed.requires_grad = False # 修改BertEmbedding的权重为不更新

(3) 各种Embedding中word_dropout与dropout的说明

fastNLP中所有的Embedding都支持传入word_dropout和dropout参数,word_dropout指示的是以多大概率将输入的word置为unk的index,这样既可以
是的unk得到训练,也可以有一定的regularize效果; dropout参数是在获取到word的表示之后,以多大概率将一些维度的表示置为0。

如果使用 :class:`~fastNLP.embeddings.StackEmbedding` 且需要用到word_dropout,建议将word_dropout设置在 :class:`~fastNLP.embeddings.StackEmbedding` 上。



Part IX: StaticEmbedding的使用建议
-----------------------------------------------------------

在英文的命名实体识别(NER)任务中,由 `Named Entity Recognition with Bidirectional LSTM-CNNs <http://xxx.itp.ac.cn/pdf/1511.08308.pdf>`_ 指出,同时使用cnn character embedding和word embedding
会使得NER的效果有比较大的提升。正如你在上节中看到的那样,fastNLP支持将 :class:`~fastNLP.embeddings.CNNCharEmbedding`
与 :class:`~fastNLP.embeddings.StaticEmbedding` 拼成一个 :class:`~fastNLP.embeddings.StackEmbedding` 。如果通过这种方式使用,需要
在预处理文本时,不要将词汇小写化(因为Character Embedding需要利用词语中的大小写信息)且不要将出现频次低于某个阈值的word设置为unk(因为
Character embedding需要利用字形信息);但 :class:`~fastNLP.embeddings.StaticEmbedding` 使用的某些预训练词嵌入的词汇表中只有小写的词
语, 且某些低频词并未在预训练中出现需要被剔除。即(1) character embedding需要保留大小写,而预训练词向量不需要保留大小写。(2)
character embedding需要保留所有的字形, 而static embedding需要设置一个最低阈值以学到更好的表示。

(1) fastNLP如何解决关于大小写的问题

fastNLP通过在 :class:`~fastNLP.embeddings.StaticEmbedding` 增加了一个lower参数解决该问题。如下面的例子所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("The the a A".split())
# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)
print(embed(torch.LongTensor([vocab.to_index('The')])))
print(embed(torch.LongTensor([vocab.to_index('the')])))

输出为::

tensor([[-0.4685, 0.4572, 0.5159, -0.2618, -0.6871]], grad_fn=<EmbeddingBackward>)
tensor([[ 0.2615, 0.1490, -0.2491, 0.4009, -0.3842]], grad_fn=<EmbeddingBackward>)

可以看到"The"与"the"的vector是不一致的。但如果我们在初始化 :class:`~fastNLP.embeddings.StaticEmbedding` 将lower设置为True,效果将
如下所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("The the a A".split())
# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)
print(embed(torch.LongTensor([vocab.to_index('The')])))
print(embed(torch.LongTensor([vocab.to_index('the')])))

输出为::

tensor([[-0.2237, 0.6825, -0.3459, -0.1795, 0.7516]], grad_fn=<EmbeddingBackward>)
tensor([[-0.2237, 0.6825, -0.3459, -0.1795, 0.7516]], grad_fn=<EmbeddingBackward>)

可以看到"The"与"the"的vector是一致的。他们实际上也是引用的同一个vector。通过将lower设置为True,可以在 :class:`~fastNLP.embeddings.StaticEmbedding`
实现类似具备相同小写结果的词语引用同一个vector。

(2) fastNLP如何解决min_freq的问题

fastNLP通过在 :class:`~fastNLP.embeddings.StaticEmbedding` 增加了一个min_freq参数解决该问题。如下面的例子所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("the the the a".split())
# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)
print(embed(torch.LongTensor([vocab.to_index('the')])))
print(embed(torch.LongTensor([vocab.to_index('a')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

输出为::

tensor([[ 0.0454, 0.3375, 0.6758, -0.2026, -0.4715]], grad_fn=<EmbeddingBackward>)
tensor([[-0.7602, 0.0149, 0.2733, 0.3974, 0.7371]], grad_fn=<EmbeddingBackward>)
tensor([[-0.7602, 0.0149, 0.2733, 0.3974, 0.7371]], grad_fn=<EmbeddingBackward>)

其中最后一行为unknown值的vector,可以看到a的vector表示与unknown是一样的,这是由于a的频次低于了2,所以被指向了unknown的表示;而the由于
词频超过了2次,所以它是单独的表示。

在计算min_freq时,也会考虑到lower的作用,比如

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("the the the a A".split())
# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)
print(embed(torch.LongTensor([vocab.to_index('the')])))
print(embed(torch.LongTensor([vocab.to_index('a')])))
print(embed(torch.LongTensor([vocab.to_index('A')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

输出为::

tensor([[-0.7453, -0.5542, 0.5039, 0.6195, -0.4723]], grad_fn=<EmbeddingBackward>) # the
tensor([[ 0.0170, -0.0995, -0.5743, -0.2469, -0.2095]], grad_fn=<EmbeddingBackward>) # a
tensor([[ 0.0170, -0.0995, -0.5743, -0.2469, -0.2095]], grad_fn=<EmbeddingBackward>) # A
tensor([[ 0.6707, -0.5786, -0.6967, 0.0111, 0.1209]], grad_fn=<EmbeddingBackward>) # unk

stack_embed = StackEmbedding([elmo_embedding, glove_embedding])
可以看到a不再和最后一行的unknown共享一个表示了,这是由于a与A都算入了a的词频,且A的表示也是a的表示。

+ 210
- 0
docs/source/tutorials/tutorial_4_load_dataset.rst View File

@@ -0,0 +1,210 @@
=======================================
使用Loader和Pipe加载并处理数据集
=======================================

这一部分是关于如何加载数据集的教程

教程目录:

- `Part I: 数据集容器DataBundle`_
- `Part II: 加载的各种数据集的Loader`_
- `Part III: 使用Pipe对数据集进行预处理`_
- `Part IV: fastNLP封装好的Loader和Pipe`_
- `Part V: 不同格式类型的基础Loader`_


Part I: 数据集容器DataBundle
------------------------------------

而由于对于同一个任务,训练集,验证集和测试集会共用同一个词表以及具有相同的目标值,所以在fastNLP中我们使用了 :class:`~fastNLP.io.DataBundle`
来承载同一个任务的多个数据集 :class:`~fastNLP.DataSet` 以及它们的词表 :class:`~fastNLP.Vocabulary` 。下面会有例子介绍 :class:`~fastNLP.io.DataBundle`
的相关使用。

:class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 中被使用。
下面我们先介绍一下 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 。

Part II: 加载的各种数据集的Loader
-------------------------------------

在fastNLP中,所有的 :class:`~fastNLP.io.Loader` 都可以通过其文档判断其支持读取的数据格式,以及读取之后返回的 :class:`~fastNLP.DataSet` 的格式,
例如 :class:`~fastNLP.io.ChnSentiCorpLoader` 。

- **download()** 函数:自动将该数据集下载到缓存地址,默认缓存地址为~/.fastNLP/datasets/。由于版权等原因,不是所有的Loader都实现了该方法。该方法会返回下载后文件所处的缓存地址。
- **_load()** 函数:从一个数据文件中读取数据,返回一个 :class:`~fastNLP.DataSet` 。返回的DataSet的格式可从Loader文档判断。
- **load()** 函数:从文件或者文件夹中读取数据为 :class:`~fastNLP.DataSet` 并将它们组装成 :class:`~fastNLP.io.DataBundle`。支持接受的参数类型有以下的几种

- None, 将尝试读取自动缓存的数据,仅支持提供了自动下载数据的Loader
- 文件夹路径, 默认将尝试在该文件夹下匹配文件名中含有 `train` , `test` , `dev` 的文件,如果有多个文件含有相同的关键字,将无法通过该方式读取
- dict, 例如{'train':"/path/to/tr.conll", 'dev':"/to/validate.conll", "test":"/to/te.conll"}。

.. code-block:: python

from fastNLP.io import CWSLoader

loader = CWSLoader(dataset_name='pku')
data_bundle = loader.load()
print(data_bundle)

输出内容为::

In total 3 datasets:
dev has 1831 instances.
train has 17223 instances.
test has 1944 instances.

这里表示一共有3个数据集。其中:

- 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance

也可以取出DataSet,并打印DataSet中的具体内容

.. code-block:: python

tr_data = data_bundle.get_dataset('train')
print(tr_data[:2])

输出为::

+--------------------------------------------------------------------------------------+
| raw_words |
+--------------------------------------------------------------------------------------+
| 迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ( 附 图片 1 张 ) |
| 中共中央 总书记 、 国家 主席 江 泽民 |
+--------------------------------------------------------------------------------------+

Part III: 使用Pipe对数据集进行预处理
------------------------------------------
通过 :class:`~fastNLP.io.Loader` 可以将文本数据读入,但并不能直接被神经网络使用,还需要进行一定的预处理。

在fastNLP中,我们使用 :class:`~fastNLP.io.Pipe` 的子类作为数据预处理的类, :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 一般具备一一对应的关系,该关系可以从其名称判断,
例如 :class:`~fastNLP.io.CWSLoader` 与 :class:`~fastNLP.io.CWSPipe` 是一一对应的。一般情况下Pipe处理包含以下的几个过程,(1)将raw_words或
raw_chars进行tokenize以切分成不同的词或字; (2) 再建立词或字的 :class:`~fastNLP.Vocabulary` , 并将词或字转换为index; (3)将target
列建立词表并将target列转为index;

所有的Pipe都可通过其文档查看该Pipe支持处理的 :class:`~fastNLP.DataSet` 以及返回的 :class:`~fastNLP.io.DataBundle` 中的Vocabulary的情况;
如 :class:`~fastNLP.io.OntoNotesNERPipe`

各种数据集的Pipe当中,都包含了以下的两个函数:

- process() 函数:对输入的 :class:`~fastNLP.io.DataBundle` 进行处理, 然后返回处理之后的 :class:`~fastNLP.io.DataBundle` 。process函数的文档中包含了该Pipe支持处理的DataSet的格式。
- process_from_file() 函数:输入数据集所在文件夹,使用对应的Loader读取数据(所以该函数支持的参数类型是由于其对应的Loader的load函数决定的),然后调用相对应的process函数对数据进行预处理。相当于是把Load和process放在一个函数中执行。

接着上面 :class:`~fastNLP.io.CWSLoader` 的例子,我们展示一下 :class:`~fastNLP.io.CWSPipe` 的功能:

.. code-block:: python

from fastNLP.io import CWSPipe

data_bundle = CWSPipe().process(data_bundle)
print(data_bundle)

输出内容为::

In total 3 datasets:
dev has 1831 instances.
train has 17223 instances.
test has 1944 instances.
In total 2 vocabs:
chars has 4777 entries.
target has 4 entries.

表示一共有3个数据集和2个词表。其中:

- 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance
- 2个词表分别为chars词表与target词表。其中chars词表为句子文本所构建的词表,一共有4777个不同的字;target词表为目标标签所构建的词表,一共有4种标签。

相较于之前CWSLoader读取的DataBundle,新增了两个Vocabulary。 我们可以打印一下处理之后的DataSet

.. code-block:: python

tr_data = data_bundle.get_dataset('train')
print(tr_data[:2])

输出为::

+---------------------------------------------------+------------------------------------+------------------------------------+---------+
| raw_words | chars | target | seq_len |
+---------------------------------------------------+------------------------------------+------------------------------------+---------+
| 迈向 充满 希望 的 新 世纪 —— 一九九八年... | [1224, 178, 674, 544, 573, 435,... | [0, 1, 0, 1, 0, 1, 2, 2, 0, 1, ... | 29 |
| 中共中央 总书记 、 国家 主席 江 泽民 | [11, 212, 11, 335, 124, 256, 10... | [0, 3, 3, 1, 0, 3, 1, 2, 0, 1, ... | 15 |
+---------------------------------------------------+------------------------------------+------------------------------------+---------+

可以看到有两列为int的field: chars和target。这两列的名称同时也是DataBundle中的Vocabulary的名称。可以通过下列的代码获取并查看Vocabulary的
信息

.. code-block:: python

vocab = data_bundle.get_vocab('target')
print(vocab)

输出为::

Vocabulary(['B', 'E', 'S', 'M']...)


Part IV: fastNLP封装好的Loader和Pipe
------------------------------------------

fastNLP封装了多种任务/数据集的 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 并提供自动下载功能,具体参见文档
`数据集 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_


Part V: 不同格式类型的基础Loader
--------------------------------------------------------

除了上面提到的针对具体任务的Loader,我们还提供了CSV格式和JSON格式的Loader

:class:`~fastNLP.io.loader.CSVLoader` 读取CSV类型的数据集文件。例子如下:

.. code-block:: python

from fastNLP.io.loader import CSVLoader
data_set_loader = CSVLoader(
headers=('raw_words', 'target'), sep='\t'
)
# 表示将CSV文件中每一行的第一项将填入'raw_words' field,第二项填入'target' field。
# 其中项之间由'\t'分割开来

data_set = data_set_loader._load('path/to/your/file')

文件内容样例如下 ::

But it does not leave you with much . 1
You could hate it for the same reason . 1
The performances are an absolute joy . 4

读取之后的DataSet具有以下的field

.. csv-table::
:header: raw_words, target

"But it does not leave you with much .", "1"
"You could hate it for the same reason .", "1"
"The performances are an absolute joy .", "4"

:class:`~fastNLP.io.JsonLoader` 读取Json类型的数据集文件,数据必须按行存储,每行是一个包含各类属性的Json对象。例子如下:

.. code-block:: python

from fastNLP.io.loader import JsonLoader
oader = JsonLoader(
fields={'sentence1': 'raw_words1', 'sentence2': 'raw_words2', 'gold_label': 'target'}
)
# 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'raw_words1'、'raw_words2'、'target'这三个fields

data_set = loader._load('path/to/your/file')

数据集内容样例如下 ::

{"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"}
{"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"}
{"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"}

读取之后的DataSet具有以下的field

.. csv-table::
:header: raw_words0, raw_words1, target

"A person on a horse jumps over a broken down airplane.", "A person is training his horse for a competition.", "neutral"
"A person on a horse jumps over a broken down airplane.", "A person is at a diner, ordering an omelette.", "contradiction"
"A person on a horse jumps over a broken down airplane.", "A person is outdoors, on a horse.", "entailment"

+ 0
- 267
docs/source/tutorials/tutorial_4_loss_optimizer.rst View File

@@ -1,267 +0,0 @@
==============================================================================
动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试
==============================================================================

我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极(label=1)、
消极(label=0)还是中性(label=2),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。

--------------
数据处理
--------------

数据读入
我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SSTLoader` 类,轻松地读取SST数据集(数据来源:https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)。
这里的 dataset 是 fastNLP 中 :class:`~fastNLP.DataSet` 类的对象。

.. code-block:: python

from fastNLP.io import SSTLoader

loader = SSTLoader()
#这里的all.txt是下载好数据后train.txt、dev.txt、test.txt的组合
dataset = loader.load("./trainDevTestTrees_PTB/trees/all.txt")
print(dataset[0])

输出数据如下::
{'words': ['It', "'s", 'a', 'lovely', 'film', 'with', 'lovely', 'performances', 'by', 'Buy', 'and', 'Accorsi', '.'] type=list,
'target': positive type=str}

除了读取数据外,fastNLP 还提供了读取其它文件类型的 Loader 类、读取 Embedding的 Loader 等。详见 :doc:`/fastNLP.io` 。

数据处理
我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``target`` :mod:`~fastNLP.core.field` 转化为整数。
.. code-block:: python

def label_to_int(x):
if x['target']=="positive":
return 1
elif x['target']=="negative":
return 0
else:
return 2

# 将label转为整数
dataset.apply(lambda x: label_to_int(x), new_field_name='target')

``words`` 和 ``target`` 已经足够用于 :class:`~fastNLP.models.CNNText` 的训练了,但我们从其文档
:class:`~fastNLP.models.CNNText` 中看到,在 :meth:`~fastNLP.models.CNNText.forward` 的时候,还可以传入可选参数 ``seq_len`` 。
所以,我们再使用 :meth:`~fastNLP.DataSet.apply_field` 方法增加一个名为 ``seq_len`` 的 :mod:`~fastNLP.core.field` 。

.. code-block:: python

# 增加长度信息
dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')

观察可知: :meth:`~fastNLP.DataSet.apply_field` 与 :meth:`~fastNLP.DataSet.apply` 类似,
但所传入的 `lambda` 函数是针对一个 :class:`~fastNLP.Instance` 中的一个 :mod:`~fastNLP.core.field` 的;
而 :meth:`~fastNLP.DataSet.apply` 所传入的 `lambda` 函数是针对整个 :class:`~fastNLP.Instance` 的。

.. note::
`lambda` 函数即匿名函数,是 Python 的重要特性。 ``lambda x: len(x)`` 和下面的这个函数的作用相同::

def func_lambda(x):
return len(x)

你也可以编写复杂的函数做为 :meth:`~fastNLP.DataSet.apply_field` 与 :meth:`~fastNLP.DataSet.apply` 的参数

Vocabulary 的使用
我们再用 :class:`~fastNLP.Vocabulary` 类来统计数据中出现的单词,并使用 :meth:`~fastNLP.Vocabulary.index_dataset`
将单词序列转化为训练可用的数字序列。

.. code-block:: python

from fastNLP import Vocabulary

# 使用Vocabulary类统计单词,并将单词序列转化为数字序列
vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
vocab.index_dataset(dataset, field_name='words',new_field_name='words')
print(dataset[0])
输出数据如下::

{'words': [27, 9, 6, 913, 16, 18, 913, 124, 31, 5715, 5, 1, 2] type=list,
'target': 1 type=int,
'seq_len': 13 type=int}


---------------------
使用内置模型训练
---------------------

内置模型的输入输出命名
fastNLP内置了一些完整的神经网络模型,详见 :doc:`/fastNLP.models` , 我们使用其中的 :class:`~fastNLP.models.CNNText` 模型进行训练。
为了使用内置的 :class:`~fastNLP.models.CNNText`,我们必须修改 :class:`~fastNLP.DataSet` 中 :mod:`~fastNLP.core.field` 的名称。
在这个例子中模型输入 (forward方法的参数) 为 ``words`` 和 ``seq_len`` ; 预测输出为 ``pred`` ;标准答案为 ``target`` 。
具体的命名规范可以参考 :doc:`/fastNLP.core.const` 。

如果不想查看文档,您也可以使用 :class:`~fastNLP.Const` 类进行命名。下面的代码展示了给 :class:`~fastNLP.DataSet` 中
:mod:`~fastNLP.core.field` 改名的 :meth:`~fastNLP.DataSet.rename_field` 方法,以及 :class:`~fastNLP.Const` 类的使用方法。

.. code-block:: python

from fastNLP import Const

dataset.rename_field('words', Const.INPUT)
dataset.rename_field('seq_len', Const.INPUT_LEN)
dataset.rename_field('target', Const.TARGET)

print(Const.INPUT)
print(Const.INPUT_LEN)
print(Const.TARGET)
print(Const.OUTPUT)
输出结果为::

words
seq_len
target
pred
在给 :class:`~fastNLP.DataSet` 中 :mod:`~fastNLP.core.field` 改名后,我们还需要设置训练所需的输入和目标,这里使用的是
:meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 两个函数。

.. code-block:: python

#使用dataset的 set_input 和 set_target函数,告诉模型dataset中那些数据是输入,那些数据是标签(目标输出)
dataset.set_input(Const.INPUT, Const.INPUT_LEN)
dataset.set_target(Const.TARGET)

数据集分割
除了修改 :mod:`~fastNLP.core.field` 之外,我们还可以对 :class:`~fastNLP.DataSet` 进行分割,以供训练、开发和测试使用。
下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法

.. code-block:: python

train_dev_data, test_data = dataset.split(0.1)
train_data, dev_data = train_dev_data.split(0.1)
print(len(train_data), len(dev_data), len(test_data))

输出结果为::
9603 1067 1185

评价指标
训练模型需要提供一个评价指标。这里使用准确率做为评价指标。参数的 `命名规则` 跟上面类似。
``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。

.. code-block:: python

from fastNLP import AccuracyMetric
# metrics=AccuracyMetric() 在本例中与下面这行代码等价
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)
损失函数
训练模型需要提供一个损失函数
,fastNLP中提供了直接可以导入使用的四种loss,分别为:
* :class:`~fastNLP.CrossEntropyLoss`:包装了torch.nn.functional.cross_entropy()函数,返回交叉熵损失(可以运用于多分类场景)
* :class:`~fastNLP.BCELoss`:包装了torch.nn.functional.binary_cross_entropy()函数,返回二分类的交叉熵
* :class:`~fastNLP.L1Loss`:包装了torch.nn.functional.l1_loss()函数,返回L1 损失
* :class:`~fastNLP.NLLLoss`:包装了torch.nn.functional.nll_loss()函数,返回负对数似然损失
下面提供了一个在分类问题中常用的交叉熵损失。注意它的 **初始化参数** 。
``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。
这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或
数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。

.. code-block:: python

from fastNLP import CrossEntropyLoss
# loss = CrossEntropyLoss() 在本例中与下面这行代码等价
loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)
优化器
定义模型运行的时候使用的优化器,可以使用fastNLP包装好的优化器:
* :class:`~fastNLP.SGD` :包装了torch.optim.SGD优化器
* :class:`~fastNLP.Adam` :包装了torch.optim.Adam优化器
也可以直接使用torch.optim.Optimizer中的优化器,并在实例化 :class:`~fastNLP.Trainer` 类的时候传入优化器实参
.. code-block:: python

import torch.optim as optim
from fastNLP import Adam

#使用 torch.optim 定义优化器
optimizer_1=optim.RMSprop(model_cnn.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)
#使用fastNLP中包装的 Adam 定义优化器
optimizer_2=Adam(lr=4e-3, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, model_params=model_cnn.parameters())

快速训练
现在我们可以导入 fastNLP 内置的文本分类模型 :class:`~fastNLP.models.CNNText` ,并使用 :class:`~fastNLP.Trainer` 进行训练,
除了使用 :class:`~fastNLP.Trainer`进行训练,我们也可以通过使用 :class:`~fastNLP.DataSetIter` 来编写自己的训练过程,具体见 :doc:`/tutorials/tutorial_5_datasetiter`

.. code-block:: python

from fastNLP.models import CNNText

#词嵌入的维度、训练的轮数和batch size
EMBED_DIM = 100
N_EPOCHS = 10
BATCH_SIZE = 16

#使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数
#还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值
model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=3, padding=2, dropout=0.1)

#如果在定义trainer的时候没有传入optimizer参数,模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3
#这里只使用了optimizer_1作为优化器输入,感兴趣可以尝试optimizer_2或者其他优化器作为输入
#这里只使用了loss作为损失函数输入,感兴趣可以尝试其他损失函数输入
trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics,
optimizer=optimizer_1,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE)
trainer.train()

训练过程的输出如下::
input fields after batch(if batch size is 2):
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 40])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])

training epochs started 2019-07-08-15-44-48
Evaluation at Epoch 1/10. Step:601/6010. AccuracyMetric: acc=0.59044

Evaluation at Epoch 2/10. Step:1202/6010. AccuracyMetric: acc=0.599813

Evaluation at Epoch 3/10. Step:1803/6010. AccuracyMetric: acc=0.508903

Evaluation at Epoch 4/10. Step:2404/6010. AccuracyMetric: acc=0.596064

Evaluation at Epoch 5/10. Step:3005/6010. AccuracyMetric: acc=0.47985

Evaluation at Epoch 6/10. Step:3606/6010. AccuracyMetric: acc=0.589503

Evaluation at Epoch 7/10. Step:4207/6010. AccuracyMetric: acc=0.311153

Evaluation at Epoch 8/10. Step:4808/6010. AccuracyMetric: acc=0.549203

Evaluation at Epoch 9/10. Step:5409/6010. AccuracyMetric: acc=0.581068

Evaluation at Epoch 10/10. Step:6010/6010. AccuracyMetric: acc=0.523899


In Epoch:2/Step:1202, got best dev performance:AccuracyMetric: acc=0.599813
Reloaded the best model.

快速测试
与 :class:`~fastNLP.Trainer` 对应,fastNLP 也提供了 :class:`~fastNLP.Tester` 用于快速测试,用法如下

.. code-block:: python

from fastNLP import Tester

tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())
tester.test()
训练过程输出如下::
[tester]
AccuracyMetric: acc=0.565401

+ 0
- 250
docs/source/tutorials/tutorial_5_datasetiter.rst View File

@@ -1,250 +0,0 @@
==============================================================================
动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程
==============================================================================

我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极(label=1)、
消极(label=0)还是中性(label=2),使用 :class:`~fastNLP.DataSetIter` 类来编写自己的训练过程。
自己编写训练过程之前的内容与 :doc:`/tutorials/tutorial_4_loss_optimizer` 中的完全一样,如已经阅读过可以跳过。

--------------
数据处理
--------------

数据读入
我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SSTLoader` 类,轻松地读取SST数据集(数据来源:https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip)。
这里的 dataset 是 fastNLP 中 :class:`~fastNLP.DataSet` 类的对象。

.. code-block:: python

from fastNLP.io import SSTLoader

loader = SSTLoader()
#这里的all.txt是下载好数据后train.txt、dev.txt、test.txt的组合
dataset = loader.load("./trainDevTestTrees_PTB/trees/all.txt")
print(dataset[0])

输出数据如下::
{'words': ['It', "'s", 'a', 'lovely', 'film', 'with', 'lovely', 'performances', 'by', 'Buy', 'and', 'Accorsi', '.'] type=list,
'target': positive type=str}
除了读取数据外,fastNLP 还提供了读取其它文件类型的 Loader 类、读取 Embedding的 Loader 等。详见 :doc:`/fastNLP.io` 。

数据处理
我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``target`` :mod:`~fastNLP.core.field` 转化为整数。
.. code-block:: python

def label_to_int(x):
if x['target']=="positive":
return 1
elif x['target']=="negative":
return 0
else:
return 2

# 将label转为整数
dataset.apply(lambda x: label_to_int(x), new_field_name='target')

``words`` 和 ``target`` 已经足够用于 :class:`~fastNLP.models.CNNText` 的训练了,但我们从其文档
:class:`~fastNLP.models.CNNText` 中看到,在 :meth:`~fastNLP.models.CNNText.forward` 的时候,还可以传入可选参数 ``seq_len`` 。
所以,我们再使用 :meth:`~fastNLP.DataSet.apply_field` 方法增加一个名为 ``seq_len`` 的 :mod:`~fastNLP.core.field` 。

.. code-block:: python

# 增加长度信息
dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')

观察可知: :meth:`~fastNLP.DataSet.apply_field` 与 :meth:`~fastNLP.DataSet.apply` 类似,
但所传入的 `lambda` 函数是针对一个 :class:`~fastNLP.Instance` 中的一个 :mod:`~fastNLP.core.field` 的;
而 :meth:`~fastNLP.DataSet.apply` 所传入的 `lambda` 函数是针对整个 :class:`~fastNLP.Instance` 的。

.. note::
`lambda` 函数即匿名函数,是 Python 的重要特性。 ``lambda x: len(x)`` 和下面的这个函数的作用相同::

def func_lambda(x):
return len(x)

你也可以编写复杂的函数做为 :meth:`~fastNLP.DataSet.apply_field` 与 :meth:`~fastNLP.DataSet.apply` 的参数

Vocabulary 的使用
我们再用 :class:`~fastNLP.Vocabulary` 类来统计数据中出现的单词,并使用 :meth:`~fastNLP.Vocabulary.index_dataset`
将单词序列转化为训练可用的数字序列。

.. code-block:: python

from fastNLP import Vocabulary

# 使用Vocabulary类统计单词,并将单词序列转化为数字序列
vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
vocab.index_dataset(dataset, field_name='words',new_field_name='words')
print(dataset[0])
输出数据如下::
{'words': [27, 9, 6, 913, 16, 18, 913, 124, 31, 5715, 5, 1, 2] type=list,
'target': 1 type=int,
'seq_len': 13 type=int}


---------------------
使用内置模型训练
---------------------

内置模型的输入输出命名
fastNLP内置了一些完整的神经网络模型,详见 :doc:`/fastNLP.models` , 我们使用其中的 :class:`~fastNLP.models.CNNText` 模型进行训练。
为了使用内置的 :class:`~fastNLP.models.CNNText`,我们必须修改 :class:`~fastNLP.DataSet` 中 :mod:`~fastNLP.core.field` 的名称。
在这个例子中模型输入 (forward方法的参数) 为 ``words`` 和 ``seq_len`` ; 预测输出为 ``pred`` ;标准答案为 ``target`` 。
具体的命名规范可以参考 :doc:`/fastNLP.core.const` 。

如果不想查看文档,您也可以使用 :class:`~fastNLP.Const` 类进行命名。下面的代码展示了给 :class:`~fastNLP.DataSet` 中
:mod:`~fastNLP.core.field` 改名的 :meth:`~fastNLP.DataSet.rename_field` 方法,以及 :class:`~fastNLP.Const` 类的使用方法。

.. code-block:: python

from fastNLP import Const

dataset.rename_field('words', Const.INPUT)
dataset.rename_field('seq_len', Const.INPUT_LEN)
dataset.rename_field('target', Const.TARGET)

print(Const.INPUT)
print(Const.INPUT_LEN)
print(Const.TARGET)
print(Const.OUTPUT)
输出结果为::
words
seq_len
target
pred
在给 :class:`~fastNLP.DataSet` 中 :mod:`~fastNLP.core.field` 改名后,我们还需要设置训练所需的输入和目标,这里使用的是
:meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 两个函数。

.. code-block:: python

#使用dataset的 set_input 和 set_target函数,告诉模型dataset中那些数据是输入,那些数据是标签(目标输出)
dataset.set_input(Const.INPUT, Const.INPUT_LEN)
dataset.set_target(Const.TARGET)

数据集分割
除了修改 :mod:`~fastNLP.core.field` 之外,我们还可以对 :class:`~fastNLP.DataSet` 进行分割,以供训练、开发和测试使用。
下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法

.. code-block:: python

train_dev_data, test_data = dataset.split(0.1)
train_data, dev_data = train_dev_data.split(0.1)
print(len(train_data), len(dev_data), len(test_data))

输出结果为::
9603 1067 1185

评价指标
训练模型需要提供一个评价指标。这里使用准确率做为评价指标。参数的 `命名规则` 跟上面类似。
``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。

.. code-block:: python

from fastNLP import AccuracyMetric
# metrics=AccuracyMetric() 在本例中与下面这行代码等价
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)


--------------------------
自己编写训练过程
--------------------------
如果你想用类似 PyTorch 的使用方法,自己编写训练过程,你可以参考下面这段代码。
其中使用了 fastNLP 提供的 :class:`~fastNLP.DataSetIter` 来获得小批量训练的小批量数据,
使用 :class:`~fastNLP.BucketSampler` 做为 :class:`~fastNLP.DataSetIter` 的参数来选择采样的方式。
DataSetIter
fastNLP定义的 :class:`~fastNLP.DataSetIter` 类,用于定义一个batch,并实现batch的多种功能,在初始化时传入的参数有:
* dataset: :class:`~fastNLP.DataSet` 对象, 数据集
* batch_size: 取出的batch大小
* sampler: 规定使用的 :class:`~fastNLP.Sampler` 若为 None, 使用 :class:`~fastNLP.RandomSampler` (Default: None)
* as_numpy: 若为 True, 输出batch为 `numpy.array`. 否则为 `torch.Tensor` (Default: False)
* prefetch: 若为 True使用多进程预先取出下一batch. (Default: False)

sampler
fastNLP 实现的采样器有:
* :class:`~fastNLP.BucketSampler` 可以随机地取出长度相似的元素 【初始化参数: num_buckets:bucket的数量; batch_size:batch大小; seq_len_field_name:dataset中对应序列长度的 :mod:`~fastNLP.core.field` 的名字】
* SequentialSampler: 顺序取出元素的采样器【无初始化参数】
* RandomSampler:随机化取元素的采样器【无初始化参数】

以下代码使用BucketSampler作为 :class:`~fastNLP.DataSetIter` 初始化的输入,运用 :class:`~fastNLP.DataSetIter` 自己写训练程序

.. code-block:: python

from fastNLP import BucketSampler
from fastNLP import DataSetIter
from fastNLP.models import CNNText
from fastNLP import Tester
import torch
import time

embed_dim = 100
model = CNNText((len(vocab),embed_dim), num_classes=3, padding=2, dropout=0.1)

def train(epoch, data, devdata):
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
lossfunc = torch.nn.CrossEntropyLoss()
batch_size = 32

# 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。
# 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)
train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')
train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)

start_time = time.time()
print("-"*5+"start training"+"-"*5)
for i in range(epoch):
loss_list = []
for batch_x, batch_y in train_batch:
optimizer.zero_grad()
output = model(batch_x['words'])
loss = lossfunc(output['pred'], batch_y['target'])
loss.backward()
optimizer.step()
loss_list.append(loss.item())

#这里verbose如果为0,在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1,打印出验证结果,返回评估信息
#在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果
tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)
res=tester_tmp.test()

print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ")
print(tester._format_eval_results(res),end=" ")
print('{:d}ms'.format(round((time.time()-start_time)*1000)))
loss_list.clear()

train(10, train_data, dev_data)
#使用tester进行快速测试
tester = Tester(test_data, model, metrics=AccuracyMetric())
tester.test()

这段代码的输出如下::

-----start training-----
Epoch 0 Avg Loss: 1.09 AccuracyMetric: acc=0.480787 58989ms
Epoch 1 Avg Loss: 1.00 AccuracyMetric: acc=0.500469 118348ms
Epoch 2 Avg Loss: 0.93 AccuracyMetric: acc=0.536082 176220ms
Epoch 3 Avg Loss: 0.87 AccuracyMetric: acc=0.556701 236032ms
Epoch 4 Avg Loss: 0.78 AccuracyMetric: acc=0.562324 294351ms
Epoch 5 Avg Loss: 0.69 AccuracyMetric: acc=0.58388 353673ms
Epoch 6 Avg Loss: 0.60 AccuracyMetric: acc=0.574508 412106ms
Epoch 7 Avg Loss: 0.51 AccuracyMetric: acc=0.589503 471097ms
Epoch 8 Avg Loss: 0.44 AccuracyMetric: acc=0.581068 529174ms
Epoch 9 Avg Loss: 0.39 AccuracyMetric: acc=0.572634 586216ms
[tester]
AccuracyMetric: acc=0.527426



+ 237
- 0
docs/source/tutorials/tutorial_5_loss_optimizer.rst View File

@@ -0,0 +1,237 @@
==============================================================================
动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试
==============================================================================

我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极的(label=0)、
还是消极的(label=1),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。

数据读入和处理
-----------------

数据读入
我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的
:meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数 据集,函数默认paths值为None。
此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` :

* raw_words: 原source句子
* target: 标签值
* words: index之后的raw_words
* seq_len: 句子长度

读入数据代码如下:

.. code-block:: python

from fastNLP.io import SST2Pipe
pipe = SST2Pipe()
databundle = pipe.process_from_file()
vocab = databundle.get_vocab('words')
print(databundle)
print(databundle.get_dataset('train')[0])
print(databundle.get_vocab('words'))


输出数据如下::

In total 3 datasets:
test has 1821 instances.
train has 67349 instances.
dev has 872 instances.
In total 2 vocabs:
words has 16293 entries.
target has 2 entries.

+-------------------------------------------+--------+--------------------------------------+---------+
| raw_words | target | words | seq_len |
+-------------------------------------------+--------+--------------------------------------+---------+
| hide new secretions from the parental ... | 1 | [4111, 98, 12010, 38, 2, 6844, 9042] | 7 |
+-------------------------------------------+--------+--------------------------------------+---------+
Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)

除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:` </tutorials/tutorial_4_load_dataset>` 。
数据集分割
由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法

.. code-block:: python

train_data = databundle.get_dataset('train')
train_data, test_data = train_data.split(0.015)
dev_data = databundle.get_dataset('dev')
print(len(train_data),len(dev_data),len(test_data))

输出结果为::
66339 872 1010

数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数
:class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证
集的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将 `target` :mod:`~fastNLP.core.field` 设定
为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个
:mod:`~fastNLP.core.field` 的设定情况,代码如下:

.. code-block:: python

train_data.print_field_meta()

输出结果为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

其中is_input和is_target分别表示是否为input和target。ignore_type为true时指使用 :class:`~fastNLP.DataSetIter` 取出batch数
据时fastNLP不会进行自动padding,pad_value指对应 :mod:`~fastNLP.core.field` padding所用的值,这两者只有
当 :mod:`~fastNLP.core.field` 设定为input或者target的时候才有存在的意义。

is_input为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的batch_x 中,而is_target为true
的 :mod:`~fastNLP.core.field` 在:class:`~fastNLP.DataSetIter` 迭代取出的 batch_y 中。
具体分析见 :doc:`使用DataSetIter实现自定义训练过程 </tutorials/tutorial_6_datasetiter>` 。

使用内置模型训练
---------------------
模型定义和初始化
我们可以导入 fastNLP 内置的文本分类模型 :class:`~fastNLP.models.CNNText` 来对模型进行定义,代码如下:

.. code-block:: python

from fastNLP.models import CNNText

#词嵌入的维度
EMBED_DIM = 100

#使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数
#还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值
model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, dropout=0.1)

使用fastNLP快速搭建自己的模型详见 :doc:`</tutorials/tutorial_8_modules_models>` 。

评价指标
训练模型需要提供一个评价指标。这里使用准确率做为评价指标。

* ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
* ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。

这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或
数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。代码如下:

.. code-block:: python

from fastNLP import AccuracyMetric
from fastNLP import Const
# metrics=AccuracyMetric() 在本例中与下面这行代码等价
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

损失函数
训练模型需要提供一个损失函数
,fastNLP中提供了直接可以导入使用的四种loss,分别为:
* :class:`~fastNLP.CrossEntropyLoss`:包装了torch.nn.functional.cross_entropy()函数,返回交叉熵损失(可以运用于多分类场景)
* :class:`~fastNLP.BCELoss`:包装了torch.nn.functional.binary_cross_entropy()函数,返回二分类的交叉熵
* :class:`~fastNLP.L1Loss`:包装了torch.nn.functional.l1_loss()函数,返回L1 损失
* :class:`~fastNLP.NLLLoss`:包装了torch.nn.functional.nll_loss()函数,返回负对数似然损失
下面提供了一个在分类问题中常用的交叉熵损失。注意它的 **初始化参数** 。

* ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
* ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。

这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或
数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。

.. code-block:: python

from fastNLP import CrossEntropyLoss
# loss = CrossEntropyLoss() 在本例中与下面这行代码等价
loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)
除了使用fastNLP已经包装好的了损失函数,也可以通过fastNLP中的LossFunc类来构建自己的损失函数,方法如下:

.. code-block:: python

# 这表示构建了一个损失函数类,由func计算损失函数,其中将从模型返回值或者DataSet的target=True的field
# 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数;找到一个参数名为`label`的参数
# 传入func作为一个名为`target`的参数
#下面自己构建了一个交叉熵函数,和之后直接使用fastNLP中的交叉熵函数是一个效果
import torch
from fastNLP import LossFunc
func = torch.nn.functional.cross_entropy
loss_func = LossFunc(func, input=Const.OUTPUT, target=Const.TARGET)
优化器
定义模型运行的时候使用的优化器,可以直接使用torch.optim.Optimizer中的优化器,并在实例化 :class:`~fastNLP.Trainer` 类的时候传入优化器实参
.. code-block:: python

import torch.optim as optim

#使用 torch.optim 定义优化器
optimizer=optim.RMSprop(model_cnn.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)

快速训练
现在我们对上面定义的模型使用 :class:`~fastNLP.Trainer` 进行训练。
除了使用 :class:`~fastNLP.Trainer`进行训练,我们也可以通过使用 :class:`~fastNLP.DataSetIter` 来编写自己的训练过程,具体见 :doc:`/tutorials/tutorial_6_datasetiter`

.. code-block:: python

from fastNLP import Trainer
#训练的轮数和batch size
N_EPOCHS = 10
BATCH_SIZE = 16

#如果在定义trainer的时候没有传入optimizer参数,模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3
#这里只使用了loss作为损失函数输入,感兴趣可以尝试其他损失函数(如之前自定义的loss_func)作为输入
trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics,
optimizer=optimizer,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE)
trainer.train()

训练过程的输出如下::

input fields after batch(if batch size is 2):
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])

training epochs started 2019-09-17-14-29-00

Evaluate data in 0.11 seconds!
Evaluation on dev at Epoch 1/10. Step:4147/41470:
AccuracyMetric: acc=0.762615

...

Evaluate data in 0.2 seconds!
Evaluation on dev at Epoch 10/10. Step:41470/41470:
AccuracyMetric: acc=0.769495

In Epoch:2/Step:8294, got best dev performance:
AccuracyMetric: acc=0.800459
Reloaded the best model.

快速测试
与 :class:`~fastNLP.Trainer` 对应,fastNLP 也提供了 :class:`~fastNLP.Tester` 用于快速测试,用法如下

.. code-block:: python

from fastNLP import Tester

tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())
tester.test()
训练过程输出如下::
Evaluate data in 0.19 seconds!
[tester]
AccuracyMetric: acc=0.889109

+ 413
- 0
docs/source/tutorials/tutorial_6_datasetiter.rst View File

@@ -0,0 +1,413 @@
==============================================================================
动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程
==============================================================================

我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极的(label=0)、
还是消极的(label=1),使用 :class:`~fastNLP.DataSetIter` 类来编写自己的训练过程。
DataSetIter初探之前的内容与 :doc:`/tutorials/tutorial_5_loss_optimizer` 中的完全一样,如已经阅读过可以跳过。


数据读入和预处理
--------------------

数据读入
我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的
:meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数 据集,函数默认paths值为None。
此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` :

* raw_words: 原source句子
* target: 标签值
* words: index之后的raw_words
* seq_len: 句子长度

读入数据代码如下:

.. code-block:: python

from fastNLP.io import SST2Pipe
pipe = SST2Pipe()
databundle = pipe.process_from_file()
vocab = databundle.vocabs['words']
print(databundle)
print(databundle.datasets['train'][0])
print(databundle.vocabs['words'])


输出数据如下::
In total 3 datasets:
test has 1821 instances.
train has 67349 instances.
dev has 872 instances.
In total 2 vocabs:
words has 16293 entries.
target has 2 entries.

+-------------------------------------------+--------+--------------------------------------+---------+
| raw_words | target | words | seq_len |
+-------------------------------------------+--------+--------------------------------------+---------+
| hide new secretions from the parental ... | 1 | [4111, 98, 12010, 38, 2, 6844, 9042] | 7 |
+-------------------------------------------+--------+--------------------------------------+---------+
Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)

除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:`/tutorials/tutorial_4_load_dataset` 。
数据集分割
由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法

.. code-block:: python

train_data = databundle.get_dataset('train')
train_data, test_data = train_data.split(0.015)
dev_data = databundle.get_dataset('dev')
print(len(train_data),len(dev_data),len(test_data))

输出结果为::
66339 872 1010

数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数
:class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证集
的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将`target` :mod:`~fastNLP.core.field` 设定为target。
我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个
:mod:`~fastNLP.core.field` 的设定情况,代码如下:

.. code-block:: python

train_data.print_field_meta()

输出结果为::
+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

其中is_input和is_target分别表示是否为input和target。ignore_type为true时指使用 :class:`~fastNLP.DataSetIter` 取出batch数
据时fastNLP不会进行自动padding,pad_value指对应 :mod:`~fastNLP.core.field` padding所用的值,这两者只有当
:mod:`~fastNLP.core.field` 设定为input或者target的时候才有存在的意义。

is_input为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的 batch_x 中,
而 is_target为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的 batch_y 中。
具体分析见下面DataSetIter的介绍过程。


评价指标
训练模型需要提供一个评价指标。这里使用准确率做为评价指标。

* ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
* ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。

这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或
数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。代码如下:

.. code-block:: python

from fastNLP import AccuracyMetric
from fastNLP import Const
# metrics=AccuracyMetric() 在本例中与下面这行代码等价
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)


DataSetIter初探
--------------------------

DataSetIter
fastNLP定义的 :class:`~fastNLP.DataSetIter` 类,用于定义一个batch,并实现batch的多种功能,在初始化时传入的参数有:
* dataset: :class:`~fastNLP.DataSet` 对象, 数据集
* batch_size: 取出的batch大小
* sampler: 规定使用的 :class:`~fastNLP.Sampler` 若为 None, 使用 :class:`~fastNLP.RandomSampler` (Default: None)
* as_numpy: 若为 True, 输出batch为 `numpy.array`. 否则为 `torch.Tensor` (Default: False)
* prefetch: 若为 True使用多进程预先取出下一batch. (Default: False)

sampler
fastNLP 实现的采样器有:
* :class:`~fastNLP.BucketSampler` 可以随机地取出长度相似的元素 【初始化参数: num_buckets:bucket的数量; batch_size:batch大小; seq_len_field_name:dataset中对应序列长度的 :mod:`~fastNLP.core.field` 的名字】
* SequentialSampler: 顺序取出元素的采样器【无初始化参数】
* RandomSampler:随机化取元素的采样器【无初始化参数】

Padder
在fastNLP里,pad是与一个 :mod:`~fastNLP.core.field` 绑定的。即不同的 :mod:`~fastNLP.core.field` 可以使用不同的pad方式,比如在英文任务中word需要的pad和
character的pad方式往往是不同的。fastNLP是通过一个叫做 :class:`~fastNLP.Padder` 的子类来完成的。
默认情况下,所有field使用 :class:`~fastNLP.AutoPadder`
。大多数情况下直接使用 :class:`~fastNLP.AutoPadder` 就可以了。
如果 :class:`~fastNLP.AutoPadder` 或 :class:`~fastNLP.EngChar2DPadder` 无法满足需求,
也可以自己写一个 :class:`~fastNLP.Padder` 。

DataSetIter自动padding
以下代码展示了DataSetIter的简单使用:

.. code-block:: python

from fastNLP import BucketSampler
from fastNLP import DataSetIter

tmp_data = dev_data[:10]
# 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。
# 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)
sampler = BucketSampler(batch_size=2, seq_len_field_name='seq_len')
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
print("batch_x: ",batch_x)
print("batch_y: ", batch_y)
输出结果如下::

batch_x: {'words': tensor([[ 4, 278, 686, 18, 7],
[15619, 3205, 5, 1676, 0]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9,
1218, 7, 0, 0, 0, 0, 0, 0, 0, 0],
[ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620,
4, 674, 663, 15, 4, 1155, 241, 640, 418, 7]]), 'seq_len': tensor([12, 20])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3,
2, 18, 11, 4, 1019, 433, 144, 32, 246, 309,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0],
[ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15,
2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17,
13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8,
1324, 4399, 7]]), 'seq_len': tensor([21, 33])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 14, 10, 438, 31, 78, 3, 78, 438, 7],
[ 14, 10, 4, 312, 5, 155, 1419, 610, 7]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134,
2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7],
[ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137,
40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])}

可以看到那些设定为input的 :mod:`~fastNLP.core.field` 都出现在batch_x中,而设定为target的 :mod:`~fastNLP.core.field` 则出现在batch_y中。同时对于同一个batch_x中的两个数 据,长度偏短的那个会被自动padding到和长度偏长的句子长度一致,默认的padding值为0。

Dataset改变padding值
可以通过 :meth:`~fastNLP.core.Dataset.set_pad_val` 方法修改默认的pad值,代码如下:

.. code-block:: python

tmp_data.set_pad_val('words',-1)
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
print("batch_x: ",batch_x)
print("batch_y: ", batch_y)

输出结果如下::

batch_x: {'words': tensor([[15619, 3205, 5, 1676, -1],
[ 4, 278, 686, 18, 7]]), 'seq_len': tensor([4, 5])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3,
2, 18, 11, 4, 1019, 433, 144, 32, 246, 309,
7, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1],
[ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15,
2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17,
13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8,
1324, 4399, 7]]), 'seq_len': tensor([21, 33])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 14, 10, 4, 312, 5, 155, 1419, 610, 7],
[ 14, 10, 438, 31, 78, 3, 78, 438, 7]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137,
40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7],
[ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134,
2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9,
1218, 7, -1, -1, -1, -1, -1, -1, -1, -1],
[ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620,
4, 674, 663, 15, 4, 1155, 241, 640, 418, 7]]), 'seq_len': tensor([12, 20])}
batch_y: {'target': tensor([1, 0])}
可以看到使用了-1进行padding。

Dataset个性化padding
如果我们希望对某一些 :mod:`~fastNLP.core.field` 进行个性化padding,可以自己构造Padder类,并使用 :meth:`~fastNLP.core.Dataset.set_padder` 函数修改padder来实现。下面通 过构造一个将数据padding到固定长度的padder进行展示:

.. code-block:: python

from fastNLP.core.field import Padder
import numpy as np
class FixLengthPadder(Padder):
def __init__(self, pad_val=0, length=None):
super().__init__(pad_val=pad_val)
self.length = length
assert self.length is not None, "Creating FixLengthPadder with no specific length!"
def __call__(self, contents, field_name, field_ele_dtype, dim):
#计算当前contents中的最大长度
max_len = max(map(len, contents))
#如果当前contents中的最大长度大于指定的padder length的话就报错
assert max_len <= self.length, "Fixed padder length smaller than actual length! with length {}".format(max_len)
array = np.full((len(contents), self.length), self.pad_val, dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
array[i, :len(content_i)] = content_i
return array

#设定FixLengthPadder的固定长度为40
tmp_padder = FixLengthPadder(pad_val=0,length=40)
#利用dataset的set_padder函数设定words field的padder
tmp_data.set_padder('words',tmp_padder)
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
print("batch_x: ",batch_x)
print("batch_y: ", batch_y)

输出结果如下::

batch_x: {'words': tensor([[ 4, 278, 686, 18, 7, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[15619, 3205, 5, 1676, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137,
40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134,
2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15,
2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17,
13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8,
1324, 4399, 7, 0, 0, 0, 0, 0, 0, 0],
[ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3,
2, 18, 11, 4, 1019, 433, 144, 32, 246, 309,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([33, 21])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 14, 10, 4, 312, 5, 155, 1419, 610, 7, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0],
[ 14, 10, 438, 31, 78, 3, 78, 438, 7, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9,
1218, 7, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620,
4, 674, 663, 15, 4, 1155, 241, 640, 418, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([12, 20])}
batch_y: {'target': tensor([1, 0])}

在这里所有的`words`都被pad成了长度为40的list。


使用DataSetIter自己编写训练过程
------------------------------------
如果你想用类似 PyTorch 的使用方法,自己编写训练过程,可以参考下面这段代码。
其中使用了 fastNLP 提供的 :class:`~fastNLP.DataSetIter` 来获得小批量训练的小批量数据,
使用 :class:`~fastNLP.BucketSampler` 做为 :class:`~fastNLP.DataSetIter` 的参数来选择采样的方式。

以下代码使用BucketSampler作为 :class:`~fastNLP.DataSetIter` 初始化的输入,运用 :class:`~fastNLP.DataSetIter` 自己写训练程序

.. code-block:: python

from fastNLP import BucketSampler
from fastNLP import DataSetIter
from fastNLP.models import CNNText
from fastNLP import Tester
import torch
import time

embed_dim = 100
model = CNNText((len(vocab),embed_dim), num_classes=2, dropout=0.1)

def train(epoch, data, devdata):
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
lossfunc = torch.nn.CrossEntropyLoss()
batch_size = 32

# 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。
# 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)
train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')
train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)

start_time = time.time()
print("-"*5+"start training"+"-"*5)
for i in range(epoch):
loss_list = []
for batch_x, batch_y in train_batch:
optimizer.zero_grad()
output = model(batch_x['words'])
loss = lossfunc(output['pred'], batch_y['target'])
loss.backward()
optimizer.step()
loss_list.append(loss.item())

#这里verbose如果为0,在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1,打印出验证结果,返回评估信息
#在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果
tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)
res=tester_tmp.test()

print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ")
print(tester_tmp._format_eval_results(res),end=" ")
print('{:d}ms'.format(round((time.time()-start_time)*1000)))
loss_list.clear()

train(10, train_data, dev_data)
#使用tester进行快速测试
tester = Tester(test_data, model, metrics=AccuracyMetric())
tester.test()

这段代码的输出如下::

-----start training-----

Evaluate data in 0.2 seconds!
Epoch 0 Avg Loss: 0.33 AccuracyMetric: acc=0.825688 48895ms

Evaluate data in 0.19 seconds!
Epoch 1 Avg Loss: 0.16 AccuracyMetric: acc=0.829128 102081ms

Evaluate data in 0.18 seconds!
Epoch 2 Avg Loss: 0.10 AccuracyMetric: acc=0.822248 152853ms

Evaluate data in 0.17 seconds!
Epoch 3 Avg Loss: 0.08 AccuracyMetric: acc=0.821101 200184ms

Evaluate data in 0.17 seconds!
Epoch 4 Avg Loss: 0.06 AccuracyMetric: acc=0.827982 253097ms

Evaluate data in 0.27 seconds!
Epoch 5 Avg Loss: 0.05 AccuracyMetric: acc=0.806193 303883ms

Evaluate data in 0.26 seconds!
Epoch 6 Avg Loss: 0.04 AccuracyMetric: acc=0.803899 392315ms

Evaluate data in 0.36 seconds!
Epoch 7 Avg Loss: 0.04 AccuracyMetric: acc=0.802752 527211ms

Evaluate data in 0.15 seconds!
Epoch 8 Avg Loss: 0.03 AccuracyMetric: acc=0.809633 661533ms

Evaluate data in 0.31 seconds!
Epoch 9 Avg Loss: 0.03 AccuracyMetric: acc=0.797018 812232ms

Evaluate data in 0.25 seconds!
[tester]
AccuracyMetric: acc=0.917822



+ 0
- 114
docs/source/tutorials/tutorial_6_seq_labeling.rst View File

@@ -1,114 +0,0 @@
=====================
快速实现序列标注模型
=====================

这一部分的内容主要展示如何使用fastNLP 实现序列标注任务。你可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。
在阅读这篇Tutorial前,希望你已经熟悉了fastNLP的基础使用,包括基本数据结构以及数据预处理,embedding的嵌入等,希望你对之前的教程有更进一步的掌握。
我们将对CoNLL-03的英文数据集进行处理,展示如何完成命名实体标注任务整个训练的过程。

载入数据
===================================
fastNLP可以方便地载入各种类型的数据。同时,针对常见的数据集,我们已经预先实现了载入方法,其中包含CoNLL-03数据集。
在设计dataloader时,以DataSetLoader为基类,可以改写并应用于其他数据集的载入。

.. code-block:: python

class Conll2003DataLoader(DataSetLoader):
def __init__(self, task:str='ner', encoding_type:str='bioes'):
assert task in ('ner', 'pos', 'chunk')
index = {'ner':3, 'pos':1, 'chunk':2}[task]
#ConllLoader是fastNLP内置的类
self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index])
self._tag_converters = None
if task in ('ner', 'chunk'):
#iob和iob2bioes会对tag进行统一,标准化
self._tag_converters = [iob2]
if encoding_type == 'bioes':
self._tag_converters.append(iob2bioes)

def load(self, path: str):
dataset = self._loader.load(path)
def convert_tag_schema(tags):
for converter in self._tag_converters:
tags = converter(tags)
return tags
if self._tag_converters:
#使用apply实现convert_tag_schema函数,实际上也支持匿名函数
dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET)
return dataset

输出数据格式如:

{'raw_words': ['on', 'Friday', ':'] type=list,
'target': ['O', 'O', 'O'] type=list},


数据处理
----------------------------
我们进一步处理数据。将数据和词表封装在 :class:`~fastNLP.DataBundle` 类中。data是DataBundle的实例。
我们输入模型的数据包括char embedding,以及word embedding。在数据处理部分,我们尝试完成词表的构建。
使用fastNLP中的Vocabulary类来构建词表。

.. code-block:: python

word_vocab = Vocabulary(min_freq=2)
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT)
word_vocab.index_dataset(*data.datasets.values(),field_name=Const.INPUT, new_field_name=Const.INPUT)

处理后的data对象内部为:

dataset
vocabs
dataset保存了train和test中的数据,并保存为dataset类型
vocab保存了words,raw-words以及target的词表。

模型构建
--------------------------------
我们使用CNN-BILSTM-CRF模型完成这一任务。在网络构建方面,fastNLP的网络定义继承pytorch的 :class:`nn.Module` 类。
自己可以按照pytorch的方式定义网络。需要注意的是命名。fastNLP的标准命名位于 :class:`~fastNLP.Const` 类。

模型的训练
首先实例化模型,导入所需的char embedding以及word embedding。Embedding的载入可以参考教程。
也可以查看 :mod:`~fastNLP.modules.encoder.embedding` 使用所需的embedding 载入方法。
fastNLP将模型的训练过程封装在了 :class:`~fastnlp.trainer` 类中。
根据不同的任务调整trainer中的参数即可。通常,一个trainer实例需要有:指定的训练数据集,模型,优化器,loss函数,评测指标,以及指定训练的epoch数,batch size等参数。

.. code-block:: python

#实例化模型
model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type)
#定义优化器
optimizer = Adam(model.parameters(), lr=0.005)
#定义评估指标
Metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type)
#实例化trainer
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, dev_data=data.datasets['test'], batch_size=10, metrics=Metrics,callbacks=callbacks, n_epochs=100)
#开始训练
trainer.train()
训练中会保存最优的参数配置。
训练的结果如下:

.. code-block:: python

Evaluation on DataSet test:
SpanFPreRecMetric: f=0.727661, pre=0.732293, rec=0.723088
Evaluation at Epoch 1/100. Step:1405/140500. SpanFPreRecMetric: f=0.727661, pre=0.732293, rec=0.723088
Evaluation on DataSet test:
SpanFPreRecMetric: f=0.784307, pre=0.779371, rec=0.789306
Evaluation at Epoch 2/100. Step:2810/140500. SpanFPreRecMetric: f=0.784307, pre=0.779371, rec=0.789306
Evaluation on DataSet test:
SpanFPreRecMetric: f=0.810068, pre=0.811003, rec=0.809136
Evaluation at Epoch 3/100. Step:4215/140500. SpanFPreRecMetric: f=0.810068, pre=0.811003, rec=0.809136
Evaluation on DataSet test:
SpanFPreRecMetric: f=0.829592, pre=0.84153, rec=0.817989
Evaluation at Epoch 4/100. Step:5620/140500. SpanFPreRecMetric: f=0.829592, pre=0.84153, rec=0.817989
Evaluation on DataSet test:
SpanFPreRecMetric: f=0.828789, pre=0.837096, rec=0.820644
Evaluation at Epoch 5/100. Step:7025/140500. SpanFPreRecMetric: f=0.828789, pre=0.837096, rec=0.820644



docs/source/tutorials/tutorial_8_metrics.rst → docs/source/tutorials/tutorial_7_metrics.rst View File


docs/source/tutorials/tutorial_7_modules_models.rst → docs/source/tutorials/tutorial_8_modules_models.rst View File

@@ -6,7 +6,6 @@
下面我们会分三节介绍编写构建模型的具体方法。


----------------------
使用 models 中的模型
----------------------

@@ -81,8 +80,9 @@ FastNLP 中内置的 models 如下表所示,您可以点击具体的名称查
:class:`~fastNLP.models.STNLICls` ,用于自然语言推断 (NLI) 的 Star-Transformer 模型
:class:`~fastNLP.models.STSeqCls` , 用于分类任务的 Star-Transformer 模型
:class:`~fastNLP.models.BiaffineParser` , Biaffine 依存句法分析网络的实现
:class:`~fastNLP.models.BiLSTMCRF`, 使用BiLSTM与CRF进行序列标注


----------------------------
使用 nn.torch 编写模型
----------------------------

@@ -137,7 +137,7 @@ FastNLP 完全支持使用 pyTorch 编写的模型,但与 pyTorch 中编写模
(dropout): Dropout(p=0.5)
)

----------------------------
使用 modules 编写模型
----------------------------


+ 0
- 67
docs/source/tutorials/tutorial_9_callback.rst View File

@@ -1,67 +0,0 @@
===================================================
使用Callback自定义你的训练过程
===================================================

在训练时,我们常常要使用trick来提高模型的性能(如调节学习率),或者要打印训练中的信息。
这里我们提供Callback类,在Trainer中插入代码,完成一些自定义的操作。

我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。
给出一段评价性文字,预测其情感倾向是积极(label=1)、消极(label=0)还是中性(label=2),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。
关于数据处理,Loss和Optimizer的选择可以看其他教程,这里仅在训练时加入学习率衰减。

---------------------
Callback的构建和使用
---------------------

创建Callback
我们可以继承fastNLP :class:`~fastNLP.Callback` 类来定义自己的Callback。
这里我们实现一个让学习率线性衰减的Callback。

.. code-block:: python

import fastNLP

class LRDecay(fastNLP.Callback):
def __init__(self):
super(MyCallback, self).__init__()
self.base_lrs = []
self.delta = []

def on_train_begin(self):
# 初始化,仅训练开始时调用
self.base_lrs = [pg['lr'] for pg in self.optimizer.param_groups]
self.delta = [float(lr) / self.n_epochs for lr in self.base_lrs]

def on_epoch_end(self):
# 每个epoch结束时,更新学习率
ep = self.epoch
lrs = [lr - d * ep for lr, d in zip(self.base_lrs, self.delta)]
self.change_lr(lrs)

def change_lr(self, lrs):
for pg, lr in zip(self.optimizer.param_groups, lrs):
pg['lr'] = lr

这里,:class:`~fastNLP.Callback` 中所有以 ``on_`` 开头的类方法会在 :class:`~fastNLP.Trainer` 的训练中在特定时间调用。
如 on_train_begin() 会在训练开始时被调用,on_epoch_end() 会在每个 epoch 结束时调用。
具体有哪些类方法,参见文档 :class:`~fastNLP.Callback` 。

另外,为了使用方便,可以在 :class:`~fastNLP.Callback` 内部访问 :class:`~fastNLP.Trainer` 中的属性,如 optimizer, epoch, step,分别对应训练时的优化器,当前epoch数,和当前的总step数。
具体可访问的属性,参见文档 :class:`~fastNLP.Callback` 。

使用Callback
在定义好 :class:`~fastNLP.Callback` 之后,就能将它传入Trainer的 ``callbacks`` 参数,在实际训练时使用。

.. code-block:: python

"""
数据预处理,模型定义等等
"""

trainer = fastNLP.Trainer(
model=model, train_data=train_data, dev_data=dev_data,
optimizer=optimizer, metrics=metrics,
batch_size=10, n_epochs=100,
callbacks=[LRDecay()])

trainer.train()

+ 187
- 0
docs/source/tutorials/tutorial_9_seq_labeling.rst View File

@@ -0,0 +1,187 @@
=====================
快速实现序列标注模型
=====================

这一部分的内容主要展示如何使用fastNLP实现序列标注任务。您可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。
在阅读这篇Tutorial前,希望您已经熟悉了fastNLP的基础使用,尤其是数据的载入以及模型的构建,通过这个小任务的能让您进一步熟悉fastNLP的使用。

命名实体识别(name entity recognition, NER)
------------------------------------------

命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。
如下面的例子中

我来自复旦大学。

其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题在实际做的时候会被
转换为序列标注问题

针对"我来自复旦大学"这句话,我们的预测目标将是[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG],其中O表示out,即不是一个实体,B-ORG是ORG(
organization的缩写)这个类别的开头(Begin),I-ORG是ORG类别的中间(Inside)。

在本tutorial中我们将通过fastNLP尝试写出一个能够执行以上任务的模型。

载入数据
------------------------------------------
fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您可以通过 :doc:`使用Loader和Pipe处理数据 </tutorials/tutorial_4_load_dataset>`
了解如何使用fastNLP提供的数据加载函数。下面我们以微博命名实体任务来演示一下在fastNLP进行序列标注任务。

.. code-block:: python

from fastNLP.io import WeiboNERPipe
data_bundle = WeiboNERPipe().process_from_file()
print(data_bundle.get_dataset('train')[:2])

打印的数据如下 ::

+-------------------------------------------------+------------------------------------------+------------------------------------------+---------+
| raw_chars | target | chars | seq_len |
+-------------------------------------------------+------------------------------------------+------------------------------------------+---------+
| ['一', '节', '课', '的', '时', '间', '真', '... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, ... | [8, 211, 775, 3, 49, 245, 89, 26, 101... | 16 |
| ['回', '复', '支', '持', ',', '赞', '成', '... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | [116, 480, 127, 109, 2, 446, 134, 2, ... | 59 |
+-------------------------------------------------+------------------------------------------+------------------------------------------+---------+


模型构建
--------------------------------

首先选择需要使用的Embedding类型。关于Embedding的相关说明可以参见 :doc:`使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>` 。
在这里我们使用通过word2vec预训练的中文汉字embedding。

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding

embed = StaticEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name='cn-char-fastnlp-100d')

选择好Embedding之后,我们可以使用fastNLP中自带的 :class:`fastNLP.models.BiLSTMCRF` 作为模型。

.. code-block:: python

from fastNLP.models import BiLSTMCRF

data_bundle.rename_field('chars', 'words') # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名
model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,
target_vocab=data_bundle.get_vocab('target'))

下面我们选择用来评估模型的metric,以及优化用到的优化函数。

.. code-block:: python

from fastNLP import SpanFPreRecMetric
from torch.optim import Adam
from fastNLP import LossInForward

metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=1e-2)
loss = LossInForward()

使用Trainer进行训练

.. code-block:: python

from fastNLP import Trainer
import torch

device= 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,
dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)
trainer.train()

训练过程输出为::

input fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])

training epochs started 2019-09-25-10-43-09
Evaluate data in 0.62 seconds!
Evaluation on dev at Epoch 1/10. Step:43/430:
SpanFPreRecMetric: f=0.070352, pre=0.100962, rec=0.053985

...

Evaluate data in 0.61 seconds!
Evaluation on dev at Epoch 10/10. Step:430/430:
SpanFPreRecMetric: f=0.51223, pre=0.581699, rec=0.457584


In Epoch:7/Step:301, got best dev performance:
SpanFPreRecMetric: f=0.515528, pre=0.65098, rec=0.426735
Reloaded the best model.

训练结束之后过,可以通过 :class:`~fastNLP.Tester` 测试其在测试集上的性能

.. code-block::python

from fastNLP import Tester

tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()

输出为::

[tester]
SpanFPreRecMetric: f=0.482399, pre=0.530086, rec=0.442584


使用更强的Bert做序列标注
--------------------------------

在fastNLP使用Bert进行任务,您只需要切换为 :class:`fastNLP.embeddings.BertEmbedding` 即可。

.. code-block:: python

from fastNLP.io import WeiboNERPipe
data_bundle = WeiboNERPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

from fastNLP.embeddings import BertEmbedding
embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn')
model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,
target_vocab=data_bundle.get_vocab('target'))

from fastNLP import SpanFPreRecMetric
from torch import Adam
from fastNLP import LossInForward
metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2e-5)
loss = LossInForward()

from fastNLP import Trainer
import torch
device= 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, batch_size=12,
dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)
trainer.train()

from fastNLP import Tester
tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()

输出为::

training epochs started 2019-09-25-07-15-43
Evaluate data in 2.02 seconds!
Evaluation on dev at Epoch 1/10. Step:113/1130:
SpanFPreRecMetric: f=0.0, pre=0.0, rec=0.0

...

Evaluate data in 2.17 seconds!
Evaluation on dev at Epoch 10/10. Step:1130/1130:
SpanFPreRecMetric: f=0.647332, pre=0.589852, rec=0.717224

In Epoch:6/Step:678, got best dev performance:
SpanFPreRecMetric: f=0.669963, pre=0.645238, rec=0.696658
Reloaded the best model.

Evaluate data in 1.82 seconds!
[tester]
SpanFPreRecMetric: f=0.641774, pre=0.626424, rec=0.657895

可以看出通过使用Bert,效果有明显的提升,从48.2提升到了64.1。

+ 0
- 3
docs/source/user/docs_in_code.rst View File

@@ -1,3 +0,0 @@
===============
在代码中写文档
===============

+ 2
- 1
docs/source/user/installation.rst View File

@@ -13,8 +13,9 @@ fastNLP 依赖如下包::
nltk>=3.4.1
requests
spacy
prettytable>=0.7.2

其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 `PyTorch 官网 <https://pytorch.org/get-started/locally/>`_ 。
其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 `PyTorch 官网 <https://pytorch.org/>`_ 。
在依赖包安装完成的情况,您可以在命令行执行如下指令完成安装

.. code:: shell


+ 5
- 115
docs/source/user/quickstart.rst View File

@@ -2,123 +2,13 @@
快速入门
===============

这是一个简单的分类任务 (数据来源 `kaggle <https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews>`_ )。
给出一段文字,预测它的标签是0~4中的哪一个。
如果你想用 fastNLP 来快速地解决某类自然语言处理问题,你可以参考以下教程之一

我们可以使用 fastNLP 中 io 模块中的 :class:`~fastNLP.io.CSVLoader` 类,轻松地从 csv 文件读取我们的数据。
.. toctree::
:maxdepth: 1

.. code-block:: python
/quickstart/文本分类

from fastNLP.io import CSVLoader

loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
dataset = loader.load("./sample_data/tutorial_sample_dataset.csv")
这些教程是简单地介绍了使用 fastNLP 的流程,更多的教程分析见 :doc:`/user/tutorials`

此时的 `dataset[0]` 的值如下,可以看到,数据集中的每个数据包含 ``raw_sentence`` 和 ``label`` 两个字段,他们的类型都是 ``str``::

{'raw_sentence': A series of escapades demonstrating the adage that what is good for the
goose is also good for the gander , some of which occasionally amuses but none of which
amounts to much of a story . type=str,
'label': 1 type=str}


我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``raw_sentence`` 中字母变成小写,并将句子分词。

.. code-block:: python

dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')
dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)

然后我们再用 :class:`~fastNLP.Vocabulary` 类来统计数据中出现的单词,并将单词序列转化为训练可用的数字序列。

.. code-block:: python

from fastNLP import Vocabulary
vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
vocab.index_dataset(dataset, field_name='words',new_field_name='words')

同时,我们也将原来 str 类型的标签转化为数字,并设置为训练中的标准答案 ``target``

.. code-block:: python

dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True)

现在我们可以导入 fastNLP 内置的文本分类模型 :class:`~fastNLP.models.CNNText` ,


.. code-block:: python

from fastNLP.models import CNNText
model = CNNText((len(vocab),50), num_classes=5, dropout=0.1)

:class:`~fastNLP.models.CNNText` 的网络结构如下::

CNNText(
(embed): Embedding(
177, 50
(dropout): Dropout(p=0.0)
)
(conv_pool): ConvMaxpool(
(convs): ModuleList(
(0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))
(1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))
(2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))
)
)
(dropout): Dropout(p=0.1)
(fc): Linear(in_features=12, out_features=5, bias=True)
)

下面我们用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.split` 方法将数据集划分为 ``train_data`` 和 ``dev_data``
两个部分,分别用于训练和验证

.. code-block:: python

train_data, dev_data = dataset.split(0.2)

最后我们用 fastNLP 的 :class:`~fastNLP.Trainer` 进行训练,训练的过程中需要传入模型 ``model`` ,训练数据集 ``train_data`` ,
验证数据集 ``dev_data`` ,损失函数 ``loss`` 和衡量标准 ``metrics`` 。
其中损失函数使用的是 fastNLP 提供的 :class:`~fastNLP.CrossEntropyLoss` 损失函数;
衡量标准使用的是 fastNLP 提供的 :class:`~fastNLP.AccuracyMetric` 正确率指标。

.. code-block:: python

from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric

trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=CrossEntropyLoss(), metrics=AccuracyMetric())
trainer.train()

训练过程的输出如下::

input fields after batch(if batch size is 2):
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])

training epochs started 2019-05-09-10-59-39
Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.333333

Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.533333

Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.533333

Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.533333

Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.6

Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.8

Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.8

Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.733333

Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.733333

Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.733333


In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.8
Reloaded the best model.

这份教程只是简单地介绍了使用 fastNLP 工作的流程,更多的教程分析见 :doc:`/user/tutorials`

+ 14
- 9
docs/source/user/tutorials.rst View File

@@ -1,4 +1,4 @@
========================
========================
fastNLP 详细使用教程
========================

@@ -8,13 +8,18 @@ fastNLP 详细使用教程
:maxdepth: 1

使用DataSet预处理文本 </tutorials/tutorial_1_data_preprocess>
使用DataSetLoader加载数据集 </tutorials/tutorial_2_load_dataset>
使用Vocabulary转换文本与index </tutorials/tutorial_2_vocabulary>
使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>
动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 </tutorials/tutorial_4_loss_optimizer>
动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 </tutorials/tutorial_5_datasetiter>
快速实现序列标注模型 </tutorials/tutorial_6_seq_labeling>
使用Modules和Models快速搭建自定义模型 </tutorials/tutorial_7_modules_models>
使用Metric快速评测你的模型 </tutorials/tutorial_8_metrics>
使用Callback自定义你的训练过程 </tutorials/tutorial_9_callback>
使用fitlog 辅助 fastNLP 进行科研 </tutorials/tutorial_10_fitlog>
使用Loader和Pipe加载并处理数据集 </tutorials/tutorial_4_load_dataset>
动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 </tutorials/tutorial_5_loss_optimizer>
动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 </tutorials/tutorial_6_datasetiter>
使用Metric快速评测你的模型 </tutorials/tutorial_7_metrics>
使用Modules和Models快速搭建自定义模型 </tutorials/tutorial_8_modules_models>
快速实现序列标注模型 </tutorials/tutorial_9_seq_labeling>
使用Callback自定义你的训练过程 </tutorials/tutorial_10_callback>

.. toctree::
:maxdepth: 1

拓展阅读1:BertEmbedding的各种用法 </tutorials/extend_1_bert_embedding>
拓展阅读2:使用fitlog 辅助 fastNLP 进行科研 </tutorials/extend_2_fitlog>

+ 29
- 13
fastNLP/__init__.py View File

@@ -2,22 +2,22 @@
fastNLP 由 :mod:`~fastNLP.core` 、 :mod:`~fastNLP.io` 、:mod:`~fastNLP.embeddings` 、 :mod:`~fastNLP.modules`、
:mod:`~fastNLP.models` 等子模块组成,你可以查看每个模块的文档。

- :mod:`~fastNLP.core` 是fastNLP 的核心模块,包括 DataSet、 Trainer、 Tester 等组件。详见文档 :doc:`/fastNLP.core`
- :mod:`~fastNLP.io` 是实现输入输出的模块,包括了数据集的读取,模型的存取等功能。详见文档 :doc:`/fastNLP.io`
- :mod:`~fastNLP.embeddings` 提供用于构建复杂网络模型所需的各种embedding。详见文档 :doc:`/fastNLP.embeddings`
- :mod:`~fastNLP.modules` 包含了用于搭建神经网络模型的诸多组件,可以帮助用户快速搭建自己所需的网络。详见文档 :doc:`/fastNLP.modules`
- :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型,包括 :class:`~fastNLP.models.CNNText` 、 :class:`~fastNLP.models.SeqLabeling` 等常见模型。详见文档 :doc:`fastNLP.models`
- :mod:`~fastNLP.core` 是fastNLP 的核心模块,包括 DataSet、 Trainer、 Tester 等组件。详见文档 :mod:`fastNLP.core`
- :mod:`~fastNLP.io` 是实现输入输出的模块,包括了数据集的读取,模型的存取等功能。详见文档 :mod:`fastNLP.io`
- :mod:`~fastNLP.embeddings` 提供用于构建复杂网络模型所需的各种embedding。详见文档 :mod:`fastNLP.embeddings`
- :mod:`~fastNLP.modules` 包含了用于搭建神经网络模型的诸多组件,可以帮助用户快速搭建自己所需的网络。详见文档 :mod:`fastNLP.modules`
- :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型,包括 :class:`~fastNLP.models.CNNText` 、 :class:`~fastNLP.models.SeqLabeling` 等常见模型。详见文档 :mod:`fastNLP.models`

fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ,他们的文档如下:
"""
__all__ = [
"Instance",
"FieldArray",
"DataSetIter",
"BatchIter",
"TorchLoaderIter",
"Vocabulary",
"DataSet",
"Const",
@@ -28,9 +28,16 @@ __all__ = [
"Callback",
"GradientClipCallback",
"EarlyStopCallback",
"TensorboardCallback",
"FitlogCallback",
"EvaluateCallback",
"LRScheduler",
"ControlC",
"LRFinder",
"TensorboardCallback",
"WarmupCallback",
'SaveModelCallback',
"CallbackException",
"EarlyStopError",
"Padder",
"AutoPadder",
@@ -43,6 +50,7 @@ __all__ = [
"Optimizer",
"SGD",
"Adam",
"AdamW",
"Sampler",
"SequentialSampler",
@@ -51,16 +59,24 @@ __all__ = [
"LossFunc",
"CrossEntropyLoss",
"L1Loss", "BCELoss",
"L1Loss",
"BCELoss",
"NLLLoss",
"LossInForward",
"cache_results"
"cache_results",
'logger'
]
__version__ = '0.4.5'

from .core import *
import sys

from . import embeddings
from . import models
from . import modules
from . import embeddings
from .io import data_loader
from .core import *
from .doc_utils import doc_process
from .io import loader, pipe

doc_process(sys.modules[__name__])

+ 72
- 9
fastNLP/core/__init__.py View File

@@ -8,23 +8,86 @@ core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fa
# 从 core 模块的子模块 batch 中 import DataSetIter
from fastNLP.core.batch import DataSetIter

对于常用的功能,你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。
对于常用的功能,你只需要在 :mod:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。

.. todo::
介绍core 的子模块的分工,好像必要性不大
"""
__all__ = [
"DataSet",
"Instance",
"FieldArray",
"Padder",
"AutoPadder",
"EngChar2DPadder",
"Vocabulary",
"DataSetIter",
"BatchIter",
"TorchLoaderIter",
"Const",
"Tester",
"Trainer",
"cache_results",
"seq_len_to_mask",
"get_seq_len",
"logger",
"Callback",
"GradientClipCallback",
"EarlyStopCallback",
"FitlogCallback",
"EvaluateCallback",
"LRScheduler",
"ControlC",
"LRFinder",
"TensorboardCallback",
"WarmupCallback",
'SaveModelCallback',
"CallbackException",
"EarlyStopError",
"LossFunc",
"CrossEntropyLoss",
"L1Loss",
"BCELoss",
"NLLLoss",
"LossInForward",
"CMRC2018Loss",
"AccuracyMetric",
"SpanFPreRecMetric",
"CMRC2018Metric",

"Optimizer",
"SGD",
"Adam",
"AdamW",
"SequentialSampler",
"BucketSampler",
"RandomSampler",
"Sampler",
]

from ._logger import logger
from .batch import DataSetIter, BatchIter, TorchLoaderIter
from .callback import Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC
from .callback import Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, \
LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, \
EarlyStopError
from .const import Const
from .dataset import DataSet
from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder
from .instance import Instance
from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward
from .metrics import AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric
from .optimizer import Optimizer, SGD, Adam
from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, CMRC2018Loss
from .metrics import AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric
from .optimizer import Optimizer, SGD, Adam, AdamW
from .sampler import SequentialSampler, BucketSampler, RandomSampler, Sampler
from .tester import Tester
from .trainer import Trainer
from .utils import cache_results, seq_len_to_mask
from .utils import cache_results, seq_len_to_mask, get_seq_len
from .vocabulary import Vocabulary

+ 155
- 0
fastNLP/core/_logger.py View File

@@ -0,0 +1,155 @@
"""undocumented"""

__all__ = [
'logger',
]

import logging
import logging.config
import os
import sys
import warnings

ROOT_NAME = 'fastNLP'

try:
import fitlog
except ImportError:
fitlog = None
try:
from tqdm.auto import tqdm
except ImportError:
tqdm = None

if tqdm is not None:
class TqdmLoggingHandler(logging.Handler):
def __init__(self, level=logging.INFO):
super().__init__(level)
def emit(self, record):
try:
msg = self.format(record)
tqdm.write(msg)
self.flush()
except (KeyboardInterrupt, SystemExit):
raise
except:
self.handleError(record)
else:
class TqdmLoggingHandler(logging.StreamHandler):
def __init__(self, level=logging.INFO):
super().__init__(sys.stdout)
self.setLevel(level)


def _get_level(level):
if isinstance(level, int):
pass
else:
level = level.lower()
level = {'info': logging.INFO, 'debug': logging.DEBUG,
'warn': logging.WARN, 'warning': logging.WARN,
'error': logging.ERROR}[level]
return level


def _add_file_handler(logger, path, level='INFO'):
for h in logger.handlers:
if isinstance(h, logging.FileHandler):
if os.path.abspath(path) == h.baseFilename:
# file path already added
return
# File Handler
if os.path.exists(path):
assert os.path.isfile(path)
warnings.warn('log already exists in {}'.format(path))
dirname = os.path.abspath(os.path.dirname(path))
os.makedirs(dirname, exist_ok=True)
file_handler = logging.FileHandler(path, mode='a')
file_handler.setLevel(_get_level(level))
file_formatter = logging.Formatter(fmt='%(asctime)s - %(module)s - [%(levelname)s] - %(message)s',
datefmt='%Y/%m/%d %H:%M:%S')
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)


def _set_stdout_handler(logger, stdout='tqdm', level='INFO'):
level = _get_level(level)
if stdout not in ['none', 'plain', 'tqdm']:
raise ValueError('stdout must in one of {}'.format(['none', 'plain', 'tqdm']))
# make sure to initialize logger only once
stream_handler = None
for i, h in enumerate(logger.handlers):
if isinstance(h, (logging.StreamHandler, TqdmLoggingHandler)):
stream_handler = h
break
if stream_handler is not None:
logger.removeHandler(stream_handler)
# Stream Handler
if stdout == 'plain':
stream_handler = logging.StreamHandler(sys.stdout)
elif stdout == 'tqdm':
stream_handler = TqdmLoggingHandler(level)
else:
stream_handler = None
if stream_handler is not None:
stream_formatter = logging.Formatter('%(message)s')
stream_handler.setLevel(level)
stream_handler.setFormatter(stream_formatter)
logger.addHandler(stream_handler)


class FastNLPLogger(logging.getLoggerClass()):
def __init__(self, name):
super().__init__(name)
def add_file(self, path='./log.txt', level='INFO'):
"""add log output file and level"""
_add_file_handler(self, path, level)
def set_stdout(self, stdout='tqdm', level='INFO'):
"""set stdout format and level"""
_set_stdout_handler(self, stdout, level)


logging.setLoggerClass(FastNLPLogger)


# print(logging.getLoggerClass())
# print(logging.getLogger())

def _init_logger(path=None, stdout='tqdm', level='INFO'):
"""initialize logger"""
level = _get_level(level)
# logger = logging.getLogger()
logger = logging.getLogger(ROOT_NAME)
logger.propagate = False
logger.setLevel(level)
_set_stdout_handler(logger, stdout, level)
# File Handler
if path is not None:
_add_file_handler(logger, path, level)
return logger


def _get_logger(name=None, level='INFO'):
level = _get_level(level)
if name is None:
name = ROOT_NAME
assert isinstance(name, str)
if not name.startswith(ROOT_NAME):
name = '{}.{}'.format(ROOT_NAME, name)
logger = logging.getLogger(name)
logger.setLevel(level)
return logger


logger = _init_logger(path=None)

+ 26
- 7
fastNLP/core/_parallel_utils.py View File

@@ -1,10 +1,14 @@
"""undocumented"""

__all__ = []

import threading

import torch
from torch import nn
from torch.nn.parallel.parallel_apply import get_a_var

from torch.nn.parallel.scatter_gather import scatter_kwargs, gather
from torch.nn.parallel.replicate import replicate
from torch.nn.parallel.scatter_gather import scatter_kwargs, gather


def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None):
@@ -26,11 +30,11 @@ def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None):
assert len(modules) == len(devices)
else:
devices = [None] * len(modules)
lock = threading.Lock()
results = {}
grad_enabled = torch.is_grad_enabled()
def _worker(i, module, input, kwargs, device=None):
torch.set_grad_enabled(grad_enabled)
if device is None:
@@ -46,20 +50,20 @@ def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None):
except Exception as e:
with lock:
results[i] = e
if len(modules) > 1:
threads = [threading.Thread(target=_worker,
args=(i, module, input, kwargs, device))
for i, (module, input, kwargs, device) in
enumerate(zip(modules, inputs, kwargs_tup, devices))]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
else:
_worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
outputs = []
for i in range(len(inputs)):
output = results[i]
@@ -78,6 +82,7 @@ def _data_parallel_wrapper(func_name, device_ids, output_device):
:param output_device: nn.DataParallel中的output_device
:return:
"""
def wrapper(network, *inputs, **kwargs):
inputs, kwargs = scatter_kwargs(inputs, kwargs, device_ids, dim=0)
if len(device_ids) == 1:
@@ -85,4 +90,18 @@ def _data_parallel_wrapper(func_name, device_ids, output_device):
replicas = replicate(network, device_ids[:len(inputs)])
outputs = parallel_apply(replicas, func_name, inputs, kwargs, device_ids[:len(replicas)])
return gather(outputs, output_device)
return wrapper


def _model_contains_inner_module(model):
"""

:param nn.Module model: 模型文件,判断是否内部包含model.module, 多用于check模型是否是nn.DataParallel,
nn.parallel.DistributedDataParallel。主要是在做形参匹配的时候需要使用最内部的model的function。
:return: bool
"""
if isinstance(model, nn.Module):
if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
return True
return False

+ 124
- 55
fastNLP/core/batch.py View File

@@ -9,14 +9,15 @@ __all__ = [
]

import atexit
from numbers import Number

import numpy as np
import torch
import torch.utils.data
from numbers import Number

from .sampler import SequentialSampler
from ._logger import logger
from .dataset import DataSet
from .sampler import SequentialSampler

_python_is_exit = False

@@ -48,6 +49,11 @@ class DataSetGetter:
return len(self.dataset)

def collate_fn(self, batch: list):
"""

:param batch: [[idx1, x_dict1, y_dict1], [idx2, x_dict2, y_dict2], [xx, xx, xx]]
:return:
"""
# TODO 支持在DataSet中定义collate_fn,因为有时候可能需要不同的field之间融合,比如BERT的场景
batch_x = {n:[] for n in self.inputs.keys()}
batch_y = {n:[] for n in self.targets.keys()}
@@ -70,7 +76,7 @@ class DataSetGetter:
try:
data, flag = _to_tensor(data, f.dtype)
except TypeError as e:
print(f"Field {n} cannot be converted to torch.tensor.")
logger.error(f"Field {n} cannot be converted to torch.tensor.")
raise e
batch_dict[n] = data
return batch_dict
@@ -93,37 +99,68 @@ class DataSetGetter:

class SamplerAdapter(torch.utils.data.Sampler):
def __init__(self, sampler, dataset):
super().__init__(dataset)
self.sampler = sampler
self.dataset = dataset

def __len__(self):
return len(self.dataset)

def __iter__(self):
return iter(self.sampler(self.dataset))


class BatchIter:
def __init__(self):
self.dataiter = None
self.num_batches = None
def __init__(self, dataset, batch_size=1, sampler=None,
num_workers=0, pin_memory=False, drop_last=False,
timeout=0, worker_init_fn=None, collate_fn=None):
if not isinstance(sampler, torch.utils.data.Sampler):
self.sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset)
else:
self.sampler = sampler
if collate_fn is None:
# pytoch <= 1.1 中不能设置collate_fn=None
self.dataiter = torch.utils.data.DataLoader(
dataset=dataset, batch_size=batch_size, sampler=self.sampler,
num_workers=num_workers,
pin_memory=pin_memory, drop_last=drop_last,
timeout=timeout, worker_init_fn=worker_init_fn)
else:
self.dataiter = torch.utils.data.DataLoader(
dataset=dataset, batch_size=batch_size, sampler=self.sampler,
collate_fn=collate_fn, num_workers=num_workers,
pin_memory=pin_memory, drop_last=drop_last,
timeout=timeout, worker_init_fn=worker_init_fn)

# 以sampler的数量为准,因为DistributedSampler的时候每个进程上并不是所有的数据都用上了
self.num_batches = self.get_num_batches(len(self.dataiter.sampler), batch_size, drop_last)
self.batch_size = batch_size
self.cur_batch_indices = None
self.batch_size = None

def init_iter(self):
pass

@staticmethod
def get_num_batches(num_samples, batch_size, drop_last):
"""
计算batch的数量。

:param int num_samples:
:param int batch_size:
:param bool drop_last: 如果最后一个batch没有batch_size这么多,是否就丢掉。
:return:
"""
num_batches = num_samples // batch_size
if not drop_last and (num_samples % batch_size > 0):
num_batches += 1
return num_batches

def __iter__(self):
self.init_iter()
for indices, batch_x, batch_y in self.dataiter:
self.cur_batch_indices = indices
yield batch_x, batch_y

def get_batch_indices(self):
"""
获取当前已经输出的batch的index。

:return:
"""
return self.cur_batch_indices

def __len__(self):
@@ -136,8 +173,6 @@ class BatchIter:

class DataSetIter(BatchIter):
"""
别名::class:`fastNLP.DataSetIter` :class:`fastNLP.core.batch.DataSetIter`

DataSetIter 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出,
组成 `x` 和 `y`::

@@ -146,60 +181,94 @@ class DataSetIter(BatchIter):
for batch_x, batch_y in batch:
# do stuff ...

:param dataset: :class:`~fastNLP.DataSet` 对象, 数据集
:param int batch_size: 取出的batch大小
:param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`.

Default: ``None``
:param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`.

Default: ``False``
:param int num_workers: 使用多少个进程来预处理数据
:param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。
:param bool drop_last: 如果最后一个batch没有batch_size这么多sample,就扔掉最后一个
:param timeout:
:param worker_init_fn: 在每个worker启动时调用该函数,会传入一个值,该值是worker的index。
"""
def __init__(self, dataset, batch_size=1, sampler=None, as_numpy=False,
num_workers=0, pin_memory=False, drop_last=False,
timeout=0, worker_init_fn=None):
super().__init__()
timeout=0, worker_init_fn=None, collate_fn=None):
"""
:param dataset: :class:`~fastNLP.DataSet` 对象, 数据集
:param int batch_size: 取出的batch大小
:param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`.
Default: ``None``
:param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`.

Default: ``False``
:param int num_workers: 使用多少个进程来预处理数据
:param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。
:param bool drop_last: 如果最后一个batch没有batch_size这么多sample,就扔掉最后一个
:param timeout: 生成一个batch的timeout值
:param worker_init_fn: 在每个worker启动时调用该函数,会传入一个值,该值是worker的index。
:param collate_fn: 用于将样本组合成batch的函数
"""
assert isinstance(dataset, DataSet)
sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset)
dataset = DataSetGetter(dataset, as_numpy)
collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None
self.dataiter = torch.utils.data.DataLoader(
collate_fn = dataset.collate_fn if collate_fn is None else collate_fn
super().__init__(
dataset=dataset, batch_size=batch_size, sampler=sampler,
collate_fn=collate_fn, num_workers=num_workers,
pin_memory=pin_memory, drop_last=drop_last,
timeout=timeout, worker_init_fn=worker_init_fn)
self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last)
self.batch_size = batch_size


class TorchLoaderIter(BatchIter):
def __init__(self, dataset):
super().__init__()
assert isinstance(dataset, torch.utils.data.DataLoader)
self.dataiter = dataset
self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last)
self.batch_size = dataset.batch_size
num_workers=num_workers, pin_memory=pin_memory,
drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn,
collate_fn=collate_fn
)

def __iter__(self):
self.init_iter()
for indices, batch_x, batch_y in self.dataiter:
self.cur_batch_indices = indices
yield batch_x, batch_y

class OnlineDataGettter:
# TODO
pass

class TorchLoaderIter(BatchIter):
"""
与DataSetIter类似,但用于pytorch的DataSet对象。
通过使用TorchLoaderIter封装pytorch的DataSet,然后将其传入到Trainer中。

class OnlineDataIter(BatchIter):
# TODO
def __init__(self, dataset, batch_size=1, buffer_size=10000, sampler=None, as_numpy=False,
"""
def __init__(self, dataset, batch_size=1, sampler=None,
num_workers=0, pin_memory=False, drop_last=False,
timeout=0, worker_init_fn=None, **kwargs):
super().__init__()
timeout=0, worker_init_fn=None, collate_fn=None):
"""

:param dataset: :class:`~fastNLP.DataSet` 对象, 数据集
:param int batch_size: 取出的batch大小
:param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`.

Default: ``None``
:param int num_workers: 使用多少个进程来预处理数据
:param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。
:param bool drop_last: 如果最后一个batch没有batch_size这么多sample,就扔掉最后一个
:param timeout: 生成一个batch的timeout值
:param worker_init_fn: 在每个worker启动时调用该函数,会传入一个值,该值是worker的index。
:param collate_fn: 用于将样本组合成batch的函数"""
assert len(dataset) > 0
ins = dataset[0]
assert len(ins) == 2 and \
isinstance(ins[0], dict) and \
isinstance(ins[1], dict), 'DataSet should return two dict, as X and Y'

super().__init__(
dataset=dataset, batch_size=batch_size, sampler=sampler,
num_workers=num_workers, pin_memory=pin_memory,
drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn,
collate_fn=collate_fn
)

def __iter__(self):
self.init_iter()
for batch_x, batch_y in self.dataiter:
self.cur_batch_indices = None
yield batch_x, batch_y


def _to_tensor(batch, field_dtype):
"""

:param batch: np.array()
:param field_dtype: 数据类型
:return: batch, flag. 如果传入的数据支持转为tensor,返回的batch就是tensor,且flag为True;如果传入的数据不支持转为tensor,
返回的batch就是原来的数据,且flag为False
"""
try:
if field_dtype is not None and isinstance(field_dtype, type)\
and issubclass(field_dtype, Number) \


+ 333
- 120
fastNLP/core/callback.py View File

@@ -4,7 +4,7 @@ callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class:
虽然Trainer本身已经集成了一些功能,但仍然不足以囊括训练过程中可能需要到的功能,
比如负采样,learning rate decay 和 early stop等。
为了解决这个问题,fastNLP引入了callback的机制,:class:`~fastNLP.Callback` 是一种在Trainer训练过程中特定阶段会运行的函数集合。
关于 :class:`~fastNLP.Trainer` 的详细文档,请参见 :doc:`trainer 模块<fastNLP.core.trainer>`
关于 :class:`~fastNLP.Trainer` 的详细文档,请参见 :mod:`trainer 模块<fastNLP.core.trainer>`

我们将 :meth:`~fastNLP.Trainer.train` 这个函数内部分为以下的阶段,在对应阶段会触发相应的调用::

@@ -51,22 +51,28 @@ callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class:
"""
__all__ = [
"Callback",

"GradientClipCallback",
"EarlyStopCallback",
"TensorboardCallback",
"FitlogCallback",
"EvaluateCallback",
"LRScheduler",
"ControlC",
"LRFinder",
"TensorboardCallback",
"WarmupCallback",
"SaveModelCallback",
"CallbackException",
"EarlyStopError"
]

import os
import sys
from copy import deepcopy

import torch
from copy import deepcopy
import sys

from .utils import _save_model

try:
@@ -76,23 +82,27 @@ try:
except:
tensorboardX_flag = False

from ..io.model_io import ModelSaver, ModelLoader
from .dataset import DataSet
from .tester import Tester
from ._logger import logger
from .utils import _check_fp16

try:
import fitlog
except:
pass

try:
from apex import amp
except:
amp = None


class Callback(object):
"""
别名::class:`fastNLP.Callback` :class:`fastNLP.core.callback.Callback`

Callback是fastNLP中被设计用于增强 :class:`~fastNLP.Trainer` 的类。
如果Callback被传递给了 Trainer , 则 Trainer 会在对应的阶段调用Callback的函数,
具体调用时机可以通过 :doc:`trainer 模块<fastNLP.core.trainer>` 查看。
具体调用时机可以通过 :mod:`trainer 模块<fastNLP.core.trainer>` 查看。
这是Callback的基类,所有的callback必须继承自这个类

"""
@@ -100,7 +110,8 @@ class Callback(object):
def __init__(self):
super(Callback, self).__init__()
self._trainer = None # 在Trainer内部被重新赋值
self._disabled = False

@property
def trainer(self):
"""
@@ -158,7 +169,19 @@ class Callback(object):
def batch_per_epoch(self):
"""每个epoch一共有多少个batch,只有在on_epoch_begin之后才能调用该属性。"""
return self._trainer.batch_per_epoch

@property
def is_master(self):
return self._trainer.is_master

@property
def disabled(self):
return self._disabled

@property
def logger(self):
return getattr(self._trainer, 'logger', logger)

def on_train_begin(self):
"""
在Train过程开始之前调用。
@@ -281,6 +304,8 @@ def _transfer(func):
def wrapper(manager, *arg):
returns = []
for callback in manager.callbacks:
if callback.disabled:
continue
returns.append(getattr(callback, func.__name__)(*arg))
return returns
@@ -288,31 +313,39 @@ def _transfer(func):


class CallbackManager(Callback):
"""
内部使用的Callback管理类
"""
def __init__(self, env, callbacks=None):
"""
内部使用的Callback管理类

:param dict env: The key is the name of the Trainer attribute(str). The value is the attribute itself.
:param List[Callback] callbacks:
"""
super(CallbackManager, self).__init__()
# set attribute of trainer environment
self._env = env
self.callbacks = []
if callbacks is not None:
if isinstance(callbacks, list):
if all([isinstance(cb, Callback) for cb in callbacks]) is True:
self.callbacks.extend(callbacks)
else:
obj = [not isinstance(cb, Callback) for cb in callbacks][0]
raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}")
if callbacks:
self.callbacks = self.prepare_callbacks(callbacks)

def prepare_callbacks(self, callbacks):
if not callbacks:
return []
if isinstance(callbacks, list):
if all([isinstance(cb, Callback) for cb in callbacks]) is True:
pass
else:
raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.")
for env_name, env_val in env.items():
for callback in self.callbacks:
obj = [not isinstance(cb, Callback) for cb in callbacks][0]
raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}")
else:
raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.")

for env_name, env_val in self._env.items():
for callback in callbacks:
setattr(callback, '_' + env_name, env_val) # Callback.trainer
return callbacks

@_transfer
def on_train_begin(self):
pass
@@ -352,6 +385,10 @@ class CallbackManager(Callback):
@_transfer
def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval):
pass

@_transfer
def on_validation(self):
pass
@_transfer
def on_epoch_end(self):
@@ -366,28 +403,53 @@ class CallbackManager(Callback):
pass


class DistCallbackManager(CallbackManager):
def __init__(self, env, callbacks_all=None, callbacks_master=None):
super(DistCallbackManager, self).__init__(env)
assert 'trainer' in env
self._trainer = env['trainer']
self.callbacks_master = []
self.callbacks_all = []
self.add_callback(callbacks_all, master=False)
self.add_callback(callbacks_master, master=True)

def patch_callback(self, callbacks, disabled):
if not callbacks:
return
if not isinstance(callbacks, (list, tuple)):
callbacks = [callbacks]
for cb in callbacks:
cb._disabled = disabled

def add_callback(self, cb, master=False):
if master:
self.patch_callback(cb, not self.is_master)
self.callbacks_master += self.prepare_callbacks(cb)
else:
self.callbacks_all += self.prepare_callbacks(cb)
self.callbacks = self.callbacks_all + self.callbacks_master


class GradientClipCallback(Callback):
"""
别名::class:`fastNLP.GradientClipCallback` :class:`fastNLP.core.callback.GradientClipCallback`

每次backward前,将parameter的gradient clip到某个范围。

:param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。
如果为None则默认对Trainer的model中所有参数进行clip
:param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数
:param str clip_type: 支持'norm', 'value'
两种::

1 'norm', 将gradient的norm rescale到[-clip_value, clip_value]
2 'value', 将gradient限制在[-clip_value, clip_value],
小于-clip_value的gradient被赋值为-clip_value;
大于clip_value的gradient被赋值为clip_value.

"""
def __init__(self, parameters=None, clip_value=1, clip_type='norm'):
"""
:param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。
如果为None则默认对Trainer的model中所有参数进行clip
:param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数
:param str clip_type: 支持'norm', 'value'
两种::
1 'norm', 将gradient的norm rescale到[-clip_value, clip_value]
2 'value', 将gradient限制在[-clip_value, clip_value],
小于-clip_value的gradient被赋值为-clip_value;
大于clip_value的gradient被赋值为clip_value.
"""
super().__init__()
from torch import nn
@@ -403,21 +465,25 @@ class GradientClipCallback(Callback):
def on_backward_end(self):
if self.step%self.update_every==0:
if self.parameters is None:
self.clip_fun(self.model.parameters(), self.clip_value)
if getattr(self.trainer, 'fp16', ''):
_check_fp16()
self.clip_fun(amp.master_params(self.optimizer), self.clip_value)
else:
self.clip_fun(self.model.parameters(), self.clip_value)
else:
self.clip_fun(self.parameters, self.clip_value)


class EarlyStopCallback(Callback):
"""
别名::class:`fastNLP.EarlyStopCallback` :class:`fastNLP.core.callback.EarlyStopCallback`
多少个epoch没有变好就停止训练,相关类 :class:`EarlyStopError`

:param int patience: epoch的数量
多少个epoch没有变好就停止训练,相关类 :class:`~fastNLP.core.callback.EarlyStopError`
"""
def __init__(self, patience):
"""
:param int patience: epoch的数量
"""
super(EarlyStopCallback, self).__init__()
self.patience = patience
self.wait = 0
@@ -434,52 +500,54 @@ class EarlyStopCallback(Callback):
def on_exception(self, exception):
if isinstance(exception, EarlyStopError):
print("Early Stopping triggered in epoch {}!".format(self.epoch))
logger.info("Early Stopping triggered in epoch {}!".format(self.epoch))
else:
raise exception # 抛出陌生Error


class FitlogCallback(Callback):
"""
别名: :class:`fastNLP.FitlogCallback` :class:`fastNLP.core.callback.FitlogCallback`

该callback可将loss和progress写入到fitlog中; 如果Trainer有dev的数据,将自动把dev的结果写入到log中; 同时还支持传入
一个(或多个)test数据集进行测试(只有在trainer具有dev时才能使用),每次在dev上evaluate之后会在这些数据集上验证一下。
并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则
fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。

:param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个
DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。若tester不为None时,data需要通过
dict的方式传入。如果仅传入DataSet, 则被命名为test
:param ~fastNLP.Tester tester: Tester对象,将在on_valid_end时调用。tester中的DataSet会被称为为`test`
:param int log_loss_every: 多少个step记录一次loss(记录的是这几个batch的loss平均值),如果数据集较大建议将该值设置得
大一些,不然会导致log文件巨大。默认为0, 即不要记录loss。
:param int verbose: 是否在终端打印evaluation的结果,0不打印。
:param bool log_exception: fitlog是否记录发生的exception信息
一个(或多个)test数据集进行测试(只有在trainer具有dev时才能使用),每次在dev上evaluate之后会在这些数据集上验证一下。
并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则
fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。
"""

def __init__(self, data=None, tester=None, log_loss_every=0, verbose=0, log_exception=False):
"""
:param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要
传入多个DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。data的结果的名称以'data'开头。
:param ~fastNLP.Tester,Dict[~fastNLP.Tester] tester: Tester对象,将在on_valid_end时调用。tester的结果的名称以'tester'开头
:param int log_loss_every: 多少个step记录一次loss(记录的是这几个batch的loss平均值),如果数据集较大建议将该值设置得
大一些,不然会导致log文件巨大。默认为0, 即不要记录loss。
:param int verbose: 是否在终端打印evaluation的结果,0不打印。
:param bool log_exception: fitlog是否记录发生的exception信息
"""
super().__init__()
self.datasets = {}
self.testers = {}
self._log_exception = log_exception
assert isinstance(log_loss_every, int) and log_loss_every>=0
if tester is not None:
assert isinstance(tester, Tester), "Only fastNLP.Tester allowed."
assert isinstance(data, dict) or data is None, "If tester is not None, only dict[DataSet] allowed for data."
if data is not None:
assert 'test' not in data, "Cannot use `test` as DataSet key, when tester is passed."
setattr(tester, 'verbose', 0)
self.testers['test'] = tester
if isinstance(tester, dict):
for name, test in tester.items():
if not isinstance(test, Tester):
raise TypeError(f"{name} in tester is not a valid fastNLP.Tester.")
self.testers['tester-' + name] = test
if isinstance(tester, Tester):
self.testers['tester-test'] = tester
for tester in self.testers.values():
setattr(tester, 'verbose', 0)

if isinstance(data, dict):
for key, value in data.items():
assert isinstance(value, DataSet), f"Only DataSet object is allowed, not {type(value)}."
for key, value in data.items():
self.datasets[key] = value
self.datasets['data-' + key] = value
elif isinstance(data, DataSet):
self.datasets['test'] = data
else:
self.datasets['data-test'] = data
elif data is not None:
raise TypeError("data receives dict[DataSet] or DataSet object.")
self.verbose = verbose
@@ -492,8 +560,11 @@ class FitlogCallback(Callback):
if len(self.datasets) > 0:
for key, data in self.datasets.items():
tester = Tester(data=data, model=self.model, batch_size=self.batch_size, metrics=self.trainer.metrics,
verbose=0)
tester = Tester(data=data, model=self.model,
batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size),
metrics=self.trainer.metrics,
verbose=0,
use_tqdm=self.trainer.test_use_tqdm)
self.testers[key] = tester
fitlog.add_progress(total_steps=self.n_steps)
@@ -516,7 +587,7 @@ class FitlogCallback(Callback):
try:
eval_result = tester.test()
if self.verbose != 0:
self.pbar.write("Evaluation on DataSet {}:".format(key))
self.pbar.write("FitlogCallback evaluation on {}:".format(key))
self.pbar.write(tester._format_eval_results(eval_result))
fitlog.add_metric(eval_result, name=key, step=self.step, epoch=self.epoch)
if better_result:
@@ -533,17 +604,75 @@ class FitlogCallback(Callback):
fitlog.add_other(repr(exception), name='except_info')


class LRScheduler(Callback):
class EvaluateCallback(Callback):
"""
通过使用该Callback可以使得Trainer在evaluate dev之外还可以evaluate其它数据集,比如测试集。每一次验证dev之前都会先验证EvaluateCallback
中的数据。
"""
别名::class:`fastNLP.LRScheduler` :class:`fastNLP.core.callback.LRScheduler`

对PyTorch LR Scheduler的包装以使得其可以被Trainer所使用
def __init__(self, data=None, tester=None):
"""
:param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用Trainer中的metric对数据进行验证。如果需要传入多个
DataSet请通过dict的方式传入。
:param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象, 通过使用Tester对象,可以使得验证的metric与Trainer中
的metric不一样。
"""
super().__init__()
self.datasets = {}
self.testers = {}
if tester is not None:
if isinstance(tester, dict):
for name, test in tester.items():
if not isinstance(test, Tester):
raise TypeError(f"{name} in tester is not a valid fastNLP.Tester.")
self.testers['tester-' + name] = test
if isinstance(tester, Tester):
self.testers['tester-test'] = tester
for tester in self.testers.values():
setattr(tester, 'verbose', 0)

if isinstance(data, dict):
for key, value in data.items():
assert isinstance(value, DataSet), f"Only DataSet object is allowed, not {type(value)}."
for key, value in data.items():
self.datasets['data-' + key] = value
elif isinstance(data, DataSet):
self.datasets['data-test'] = data
elif data is not None:
raise TypeError("data receives dict[DataSet] or DataSet object.")

def on_train_begin(self):
if len(self.datasets) > 0 and self.trainer.dev_data is None:
raise RuntimeError("Trainer has no dev data, you cannot pass extra DataSet to do evaluation.")

if len(self.datasets) > 0:
for key, data in self.datasets.items():
tester = Tester(data=data, model=self.model,
batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size),
metrics=self.trainer.metrics, verbose=0,
use_tqdm=self.trainer.test_use_tqdm)
self.testers[key] = tester

:param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler
def on_valid_end(self, eval_result, metric_key, optimizer, better_result):
if len(self.testers) > 0:
for key, tester in self.testers.items():
try:
eval_result = tester.test()
self.logger.info("EvaluateCallback evaluation on {}:".format(key))
self.logger.info(tester._format_eval_results(eval_result))
except Exception:
self.logger.error("Exception happens when evaluate on DataSet named `{}`.".format(key))


class LRScheduler(Callback):
"""
对PyTorch LR Scheduler的包装以使得其可以被Trainer所使用
"""
def __init__(self, lr_scheduler):
"""
:param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler
"""
super(LRScheduler, self).__init__()
import torch.optim
if isinstance(lr_scheduler, torch.optim.lr_scheduler._LRScheduler):
@@ -557,13 +686,13 @@ class LRScheduler(Callback):

class ControlC(Callback):
"""
别名::class:`fastNLP.ControlC` :class:`fastNLP.core.callback.ControlC`

:param bool quit_all: 若为True,则检测到control+C 直接退出程序;否则只退出Trainer
检测到 control+C 时的反馈
"""
def __init__(self, quit_all):
"""
:param bool quit_all: 若为True,则检测到control+C 直接退出程序;否则只退出Trainer
"""
super(ControlC, self).__init__()
if type(quit_all) != bool:
raise ValueError("In KeyBoardInterrupt, quit_all arguemnt must be a bool.")
@@ -581,12 +710,14 @@ class ControlC(Callback):


class SmoothValue(object):
"""work for LRFinder"""
def __init__(self, beta: float):
self.beta, self.n, self.mov_avg = beta, 0, 0
self.smooth = None
def add_value(self, val: float) -> None:
"Add `val` to calculate updated smoothed value."
"""Add `val` to calculate updated smoothed value."""
self.n += 1
self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val
self.smooth = self.mov_avg / (1 - self.beta ** self.n)
@@ -594,16 +725,15 @@ class SmoothValue(object):

class LRFinder(Callback):
"""
别名::class:`fastNLP.LRFinder` :class:`fastNLP.core.callback.LRFinder`

用第一个 epoch 找最佳的学习率,从第二个epoch开始应用它

:param float start_lr: 学习率下界
:param float end_lr: 学习率上界
"""
def __init__(self, start_lr=1e-6, end_lr=10):
"""
:param float start_lr: 学习率下界
:param float end_lr: 学习率上界
"""
super(LRFinder, self).__init__()
self.start_lr, self.end_lr = start_lr, end_lr
@@ -614,8 +744,7 @@ class LRFinder(Callback):
self.smooth_value = SmoothValue(0.8)
self.opt = None
self.find = None
self.loader = ModelLoader()

@property
def lr_gen(self):
scale = (self.end_lr - self.start_lr) / self.batch_per_epoch
@@ -630,7 +759,7 @@ class LRFinder(Callback):
self.opt = self.trainer.optimizer # pytorch optimizer
self.opt.param_groups[0]["lr"] = self.start_lr
# save model
ModelSaver("tmp").save_pytorch(self.trainer.model, param_only=True)
torch.save(self.model.state_dict(), 'tmp')
self.find = True
def on_backward_begin(self, loss):
@@ -659,14 +788,14 @@ class LRFinder(Callback):
self.opt.param_groups[0]["lr"] = self.best_lr
self.find = False
# reset model
ModelLoader().load_pytorch(self.trainer.model, "tmp")
states = torch.load('tmp')
self.model.load_state_dict(states)
os.remove('tmp')
self.pbar.write("Model reset. \nFind best lr={}".format(self.best_lr))


class TensorboardCallback(Callback):
"""
别名::class:`fastNLP.TensorboardCallback` :class:`fastNLP.core.callback.TensorboardCallback`

接受以下一个或多个字符串作为参数:
- "model"
- "loss"
@@ -674,7 +803,7 @@ class TensorboardCallback(Callback):
.. warning::
fastNLP 已停止对此功能的维护,请等待 fastNLP 兼容 PyTorch1.1 的下一个版本。
或者使用和 fastNLP 高度配合的 fitlog(参见 :doc:`/tutorials/tutorial_10_fitlog` )。
或者使用和 fastNLP 高度配合的 fitlog(参见 :doc:`/tutorials/tutorial_11_fitlog` )。
"""
@@ -741,14 +870,17 @@ class TensorboardCallback(Callback):

class WarmupCallback(Callback):
"""
按一定的周期调节Learning rate的大小。

:param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float,
如0.1, 则前10%的step是按照schedule策略调整learning rate。
:param str schedule: 以哪种方式调整。linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后
warmup的step下降到0; constant前warmup的step上升到指定learning rate,后面的step保持learning rate.
learning rate按照一定的速率从0上升到设置的learning rate。
"""
def __init__(self, warmup=0.1, schedule='constant'):
"""
:param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float,
如0.1, 则前10%的step是按照schedule策略调整learning rate。
:param str schedule: 以哪种方式调整。
linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后warmup的step下降到0;
constant前warmup的step上升到指定learning rate,后面的step保持learning rate.
"""
super().__init__()
self.warmup = max(warmup, 0.)

@@ -790,23 +922,26 @@ class WarmupCallback(Callback):
class SaveModelCallback(Callback):
"""
由于Trainer在训练过程中只会保存最佳的模型, 该callback可实现多种方式的结果存储。
会根据训练开始的时间戳在save_dir下建立文件夹,再在文件夹下存放多个模型
-save_dir
-2019-07-03-15-06-36
-epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_performance是性能
-epoch:1_step:40_{metric_key}:{evaluate_performance}.pt
-2019-07-03-15-10-00
-epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_perfomance是性能
:param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型
:param int top: 保存dev表现top多少模型。-1为保存所有模型。
:param bool only_param: 是否只保存模型d饿权重。
:param save_on_exception: 发生exception时,是否保存一份发生exception的模型。模型名称为epoch:x_step:x_Exception:{exception_name}.
会根据训练开始的时间戳在save_dir下建立文件夹,再在文件夹下存放多个模型::
-save_dir
-2019-07-03-15-06-36
-epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_performance是性能
-epoch:1_step:40_{metric_key}:{evaluate_performance}.pt
-2019-07-03-15-10-00
-epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_perfomance是性能
"""
def __init__(self, save_dir, top=3, only_param=False, save_on_exception=False):
"""
:param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型。如果save_dir不存在将自动创建
:param int top: 保存dev表现top多少模型。-1为保存所有模型。
:param bool only_param: 是否只保存模型的权重。
:param save_on_exception: 发生exception时,是否保存一份发生exception的模型。模型名称为epoch:x_step:x_Exception:{exception_name}.
"""
super().__init__()

if not os.path.isdir(save_dir):
raise IsADirectoryError("{} is not a directory.".format(save_dir))
os.makedirs(save_dir, exist_ok=True)
self.save_dir = save_dir
if top < 0:
self.top = sys.maxsize
@@ -844,35 +979,37 @@ class SaveModelCallback(Callback):
return save_pair, delete_pair

def _save_this_model(self, metric_value):
name = "epoch:{}_step:{}_{}:{:.6f}.pt".format(self.epoch, self.step, self.trainer.metric_key, metric_value)
name = "epoch-{}_step-{}_{}-{:.6f}.pt".format(self.epoch, self.step, self.trainer.metric_key, metric_value)
save_pair, delete_pair = self._insert_into_ordered_save_models((metric_value, name))
if save_pair:
try:
_save_model(self.model, model_name=name, save_dir=self.save_dir, only_param=self.only_param)
except Exception as e:
print(f"The following exception:{e} happens when save model to {self.save_dir}.")
logger.error(f"The following exception:{e} happens when save model to {self.save_dir}.")
if delete_pair:
try:
delete_model_path = os.path.join(self.save_dir, delete_pair[1])
if os.path.exists(delete_model_path):
os.remove(delete_model_path)
except Exception as e:
print(f"Fail to delete model {name} at {self.save_dir} caused by exception:{e}.")
logger.error(f"Fail to delete model {name} at {self.save_dir} caused by exception:{e}.")

def on_exception(self, exception):
if self.save_on_exception:
name = "epoch:{}_step:{}_Exception:{}.pt".format(self.epoch, self.step, exception.__class__.__name__)
name = "epoch-{}_step-{}_Exception-{}.pt".format(self.epoch, self.step, exception.__class__.__name__)
_save_model(self.model, model_name=name, save_dir=self.save_dir, only_param=self.only_param)


class CallbackException(BaseException):
"""
当需要通过callback跳出训练的时候可以通过抛出CallbackException并在on_exception中捕获这个值。

:param str msg: Exception的信息。
"""
def __init__(self, msg):
"""
:param str msg: Exception的信息。
"""
super(CallbackException, self).__init__(msg)


@@ -884,3 +1021,79 @@ class EarlyStopError(CallbackException):
def __init__(self, msg):
super(EarlyStopError, self).__init__(msg)


class EchoCallback(Callback):
"""
用于测试分布式训练
"""
def __init__(self, name, out=sys.stdout):
super(EchoCallback, self).__init__()
self.name = name
self.out = out # deprecated

def __getattribute__(self, item):
if item.startswith('on_'):
logger.info('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid()))
return super(EchoCallback, self).__getattribute__(item)


class _TesterCallback(Callback):
def __init__(self, data, model, metrics, metric_key=None, batch_size=16, num_workers=None):
super(_TesterCallback, self).__init__()
if hasattr(model, 'module'):
# for data parallel model
model = model.module
self.tester = Tester(data, model,
metrics=metrics, batch_size=batch_size,
num_workers=num_workers, verbose=0)
if metric_key is not None:
self.metric_key, self.increase_better = self._parse_metric_key(metric_key)
else:
self.metric_key = None
self.increase_better = True
self.score = None

def on_valid_begin(self):
cur_score = self.tester.test()
eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. - {}".format(
self.epoch, self.n_epochs, self.step, self.n_steps,
self.tester._format_eval_results(cur_score))
self.logger.info(eval_str)
is_better = self.compare_better(cur_score)
if is_better:
self.score = cur_score
return cur_score, is_better

@staticmethod
def _get_score(metric_dict, key):
for metric in metric_dict.items():
if key in metric:
return metric[key]
return None

@staticmethod
def _parse_metric_key(metric_key):
# parse metric_key
# increase_better is True. It means the exp result gets better if the indicator increases.
# It is true by default.
increase_better = False if metric_key[0] == "-" else True
metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key
return metric_key, increase_better

def compare_better(self, a):
if self.score is None:
return True
if self.metric_key is None:
metric_key = list(list(self.score.values())[0].keys())[0]
self.metric_key, self.increase_better = self._parse_metric_key(metric_key)
k = self.metric_key
score = self._get_score(self.score, k)
new_score = self._get_score(a, k)
if score is None or new_score is None:
return False
if self.increase_better:
return score <= new_score
else:
return score >= new_score

+ 37
- 12
fastNLP/core/const.py View File

@@ -1,3 +1,12 @@
r"""
fastNLP包当中的field命名均符合一定的规范,该规范由fastNLP.Const类进行定义。
"""

__all__ = [
"Const"
]


class Const:
"""
fastNLP中field命名常量。
@@ -7,12 +16,14 @@ class Const:
具体列表::

INPUT 模型的序列输入 words(复数words1, words2)
CHAR_INPUT 模型character输入 chars(复数chars1, chars2)
INPUT_LEN 序列长度 seq_len(复数seq_len1,seq_len2)
OUTPUT 模型输出 pred(复数pred1, pred2)
TARGET 真实目标 target(复数target1,target2)
LOSS 损失函数 loss (复数loss1,loss2)
INPUT 模型的序列输入 words(具有多列words时,依次使用words1, words2, )
CHAR_INPUT 模型character输入 chars(具有多列chars时,依次使用chars1, chars2)
INPUT_LEN 序列长度 seq_len(具有多列seq_len时,依次使用seq_len1,seq_len2)
OUTPUT 模型输出 pred(具有多列pred时,依次使用pred1, pred2)
TARGET 真实目标 target(具有多列target时,依次使用target1,target2)
LOSS 损失函数 loss (具有多列loss时,依次使用loss1,loss2)
RAW_WORD 原文的词 raw_words (具有多列raw_words时,依次使用raw_words1, raw_words2)
RAW_CHAR 原文的字 raw_chars (具有多列raw_chars时,依次使用raw_chars1, raw_chars2)

"""
INPUT = 'words'
@@ -21,37 +32,51 @@ class Const:
OUTPUT = 'pred'
TARGET = 'target'
LOSS = 'loss'

RAW_WORD = 'raw_words'
RAW_CHAR = 'raw_chars'
@staticmethod
def INPUTS(i):
"""得到第 i 个 ``INPUT`` 的命名"""
i = int(i) + 1
return Const.INPUT + str(i)
@staticmethod
def CHAR_INPUTS(i):
"""得到第 i 个 ``CHAR_INPUT`` 的命名"""
i = int(i) + 1
return Const.CHAR_INPUT + str(i)

@staticmethod
def RAW_WORDS(i):
"""得到第 i 个 ``RAW_WORDS`` 的命名"""
i = int(i) + 1
return Const.RAW_WORD + str(i)
@staticmethod
def RAW_CHARS(i):
"""得到第 i 个 ``RAW_CHARS`` 的命名"""
i = int(i) + 1
return Const.RAW_CHAR + str(i)
@staticmethod
def INPUT_LENS(i):
"""得到第 i 个 ``INPUT_LEN`` 的命名"""
i = int(i) + 1
return Const.INPUT_LEN + str(i)

@staticmethod
def OUTPUTS(i):
"""得到第 i 个 ``OUTPUT`` 的命名"""
i = int(i) + 1
return Const.OUTPUT + str(i)

@staticmethod
def TARGETS(i):
"""得到第 i 个 ``TARGET`` 的命名"""
i = int(i) + 1
return Const.TARGET + str(i)
@staticmethod
def LOSSES(i):
"""得到第 i 个 ``LOSS`` 的命名"""


+ 169
- 122
fastNLP/core/dataset.py View File

@@ -86,7 +86,7 @@
dataset.append(Instance(sentence=sent, label=label))

.. note::
直接读取特定数据集的数据请参考 :doc:`/tutorials/tutorial_2_load_dataset`
直接读取特定数据集的数据请参考 :doc:`/tutorials/tutorial_4_load_dataset`

2.2 对DataSet中的内容处理
--------------------------------------
@@ -288,29 +288,33 @@ __all__ = [
]

import _pickle as pickle
import warnings
from copy import deepcopy

import numpy as np
from prettytable import PrettyTable

from ._logger import logger
from .const import Const
from .field import AppendToTargetOrInputException
from .field import AutoPadder
from .field import FieldArray
from .field import SetInputOrTargetException
from .instance import Instance
from .utils import _get_func_signature
from .field import AppendToTargetOrInputException
from .field import SetInputOrTargetException
from .utils import pretty_table_printer

class DataSet(object):
"""
别名::class:`fastNLP.DataSet` :class:`fastNLP.core.dataset.DataSet`

fastNLP的数据容器,详细的使用方法见文档 :doc:`fastNLP.core.dataset`
:param data: 如果为dict类型,则每个key的value应该为等长的list; 如果为list,
每个元素应该为具有相同field的 :class:`~fastNLP.Instance` 。

fastNLP的数据容器,详细的使用方法见文档 :mod:`fastNLP.core.dataset`
"""

def __init__(self, data=None):
"""
:param data: 如果为dict类型,则每个key的value应该为等长的list; 如果为list,
每个元素应该为具有相同field的 :class:`~fastNLP.Instance` 。
"""
self.field_arrays = {}
if data is not None:
if isinstance(data, dict):
@@ -324,41 +328,48 @@ class DataSet(object):
for ins in data:
assert isinstance(ins, Instance), "Must be Instance type, not {}.".format(type(ins))
self.append(ins)
else:
raise ValueError("data only be dict or list type.")
def __contains__(self, item):
return item in self.field_arrays
def __iter__(self):
def iter_func():
for idx in range(len(self)):
yield self[idx]
return iter_func()
def _inner_iter(self):
class Iter_ptr:
def __init__(self, dataset, idx):
self.dataset = dataset
self.idx = idx
def __getitem__(self, item):
assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[
self.idx])
assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx)
return self.dataset.field_arrays[item][self.idx]

def __setitem__(self, key, value):
raise TypeError("You cannot modify value directly.")

def items(self):
ins = self.dataset[self.idx]
return ins.items()

def __repr__(self):
return self.dataset[self.idx].__repr__()
def inner_iter_func():
for idx in range(len(self)):
yield Iter_ptr(self, idx)
return inner_iter_func()
def __getitem__(self, idx):
"""给定int的index,返回一个Instance; 给定slice,返回包含这个slice内容的新的DataSet。

@@ -391,20 +402,20 @@ class DataSet(object):
return dataset
else:
raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx)))
def __getattr__(self, item):
# Not tested. Don't use !!
if item == "field_arrays":
raise AttributeError
if isinstance(item, str) and item in self.field_arrays:
return self.field_arrays[item]
def __setstate__(self, state):
self.__dict__ = state
def __getstate__(self):
return self.__dict__
def __len__(self):
"""Fetch the length of the dataset.

@@ -414,16 +425,66 @@ class DataSet(object):
return 0
field = iter(self.field_arrays.values()).__next__()
return len(field)
def __inner_repr__(self):
if len(self) < 20:
return ",\n".join([ins.__repr__() for ins in self])
else:
return self[:5].__inner_repr__() + "\n...\n" + self[-5:].__inner_repr__()

def __repr__(self):
return "DataSet(" + self.__inner_repr__() + ")"
return str(pretty_table_printer(self))

def print_field_meta(self):
"""
输出当前field的meta信息, 形似下列的输出::

+-------------+-------+-------+
| field_names | x | y |
+=============+=======+=======+
| is_input | True | False |
| is_target | False | False |
| ignore_type | False | |
| pad_value | 0 | |
+-------------+-------+-------+

:param field_names: DataSet中field的名称
:param is_input: field是否为input
:param is_target: field是否为target
:param ignore_type: 是否忽略该field的type, 一般仅在该field至少为input或target时才有意义
:param pad_value: 该field的pad的值,仅在该field为input或target时有意义
:return:
"""
if len(self.field_arrays)>0:
field_names = ['field_names']
is_inputs = ['is_input']
is_targets = ['is_target']
pad_values = ['pad_value']
ignore_types = ['ignore_type']

for name, field_array in self.field_arrays.items():
field_names.append(name)
if field_array.is_input:
is_inputs.append(True)
else:
is_inputs.append(False)
if field_array.is_target:
is_targets.append(True)
else:
is_targets.append(False)

if (field_array.is_input or field_array.is_target) and field_array.padder is not None:
pad_values.append(field_array.padder.get_pad_val())
else:
pad_values.append(' ')

if field_array._ignore_type:
ignore_types.append(True)
elif field_array.is_input or field_array.is_target:
ignore_types.append(False)
else:
ignore_types.append(' ')
table = PrettyTable(field_names=field_names)
fields = [is_inputs, is_targets, ignore_types, pad_values]
for field in fields:
table.add_row(field)
logger.info(table)
return table

def append(self, instance):
"""
将一个instance对象append到DataSet后面。
@@ -446,9 +507,9 @@ class DataSet(object):
try:
self.field_arrays[name].append(field)
except AppendToTargetOrInputException as e:
print(f"Cannot append to field:{name}.")
logger.error(f"Cannot append to field:{name}.")
raise e
def add_fieldarray(self, field_name, fieldarray):
"""
将fieldarray添加到DataSet中.
@@ -463,7 +524,7 @@ class DataSet(object):
raise RuntimeError(f"The field to add must have the same size as dataset. "
f"Dataset size {len(self)} != field size {len(fieldarray)}")
self.field_arrays[field_name] = fieldarray
def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False):
"""
新增一个field
@@ -475,19 +536,19 @@ class DataSet(object):
:param bool is_target: 新加入的field是否是target
:param bool ignore_type: 是否忽略对新加入的field的类型检查
"""
if len(self.field_arrays) != 0:
if len(self) != len(fields):
raise RuntimeError(f"The field to add must have the same size as dataset. "
f"Dataset size {len(self)} != field size {len(fields)}")
self.field_arrays[field_name] = FieldArray(field_name, fields, is_target=is_target, is_input=is_input,
padder=padder, ignore_type=ignore_type)
def delete_instance(self, index):
"""
删除第index个instance

:param int index: 需要删除的instance的index,从0开始
:param int index: 需要删除的instance的index,序号从0开始
"""
assert isinstance(index, int), "Only integer supported."
if len(self) <= index:
@@ -497,7 +558,8 @@ class DataSet(object):
else:
for field in self.field_arrays.values():
field.pop(index)
return self

def delete_field(self, field_name):
"""
删除名为field_name的field
@@ -505,7 +567,22 @@ class DataSet(object):
:param str field_name: 需要删除的field的名称.
"""
self.field_arrays.pop(field_name)
return self

def copy_field(self, field_name, new_field_name):
"""
深度copy名为field_name的field到new_field_name

:param str field_name: 需要copy的field。
:param str new_field_name: copy生成的field名称
:return: self
"""
if not self.has_field(field_name):
raise KeyError(f"Field:{field_name} not found in DataSet.")
fieldarray = deepcopy(self.get_field(field_name))
self.add_fieldarray(field_name=new_field_name, fieldarray=fieldarray)
return self

def has_field(self, field_name):
"""
判断DataSet中是否有名为field_name这个field
@@ -516,7 +593,7 @@ class DataSet(object):
if isinstance(field_name, str):
return field_name in self.field_arrays
return False
def get_field(self, field_name):
"""
获取field_name这个field
@@ -527,7 +604,7 @@ class DataSet(object):
if field_name not in self.field_arrays:
raise KeyError("Field name {} not found in DataSet".format(field_name))
return self.field_arrays[field_name]
def get_all_fields(self):
"""
返回一个dict,key为field_name, value为对应的 :class:`~fastNLP.FieldArray`
@@ -535,7 +612,7 @@ class DataSet(object):
:return dict: 返回如上所述的字典
"""
return self.field_arrays
def get_field_names(self) -> list:
"""
返回一个list,包含所有 field 的名字
@@ -543,7 +620,7 @@ class DataSet(object):
:return list: 返回如上所述的列表
"""
return sorted(self.field_arrays.keys())
def get_length(self):
"""
获取DataSet的元素数量
@@ -551,22 +628,22 @@ class DataSet(object):
:return: int: DataSet中Instance的个数。
"""
return len(self)
def rename_field(self, old_name, new_name):
def rename_field(self, field_name, new_field_name):
"""
将某个field重新命名.

:param str old_name: 原来的field名称。
:param str new_name: 修改为new_name。
:param str field_name: 原来的field名称。
:param str new_field_name: 修改为new_name。
"""
if old_name in self.field_arrays:
self.field_arrays[new_name] = self.field_arrays.pop(old_name)
self.field_arrays[new_name].name = new_name
if field_name in self.field_arrays:
self.field_arrays[new_field_name] = self.field_arrays.pop(field_name)
self.field_arrays[new_field_name].name = new_field_name
else:
raise KeyError("DataSet has no field named {}.".format(old_name))
raise KeyError("DataSet has no field named {}.".format(field_name))
return self
def set_target(self, *field_names, flag=True):
def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True):
"""
将field_names的field设置为target

@@ -577,19 +654,23 @@ class DataSet(object):

:param str field_names: field的名称
:param bool flag: 将field_name的target状态设置为flag
:param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一
行的数据进行类型和维度推断本列的数据的类型和维度。
"""
assert isinstance(flag, bool), "Only bool type supported."
for name in field_names:
if name in self.field_arrays:
try:
self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type)
self.field_arrays[name].is_target = flag
except SetInputOrTargetException as e:
print(f"Cannot set field:{name} as target.")
logger.error(f"Cannot set field:{name} as target.")
raise e
else:
raise KeyError("{} is not a valid field name.".format(name))
def set_input(self, *field_names, flag=True):
return self

def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True):
"""
将field_names的field设置为input::

@@ -598,17 +679,21 @@ class DataSet(object):

:param str field_names: field的名称
:param bool flag: 将field_name的input状态设置为flag
:param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一
行的数据进行类型和维度推断本列的数据的类型和维度。
"""
for name in field_names:
if name in self.field_arrays:
try:
self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type)
self.field_arrays[name].is_input = flag
except SetInputOrTargetException as e:
print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.")
logger.error(f"Cannot set field:{name} as input, exception happens at the {e.index} value.")
raise e
else:
raise KeyError("{} is not a valid field name.".format(name))
return self

def set_ignore_type(self, *field_names, flag=True):
"""
将field设置为忽略类型状态。当某个field被设置了ignore_type, 则在被设置为target或者input时将不进行类型检查,
@@ -624,7 +709,8 @@ class DataSet(object):
self.field_arrays[name].ignore_type = flag
else:
raise KeyError("{} is not a valid field name.".format(name))
return self

def set_padder(self, field_name, padder):
"""
为field_name设置padder::
@@ -639,7 +725,8 @@ class DataSet(object):
if field_name not in self.field_arrays:
raise KeyError("There is no field named {}.".format(field_name))
self.field_arrays[field_name].set_padder(padder)
return self

def set_pad_val(self, field_name, pad_val):
"""
为某个field设置对应的pad_val.
@@ -650,7 +737,8 @@ class DataSet(object):
if field_name not in self.field_arrays:
raise KeyError("There is no field named {}.".format(field_name))
self.field_arrays[field_name].set_pad_val(pad_val)
return self

def get_input_name(self):
"""
返回所有is_input被设置为True的field名称
@@ -658,7 +746,7 @@ class DataSet(object):
:return list: 里面的元素为被设置为input的field名称
"""
return [name for name, field in self.field_arrays.items() if field.is_input]
def get_target_name(self):
"""
返回所有is_target被设置为True的field名称
@@ -666,7 +754,7 @@ class DataSet(object):
:return list: 里面的元素为被设置为target的field名称
"""
return [name for name, field in self.field_arrays.items() if field.is_target]
def apply_field(self, func, field_name, new_field_name=None, **kwargs):
"""
将DataSet中的每个instance中的名为 `field_name` 的field传给func,并获取它的返回值。
@@ -695,16 +783,16 @@ class DataSet(object):
results.append(func(ins[field_name]))
except Exception as e:
if idx != -1:
print("Exception happens at the `{}`th instance.".format(idx))
logger.error("Exception happens at the `{}`th(from 1) instance.".format(idx + 1))
raise e
if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None
raise ValueError("{} always return None.".format(_get_func_signature(func=func)))
if new_field_name is not None:
self._add_apply_field(results, new_field_name, kwargs)
return results
def _add_apply_field(self, results, new_field_name, kwargs):
"""
将results作为加入到新的field中,field名称为new_field_name
@@ -736,7 +824,7 @@ class DataSet(object):
self.add_field(field_name=new_field_name, fields=results, is_input=extra_param.get("is_input", None),
is_target=extra_param.get("is_target", None),
ignore_type=extra_param.get("ignore_type", False))
def apply(self, func, new_field_name=None, **kwargs):
"""
将DataSet中每个instance传入到func中,并获取它的返回值.
@@ -760,20 +848,21 @@ class DataSet(object):
results = []
for idx, ins in enumerate(self._inner_iter()):
results.append(func(ins))
except Exception as e:
except BaseException as e:
if idx != -1:
print("Exception happens at the `{}`th instance.".format(idx))
logger.error("Exception happens at the `{}`th instance.".format(idx))
raise e

# results = [func(ins) for ins in self._inner_iter()]
if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None
raise ValueError("{} always return None.".format(_get_func_signature(func=func)))
if new_field_name is not None:
self._add_apply_field(results, new_field_name, kwargs)
return results

def add_seq_len(self, field_name:str, new_field_name='seq_len'):
def add_seq_len(self, field_name: str, new_field_name=Const.INPUT_LEN):
"""
将使用len()直接对field_name中每个元素作用,将其结果作为seqence length, 并放入seq_len这个field。

@@ -810,7 +899,7 @@ class DataSet(object):
return dataset
else:
return DataSet()
def split(self, ratio, shuffle=True):
"""
将DataSet按照ratio的比例拆分,返回两个DataSet
@@ -836,51 +925,9 @@ class DataSet(object):
for field_name in self.field_arrays:
train_set.field_arrays[field_name].to(self.field_arrays[field_name])
dev_set.field_arrays[field_name].to(self.field_arrays[field_name])
return train_set, dev_set
@classmethod
def read_csv(cls, csv_path, headers=None, sep=",", dropna=True):
r"""
.. warning::
此方法会在下个版本移除,请使用 :class:`fastNLP.io.CSVLoader`
从csv_path路径下以csv的格式读取数据。

:param str csv_path: 从哪里读取csv文件
:param list[str] headers: 如果为None,则使用csv文件的第一行作为header; 如果传入list(str), 则元素的个数必须
与csv文件中每行的元素个数相同。
:param str sep: 分割符
:param bool dropna: 是否忽略与header数量不一致行。
:return: 读取后的 :class:`~fastNLP.读取后的DataSet`。
"""
warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead',
category=DeprecationWarning)
with open(csv_path, "r", encoding='utf-8') as f:
start_idx = 0
if headers is None:
headers = f.readline().rstrip('\r\n')
headers = headers.split(sep)
start_idx += 1
else:
assert isinstance(headers, (list, tuple)), "headers should be list or tuple, not {}.".format(
type(headers))
_dict = {}
for col in headers:
_dict[col] = []
for line_idx, line in enumerate(f, start_idx):
contents = line.rstrip('\r\n').split(sep)
if len(contents) != len(headers):
if dropna:
continue
else:
# TODO change error type
raise ValueError("Line {} has {} parts, while header has {} parts." \
.format(line_idx, len(contents), len(headers)))
for header, content in zip(headers, contents):
_dict[header].append(content)
return cls(_dict)
def save(self, path):
"""
保存DataSet.
@@ -889,7 +936,7 @@ class DataSet(object):
"""
with open(path, 'wb') as f:
pickle.dump(self, f)
@staticmethod
def load(path):
r"""


+ 447
- 0
fastNLP/core/dist_trainer.py View File

@@ -0,0 +1,447 @@
"""undocumented
正在开发中的分布式训练代码
"""
import logging
import os
import time
from datetime import datetime

import torch
import torch.cuda
import torch.distributed as dist
import torch.optim
from pkg_resources import parse_version
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm

from ._logger import logger
from .batch import DataSetIter, BatchIter
from .callback import DistCallbackManager, CallbackException
from .callback import _TesterCallback
from .dataset import DataSet
from .losses import _prepare_losser
from .optimizer import Optimizer
from .utils import _build_args
from .utils import _check_fp16
from .utils import _get_func_signature
from .utils import _move_dict_value_to_device

try:
from apex import amp
except:
amp = None

__all__ = [
'get_local_rank',
'DistTrainer',
]

def get_local_rank():
"""
返回当前进程的 local rank, 0 到 N-1 ,N为当前分布式总进程数
"""
if 'LOCAL_RANK' in os.environ:
return int(os.environ['LOCAL_RANK'])
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument('--local_rank', type=int)
args, _ = parser.parse_known_args()
if 'local_rank' in args and args.local_rank:
os.environ['LOCAL_RANK'] = str(args.local_rank) # for multiple calls for this function
return args.local_rank
raise RuntimeError('Please use "python -m torch.distributed.launch --nproc_per_node=N train_script.py')


class DistTrainer():
"""
分布式的 Trainer,支持分布式训练和混合精度的训练。具体实现原理请阅读 pytorch 官方文档。

Note: 使用分布式 Trainer 时会同时有多个进程执行训练代码。因此将单进程的训练代码改为多进程之前,
请仔细检查,确保训练代码中的同步和互斥操作能正确执行(如模型保持,打印日志等)
"""
def __init__(self, train_data, model, optimizer=None, loss=None,
callbacks_all=None, callbacks_master=None,
batch_size_per_gpu=8, n_epochs=1,
num_workers=1, drop_last=False,
dev_data=None, metrics=None, metric_key=None,
update_every=1, print_every=10, validate_every=-1,
save_every=-1, save_path=None, device='auto',
fp16='', backend=None, init_method=None, use_tqdm=True):
"""

:param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。
:param nn.modules model: 待训练的模型
:param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器
:param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward`
:param list callbacks_all: 用于在train过程中起调节作用的回调函数,作用于所有训练进程中。
可使用的callback参见 :mod:`callback模块 <fastNLP.core.callback>`
:param list callbacks_master: 用于在train过程中起调节作用的回调函数,只作用于其中一个进程( Master 进程)。
可使用的callback参见 :mod:`callback模块 <fastNLP.core.callback>`
:param int batch_size_per_gpu: 训练时,每个进程的 batch 大小。
:param int n_epochs: 需要优化迭代多少次。
:param num_workers: int, 有多少个线程来进行数据pad处理。
:param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch
:param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。
:param metrics: 验证的评估函数。可以只使用一个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,
也可以使用多个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,通过列表传入。
如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None,
则保存当前模型。Metric种类详见 :mod:`metrics模块 <fastNLP.core.metrics>` 。仅在传入dev_data时有效。
:param str,None metric_key: :class:`Metric<fastNLP.core.metrics.MetricBase>` 有时会有多个指标,
比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需
要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表
明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。
:param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128
会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。
:param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。
:param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。
:param int save_every: 多少个step保存一次模型,如果为-1,则每个epoch结束保存一次。仅在传入save_path时有效。
:param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存
最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。
:param str device: 指定 device,可以是 gpu,cpu 或 auto
:param str fp16: 指定半精度训练的优化等级,可为 O1,O2 或 O3,若为空字符串则不使用半精度。
:param backend: 指定分布式的backend,详情参考 pytorch 文档
:param init_method 指定分布式的初始化方法,详情参考 pytorch 文档
:param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。
"""
assert device in ['auto', 'cuda', 'cpu'], "Please set correct device in [auto', 'cuda', 'cpu']"
if device == 'auto':
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if backend is None:
backend = 'nccl' if device == 'cuda' else 'gloo'

# init distributed
if device == 'cuda':
torch.cuda.set_device(get_local_rank())
self.device = torch.device("cuda", get_local_rank())
else:
self.device = torch.device(device)

dist.init_process_group(backend=backend, init_method=init_method)
self.world_size = dist.get_world_size()
self.rank = dist.get_rank() # unique id for each process

self.model = model
self.train_data = train_data
self.batch_size_per_gpu = int(batch_size_per_gpu)
self.n_epochs = int(n_epochs)
self.num_data_workers = int(num_workers)
self.drop_last = drop_last
self.update_every = int(update_every)
self.print_every = int(print_every)
self.validate_every = int(validate_every)
self.save_every = int(save_every)
self.save_path = save_path
self.losser = _prepare_losser(loss)
self.fp16 = fp16
self.init_method = init_method
self.backend = backend
self.local_rank = get_local_rank()
self._forward_func = model.forward
self.callback_manager = DistCallbackManager(
env={"trainer": self}, callbacks_all=callbacks_all,
callbacks_master=callbacks_master)
self.test_manager = DistCallbackManager(env={'trainer': self})
self.metric_key = metric_key
self.use_tqdm = use_tqdm

model.to(self.device)
optimizer = self._get_optimizer(optimizer)

# init fp16, must before DataParallel init
if len(self.fp16):
assert isinstance(self.fp16, str), "Please set Apex AMP optimization level selected in ['O0', 'O1', 'O2', 'O3']"
_check_fp16()
assert device == 'cuda', "Amp requires cuda device"
model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16)

# init DataParallel
if parse_version(torch.__version__)>=parse_version('1.1'):
self.model = DDP(model, device_ids=[self.local_rank],
output_device=self.local_rank, find_unused_parameters=True)
else:
self.model = DDP(model, device_ids=[self.local_rank],
output_device=self.local_rank)

self.optimizer = optimizer
self.sampler = DistributedSampler(self.train_data)
self.data_iterator = self._get_data_iter(self.train_data)
self.batch_size = self.world_size * self.batch_size_per_gpu
self.n_steps = self._get_n_steps()

# for evaluation, only run eval on master proc
if dev_data and metrics:
cb = _TesterCallback(
dev_data, model, metrics,
batch_size=batch_size_per_gpu, num_workers=num_workers)
self.test_manager.add_callback([cb], master=False)

# Setup logging
dist.barrier()
self.start_time = datetime.now().strftime('%m_%d_%Y-%H_%M')
if self.save_path:
self.cp_save_path = os.path.join(self.save_path, 'checkpoints')
else:
self.cp_save_path = None

# use INFO in the master, WARN for others
logger.setLevel(logging.INFO if self.is_master else logging.WARNING)
self.logger = logger
self.logger.info("Setup Distributed Trainer")
self.logger.warning("Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".format(
os.getpid(), self.rank, self.local_rank, self.device, self.fp16 if self.fp16 else False))
self.logger.info("Num of processes: {}".format(self.world_size))
self.logger.info("Use device: {}".format(device))
self.logger.info("Training with fp16: {}, optimization level: {}".format(
len(self.fp16) > 0, self.fp16 if self.fp16 else None))

def _get_n_steps(self):
batch_size = self.world_size * self.batch_size_per_gpu
return (len(self.train_data) // batch_size + int(
len(self.train_data) % batch_size != 0)) * int(self.drop_last == 0) * self.n_epochs

def _get_data_iter(self, dataset):
if isinstance(dataset, DataSet):
return DataSetIter(
dataset=dataset, batch_size=self.batch_size_per_gpu,
num_workers=self.num_data_workers, sampler=self.sampler,
drop_last=self.drop_last
)
elif isinstance(dataset, BatchIter):
return dataset
else:
raise TypeError("train_data type {} not support".format(type(dataset)))

def _get_optimizer(self, optimizer):
if isinstance(optimizer, torch.optim.Optimizer):
return optimizer
elif isinstance(optimizer, Optimizer):
return optimizer.construct_from_pytorch(self.model.parameters())
elif optimizer is None:
return torch.optim.Adam(self.model.parameters(), lr=4e-3)
else:
raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer)))

@property
def is_master(self):
"""是否是主进程"""
return self.rank == 0

def train(self, load_best_model=True, on_exception='auto'):
"""
使用该函数使Trainer开始训练。

:param str on_exception: 在训练过程遭遇exception,并被 :py:class:Callback 的on_exception()处理后,是否继续抛出异常。
支持'ignore','raise', 'auto': 'ignore'将捕获异常,写在Trainer.train()后面的代码将继续运行; 'raise'将异常抛出;
'auto'将ignore以下两种Exception: CallbackException与KeyboardInterrupt, raise其它exception.
:return dict: 返回一个字典类型的数据,
内含以下内容::

seconds: float, 表示训练时长
以下三个内容只有在提供了dev_data的情况下会有。
best_eval: Dict of Dict, 表示evaluation的结果。第一层的key为Metric的名称,
第二层的key为具体的Metric
best_epoch: int,在第几个epoch取得的最佳值
best_step: int, 在第几个step(batch)更新取得的最佳值

"""
try:
self.logger.info("###### Training epochs started ######")
self.logger.info('Total epochs: %d'% self.n_epochs)
self.logger.info('Total steps: %d'% self.n_steps)
self.logger.info('Num instances per GPU %d'% self.batch_size_per_gpu)
self.logger.info('Total batch_size: %d'% self.batch_size_per_gpu * dist.get_world_size())
self.logger.info('Total num of samples: %d'% len(self.train_data))
self.logger.info("Num of callbacks for all workers: {}".format(
len(self.callback_manager.callbacks_all)))
self.logger.info("Num of callbacks for master workers: {}".format(
len(self.callback_manager.callbacks_master)))
self.logger.info("Callbacks for all workers: {}".format(
[repr(cb) for cb in self.callback_manager.callbacks_all]))
self.logger.info("Callbacks for master workers: {}".format(
[repr(cb) for cb in self.callback_manager.callbacks_master]))

start_time = time.time()
results = {}
if self.n_epochs <= 0:
self.logger.info("Training epoch is {}, nothing was done.".format(self.n_epochs))
results['seconds'] = 0.
return results

try:
self.callback_manager.on_train_begin()
self._train()
self.callback_manager.on_train_end()

except BaseException as e:
self.callback_manager.on_exception(e)
if on_exception == 'auto':
if not isinstance(e, (CallbackException, KeyboardInterrupt)):
raise e
else:
self.logger.info('Catch {}, ignored.'.format(e.__class__.__name__))
elif on_exception == 'raise':
raise e

results['seconds'] = round(time.time() - start_time, 2)
self.logger.info("###### Train finished ######")
self.logger.info('Total train time: {} seconds.'. format(results['seconds']))
if load_best_model and self.cp_save_path and len(self.test_manager.callbacks):
self.load_check_point('best')
finally:
pass
dist.barrier()
return results

def _train(self):
if not self.use_tqdm:
from .utils import _pseudo_tqdm as inner_tqdm
else:
inner_tqdm = tqdm

self.step = 0
self.epoch = 0
self.pbar = inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}',
leave=False, dynamic_ncols=True, disable=not self.is_master)
pbar = self.pbar
avg_loss = 0
data_iterator = self.data_iterator
self.model.zero_grad()
for epoch in range(1, self.n_epochs + 1):
self.epoch = epoch
pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
# early stopping
self.callback_manager.on_epoch_begin()
for batch_x, batch_y in data_iterator:
self.model.train()
self.step += 1
_move_dict_value_to_device(batch_x, batch_y, device=self.device)
indices = data_iterator.get_batch_indices()
# negative sampling; replace unknown; re-weight batch_y
self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
prediction = self._data_forward(self.model, batch_x)

# edit prediction
self.callback_manager.on_loss_begin(batch_y, prediction)
loss = self._compute_loss(prediction, batch_y)
avg_loss += loss.item()

# Is loss NaN or inf? requires_grad = False
self.callback_manager.on_backward_begin(loss)

if self.fp16:
with amp.scale_loss(loss, self.optimizer) as scale_loss:
scale_loss.backward()
else:
loss.backward()

self.callback_manager.on_backward_end()

self._update()
self.callback_manager.on_step_end()

if self.step % self.print_every == 0:
avg_loss = float(avg_loss) / self.print_every
print_output = "loss:{:<6.5f}".format(avg_loss)
pbar.update(self.print_every)
pbar.set_postfix_str(print_output)
avg_loss = 0

self.callback_manager.on_batch_end()

if (self.validate_every > 0 and self.step % self.validate_every == 0):
self._do_validation()

if self.cp_save_path and \
self.save_every > 0 and \
self.step % self.save_every == 0:
self.save_check_point()

# ================= mini-batch end ==================== #
if self.validate_every < 0:
self._do_validation()

if self.save_every < 0 and self.cp_save_path:
self.save_check_point()
# lr decay; early stopping
self.callback_manager.on_epoch_end()
# =============== epochs end =================== #
pbar.close()
self.pbar = None
# ============ tqdm end ============== #

def _update(self):
"""Perform weight update on a model.

"""
if self.step % self.update_every == 0:
self.optimizer.step()
self.model.zero_grad()

def _data_forward(self, network, x):
x = _build_args(self._forward_func, **x)
y = network(**x)
if not isinstance(y, dict):
raise TypeError(
f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.")
return y

def _compute_loss(self, predict, truth):
"""Compute loss given prediction and ground truth.

:param predict: prediction dict, produced by model.forward
:param truth: ground truth dict, produced by batch_y
:return: a scalar
"""
loss = self.losser(predict, truth)
if self.update_every > 1:
loss = loss / self.update_every
if loss.dim() > 0:
loss = loss.mean()
return loss

def save_check_point(self, name=None, only_params=False):
"""保存当前模型"""
# only master save models
if self.is_master:
if name is None:
name = 'checkpoint-{}.bin'.format(self.step)
os.makedirs(self.cp_save_path, exist_ok=True)
path = os.path.join(self.cp_save_path, name)
self.logger.info("Save checkpoint to {}".format(path))
model_to_save = self.model.module
if only_params:
model_to_save = model_to_save.state_dict()
torch.save(model_to_save, path)

def load_check_point(self, name):
path = os.path.join(self.cp_save_path, name)
self.logger.info('reload best model from %s', path)
model_load = torch.load(path, map_location='cpu')
if not isinstance(model_load, dict):
model_load = model_load.state_dict()
self.model.module.load_state_dict(model_load)

def _do_validation(self):
self.callback_manager.on_valid_begin()
# do evaluate on all nodes
eval_res = self.test_manager.on_valid_begin()
eval_res = list(filter(lambda x: x is not None, eval_res))
if len(eval_res):
eval_res, is_better = list(zip(*eval_res))
else:
eval_res, is_better = None, None
# save better model on master node
if self.is_master and is_better is not None and self.cp_save_path:
for i, better_flag in enumerate(is_better):
if better_flag:
# TODO to support multiple datasets to evaluate
self.save_check_point('best')
break
self.callback_manager.on_valid_end(
eval_res, self.metric_key, self.optimizer, is_better)
dist.barrier()

def close(self):
"""关闭Trainer,销毁进程"""
dist.destroy_process_group()

+ 155
- 131
fastNLP/core/field.py View File

@@ -1,73 +1,91 @@
"""
.. todo::
doc
"""

__all__ = [
"Padder",
"AutoPadder",
"EngChar2DPadder",
]

from numbers import Number
import torch
import numpy as np
from typing import Any
from abc import abstractmethod
from copy import deepcopy
from collections import Counter
from copy import deepcopy
from numbers import Number
from typing import Any

import numpy as np
import torch

from ._logger import logger
from .utils import _is_iterable


class SetInputOrTargetException(Exception):
def __init__(self, msg, index=None, field_name=None):
super().__init__(msg)
self.msg = msg
self.index = index # 标示在哪个数据遭遇到问题了
self.field_name = field_name # 标示当前field的名称
self.field_name = field_name # 标示当前field的名称


class AppendToTargetOrInputException(Exception):
def __init__(self, msg, index=None, field_name=None):
super().__init__(msg)
self.msg = msg
self.index = index # 标示在哪个数据遭遇到问题了
self.field_name = field_name # 标示当前field的名称
self.field_name = field_name # 标示当前field的名称


class FieldArray:
def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False):
if len(content)==0:
def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False,
use_1st_ins_infer_dim_type=True):
if len(content) == 0:
raise RuntimeError("Empty fieldarray is not allowed.")
_content = content
try:
_content = list(_content)
except BaseException as e:
print(f"Cannot convert content(of type:{type(content)}) into list.")
logger.error(f"Cannot convert content(of type:{type(content)}) into list.")
raise e
self.name = name
self.content = _content
self._ignore_type = ignore_type
# 根据input的情况设置input,target等
self._cell_ndim = None # 多少维度
self._cell_ndim = None # 多少维度, 如果value是1, dim为0; 如果value是[1, 2], dim=2
self.dtype = None # 最内层的element都是什么类型的
self._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type)
self._is_input = False
self._is_target = False
if is_input:
self.is_input = is_input
if is_target:
self.is_target = is_target
if padder is None:
padder = AutoPadder(pad_val=0)
else:
assert isinstance(padder, Padder), "padder must be of type fastNLP.Padder."
padder = deepcopy(padder)
self.set_padder(padder)
@property
def ignore_type(self):
return self._ignore_type
@ignore_type.setter
def ignore_type(self, value):
if value:
self._cell_ndim = None
self.dtype = None
self._ignore_type = value
@property
def is_input(self):
return self._is_input
@is_input.setter
def is_input(self, value):
"""
@@ -77,16 +95,16 @@ class FieldArray:
if value is True and \
self._is_target is False and \
self._ignore_type is False:
self._check_dtype_and_ndim()
self._check_dtype_and_ndim(only_check_1st_ins_dim_type=self._use_1st_ins_infer_dim_type)
if value is False and self._is_target is False:
self.dtype = None
self._cell_ndim = None
self._is_input = value
@property
def is_target(self):
return self._is_target
@is_target.setter
def is_target(self, value):
"""
@@ -95,70 +113,82 @@ class FieldArray:
if value is True and \
self._is_input is False and \
self._ignore_type is False:
self._check_dtype_and_ndim()
self._check_dtype_and_ndim(only_check_1st_ins_dim_type=self._use_1st_ins_infer_dim_type)
if value is False and self._is_input is False:
self.dtype = None
self._cell_ndim = None
self._is_target = value
def _check_dtype_and_ndim(self):
def _check_dtype_and_ndim(self, only_check_1st_ins_dim_type=True):
"""
检查当前content所有的element是否是同一个类型,且是否每个元素具有相同的维度。通过的话,设置_cell_ndim与_ele_type属性;没有
通过将直接报错.

:param bool only_check_1st_ins_dim_type: 是否只检查第一个元素的type和dim
:return:
"""
cell_0 = self.content[0]
index = 0
try:
type_0, dim_0 = _get_ele_type_and_dim(cell_0)
for cell in self.content[1:]:
index += 1
type_i, dim_i = _get_ele_type_and_dim(cell)
if type_i!=type_0:
raise SetInputOrTargetException("Type:{} in index {} is different from the first element with type:{}."
".".format(type_i, index, type_0))
if dim_0!=dim_i:
raise SetInputOrTargetException("Dimension:{} in index {} is different from the first element with "
"dimension:{}.".format(dim_i, index, dim_0))
if not only_check_1st_ins_dim_type:
for cell in self.content[1:]:
index += 1
type_i, dim_i = _get_ele_type_and_dim(cell)
if type_i != type_0:
raise SetInputOrTargetException(
"Type:{} in index {} is different from the first element with type:{}."
".".format(type_i, index, type_0))
if dim_0 != dim_i:
raise SetInputOrTargetException(
"Dimension:{} in index {} is different from the first element with "
"dimension:{}.".format(dim_i, index, dim_0))
self._cell_ndim = dim_0
self.dtype = type_0
except SetInputOrTargetException as e:
e.index = index
raise e
def append(self, val:Any):
def append(self, val: Any):
"""
:param val: 把该val append到fieldarray。
:return:
"""
if (self._is_target or self._is_input) and self._ignore_type is False:
if (self._is_target or self._is_input) and self._ignore_type is False and not self._use_1st_ins_infer_dim_type:
type_, dim_ = _get_ele_type_and_dim(val)
if self.dtype!=type_:
if self.dtype != type_:
raise AppendToTargetOrInputException(f"Value(type:{type_}) are of different types with "
f"previous values(type:{self.dtype}).")
if self._cell_ndim!=dim_:
if self._cell_ndim != dim_:
raise AppendToTargetOrInputException(f"Value(dim:{dim_}) are of different dimensions with "
f"previous values(dim:{self._cell_ndim}).")
self.content.append(val)
else:
self.content.append(val)

def pop(self, index):
"""
删除该field中index处的元素
:param int index: 从0开始的数据下标。
:return:
"""
self.content.pop(index)
def __getitem__(self, indices):
return self.get(indices, pad=False)

def __setitem__(self, idx, val):
assert isinstance(idx, int)
if (self._is_target or self._is_input) and self.ignore_type is False: # 需要检测类型
type_, dim_ = _get_ele_type_and_dim(val)
if self.dtype!=type_:
if self.dtype != type_:
raise RuntimeError(f"Value(type:{type_}) are of different types with "
f"other values(type:{self.dtype}).")
if self._cell_ndim!=dim_:
f"other values(type:{self.dtype}).")
if self._cell_ndim != dim_:
raise RuntimeError(f"Value(dim:{dim_}) are of different dimensions with "
f"previous values(dim:{self._cell_ndim}).")
f"previous values(dim:{self._cell_ndim}).")
self.content[idx] = val
def get(self, indices, pad=True):
"""
根据给定的indices返回内容
@@ -171,16 +201,16 @@ class FieldArray:
return self.content[indices]
if self.is_input is False and self.is_target is False:
raise RuntimeError("Please specify either is_input or is_target to True for {}".format(self.name))
contents = [self.content[i] for i in indices]
if self.padder is None or pad is False:
return np.array(contents)
else:
return self.pad(contents)
def pad(self, contents):
return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim)
def set_padder(self, padder):
"""
设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。
@@ -192,7 +222,7 @@ class FieldArray:
self.padder = deepcopy(padder)
else:
self.padder = None
def set_pad_val(self, pad_val):
"""
修改padder的pad_val.
@@ -202,7 +232,7 @@ class FieldArray:
if self.padder is not None:
self.padder.set_pad_val(pad_val)
return self
def __len__(self):
"""
Returns the size of FieldArray.
@@ -210,7 +240,7 @@ class FieldArray:
:return int length:
"""
return len(self.content)
def to(self, other):
"""
将other的属性复制给本FieldArray(other必须为FieldArray类型).
@@ -220,15 +250,15 @@ class FieldArray:
:return: :class:`~fastNLP.FieldArray`
"""
assert isinstance(other, FieldArray), "Only supports fastNLP.FieldArray type, not {}.".format(type(other))
self.ignore_type = other.ignore_type
self.is_input = other.is_input
self.is_target = other.is_target
self.padder = other.padder
return self
def split(self, sep:str=None, inplace:bool=True):
def split(self, sep: str = None, inplace: bool = True):
"""
依次对自身的元素使用.split()方法,应该只有当本field的元素为str时,该方法才有用。将返回值

@@ -241,11 +271,11 @@ class FieldArray:
try:
new_contents.append(cell.split(sep))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
logger.error(f"Exception happens when process value in index {index}.")
raise e
return self._after_process(new_contents, inplace=inplace)
def int(self, inplace:bool=True):
def int(self, inplace: bool = True):
"""
将本field中的值调用int(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的),
(2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。)
@@ -261,10 +291,10 @@ class FieldArray:
else:
new_contents.append(int(cell))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
logger.error(f"Exception happens when process value in index {index}.")
raise e
return self._after_process(new_contents, inplace=inplace)
def float(self, inplace=True):
"""
将本field中的值调用float(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的),
@@ -281,10 +311,10 @@ class FieldArray:
else:
new_contents.append(float(cell))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
logger.error(f"Exception happens when process value in index {index}.")
raise e
return self._after_process(new_contents, inplace=inplace)
def bool(self, inplace=True):
"""
将本field中的值调用bool(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的),
@@ -301,11 +331,11 @@ class FieldArray:
else:
new_contents.append(bool(cell))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
logger.error(f"Exception happens when process value in index {index}.")
raise e
return self._after_process(new_contents, inplace=inplace)
def lower(self, inplace=True):
"""
将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的),
@@ -322,10 +352,10 @@ class FieldArray:
else:
new_contents.append(cell.lower())
except Exception as e:
print(f"Exception happens when process value in index {index}.")
logger.error(f"Exception happens when process value in index {index}.")
raise e
return self._after_process(new_contents, inplace=inplace)
def upper(self, inplace=True):
"""
将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的),
@@ -342,10 +372,10 @@ class FieldArray:
else:
new_contents.append(cell.upper())
except Exception as e:
print(f"Exception happens when process value in index {index}.")
logger.error(f"Exception happens when process value in index {index}.")
raise e
return self._after_process(new_contents, inplace=inplace)
def value_count(self):
"""
返回该field下不同value的数量。多用于统计label数量
@@ -353,17 +383,18 @@ class FieldArray:
:return: Counter, key是label,value是出现次数
"""
count = Counter()
def cum(cell):
if _is_iterable(cell) and not isinstance(cell, str):
for cell_ in cell:
cum(cell_)
else:
count[cell] += 1
for cell in self.content:
cum(cell)
return count
def _after_process(self, new_contents, inplace):
"""
当调用处理函数之后,决定是否要替换field。
@@ -378,14 +409,14 @@ class FieldArray:
self.is_input = self.is_input
self.is_target = self.is_input
except SetInputOrTargetException as e:
print("The newly generated field cannot be set as input or target.")
logger.error("The newly generated field cannot be set as input or target.")
raise e
return self
else:
return new_contents


def _get_ele_type_and_dim(cell:Any, dim=0):
def _get_ele_type_and_dim(cell: Any, dim=0):
"""
识别cell的类别与dimension的数量

@@ -401,13 +432,13 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
elif isinstance(cell, list):
dim += 1
res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell]
types = set([i for i,j in res])
dims = set([j for i,j in res])
if len(types)>1:
types = set([i for i, j in res])
dims = set([j for i, j in res])
if len(types) > 1:
raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types)))
elif len(types)==0:
elif len(types) == 0:
raise SetInputOrTargetException("Empty value encountered.")
if len(dims)>1:
if len(dims) > 1:
raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims)))
return types.pop(), dims.pop()
elif isinstance(cell, torch.Tensor):
@@ -418,55 +449,47 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
# 否则需要继续往下iterate
dim += 1
res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell]
types = set([i for i,j in res])
dims = set([j for i,j in res])
if len(types)>1:
types = set([i for i, j in res])
dims = set([j for i, j in res])
if len(types) > 1:
raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types)))
elif len(types)==0:
elif len(types) == 0:
raise SetInputOrTargetException("Empty value encountered.")
if len(dims)>1:
if len(dims) > 1:
raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims)))
return types.pop(), dims.pop()
else: # 包含tuple, set, dict以及其它的类型
else: # 包含tuple, set, dict以及其它的类型
raise SetInputOrTargetException(f"Cannot process type:{type(cell)}.")


def _is_iterable(value):
# 检查是否是iterable的, duck typing
try:
iter(value)
return True
except BaseException as e:
return False


class Padder:
"""
别名::class:`fastNLP.Padder` :class:`fastNLP.core.field.Padder`

所有padder都需要继承这个类,并覆盖__call__方法。
用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。

.. py:function:: __call__(self, contents, field_name, field_ele_dtype):
"""
def __init__(self, pad_val=0, **kwargs):
"""
传入的是List内容。假设有以下的DataSet。

:param List[Any] contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前
deepcopy一份。
:param str, field_name: field的名称。
:param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True,该这个值为None。
:return: np.array([padded_element])

"""

def __init__(self, pad_val=0, **kwargs):
"""
self.pad_val = pad_val
def set_pad_val(self, pad_val):
self.pad_val = pad_val

def get_pad_val(self):
return self.pad_val

@abstractmethod
def __call__(self, contents, field_name, field_ele_dtype, dim:int):
def __call__(self, contents, field_name, field_ele_dtype, dim: int):
"""
传入的是List内容。假设有以下的DataSet。

@@ -512,8 +535,6 @@ class Padder:

class AutoPadder(Padder):
"""
别名::class:`fastNLP.AutoPadder` :class:`fastNLP.core.field.AutoPadder`

根据contents的数据自动判定是否需要做padding。

1 如果元素类型(元素类型是指field中最里层元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类
@@ -533,23 +554,24 @@ class AutoPadder(Padder):

3 其它情况不进行处理,返回一个np.array类型。
"""
def __init__(self, pad_val=0):
super().__init__(pad_val=pad_val)
def __call__(self, contents, field_name, field_ele_dtype, dim):
if field_ele_dtype:
if dim>3:
if dim > 3:
return np.array(contents)
if isinstance(field_ele_dtype, type) and \
(issubclass(field_ele_dtype, np.number) or issubclass(field_ele_dtype, Number)):
if dim==0:
if dim == 0:
array = np.array(contents, dtype=field_ele_dtype)
elif dim==1:
elif dim == 1:
max_len = max(map(len, contents))
array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
array[i, :len(content_i)] = content_i
elif dim==2:
elif dim == 2:
max_len = max(map(len, contents))
max_word_len = max([max([len(content_ii) for content_ii in content_i]) for
content_i in contents])
@@ -559,20 +581,21 @@ class AutoPadder(Padder):
array[i, j, :len(content_ii)] = content_ii
else:
shape = np.shape(contents)
if len(shape)==4: # 说明各dimension是相同的大小
if len(shape) == 4: # 说明各dimension是相同的大小
array = np.array(contents, dtype=field_ele_dtype)
else:
raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
raise RuntimeError(
f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
return array
elif str(field_ele_dtype).startswith('torch'):
if dim==0:
if dim == 0:
tensor = torch.tensor(contents).to(field_ele_dtype)
elif dim==1:
elif dim == 1:
max_len = max(map(len, contents))
tensor = torch.full((len(contents), max_len), fill_value=self.pad_val, dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
tensor[i, :len(content_i)] = torch.tensor(content_i)
elif dim==2:
tensor[i, :len(content_i)] = content_i.clone().detach()
elif dim == 2:
max_len = max(map(len, contents))
max_word_len = max([max([len(content_ii) for content_ii in content_i]) for
content_i in contents])
@@ -580,18 +603,21 @@ class AutoPadder(Padder):
dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
for j, content_ii in enumerate(content_i):
tensor[i, j, :len(content_ii)] = torch.tensor(content_ii)
tensor[i, j, :len(content_ii)] = content_ii.clone().detach()
else:
shapes = set([np.shape(content_i) for content_i in contents])
if len(shapes)>1:
raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
if len(shapes) > 1:
raise RuntimeError(
f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
shape = shapes.pop()
if len(shape)==3:
tensor = torch.full([len(contents)]+list(shape), fill_value=self.pad_val, dtype=field_ele_dtype)
if len(shape) == 3:
tensor = torch.full([len(contents)] + list(shape), fill_value=self.pad_val,
dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
tensor[i] = torch.tensor(content_i, dtype=field_ele_dtype)
tensor[i] = content_i.clone().detach().to(field_ele_dtype)
else:
raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
raise RuntimeError(
f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
return tensor
else:
return np.array(contents) # 不进行任何操作
@@ -601,8 +627,6 @@ class AutoPadder(Padder):

class EngChar2DPadder(Padder):
"""
别名::class:`fastNLP.EngChar2DPadder` :class:`fastNLP.core.field.EngChar2DPadder`

用于为英语执行character级别的2D padding操作。对应的field内容应该类似[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']],
但这个Padder只能处理index为int的情况。

@@ -622,7 +646,7 @@ class EngChar2DPadder(Padder):
dataset.set_padder('chars', padder) # chars这个field的设置为了EnChar2DPadder

"""
def __init__(self, pad_val=0, pad_length=0):
"""
:param pad_val: int, pad的位置使用该index
@@ -630,9 +654,9 @@ class EngChar2DPadder(Padder):
都pad或截取到该长度.
"""
super().__init__(pad_val=pad_val)
self.pad_length = pad_length
def __call__(self, contents, field_name, field_ele_dtype, dim):
"""
期望输入类似于
@@ -651,7 +675,7 @@ class EngChar2DPadder(Padder):
raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format(
field_name, field_ele_dtype
))
assert dim==2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions."
assert dim == 2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions."
if self.pad_length < 1:
max_char_length = max([max(len(char_lst) for char_lst in word_lst) for word_lst in contents])
else:
@@ -659,12 +683,12 @@ class EngChar2DPadder(Padder):
max_sent_length = max(len(word_lst) for word_lst in contents)
batch_size = len(contents)
dtype = type(contents[0][0][0])
padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val,
dtype=dtype)
for b_idx, word_lst in enumerate(contents):
for c_idx, char_lst in enumerate(word_lst):
chars = char_lst[:max_char_length]
padded_array[b_idx, c_idx, :len(chars)] = chars
return padded_array

+ 22
- 13
fastNLP/core/instance.py View File

@@ -1,17 +1,18 @@
"""
instance 模块实现了Instance 类在fastNLP中对应sample。一个sample可以认为是一个Instance类型的对象。
便于理解的例子可以参考文档 :doc:`fastNLP.core.dataset` 中的表格
便于理解的例子可以参考文档 :mod:`fastNLP.core.dataset` 中的表格

"""

__all__ = [
"Instance"
]

from .utils import pretty_table_printer


class Instance(object):
"""
别名::class:`fastNLP.Instance` :class:`fastNLP.core.instance.Instance`

Instance是fastNLP中对应一个sample的类。每个sample在fastNLP中是一个Instance对象。
Instance一般与 :class:`~fastNLP.DataSet` 一起使用, Instance的初始化如下面的Example所示::
@@ -22,11 +23,11 @@ class Instance(object):
>>>ins.add_field("field_3", [3, 3, 3])
>>>ins = Instance(**{'x1': 1, 'x2':np.zeros((3, 4))})
"""
def __init__(self, **fields):
self.fields = fields
def add_field(self, field_name, field):
"""
向Instance中增加一个field
@@ -35,18 +36,26 @@ class Instance(object):
:param Any field: 新增field的内容
"""
self.fields[field_name] = field

def items(self):
"""
返回一个迭代器,迭代器返回两个内容,第一个内容是field_name, 第二个内容是field_value
:return: 一个迭代器
"""
return self.fields.items()

def __contains__(self, item):
return item in self.fields

def __getitem__(self, name):
if name in self.fields:
return self.fields[name]
else:
raise KeyError("{} not found".format(name))
def __setitem__(self, name, field):
return self.add_field(name, field)
def __repr__(self):
s = '\''
return "{" + ",\n".join(
"\'" + field_name + "\': " + str(self.fields[field_name]) + \
f" type={(str(type(self.fields[field_name]))).split(s)[1]}" for field_name in self.fields) + "}"
return str(pretty_table_printer(self))

+ 67
- 102
fastNLP/core/losses.py View File

@@ -11,7 +11,10 @@ __all__ = [
"CrossEntropyLoss",
"BCELoss",
"L1Loss",
"NLLLoss"
"NLLLoss",

"CMRC2018Loss"

]

import inspect
@@ -20,7 +23,6 @@ from collections import defaultdict
import torch
import torch.nn.functional as F

from ..core.const import Const
from .utils import _CheckError
from .utils import _CheckRes
from .utils import _build_args
@@ -28,6 +30,7 @@ from .utils import _check_arg_dict_list
from .utils import _check_function_or_method
from .utils import _get_func_signature
from .utils import seq_len_to_mask
from ..core.const import Const


class LossBase(object):
@@ -166,8 +169,6 @@ class LossBase(object):

class LossFunc(LossBase):
"""
别名::class:`fastNLP.LossFunc` :class:`fastNLP.core.losses.LossFunc`

提供给用户使用自定义损失函数的类

:param func: 用户自行定义的损失函数,应当为一个函数或者callable(func)为True的ojbect
@@ -199,13 +200,15 @@ class LossFunc(LossBase):

class CrossEntropyLoss(LossBase):
"""
别名::class:`fastNLP.CrossEntropyLoss` :class:`fastNLP.core.losses.CrossEntropyLoss`

交叉熵损失函数
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target`
:param seq_len: 句子的长度, 长度之外的token不会计算loss。。
:param seq_len: 句子的长度, 长度之外的token不会计算loss。
:param int class_in_dim: 在序列标注的场景中,pred可能的shape为(batch_size, max_len, num_classes)
或(batch_size, num_classes, max_len), CrossEntropyLoss需要知道哪一维是class的维度以计算loss。如果为-1,就根据pred的第
二维是否等于target的第二维来判断是否需要交换pred的第二维和第三维,因为target的第二维是length的维度,如果这一维度上和pred相等,
那么pred可能第二维也是长度维(存在误判的可能,如果有误判的情况,请显示设置该值)。其它大于0的值则认为该维度是class的维度。
:param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替
传入seq_len.
:param str reduction: 支持 `mean` ,`sum` 和 `none` .
@@ -216,21 +219,25 @@ class CrossEntropyLoss(LossBase):
"""
def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100, reduction='mean'):
def __init__(self, pred=None, target=None, seq_len=None, class_in_dim=-1, padding_idx=-100, reduction='mean'):
super(CrossEntropyLoss, self).__init__()
self._init_param_map(pred=pred, target=target, seq_len=seq_len)
self.padding_idx = padding_idx
assert reduction in ('mean', 'sum', 'none')
self.reduction = reduction
self.class_in_dim = class_in_dim
def get_loss(self, pred, target, seq_len=None):
if pred.dim() > 2:
if pred.size(1) != target.size(1):
pred = pred.transpose(1, 2)
if self.class_in_dim == -1:
if pred.size(1) != target.size(1): # 有可能顺序替换了
pred = pred.transpose(1, 2)
else:
pred = pred.tranpose(-1, pred)
pred = pred.reshape(-1, pred.size(-1))
target = target.reshape(-1)
if seq_len is not None:
mask = seq_len_to_mask(seq_len).reshape(-1).eq(0)
if seq_len is not None and target.dim()>1:
mask = seq_len_to_mask(seq_len, max_len=target.size(1)).reshape(-1).eq(0)
target = target.masked_fill(mask, self.padding_idx)

return F.cross_entropy(input=pred, target=target,
@@ -239,8 +246,6 @@ class CrossEntropyLoss(LossBase):

class L1Loss(LossBase):
"""
别名::class:`fastNLP.L1Loss` :class:`fastNLP.core.losses.L1Loss`

L1损失函数
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
@@ -261,8 +266,6 @@ class L1Loss(LossBase):

class BCELoss(LossBase):
"""
别名::class:`fastNLP.BCELoss` :class:`fastNLP.core.losses.BCELoss`

二分类交叉熵损失函数
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
@@ -282,18 +285,18 @@ class BCELoss(LossBase):

class NLLLoss(LossBase):
"""
别名::class:`fastNLP.NLLLoss` :class:`fastNLP.core.losses.NLLLoss`
负对数似然损失函数
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target`
:param ignore_idx: ignore的index,在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替
传入seq_len.
:param str reduction: 支持 `mean` ,`sum` 和 `none` .
"""
def __init__(self, pred=None, target=None, ignore_idx=-100, reduction='mean'):
"""
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target`
:param ignore_idx: ignore的index,在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替
传入seq_len.
:param str reduction: 支持 `mean` ,`sum` 和 `none` .
"""
super(NLLLoss, self).__init__()
self._init_param_map(pred=pred, target=target)
assert reduction in ('mean', 'sum', 'none')
@@ -306,14 +309,14 @@ class NLLLoss(LossBase):

class LossInForward(LossBase):
"""
别名::class:`fastNLP.LossInForward` :class:`fastNLP.core.losses.LossInForward`

从forward()函数返回结果中获取loss
:param str loss_key: 在forward函数中loss的键名,默认为loss
"""
def __init__(self, loss_key=Const.LOSS):
"""
:param str loss_key: 在forward函数中loss的键名,默认为loss
"""
super().__init__()
if not isinstance(loss_key, str):
raise TypeError(f"Only str allowed for loss_key, got {type(loss_key)}.")
@@ -344,90 +347,52 @@ class LossInForward(LossBase):
return loss


def _prepare_losser(losser):
if losser is None:
losser = LossInForward()
return losser
elif isinstance(losser, LossBase):
return losser
else:
raise TypeError(f"Type of loss should be `fastNLP.LossBase`, got {type(losser)}")


def squash(predict, truth, **kwargs):
"""To reshape tensors in order to fit loss functions in PyTorch.

:param predict: Tensor, model output
:param truth: Tensor, truth from dataset
:param kwargs: extra arguments
:return predict , truth: predict & truth after processing
class CMRC2018Loss(LossBase):
"""
return predict.view(-1, predict.size()[-1]), truth.view(-1, )


def unpad(predict, truth, **kwargs):
"""To process padded sequence output to get true loss.
用于计算CMRC2018中文问答任务。

:param predict: Tensor, [batch_size , max_len , tag_size]
:param truth: Tensor, [batch_size , max_len]
:param kwargs: kwargs["lens"] is a list or LongTensor, with size [batch_size]. The i-th element is true lengths of i-th sequence.

:return predict , truth: predict & truth after processing
"""
if kwargs.get("lens") is None:
return predict, truth
lens = torch.LongTensor(kwargs["lens"])
lens, idx = torch.sort(lens, descending=True)
predict = torch.nn.utils.rnn.pack_padded_sequence(predict[idx], lens, batch_first=True).data
truth = torch.nn.utils.rnn.pack_padded_sequence(truth[idx], lens, batch_first=True).data
return predict, truth

def __init__(self, target_start=None, target_end=None, context_len=None, pred_start=None, pred_end=None,
reduction='mean'):
super().__init__()

def unpad_mask(predict, truth, **kwargs):
"""To process padded sequence output to get true loss.
assert reduction in ('mean', 'sum')

:param predict: Tensor, [batch_size , max_len , tag_size]
:param truth: Tensor, [batch_size , max_len]
:param kwargs: kwargs["lens"] is a list or LongTensor, with size [batch_size]. The i-th element is true lengths of i-th sequence.
self._init_param_map(target_start=target_start, target_end=target_end, context_len=context_len,
pred_start=pred_start, pred_end=pred_end)
self.reduction = reduction

:return predict , truth: predict & truth after processing
"""
if kwargs.get("lens") is None:
return predict, truth
mas = make_mask(kwargs["lens"], truth.size()[1])
return mask(predict, truth, mask=mas)
def get_loss(self, target_start, target_end, context_len, pred_start, pred_end):
"""

:param target_start: batch_size
:param target_end: batch_size
:param context_len: batch_size
:param pred_start: batch_size x max_len
:param pred_end: batch_size x max_len
:return:
"""
batch_size, max_len = pred_end.size()
mask = seq_len_to_mask(context_len, max_len).eq(0)

def mask(predict, truth, **kwargs):
"""To select specific elements from Tensor. This method calls ``squash()``.
pred_start = pred_start.masked_fill(mask, float('-inf'))
pred_end = pred_end.masked_fill(mask, float('-inf'))

:param predict: Tensor, [batch_size , max_len , tag_size]
:param truth: Tensor, [batch_size , max_len]
:param kwargs: extra arguments, kwargs["mask"]: ByteTensor, [batch_size , max_len], the mask Tensor. The position that is 1 will be selected.
start_loss = F.cross_entropy(pred_start, target_start, reduction='sum')
end_loss = F.cross_entropy(pred_end, target_end, reduction='sum')

:return predict , truth: predict & truth after processing
"""
if kwargs.get("mask") is None:
return predict, truth
mask = kwargs["mask"]
predict, truth = squash(predict, truth)
mask = mask.view(-1, )
predict = torch.masked_select(predict.permute(1, 0), mask).view(predict.size()[-1], -1).permute(1, 0)
truth = torch.masked_select(truth, mask)
return predict, truth
loss = start_loss + end_loss

if self.reduction == 'mean':
loss = loss / batch_size

def make_mask(lens, tar_len):
"""To generate a mask over a sequence.
return loss/2

:param lens: list or LongTensor, [batch_size]
:param tar_len: int
:return mask: ByteTensor
"""
lens = torch.LongTensor(lens)
mask = [torch.ge(lens, i + 1) for i in range(tar_len)]
mask = torch.stack(mask, 1)
return mask
def _prepare_losser(losser):
if losser is None:
losser = LossInForward()
return losser
elif isinstance(losser, LossBase):
return losser
else:
raise TypeError(f"Type of loss should be `fastNLP.LossBase`, got {type(losser)}")

+ 299
- 247
fastNLP/core/metrics.py View File

@@ -6,11 +6,15 @@ __all__ = [
"MetricBase",
"AccuracyMetric",
"SpanFPreRecMetric",
"ExtractiveQAMetric"
"CMRC2018Metric"
]

import inspect
import warnings
from abc import abstractmethod
from collections import defaultdict
from typing import Union
import re

import numpy as np
import torch
@@ -22,7 +26,6 @@ from .utils import _check_arg_dict_list
from .utils import _get_func_signature
from .utils import seq_len_to_mask
from .vocabulary import Vocabulary
from abc import abstractmethod


class MetricBase(object):
@@ -114,10 +117,11 @@ class MetricBase(object):
self.get_metric将统计当前的评价指标并返回评价结果, 返回值需要是一个dict, key是指标名称,value是指标的值

"""
def __init__(self):
self._param_map = {} # key is param in function, value is input param.
self._checked = False
self._metric_name = self.__class__.__name__

@property
def param_map(self):
@@ -135,7 +139,25 @@ class MetricBase(object):
@abstractmethod
def get_metric(self, reset=True):
raise NotImplemented

def set_metric_name(self, name: str):
"""
设置metric的名称,默认是Metric的class name.

:param str name:
:return: self
"""
self._metric_name = name
return self

def get_metric_name(self):
"""
返回metric的名称
:return:
"""
return self._metric_name

def _init_param_map(self, key_map=None, **kwargs):
"""检查key_map和其他参数map,并将这些映射关系添加到self._param_map

@@ -168,7 +190,7 @@ class MetricBase(object):
for value, key_set in value_counter.items():
if len(key_set) > 1:
raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.")
# check consistence between signature and _param_map
func_spect = inspect.getfullargspec(self.evaluate)
func_args = [arg for arg in func_spect.args if arg != 'self']
@@ -177,7 +199,7 @@ class MetricBase(object):
raise NameError(
f"Parameter `{func_param}` is not in {_get_func_signature(self.evaluate)}. Please check the "
f"initialization parameters, or change its signature.")
def _fast_param_map(self, pred_dict, target_dict):
"""Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map.
such as pred_dict has one element, target_dict has one element
@@ -192,7 +214,7 @@ class MetricBase(object):
fast_param['target'] = list(target_dict.values())[0]
return fast_param
return fast_param
def __call__(self, pred_dict, target_dict):
"""
这个方法会调用self.evaluate 方法.
@@ -207,12 +229,12 @@ class MetricBase(object):
:param target_dict: DataSet.batch_y里的键-值对所组成的dict(即is_target=True的fields的内容)
:return:
"""
fast_param = self._fast_param_map(pred_dict, target_dict)
if fast_param:
self.evaluate(**fast_param)
return
if not self._checked:
if not callable(self.evaluate):
raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.")
@@ -222,14 +244,14 @@ class MetricBase(object):
for func_arg, input_arg in self._param_map.items():
if func_arg not in func_args:
raise NameError(f"`{func_arg}` not in {_get_func_signature(self.evaluate)}.")
# 2. only part of the _param_map are passed, left are not
for arg in func_args:
if arg not in self._param_map:
self._param_map[arg] = arg # This param does not need mapping.
self._evaluate_args = func_args
self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self._param_map.items()}
# need to wrap inputs in dict.
mapped_pred_dict = {}
mapped_target_dict = {}
@@ -238,7 +260,7 @@ class MetricBase(object):
mapped_pred_dict[mapped_arg] = pred_dict[input_arg]
if input_arg in target_dict:
mapped_target_dict[mapped_arg] = target_dict[input_arg]
# missing
if not self._checked:
duplicated = []
@@ -253,47 +275,46 @@ class MetricBase(object):
for idx, func_arg in enumerate(missing):
# Don't delete `` in this information, nor add ``
replaced_missing[idx] = f"{self._param_map[func_arg]}" + f"(assign to `{func_arg}` " \
f"in `{self.__class__.__name__}`)"
f"in `{self.__class__.__name__}`)"
check_res = _CheckRes(missing=replaced_missing,
unused=check_res.unused,
duplicated=duplicated,
required=check_res.required,
all_needed=check_res.all_needed,
varargs=check_res.varargs)
if check_res.missing or check_res.duplicated:
raise _CheckError(check_res=check_res,
func_signature=_get_func_signature(self.evaluate))
self._checked = True
refined_args = _build_args(self.evaluate, **mapped_pred_dict, **mapped_target_dict)
self.evaluate(**refined_args)
return


class AccuracyMetric(MetricBase):
"""
别名::class:`fastNLP.AccuracyMetric` :class:`fastNLP.core.metrics.AccuracyMetric`

准确率Metric(其它的Metric参见 :doc:`fastNLP.core.metrics` )
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target`
:param seq_len: 参数映射表中 `seq_len` 的映射关系,None表示映射关系为 `seq_len` -> `seq_len`
准确率Metric(其它的Metric参见 :mod:`fastNLP.core.metrics` )
"""

def __init__(self, pred=None, target=None, seq_len=None):
"""
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target`
:param seq_len: 参数映射表中 `seq_len` 的映射关系,None表示映射关系为 `seq_len` -> `seq_len`
"""

super().__init__()
self._init_param_map(pred=pred, target=target, seq_len=seq_len)
self.total = 0
self.acc_count = 0
def evaluate(self, pred, target, seq_len=None):
"""
evaluate函数将针对一个批次的预测结果做评价指标的累计
@@ -313,25 +334,28 @@ class AccuracyMetric(MetricBase):
if not isinstance(target, torch.Tensor):
raise TypeError(f"`target` in {_get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(target)}.")
if seq_len is not None and not isinstance(seq_len, torch.Tensor):
raise TypeError(f"`seq_lens` in {_get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(seq_len)}.")
if seq_len is not None:
masks = seq_len_to_mask(seq_len=seq_len)

if seq_len is not None and target.dim() > 1:
max_len = target.size(1)
masks = seq_len_to_mask(seq_len=seq_len, max_len=max_len)
else:
masks = None
if pred.size() == target.size():
if pred.dim() == target.dim():
pass
elif len(pred.size()) == len(target.size()) + 1:
elif pred.dim() == target.dim() + 1:
pred = pred.argmax(dim=-1)
if seq_len is None and target.dim() > 1:
warnings.warn("You are not passing `seq_len` to exclude pad when calculate accuracy.")
else:
raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have "
f"size:{pred.size()}, target should have size: {pred.size()} or "
f"{pred.size()[:-1]}, got {target.size()}.")
target = target.to(pred)
if masks is not None:
self.acc_count += torch.sum(torch.eq(pred, target).masked_fill(masks.eq(0), 0)).item()
@@ -339,7 +363,7 @@ class AccuracyMetric(MetricBase):
else:
self.acc_count += torch.sum(torch.eq(pred, target)).item()
self.total += np.prod(list(pred.size()))
def get_metric(self, reset=True):
"""
get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果.
@@ -358,13 +382,14 @@ def _bmes_tag_to_spans(tags, ignore_labels=None):
"""
给定一个tags的lis,比如['S-song', 'B-singer', 'M-singer', 'E-singer', 'S-moive', 'S-actor']。
返回[('song', (0, 1)), ('singer', (1, 4)), ('moive', (4, 5)), ('actor', (5, 6))] (左闭右开区间)
也可以是单纯的['S', 'B', 'M', 'E', 'B', 'M', 'M',...]序列

:param tags: List[str],
:param ignore_labels: List[str], 在该list中的label将被忽略
:return: List[Tuple[str, List[int, int]]]. [(label,[start, end])]
"""
ignore_labels = set(ignore_labels) if ignore_labels else set()
spans = []
prev_bmes_tag = None
for idx, tag in enumerate(tags):
@@ -393,7 +418,7 @@ def _bmeso_tag_to_spans(tags, ignore_labels=None):
:return: List[Tuple[str, List[int, int]]]. [(label,[start, end])]
"""
ignore_labels = set(ignore_labels) if ignore_labels else set()
spans = []
prev_bmes_tag = None
for idx, tag in enumerate(tags):
@@ -455,7 +480,7 @@ def _bio_tag_to_spans(tags, ignore_labels=None):
:return: List[Tuple[str, List[int, int]]]. [(label,[start, end])]
"""
ignore_labels = set(ignore_labels) if ignore_labels else set()
spans = []
prev_bio_tag = None
for idx, tag in enumerate(tags):
@@ -473,10 +498,75 @@ def _bio_tag_to_spans(tags, ignore_labels=None):
return [(span[0], (span[1][0], span[1][1] + 1)) for span in spans if span[0] not in ignore_labels]


def _get_encoding_type_from_tag_vocab(tag_vocab: Union[Vocabulary, dict]) -> str:
"""
给定Vocabulary自动判断是哪种类型的encoding, 支持判断bmes, bioes, bmeso, bio

:param tag_vocab: 支持传入tag Vocabulary; 或者传入形如{0:"O", 1:"B-tag1"},即index在前,tag在后的dict。
:return:
"""
tag_set = set()
unk_token = '<unk>'
pad_token = '<pad>'
if isinstance(tag_vocab, Vocabulary):
unk_token = tag_vocab.unknown
pad_token = tag_vocab.padding
tag_vocab = tag_vocab.idx2word
for idx, tag in tag_vocab.items():
if tag in (unk_token, pad_token):
continue
tag = tag[:1].lower()
tag_set.add(tag)

bmes_tag_set = set('bmes')
if tag_set == bmes_tag_set:
return 'bmes'
bio_tag_set = set('bio')
if tag_set == bio_tag_set:
return 'bio'
bmeso_tag_set = set('bmeso')
if tag_set == bmeso_tag_set:
return 'bmeso'
bioes_tag_set = set('bioes')
if tag_set == bioes_tag_set:
return 'bioes'
raise RuntimeError("encoding_type cannot be inferred automatically. Only support "
"'bio', 'bmes', 'bmeso', 'bioes' type.")


def _check_tag_vocab_and_encoding_type(tag_vocab: Union[Vocabulary, dict], encoding_type: str):
"""
检查vocab中的tag是否与encoding_type是匹配的

:param tag_vocab: 支持传入tag Vocabulary; 或者传入形如{0:"O", 1:"B-tag1"},即index在前,tag在后的dict。
:param encoding_type: bio, bmes, bioes, bmeso
:return:
"""
tag_set = set()
unk_token = '<unk>'
pad_token = '<pad>'
if isinstance(tag_vocab, Vocabulary):
unk_token = tag_vocab.unknown
pad_token = tag_vocab.padding
tag_vocab = tag_vocab.idx2word
for idx, tag in tag_vocab.items():
if tag in (unk_token, pad_token):
continue
tag = tag[:1].lower()
tag_set.add(tag)

tags = encoding_type
for tag in tag_set:
assert tag in tags, f"{tag} is not a valid tag in encoding type:{encoding_type}. Please check your " \
f"encoding_type."
tags = tags.replace(tag, '') # 删除该值
if tags: # 如果不为空,说明出现了未使用的tag
warnings.warn(f"Tag:{tags} in encoding type:{encoding_type} is not presented in your Vocabulary. Check your "
"encoding_type.")


class SpanFPreRecMetric(MetricBase):
r"""
别名::class:`fastNLP.SpanFPreRecMetric` :class:`fastNLP.core.metrics.SpanFPreRecMetric`

在序列标注问题中,以span的方式计算F, pre, rec.
比如中文Part of speech中,会以character的方式进行标注,句子 `中国在亚洲` 对应的POS可能为(以BMES为例)
['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。
@@ -499,34 +589,36 @@ class SpanFPreRecMetric(MetricBase):
'rec-label':xxx,
...
}

:param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN),
在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'.
:param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据
:param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据
:param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。
:param str encoding_type: 目前支持bio, bmes, bmeso, bioes
:param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这
个label
:param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个
label的f1, pre, rec
:param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; `macro` :
分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同)
:param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` .
常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。
"""
def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type='bio', ignore_labels=None,

def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type=None, ignore_labels=None,
only_gross=True, f_type='micro', beta=1):
encoding_type = encoding_type.lower()
r"""
:param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN),
在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'.
:param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据
:param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据
:param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。
:param str encoding_type: 目前支持bio, bmes, bmeso, bioes。默认为None,通过tag_vocab自动判断.
:param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'个label
:param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个label的f1, pre, rec
:param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; `macro` : 分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同)
:param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为 `beta=0.5, 1, 2` 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。
"""

if not isinstance(tag_vocab, Vocabulary):
raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab)))
if f_type not in ('micro', 'macro'):
raise ValueError("f_type only supports `micro` or `macro`', got {}.".format(f_type))
self.encoding_type = encoding_type

if encoding_type:
encoding_type = encoding_type.lower()
_check_tag_vocab_and_encoding_type(tag_vocab, encoding_type)
self.encoding_type = encoding_type
else:
self.encoding_type = _get_encoding_type_from_tag_vocab(tag_vocab)

if self.encoding_type == 'bmes':
self.tag_to_span_func = _bmes_tag_to_spans
elif self.encoding_type == 'bio':
@@ -536,23 +628,23 @@ class SpanFPreRecMetric(MetricBase):
elif self.encoding_type == 'bioes':
self.tag_to_span_func = _bioes_tag_to_spans
else:
raise ValueError("Only support 'bio', 'bmes', 'bmeso' type.")
raise ValueError("Only support 'bio', 'bmes', 'bmeso', 'bioes' type.")
self.ignore_labels = ignore_labels
self.f_type = f_type
self.beta = beta
self.beta_square = self.beta ** 2
self.only_gross = only_gross
super().__init__()
self._init_param_map(pred=pred, target=target, seq_len=seq_len)
self.tag_vocab = tag_vocab
self._true_positives = defaultdict(int)
self._false_positives = defaultdict(int)
self._false_negatives = defaultdict(int)
def evaluate(self, pred, target, seq_len):
"""evaluate函数将针对一个批次的预测结果做评价指标的累计

@@ -567,11 +659,11 @@ class SpanFPreRecMetric(MetricBase):
if not isinstance(target, torch.Tensor):
raise TypeError(f"`target` in {_get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(target)}.")
if not isinstance(seq_len, torch.Tensor):
raise TypeError(f"`seq_lens` in {_get_func_signature(self.evaluate)} must be torch.Tensor,"
f"got {type(seq_len)}.")
if pred.size() == target.size() and len(target.size()) == 2:
pass
elif len(pred.size()) == len(target.size()) + 1 and len(target.size()) == 2:
@@ -584,20 +676,20 @@ class SpanFPreRecMetric(MetricBase):
raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have "
f"size:{pred.size()}, target should have size: {pred.size()} or "
f"{pred.size()[:-1]}, got {target.size()}.")
batch_size = pred.size(0)
pred = pred.tolist()
target = target.tolist()
for i in range(batch_size):
pred_tags = pred[i][:int(seq_len[i])]
gold_tags = target[i][:int(seq_len[i])]
pred_str_tags = [self.tag_vocab.to_word(tag) for tag in pred_tags]
gold_str_tags = [self.tag_vocab.to_word(tag) for tag in gold_tags]
pred_spans = self.tag_to_span_func(pred_str_tags, ignore_labels=self.ignore_labels)
gold_spans = self.tag_to_span_func(gold_str_tags, ignore_labels=self.ignore_labels)
for span in pred_spans:
if span in gold_spans:
self._true_positives[span[0]] += 1
@@ -606,7 +698,7 @@ class SpanFPreRecMetric(MetricBase):
self._false_positives[span[0]] += 1
for span in gold_spans:
self._false_negatives[span[0]] += 1
def get_metric(self, reset=True):
"""get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果."""
evaluate_result = {}
@@ -624,7 +716,7 @@ class SpanFPreRecMetric(MetricBase):
f, pre, rec = self._compute_f_pre_rec(tp, fn, fp)
f_sum += f
pre_sum += pre
rec_sum + rec
rec_sum += rec
if not self.only_gross and tag != '': # tag!=''防止无tag的情况
f_key = 'f-{}'.format(tag)
pre_key = 'pre-{}'.format(tag)
@@ -632,12 +724,12 @@ class SpanFPreRecMetric(MetricBase):
evaluate_result[f_key] = f
evaluate_result[pre_key] = pre
evaluate_result[rec_key] = rec
if self.f_type == 'macro':
evaluate_result['f'] = f_sum / len(tags)
evaluate_result['pre'] = pre_sum / len(tags)
evaluate_result['rec'] = rec_sum / len(tags)
if self.f_type == 'micro':
f, pre, rec = self._compute_f_pre_rec(sum(self._true_positives.values()),
sum(self._false_negatives.values()),
@@ -645,17 +737,17 @@ class SpanFPreRecMetric(MetricBase):
evaluate_result['f'] = f
evaluate_result['pre'] = pre
evaluate_result['rec'] = rec
if reset:
self._true_positives = defaultdict(int)
self._false_positives = defaultdict(int)
self._false_negatives = defaultdict(int)
for key, value in evaluate_result.items():
evaluate_result[key] = round(value, 6)
return evaluate_result
def _compute_f_pre_rec(self, tp, fn, fp):
"""

@@ -667,7 +759,7 @@ class SpanFPreRecMetric(MetricBase):
pre = tp / (fp + tp + 1e-13)
rec = tp / (fn + tp + 1e-13)
f = (1 + self.beta_square) * pre * rec / (self.beta_square * pre + rec + 1e-13)
return f, pre, rec


@@ -736,169 +828,129 @@ def _pred_topk(y_prob, k=1):
return y_pred_topk, y_prob_topk


class ExtractiveQAMetric(MetricBase):
r"""
别名::class:`fastNLP.ExtractiveQAMetric` :class:`fastNLP.core.metrics.ExtractiveQAMetric`
class CMRC2018Metric(MetricBase):
def __init__(self, answers=None, raw_chars=None, context_len=None, pred_start=None, pred_end=None):
super().__init__()
self._init_param_map(answers=answers, raw_chars=raw_chars, context_len=context_len, pred_start=pred_start,
pred_end=pred_end)
self.em = 0
self.total = 0
self.f1 = 0

抽取式QA(如SQuAD)的metric.
:param pred1: 参数映射表中 `pred1` 的映射关系,None表示映射关系为 `pred1` -> `pred1`
:param pred2: 参数映射表中 `pred2` 的映射关系,None表示映射关系为 `pred2` -> `pred2`
:param target1: 参数映射表中 `target1` 的映射关系,None表示映射关系为 `target1` -> `target1`
:param target2: 参数映射表中 `target2` 的映射关系,None表示映射关系为 `target2` -> `target2`
:param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` .
常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。
:param bool right_open: right_open为true表示start跟end指针指向一个左闭右开区间,为false表示指向一个左闭右闭区间。
:param bool print_predict_stat: True则输出预测答案是否为空与正确答案是否为空的统计信息, False则不输出
"""
def __init__(self, pred1=None, pred2=None, target1=None, target2=None,
beta=1, right_open=True, print_predict_stat=False):
super(ExtractiveQAMetric, self).__init__()
self._init_param_map(pred1=pred1, pred2=pred2, target1=target1, target2=target2)
self.print_predict_stat = print_predict_stat
self.no_ans_correct = 0
self.no_ans_wrong = 0
self.has_ans_correct = 0
self.has_ans_wrong = 0
self.has_ans_f = 0.
self.no2no = 0
self.no2yes = 0
self.yes2no = 0
self.yes2yes = 0
self.f_beta = beta
self.right_open = right_open
def evaluate(self, pred1, pred2, target1, target2):
"""evaluate函数将针对一个批次的预测结果做评价指标的累计
def evaluate(self, answers, raw_chars, context_len, pred_start, pred_end):
"""

:param pred1: [batch]或者[batch, seq_len], 预测答案开始的index, 如果SQuAD2.0中答案为空则为0
:param pred2: [batch]或者[batch, seq_len] 预测答案结束的index, 如果SQuAD2.0中答案为空则为0(左闭右闭区间)或者1(左闭右开区间)
:param target1: [batch], 正确答案开始的index, 如果SQuAD2.0中答案为空则为0
:param target2: [batch], 正确答案结束的index, 如果SQuAD2.0中答案为空则为0(左闭右闭区间)或者1(左闭右开区间)
:return: None
:param list[str] answers: 如[["答案1", "答案2", "答案3"], [...], ...]
:param list[str] raw_chars: [["这", "是", ...], [...]]
:param tensor context_len: context长度, batch_size
:param tensor pred_start: batch_size x length
:param tensor pred_end: batch_size x length
:return:
"""
pred_start = pred1
pred_end = pred2
target_start = target1
target_end = target2
if len(pred_start.size()) == 2:
start_inference = pred_start.max(dim=-1)[1].cpu().tolist()
else:
start_inference = pred_start.cpu().tolist()
if len(pred_end.size()) == 2:
end_inference = pred_end.max(dim=-1)[1].cpu().tolist()
else:
end_inference = pred_end.cpu().tolist()
start, end = [], []
max_len = pred_start.size(1)
t_start = target_start.cpu().tolist()
t_end = target_end.cpu().tolist()
for s, e in zip(start_inference, end_inference):
start.append(min(s, e))
end.append(max(s, e))
for s, e, ts, te in zip(start, end, t_start, t_end):
if not self.right_open:
e += 1
te += 1
if ts == 0 and te == int(not self.right_open):
if s == 0 and e == int(not self.right_open):
self.no_ans_correct += 1
self.no2no += 1
else:
self.no_ans_wrong += 1
self.no2yes += 1
else:
if s == 0 and e == int(not self.right_open):
self.yes2no += 1
else:
self.yes2yes += 1
if s == ts and e == te:
self.has_ans_correct += 1
else:
self.has_ans_wrong += 1
a = [0] * s + [1] * (e - s) + [0] * (max_len - e)
b = [0] * ts + [1] * (te - ts) + [0] * (max_len - te)
a, b = torch.tensor(a), torch.tensor(b)
TP = int(torch.sum(a * b))
pre = TP / int(torch.sum(a)) if int(torch.sum(a)) > 0 else 0
rec = TP / int(torch.sum(b)) if int(torch.sum(b)) > 0 else 0
if pre + rec > 0:
f = (1 + (self.f_beta ** 2)) * pre * rec / ((self.f_beta ** 2) * pre + rec)
else:
f = 0
self.has_ans_f += f
batch_size, max_len = pred_start.size()
context_mask = seq_len_to_mask(context_len, max_len=max_len).eq(0)
pred_start.masked_fill_(context_mask, float('-inf'))
pred_end.masked_fill_(context_mask, float('-inf'))
max_pred_start, pred_start_index = pred_start.max(dim=-1, keepdim=True) # batch_size,
pred_start_mask = pred_start.eq(max_pred_start).cumsum(dim=-1).eq(0) # 只能预测这之后的值
pred_end.masked_fill_(pred_start_mask, float('-inf'))
pred_end_index = pred_end.argmax(dim=-1) + 1
pred_ans = []
for index, (start, end) in enumerate(zip(pred_start_index.flatten().tolist(), pred_end_index.tolist())):
pred_ans.append(''.join(raw_chars[index][start:end]))
for answer, pred_an in zip(answers, pred_ans):
pred_an = pred_an.strip()
self.f1 += _calc_cmrc2018_f1_score(answer, pred_an)
self.total += 1
self.em += _calc_cmrc2018_em_score(answer, pred_an)

def get_metric(self, reset=True):
"""get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果."""
evaluate_result = {}
if self.no_ans_correct + self.no_ans_wrong + self.has_ans_correct + self.no_ans_wrong <= 0:
return evaluate_result
evaluate_result['EM'] = 0
evaluate_result[f'f_{self.f_beta}'] = 0
flag = 0
if self.no_ans_correct + self.no_ans_wrong > 0:
evaluate_result[f'noAns-f_{self.f_beta}'] = \
round(100 * self.no_ans_correct / (self.no_ans_correct + self.no_ans_wrong), 3)
evaluate_result['noAns-EM'] = \
round(100 * self.no_ans_correct / (self.no_ans_correct + self.no_ans_wrong), 3)
evaluate_result[f'f_{self.f_beta}'] += evaluate_result[f'noAns-f_{self.f_beta}']
evaluate_result['EM'] += evaluate_result['noAns-EM']
flag += 1
if self.has_ans_correct + self.has_ans_wrong > 0:
evaluate_result[f'hasAns-f_{self.f_beta}'] = \
round(100 * self.has_ans_f / (self.has_ans_correct + self.has_ans_wrong), 3)
evaluate_result['hasAns-EM'] = \
round(100 * self.has_ans_correct / (self.has_ans_correct + self.has_ans_wrong), 3)
evaluate_result[f'f_{self.f_beta}'] += evaluate_result[f'hasAns-f_{self.f_beta}']
evaluate_result['EM'] += evaluate_result['hasAns-EM']
flag += 1
if self.print_predict_stat:
evaluate_result['no2no'] = self.no2no
evaluate_result['no2yes'] = self.no2yes
evaluate_result['yes2no'] = self.yes2no
evaluate_result['yes2yes'] = self.yes2yes
if flag <= 0:
return evaluate_result
evaluate_result[f'f_{self.f_beta}'] = round(evaluate_result[f'f_{self.f_beta}'] / flag, 3)
evaluate_result['EM'] = round(evaluate_result['EM'] / flag, 3)
eval_res = {'f1': round(self.f1 / self.total*100, 2), 'em': round(self.em / self.total*100, 2)}
if reset:
self.no_ans_correct = 0
self.no_ans_wrong = 0
self.has_ans_correct = 0
self.has_ans_wrong = 0
self.has_ans_f = 0.
self.no2no = 0
self.no2yes = 0
self.yes2no = 0
self.yes2yes = 0
return evaluate_result
self.em = 0
self.total = 0
self.f1 = 0
return eval_res

# split Chinese
def _cn_segmentation(in_str, rm_punc=False):
in_str = str(in_str).lower().strip()
segs_out = []
temp_str = ""
sp_char = {'-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', ',', '。', ':', '?', '!', '“', '”', ';', '’', '《',
'》', '……', '·', '、', '「', '」', '(', ')', '-', '~', '『', '』'}
for char in in_str:
if rm_punc and char in sp_char:
continue
if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char:
if temp_str != "":
ss = list(temp_str)
segs_out.extend(ss)
temp_str = ""
segs_out.append(char)
else:
temp_str += char

# handling last part
if temp_str != "":
ss = list(temp_str)
segs_out.extend(ss)

return segs_out


# remove punctuation
def _remove_punctuation(in_str):
in_str = str(in_str).lower().strip()
sp_char = ['-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=',
',', '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、',
'「', '」', '(', ')', '-', '~', '『', '』']
out_segs = []
for char in in_str:
if char in sp_char:
continue
else:
out_segs.append(char)
return ''.join(out_segs)


# find longest common string
def _find_lcs(s1, s2):
m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)]
mmax = 0
p = 0
for i in range(len(s1)):
for j in range(len(s2)):
if s1[i] == s2[j]:
m[i + 1][j + 1] = m[i][j] + 1
if m[i + 1][j + 1] > mmax:
mmax = m[i + 1][j + 1]
p = i + 1
return s1[p - mmax:p], mmax


def _calc_cmrc2018_f1_score(answers, prediction):
f1_scores = []
for ans in answers:
ans_segs = _cn_segmentation(ans, rm_punc=True)
prediction_segs = _cn_segmentation(prediction, rm_punc=True)
lcs, lcs_len = _find_lcs(ans_segs, prediction_segs)
if lcs_len == 0:
f1_scores.append(0)
continue
precision = 1.0 * lcs_len / len(prediction_segs)
recall = 1.0 * lcs_len / len(ans_segs)
f1 = (2 * precision * recall) / (precision + recall)
f1_scores.append(f1)
return max(f1_scores)


def _calc_cmrc2018_em_score(answers, prediction):
em = 0
for ans in answers:
ans_ = _remove_punctuation(ans)
prediction_ = _remove_punctuation(prediction)
if ans_ == prediction_:
em = 1
break
return em

+ 48
- 38
fastNLP/core/optimizer.py View File

@@ -9,21 +9,23 @@ __all__ = [
"AdamW"
]

import torch
import math

import torch
from torch.optim.optimizer import Optimizer as TorchOptimizer


class Optimizer(object):
"""
别名::class:`fastNLP.Optimizer` :class:`fastNLP.core.optimizer.Optimizer`

:param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
:param kwargs: additional parameters.
Optimizer
"""
def __init__(self, model_params, **kwargs):
"""
:param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
:param kwargs: additional parameters.
"""
if model_params is not None and not hasattr(model_params, "__next__"):
raise RuntimeError("model parameters should be a generator, rather than {}.".format(type(model_params)))
self.model_params = model_params
@@ -31,15 +33,18 @@ class Optimizer(object):
def construct_from_pytorch(self, model_params):
raise NotImplementedError
def _get_require_grads_param(self, params):

@staticmethod
def _get_require_grads_param(params):
"""
将params中不需要gradient的删除
:param iterable params: parameters
:return: list(nn.Parameters)
"""
return [param for param in params if param.requires_grad]


class NullOptimizer(Optimizer):
"""
当不希望Trainer更新optimizer时,传入本optimizer,但请确保通过callback的方式对参数进行了更新。
@@ -49,7 +54,7 @@ class NullOptimizer(Optimizer):
super().__init__(None)

def construct_from_pytorch(self, model_params):
pass
return self

def __getattr__(self, item):
def pass_func(*args, **kwargs):
@@ -60,14 +65,15 @@ class NullOptimizer(Optimizer):

class SGD(Optimizer):
"""
别名::class:`fastNLP.SGD` :class:`fastNLP.core.optimizer.SGD`

:param float lr: learning rate. Default: 0.01
:param float momentum: momentum. Default: 0
:param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
SGD
"""
def __init__(self, lr=0.001, momentum=0, model_params=None):
"""
:param float lr: learning rate. Default: 0.01
:param float momentum: momentum. Default: 0
:param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
"""
if not isinstance(lr, float):
raise TypeError("learning rate has to be float.")
super(SGD, self).__init__(model_params, lr=lr, momentum=momentum)
@@ -82,14 +88,18 @@ class SGD(Optimizer):

class Adam(Optimizer):
"""
别名::class:`fastNLP.Adam` :class:`fastNLP.core.optimizer.Adam`

:param float lr: learning rate
:param float weight_decay:
:param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
Adam
"""
def __init__(self, lr=0.001, weight_decay=0, betas=(0.9, 0.999), eps=1e-8, amsgrad=False, model_params=None):
"""
:param float lr: learning rate
:param float weight_decay:
:param eps:
:param amsgrad:
:param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models.
"""
if not isinstance(lr, float):
raise TypeError("learning rate has to be float.")
super(Adam, self).__init__(model_params, lr=lr, betas=betas, eps=eps, amsgrad=amsgrad,
@@ -105,9 +115,8 @@ class Adam(Optimizer):

class AdamW(TorchOptimizer):
r"""
别名::class:`fastNLP.AdamW` :class:`fastNLP.core.optimizer.AdamW`

对AdamW的实现,该实现应该会在pytorch更高版本中出现,https://github.com/pytorch/pytorch/pull/21250。这里提前加入
对AdamW的实现,该实现在pytorch 1.2.0版本中已经出现,https://github.com/pytorch/pytorch/pull/21250。
这里加入以适配低版本的pytorch
.. todo::
翻译成中文
@@ -115,27 +124,28 @@ class AdamW(TorchOptimizer):
The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.

:param params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
:param lr (float, optional): learning rate (default: 1e-3)
:param betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.99))
:param eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
:param weight_decay (float, optional): weight decay coefficient (default: 1e-2)
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False)

.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _Decoupled Weight Decay Regularization:
https://arxiv.org/abs/1711.05101
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
.. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980
.. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101
.. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ
"""

def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
weight_decay=1e-2, amsgrad=False):
"""
:param params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
:param lr (float, optional): learning rate (default: 1e-3)
:param betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.99))
:param eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-8)
:param weight_decay (float, optional): weight decay coefficient (default: 1e-2)
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False)
"""
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:


+ 19
- 15
fastNLP/core/predictor.py View File

@@ -1,13 +1,15 @@
"""
..todo::
检查这个类是否需要
"""
"""undocumented"""

__all__ = [
"Predictor"
]

from collections import defaultdict

import torch

from . import DataSetIter
from . import DataSet
from . import DataSetIter
from . import SequentialSampler
from .utils import _build_args, _move_dict_value_to_device, _get_model_device

@@ -18,18 +20,20 @@ class Predictor(object):

与测试器(Tester)不同的是,predictor不关心模型性能的评价指标,只做inference。
这是一个fastNLP调用的高级模型包装器。它与Trainer、Tester不共享任何操作。

:param torch.nn.Module network: 用来完成预测任务的模型
"""
def __init__(self, network):
"""
:param torch.nn.Module network: 用来完成预测任务的模型
"""
if not isinstance(network, torch.nn.Module):
raise ValueError(
"Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network)))
self.network = network
self.batch_size = 1
self.batch_output = []
def predict(self, data: DataSet, seq_len_field_name=None):
"""用已经训练好的模型进行inference.

@@ -41,27 +45,27 @@ class Predictor(object):
raise ValueError("Only Dataset class is allowed, not {}.".format(type(data)))
if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data))
prev_training = self.network.training
self.network.eval()
network_device = _get_model_device(self.network)
batch_output = defaultdict(list)
data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)
if hasattr(self.network, "predict"):
predict_func = self.network.predict
else:
predict_func = self.network.forward
with torch.no_grad():
for batch_x, _ in data_iterator:
_move_dict_value_to_device(batch_x, _, device=network_device)
refined_batch_x = _build_args(predict_func, **batch_x)
prediction = predict_func(**refined_batch_x)
if seq_len_field_name is not None:
seq_lens = batch_x[seq_len_field_name].tolist()
for key, value in prediction.items():
value = value.cpu().numpy()
if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
@@ -74,6 +78,6 @@ class Predictor(object):
batch_output[key].extend(tmp_batch)
else:
batch_output[key].append(value)
self.network.train(prev_training)
return batch_output

+ 10
- 17
fastNLP/core/sampler.py View File

@@ -15,9 +15,6 @@ import numpy as np

class Sampler(object):
"""
别名::class:`fastNLP.Sampler` :class:`fastNLP.core.sampler.Sampler`

`Sampler` 类的基类. 规定以何种顺序取出data中的元素

子类必须实现 ``__call__`` 方法. 输入 `DataSet` 对象, 返回其中元素的下标序列
@@ -25,16 +22,14 @@ class Sampler(object):
def __call__(self, data_set):
"""
:param DataSet data_set: `DataSet` 对象, 需要Sample的数据
:return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出
"""
:param DataSet data_set: `DataSet` 对象, 需要Sample的数据
:return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出
"""
raise NotImplementedError


class SequentialSampler(Sampler):
"""
别名::class:`fastNLP.SequentialSampler` :class:`fastNLP.core.sampler.SequentialSampler`
顺序取出元素的 `Sampler`

"""
@@ -45,8 +40,6 @@ class SequentialSampler(Sampler):

class RandomSampler(Sampler):
"""
别名::class:`fastNLP.RandomSampler` :class:`fastNLP.core.sampler.RandomSampler`

随机化取元素的 `Sampler`

"""
@@ -57,17 +50,17 @@ class RandomSampler(Sampler):

class BucketSampler(Sampler):
"""
别名::class:`fastNLP.BucketSampler` :class:`fastNLP.core.sampler.BucketSampler`

带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素

:param int num_buckets: bucket的数量
:param int batch_size: batch的大小. 默认为None,Trainer在调用BucketSampler时,会将该值正确设置,如果是非Trainer场景使用,需
要显示传递该值
:param str seq_len_field_name: 对应序列长度的 `field` 的名字
"""
def __init__(self, num_buckets=10, batch_size=None, seq_len_field_name='seq_len'):
"""
:param int num_buckets: bucket的数量
:param int batch_size: batch的大小. 默认为None,Trainer在调用BucketSampler时,会将该值正确设置,如果是非Trainer场景使用,需
要显示传递该值
:param str seq_len_field_name: 对应序列长度的 `field` 的名字
"""
self.num_buckets = num_buckets
self.batch_size = batch_size
self.seq_len_field_name = seq_len_field_name


+ 78
- 48
fastNLP/core/tester.py View File

@@ -27,14 +27,21 @@ tester模块实现了 fastNLP 所需的Tester类,能在提供数据、模型
tester = Tester(dataset, model, metrics=AccuracyMetric())
eval_results = tester.test()

这里Metric的映射规律是和 :class:`fastNLP.Trainer` 中一致的,具体使用请参考 :doc:`trainer 模块<fastNLP.core.trainer>` 的1.3部分。
这里Metric的映射规律是和 :class:`fastNLP.Trainer` 中一致的,具体使用请参考 :mod:`trainer 模块<fastNLP.core.trainer>` 的1.3部分。
Tester在验证进行之前会调用model.eval()提示当前进入了evaluation阶段,即会关闭nn.Dropout()等,在验证结束之后会调用model.train()恢复到训练状态。


"""
import time

import torch
import torch.nn as nn

try:
from tqdm.auto import tqdm
except:
from .utils import _pseudo_tqdm as tqdm

from .batch import BatchIter, DataSetIter
from .dataset import DataSet
from .metrics import _prepare_metrics
@@ -47,7 +54,9 @@ from .utils import _get_func_signature
from .utils import _get_model_device
from .utils import _move_model_to_device
from ._parallel_utils import _data_parallel_wrapper
from ._parallel_utils import _model_contains_inner_module
from functools import partial
from ._logger import logger

__all__ = [
"Tester"
@@ -56,36 +65,35 @@ __all__ = [

class Tester(object):
"""
别名::class:`fastNLP.Tester` :class:`fastNLP.core.tester.Tester`

Tester是在提供数据,模型以及metric的情况下进行性能测试的类。需要传入模型,数据以及metric进行验证。

:param ~fastNLP.DataSet data: 需要测试的数据集
:param torch.nn.module model: 使用的模型
:param ~fastNLP.core.metrics.MetricBase,List[~fastNLP.core.metrics.MetricBase] metrics: 测试时使用的metrics
:param int batch_size: evaluation时使用的batch_size有多大。
:param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型
的计算位置进行管理。支持以下的输入:

1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中,可见的第一个GPU中,可见的第二个GPU中;

2. torch.device:将模型装载到torch.device上。

3. int: 将使用device_id为该值的gpu进行训练

4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。

5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。

如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。
:param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。
"""
def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1):
super(Tester, self).__init__()
def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True):
"""
if not isinstance(data, DataSet):
raise TypeError(f"The type of data must be `fastNLP.DataSet`, got `{type(data)}`.")
:param ~fastNLP.DataSet data: 需要测试的数据集
:param torch.nn.module model: 使用的模型
:param ~fastNLP.core.metrics.MetricBase,List[~fastNLP.core.metrics.MetricBase] metrics: 测试时使用的metrics
:param int batch_size: evaluation时使用的batch_size有多大。
:param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型
的计算位置进行管理。支持以下的输入:
1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中,可见的第一个GPU中,可见的第二个GPU中;
2. torch.device:将模型装载到torch.device上。
3. int: 将使用device_id为该值的gpu进行训练
4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。
5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。
如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。
:param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。
:param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。
"""
super(Tester, self).__init__()

if not isinstance(model, nn.Module):
raise TypeError(f"The type of model must be `torch.nn.Module`, got `{type(model)}`.")
@@ -95,6 +103,8 @@ class Tester(object):
self._model = _move_model_to_device(model, device=device)
self.batch_size = batch_size
self.verbose = verbose
self.use_tqdm = use_tqdm
self.logger = logger

if isinstance(data, DataSet):
self.data_iterator = DataSetIter(
@@ -106,19 +116,22 @@ class Tester(object):

# check predict
if (hasattr(self._model, 'predict') and callable(self._model.predict)) or \
(isinstance(self._model, nn.DataParallel) and hasattr(self._model.module, 'predict') and
callable(self._model.module.predict)):
(_model_contains_inner_module(self._model) and hasattr(self._model.module, 'predict') and
callable(self._model.module.predict)):
if isinstance(self._model, nn.DataParallel):
self._predict_func_wrapper = partial(_data_parallel_wrapper('predict',
self._model.device_ids,
self._model.output_device),
network=self._model.module)
self._predict_func = self._model.module.predict # 用于匹配参数
elif isinstance(self._model, nn.parallel.DistributedDataParallel):
self._predict_func = self._model.module.predict
self._predict_func_wrapper = self._model.module.predict # 用于调用
else:
self._predict_func = self._model.predict
self._predict_func_wrapper = self._model.predict
else:
if isinstance(self._model, nn.DataParallel):
if _model_contains_inner_module(model):
self._predict_func_wrapper = self._model.forward
self._predict_func = self._model.module.forward
else:
@@ -126,10 +139,9 @@ class Tester(object):
self._predict_func_wrapper = self._model.forward
def test(self):
"""开始进行验证,并返回验证结果。
r"""开始进行验证,并返回验证结果。

:return Dict[Dict] : dict的二层嵌套结构,dict的第一层是metric的名称; 第二层是这个metric的指标。
一个AccuracyMetric的例子为{'AccuracyMetric': {'acc': 1.0}}。
:return Dict[Dict]: dict的二层嵌套结构,dict的第一层是metric的名称; 第二层是这个metric的指标。一个AccuracyMetric的例子为{'AccuracyMetric': {'acc': 1.0}}。
"""
# turn on the testing mode; clean up the history
self._model_device = _get_model_device(self._model)
@@ -139,21 +151,39 @@ class Tester(object):
eval_results = {}
try:
with torch.no_grad():
for batch_x, batch_y in data_iterator:
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
pred_dict = self._data_forward(self._predict_func, batch_x)
if not isinstance(pred_dict, dict):
raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} "
f"must be `dict`, got {type(pred_dict)}.")
if not self.use_tqdm:
from .utils import _pseudo_tqdm as inner_tqdm
else:
inner_tqdm = tqdm
with inner_tqdm(total=len(data_iterator), leave=False, dynamic_ncols=True) as pbar:
pbar.set_description_str(desc="Test")

start_time = time.time()

for batch_x, batch_y in data_iterator:
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
pred_dict = self._data_forward(self._predict_func, batch_x)
if not isinstance(pred_dict, dict):
raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} "
f"must be `dict`, got {type(pred_dict)}.")
for metric in self.metrics:
metric(pred_dict, batch_y)

if self.use_tqdm:
pbar.update()

for metric in self.metrics:
metric(pred_dict, batch_y)
for metric in self.metrics:
eval_result = metric.get_metric()
if not isinstance(eval_result, dict):
raise TypeError(f"The return value of {_get_func_signature(metric.get_metric)} must be "
f"`dict`, got {type(eval_result)}")
metric_name = metric.__class__.__name__
eval_results[metric_name] = eval_result
eval_result = metric.get_metric()
if not isinstance(eval_result, dict):
raise TypeError(f"The return value of {_get_func_signature(metric.get_metric)} must be "
f"`dict`, got {type(eval_result)}")
metric_name = metric.get_metric_name()
eval_results[metric_name] = eval_result
pbar.close()
end_time = time.time()
test_str = f'Evaluate data in {round(end_time - start_time, 2)} seconds!'
if self.verbose >= 0:
self.logger.info(test_str)
except _CheckError as e:
prev_func_signature = _get_func_signature(self._predict_func)
_check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature,
@@ -161,7 +191,7 @@ class Tester(object):
dataset=self.data, check_level=0)
if self.verbose >= 1:
print("[tester] \n{}".format(self._format_eval_results(eval_results)))
logger.info("[tester] \n{}".format(self._format_eval_results(eval_results)))
self._mode(network, is_test=False)
return eval_results


+ 183
- 141
fastNLP/core/trainer.py View File

@@ -314,7 +314,7 @@ Example2.3
这里,我们通过继承 :class:`~fastNLP.Callback` 类定义了自己的 callback 的,并和内置的 :class:`~fastNLP.EarlyStopCallback`
一起传给了 :class:`~fastNLP.Trainer` ,增强了 :class:`~fastNLP.Trainer` 的功能

fastNLP已经自带了很多callback函数供使用,可以参考 :doc:`fastNLP.core.callback` 。
fastNLP已经自带了很多callback函数供使用,可以参考 :mod:`fastNLP.core.callback` 。

"""
__all__ = [
@@ -336,7 +336,7 @@ except:
import warnings

from .batch import DataSetIter, BatchIter
from .callback import CallbackManager, CallbackException
from .callback import CallbackManager, CallbackException, Callback
from .dataset import DataSet
from .losses import _prepare_losser
from .metrics import _prepare_metrics
@@ -352,12 +352,11 @@ from .utils import _move_dict_value_to_device
from .utils import _get_func_signature
from .utils import _get_model_device
from .utils import _move_model_to_device

from ._parallel_utils import _model_contains_inner_module
from ._logger import logger

class Trainer(object):
"""
别名::class:`fastNLP.Trainer` :class:`fastNLP.core.trainer.Trainer`
Trainer在fastNLP中用于组织单任务的训练过程,可以避免用户在不同训练任务中重复撰写
(1) epoch循环;
(2) 将数据分成不同的Batch;
@@ -365,88 +364,85 @@ class Trainer(object):
(4) 每个epoch结束或一定step后进行验证集验证;
(5) 保存获得更好验证性能的模型等。
详细的介绍参见 :doc:`fastNLP.core.trainer`
:param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。
:param nn.modules model: 待训练的模型
:param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器
:param int batch_size: 训练和验证的时候的batch大小。
:param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward`
:param sampler: Batch数据生成的顺序, :class:`~fastNLP.Sampler` 类型。如果为None,默认使用 :class:`~fastNLP.RandomSampler`
:param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch
:param num_workers: int, 有多少个线程来进行数据pad处理。
:param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128
会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。
:param int n_epochs: 需要优化迭代多少次。
:param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。
:param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。
:param metrics: 验证的评估函数。可以只使用一个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,
也可以使用多个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,通过列表传入。
如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None,
则保存当前模型。Metric种类详见 :doc:`metrics模块 <fastNLP.core.metrics>` 。仅在传入dev_data时有效。
:param str,None metric_key: :class:`Metric<fastNLP.core.metrics.MetricBase>` 有时会有多个指标,
比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需
要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表
明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。
:param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。
:param str,None save_path: 将模型保存路径。如果为None,则不保存模型。如果dev_data为None,则保存最后一次迭代的模型。
保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。
:param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。
:param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型
的计算位置进行管理。支持以下的输入:

1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中,
可见的第二个GPU中;

2. torch.device:将模型装载到torch.device上。

3. int: 将使用device_id为该值的gpu进行训练

4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。

5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。

已知可能会出现的问题:Adagrad优化器可能无法正常使用这个参数,请手动管理模型位置。

:param list(callbacks) callbacks: 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以
通过callback机制实现。 可使用的callback参见 :doc:`callback模块 <fastNLP.core.callback>`
:param int check_code_level: 模型检查等级. -1: 不进行检查; 0: 仅出现错误时停止; 1: 如果有field没有被使用,
报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是
这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况;
(2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。
详细的介绍参见 :mod:`fastNLP.core.trainer`
"""
def __init__(self, train_data, model, optimizer=None, loss=None,
batch_size=32, sampler=None, drop_last=False, update_every=1,
num_workers=0, n_epochs=10, print_every=5,
dev_data=None, metrics=None, metric_key=None,
validate_every=-1, save_path=None, use_tqdm=True, device=None, prefetch=False,
callbacks=None, check_code_level=0):
if prefetch and num_workers==0:
num_workers = 1
if prefetch:
warnings.warn("prefetch is deprecated, will be removed in version 0.5.0, please use num_workers instead.")

validate_every=-1, save_path=None, use_tqdm=True, device=None,
callbacks=None, check_code_level=0, **kwargs):
"""
:param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。
:param nn.modules model: 待训练的模型
:param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器
:param int batch_size: 训练和验证的时候的batch大小。
:param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward`
:param sampler: Batch数据生成的顺序, :class:`~fastNLP.Sampler` 类型。如果为None,默认使用 :class:`~fastNLP.RandomSampler`
:param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch
:param num_workers: int, 有多少个线程来进行数据pad处理。
:param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128
会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。
:param int n_epochs: 需要优化迭代多少次。
:param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。
:param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。
:param metrics: 验证的评估函数。可以只使用一个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,
也可以使用多个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,通过列表传入。
如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None,
则保存当前模型。Metric种类详见 :mod:`metrics模块 <fastNLP.core.metrics>` 。仅在传入dev_data时有效。
:param str,None metric_key: :class:`Metric<fastNLP.core.metrics.MetricBase>` 有时会有多个指标,
比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需
要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表
明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。
:param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。
:param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存
最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。
:param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。
:param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型
的计算位置进行管理。支持以下的输入:
1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中,
可见的第二个GPU中;
2. torch.device:将模型装载到torch.device上。
3. int: 将使用device_id为该值的gpu进行训练
4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。
5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。
已知可能会出现的问题:Adagrad优化器可能无法正常使用这个参数,请手动管理模型位置。
:param list(callbacks) callbacks: 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以
通过callback机制实现。 可使用的callback参见 :mod:`callback模块 <fastNLP.core.callback>`
:param int check_code_level: 模型检查等级. -1: 不进行检查; 0: 仅出现错误时停止; 1: 如果有field没有被使用,
报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是
这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况;
(2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。
"""
super(Trainer, self).__init__()
if not isinstance(model, nn.Module):
raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.")
# check metrics and dev_data
if (not metrics) and dev_data is not None:
raise ValueError("No metric for dev_data evaluation.")
if metrics and (dev_data is None):
raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ")
# check update every
assert update_every >= 1, "update_every must be no less than 1."
self.update_every = int(update_every)
# check save_path
if not (save_path is None or isinstance(save_path, str)):
raise ValueError("save_path can only be None or `str`.")
# prepare evaluate
metrics = _prepare_metrics(metrics)
# parse metric_key
# increase_better is True. It means the exp result gets better if the indicator increases.
# It is true by default.
@@ -458,30 +454,70 @@ class Trainer(object):
self.metric_key = None
# prepare loss
losser = _prepare_losser(loss)
# sampler check
if sampler is not None and not isinstance(sampler, Sampler):
raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))

if sampler is None:
sampler = RandomSampler()
elif hasattr(sampler, 'set_batch_size'):
sampler.set_batch_size(batch_size)
if isinstance(train_data, BatchIter):
if sampler is not None:
warnings.warn("sampler is ignored when train_data is a BatchIter.")
if num_workers>0:
warnings.warn("num_workers is ignored when train_data is BatchIter.")
if drop_last:
warnings.warn("drop_last is ignored when train_data is BatchIter.")

if isinstance(model, nn.parallel.DistributedDataParallel): # 如果是分布式的
# device为None
if device is not None:
warnings.warn("device is ignored when model is nn.parallel.DistributedDataParallel.")
device = None
# Sampler要是分布式的
if sampler is None:
sampler = torch.utils.data.DistributedSampler(train_data)
elif not isinstance(sampler, torch.utils.data.DistributedSampler):
raise TypeError("When using nn.parallel.DistributedDataParallel, "
"sampler must be None or torch.utils.data.DistributedSampler.")
# 不能保存模型
if save_path:
raise RuntimeError("Saving model in Distributed situation is not allowed right now.")
else:
# sampler check
if sampler is not None and not isinstance(sampler, (Sampler, torch.utils.data.Sampler)):
raise ValueError(f"The type of sampler should be fastNLP.BaseSampler or pytorch's Sampler, got {type(sampler)}")
if sampler is None:
sampler = RandomSampler()
elif hasattr(sampler, 'set_batch_size'):
sampler.set_batch_size(batch_size)

if isinstance(train_data, DataSet):
self.data_iterator = DataSetIter(
dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last)
elif isinstance(train_data, BatchIter):
self.data_iterator = train_data
train_data = train_data.dataset
else:
raise TypeError("train_data type {} not support".format(type(train_data)))

if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter):
_check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
metric_key=self.metric_key, check_level=check_code_level,
batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码
model.train()
self.model = _move_model_to_device(model, device=device)
if _model_contains_inner_module(self.model):
self._forward_func = self.model.module.forward
else:
self._forward_func = self.model.forward
if check_code_level > -1:
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入
# 名是否匹配
dev_dataset = dev_data
if isinstance(dev_data, BatchIter):
dev_dataset = None
warnings.warn("dev_data is of BatchIter type, ignore validation checking.")
check_batch_size = min(batch_size, DEFAULT_CHECK_BATCH_SIZE)
if isinstance(self.model, nn.DataParallel):
_num_devices = len(self.model.device_ids)
if batch_size//_num_devices>1: # 如果多卡是每个卡可以分多个数据的,则用每个卡给两个sample
check_batch_size = max(len(self.model.device_ids)*2, check_batch_size)
else:
check_batch_size = max(len(self.model.device_ids), check_batch_size)
_check_code(dataset=train_data, model=self.model, losser=losser, forward_func=self._forward_func, metrics=metrics,
dev_data=dev_dataset, metric_key=self.metric_key, check_level=check_code_level,
batch_size=check_batch_size)

self.train_data = train_data
self.dev_data = dev_data # If None, No validation.
@@ -496,8 +532,7 @@ class Trainer(object):
self.best_dev_epoch = None
self.best_dev_step = None
self.best_dev_perf = None
self.n_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * int(drop_last==0) * self.n_epochs
self.n_steps = len(self.data_iterator) * self.n_epochs

if isinstance(optimizer, torch.optim.Optimizer):
self.optimizer = optimizer
@@ -507,22 +542,32 @@ class Trainer(object):
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=4e-3)
else:
raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer)))

self.logger = logger

self.use_tqdm = use_tqdm
if 'test_use_tqdm' in kwargs:
self.test_use_tqdm = kwargs.get('test_use_tqdm')
else:
self.test_use_tqdm = self.use_tqdm
self.pbar = None
self.print_every = abs(self.print_every)

self.kwargs = kwargs
if self.dev_data is not None:
self.tester = Tester(model=self.model,
data=self.dev_data,
metrics=self.metrics,
batch_size=self.batch_size,
batch_size=kwargs.get("dev_batch_size", self.batch_size),
device=None, # 由上面的部分处理device
verbose=0)
verbose=0,
use_tqdm=self.test_use_tqdm)

self.step = 0
self.start_time = None # start timestamp

if isinstance(callbacks, Callback):
callbacks = [callbacks]

self.callback_manager = CallbackManager(env={"trainer": self},
callbacks=callbacks)

@@ -548,7 +593,7 @@ class Trainer(object):
"""
results = {}
if self.n_epochs <= 0:
print(f"training epoch is {self.n_epochs}, nothing was done.")
self.logger.info(f"training epoch is {self.n_epochs}, nothing was done.")
results['seconds'] = 0.
return results
try:
@@ -557,8 +602,8 @@ class Trainer(object):
self._load_best_model = load_best_model
self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
start_time = time.time()
print("training epochs started " + self.start_time, flush=True)
self.logger.info("training epochs started " + self.start_time)
try:
self.callback_manager.on_train_begin()
self._train()
@@ -571,11 +616,11 @@ class Trainer(object):
raise e
elif on_exception == 'raise':
raise e
if self.dev_data is not None and self.best_dev_perf is not None:
print(
"\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
self.tester._format_eval_results(self.best_dev_perf), )
self.logger.info(
"\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step))
self.logger.info(self.tester._format_eval_results(self.best_dev_perf))
results['best_eval'] = self.best_dev_perf
results['best_epoch'] = self.best_dev_epoch
results['best_step'] = self.best_dev_step
@@ -583,27 +628,23 @@ class Trainer(object):
model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])
load_succeed = self._load_model(self.model, model_name)
if load_succeed:
print("Reloaded the best model.")
self.logger.info("Reloaded the best model.")
else:
print("Fail to reload best model.")
self.logger.info("Fail to reload best model.")
finally:
pass
results['seconds'] = round(time.time() - start_time, 2)
return results
def _train(self):
if not self.use_tqdm:
from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm
from .utils import _pseudo_tqdm as inner_tqdm
else:
inner_tqdm = tqdm
self.step = 0
self.epoch = 0
start = time.time()
if isinstance(self.model, nn.DataParallel):
self._forward_func = self.model.module.forward
else:
self._forward_func = self.model.forward
with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
self.pbar = pbar
avg_loss = 0
@@ -621,21 +662,21 @@ class Trainer(object):
# negative sampling; replace unknown; re-weight batch_y
self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
prediction = self._data_forward(self.model, batch_x)
# edit prediction
self.callback_manager.on_loss_begin(batch_y, prediction)
loss = self._compute_loss(prediction, batch_y).mean()
avg_loss += loss.item()
loss = loss / self.update_every
# Is loss NaN or inf? requires_grad = False
self.callback_manager.on_backward_begin(loss)
self._grad_backward(loss)
self.callback_manager.on_backward_end()
self._update()
self.callback_manager.on_step_end()
if self.step % self.print_every == 0:
avg_loss = float(avg_loss) / self.print_every
if self.use_tqdm:
@@ -649,36 +690,36 @@ class Trainer(object):
pbar.set_postfix_str(print_output)
avg_loss = 0
self.callback_manager.on_batch_end()
if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
(self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
and self.dev_data is not None:
eval_res = self._do_validation(epoch=epoch, step=self.step)
eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
self.n_steps) + \
self.tester._format_eval_results(eval_res)
pbar.write(eval_str + '\n')
eval_str = "Evaluation on dev at Epoch {}/{}. Step:{}/{}: ".format(epoch, self.n_epochs, self.step,
self.n_steps)
# pbar.write(eval_str + '\n')
self.logger.info(eval_str)
self.logger.info(self.tester._format_eval_results(eval_res)+'\n')
# ================= mini-batch end ==================== #
# lr decay; early stopping
self.callback_manager.on_epoch_end()
# =============== epochs end =================== #
pbar.close()
self.pbar = None
# ============ tqdm end ============== #
def _do_validation(self, epoch, step):
self.callback_manager.on_valid_begin()
res = self.tester.test()
is_better_eval = False
if self._better_eval_result(res):
if self.save_path is not None:
self._save_model(self.model,
"best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]))
elif self._load_best_model:
self._best_model_states = {name: param.cpu().clone() for name, param in self.model.named_parameters()}
self._best_model_states = {name: param.cpu().clone() for name, param in self.model.state_dict().items()}
self.best_dev_perf = res
self.best_dev_epoch = epoch
self.best_dev_step = step
@@ -686,7 +727,7 @@ class Trainer(object):
# get validation results; adjust optimizer
self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer, is_better_eval)
return res
def _mode(self, model, is_test=False):
"""Train mode or Test mode. This is for PyTorch currently.

@@ -698,14 +739,14 @@ class Trainer(object):
model.eval()
else:
model.train()
def _update(self):
"""Perform weight update on a model.

"""
if self.step % self.update_every == 0:
self.optimizer.step()
def _data_forward(self, network, x):
x = _build_args(self._forward_func, **x)
y = network(**x)
@@ -713,7 +754,7 @@ class Trainer(object):
raise TypeError(
f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.")
return y
def _grad_backward(self, loss):
"""Compute gradient with link rules.

@@ -724,7 +765,7 @@ class Trainer(object):
if (self.step-1) % self.update_every == 0:
self.model.zero_grad()
loss.backward()
def _compute_loss(self, predict, truth):
"""Compute loss given prediction and ground truth.

@@ -733,7 +774,7 @@ class Trainer(object):
:return: a scalar
"""
return self.losser(predict, truth)
def _save_model(self, model, model_name, only_param=False):
""" 存储不含有显卡信息的state_dict或model
:param model:
@@ -745,7 +786,7 @@ class Trainer(object):
model_path = os.path.join(self.save_path, model_name)
if not os.path.exists(self.save_path):
os.makedirs(self.save_path, exist_ok=True)
if isinstance(model, nn.DataParallel):
if _model_contains_inner_module(model):
model = model.module
if only_param:
state_dict = model.state_dict()
@@ -756,7 +797,7 @@ class Trainer(object):
model.cpu()
torch.save(model, model_path)
model.to(self._model_device)
def _load_model(self, model, model_name, only_param=False):
# 返回bool值指示是否成功reload模型
if self.save_path is not None:
@@ -765,7 +806,7 @@ class Trainer(object):
states = torch.load(model_path)
else:
states = torch.load(model_path).state_dict()
if isinstance(model, nn.DataParallel):
if _model_contains_inner_module(model):
model.module.load_state_dict(states)
else:
model.load_state_dict(states)
@@ -774,7 +815,7 @@ class Trainer(object):
else:
return False
return True
def _better_eval_result(self, metrics):
"""Check if the current epoch yields better validation results.

@@ -800,6 +841,10 @@ class Trainer(object):
is_better = False
return is_better

@property
def is_master(self):
"""是否是主进程"""
return True

DEFAULT_CHECK_BATCH_SIZE = 2
DEFAULT_CHECK_NUM_BATCH = 2
@@ -821,14 +866,15 @@ def _get_value_info(_dict):
strs.append(_str)
return strs


from numbers import Number
from .batch import _to_tensor
def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE,
dev_data=None, metric_key=None,
check_level=0):


def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAULT_CHECK_BATCH_SIZE,
dev_data=None, metric_key=None, check_level=0):
# check get_loss 方法
model_devcie = _get_model_device(model=model)
model_device = _get_model_device(model=model)
def _iter():
start_idx = 0
while start_idx<len(dataset):
@@ -849,7 +895,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
start_idx += batch_size

for batch_count, (batch_x, batch_y) in enumerate(_iter()):
_move_dict_value_to_device(batch_x, batch_y, device=model_devcie)
_move_dict_value_to_device(batch_x, batch_y, device=model_device)
# forward check
if batch_count == 0:
info_str = ""
@@ -867,16 +913,12 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
info_str += '\n'
else:
info_str += 'There is no target field.'
print(info_str)
_check_forward_error(forward_func=model.forward, dataset=dataset,
logger.info(info_str)
_check_forward_error(forward_func=forward_func, dataset=dataset,
batch_x=batch_x, check_level=check_level)
if isinstance(model, nn.DataParallel):
forward_func = model.module.forward
else:
forward_func = model.forward
refined_batch_x = _build_args(forward_func, **batch_x)
pred_dict = model(**refined_batch_x)
func_signature = _get_func_signature(model.forward)
func_signature = _get_func_signature(forward_func)
if not isinstance(pred_dict, dict):
raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`.")
@@ -896,7 +938,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
loss.backward()
except _CheckError as e:
# TODO: another error raised if _CheckError caught
pre_func_signature = _get_func_signature(model.forward)
pre_func_signature = _get_func_signature(forward_func)
_check_loss_evaluate(prev_func_signature=pre_func_signature, func_signature=e.func_signature,
check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y,
dataset=dataset, check_level=check_level)
@@ -906,7 +948,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
if dev_data is not None:
tester = Tester(data=dev_data[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics,
batch_size=batch_size, verbose=-1)
batch_size=batch_size, verbose=-1, use_tqdm=False)
evaluate_results = tester.test()
_check_eval_results(metrics=evaluate_results, metric_key=metric_key, metric_list=metrics)



+ 148
- 108
fastNLP/core/utils.py View File

@@ -1,9 +1,11 @@
"""
utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户可以使用的是 :func:`cache_results` 修饰器。
"""

__all__ = [
"cache_results",
"seq_len_to_mask",
"get_seq_len"
]

import _pickle
@@ -11,11 +13,16 @@ import inspect
import os
import warnings
from collections import Counter, namedtuple

import numpy as np
import torch
import torch.nn as nn
from typing import List
from ._logger import logger
from prettytable import PrettyTable
try:
from apex import amp
except:
amp = None

_CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed',
'varargs'])
@@ -23,27 +30,27 @@ _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'require

class Option(dict):
"""a dict can treat keys as attributes"""
def __getattr__(self, item):
try:
return self.__getitem__(item)
except KeyError:
raise AttributeError(item)
def __setattr__(self, key, value):
if key.startswith('__') and key.endswith('__'):
raise AttributeError(key)
self.__setitem__(key, value)
def __delattr__(self, item):
try:
self.pop(item)
except KeyError:
raise AttributeError(item)
def __getstate__(self):
return self
def __setstate__(self, state):
self.update(state)

@@ -62,11 +69,8 @@ def _prepare_cache_filepath(filepath):
os.makedirs(cache_dir)


# TODO 可以保存下缓存时的参数,如果load的时候发现参数不一致,发出警告。
def cache_results(_cache_fp, _refresh=False, _verbose=1):
"""
别名::class:`fastNLP.cache_results` :class:`fastNLP.core.uitls.cache_results`

cache_results是fastNLP中用于cache数据的装饰器。通过下面的例子看一下如何使用::

import time
@@ -113,13 +117,13 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
:param int _verbose: 是否打印cache的信息。
:return:
"""
def wrapper_(func):
signature = inspect.signature(func)
for key, _ in signature.parameters.items():
if key in ('_cache_fp', '_refresh', '_verbose'):
raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))
def wrapper(*args, **kwargs):
if '_cache_fp' in kwargs:
cache_filepath = kwargs.pop('_cache_fp')
@@ -137,16 +141,16 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
else:
verbose = _verbose
refresh_flag = True
if cache_filepath is not None and refresh is False:
# load data
if os.path.exists(cache_filepath):
with open(cache_filepath, 'rb') as f:
results = _pickle.load(f)
if verbose == 1:
print("Read cache from {}.".format(cache_filepath))
logger.info("Read cache from {}.".format(cache_filepath))
refresh_flag = False
if refresh_flag:
results = func(*args, **kwargs)
if cache_filepath is not None:
@@ -155,12 +159,12 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
_prepare_cache_filepath(cache_filepath)
with open(cache_filepath, 'wb') as f:
_pickle.dump(results, f)
print("Save cache to {}.".format(cache_filepath))
logger.info("Save cache to {}.".format(cache_filepath))
return results
return wrapper
return wrapper_


@@ -189,49 +193,6 @@ def _save_model(model, model_name, save_dir, only_param=False):
model.to(_model_device)


# def save_pickle(obj, pickle_path, file_name):
# """Save an object into a pickle file.
#
# :param obj: an object
# :param pickle_path: str, the directory where the pickle file is to be saved
# :param file_name: str, the name of the pickle file. In general, it should be ended by "pkl".
# """
# if not os.path.exists(pickle_path):
# os.mkdir(pickle_path)
# print("make dir {} before saving pickle file".format(pickle_path))
# with open(os.path.join(pickle_path, file_name), "wb") as f:
# _pickle.dump(obj, f)
# print("{} saved in {}".format(file_name, pickle_path))
#
#
# def load_pickle(pickle_path, file_name):
# """Load an object from a given pickle file.
#
# :param pickle_path: str, the directory where the pickle file is.
# :param file_name: str, the name of the pickle file.
# :return obj: an object stored in the pickle
# """
# with open(os.path.join(pickle_path, file_name), "rb") as f:
# obj = _pickle.load(f)
# print("{} loaded from {}".format(file_name, pickle_path))
# return obj
#
#
# def pickle_exist(pickle_path, pickle_name):
# """Check if a given pickle file exists in the directory.
#
# :param pickle_path: the directory of target pickle file
# :param pickle_name: the filename of target pickle file
# :return: True if file exists else False
# """
# if not os.path.exists(pickle_path):
# os.makedirs(pickle_path)
# file_name = os.path.join(pickle_path, pickle_name)
# if os.path.exists(file_name):
# return True
# else:
# return False

def _move_model_to_device(model, device):
"""
将model移动到device
@@ -254,9 +215,9 @@ def _move_model_to_device(model, device):

:return: torch.nn.DataParallel or torch.nn.Module
"""
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.")
# if isinstance(model, torch.nn.parallel.DistributedDataParallel):
# raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.")
if device is None:
if isinstance(model, torch.nn.DataParallel):
model.cuda()
@@ -265,10 +226,10 @@ def _move_model_to_device(model, device):
if not torch.cuda.is_available() and (
device != 'cpu' or (isinstance(device, torch.device) and device.type != 'cpu')):
raise ValueError("There is no usable gpu. set `device` as `cpu` or `None`.")
if isinstance(model, torch.nn.DataParallel):
raise RuntimeError("When model is `torch.nn.DataParallel`, the device has to be `None`.")
if isinstance(device, int):
assert device > -1, "device can only be non-negative integer"
assert torch.cuda.device_count() > device, "Only has {} gpus, cannot use device {}.".format(
@@ -312,7 +273,7 @@ def _get_model_device(model):
"""
# TODO 这个函数存在一定的风险,因为同一个模型可能存在某些parameter不在显卡中,比如BertEmbedding. 或者跨显卡
assert isinstance(model, nn.Module)
parameters = list(model.parameters())
if len(parameters) == 0:
return None
@@ -352,7 +313,6 @@ def _map_args(maps: dict, **kwargs):
output.update({name: val})
for keys in maps.keys():
if keys not in output.keys():
# TODO: add UNUSED warning.
pass
return output

@@ -473,10 +433,10 @@ def _move_dict_value_to_device(*args, device: torch.device, non_blocking=False):
"""
if not torch.cuda.is_available():
return
if not isinstance(device, torch.device):
raise TypeError(f"device must be `torch.device`, got `{type(device)}`")
for arg in args:
if isinstance(arg, dict):
for key, value in arg.items():
@@ -491,10 +451,10 @@ class _CheckError(Exception):

_CheckError. Used in losses.LossBase, metrics.MetricBase.
"""
def __init__(self, check_res: _CheckRes, func_signature: str):
errs = [f'Problems occurred when calling `{func_signature}`']
if check_res.varargs:
errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)")
if check_res.missing:
@@ -503,9 +463,9 @@ class _CheckError(Exception):
errs.append(f"\tduplicated param: {check_res.duplicated}")
if check_res.unused:
errs.append(f"\tunused param: {check_res.unused}")
Exception.__init__(self, '\n'.join(errs))
self.check_res = check_res
self.func_signature = func_signature

@@ -525,7 +485,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
# if check_res.varargs:
# errs.append(f"\tvarargs: *{check_res.varargs}")
# suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.")
if check_res.unused:
for _unused in check_res.unused:
if _unused in target_dict:
@@ -536,7 +496,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
unuseds.append(f"\tunused field: {_unused_field}")
if _unused_param:
unuseds.append(f"\tunused param: {_unused_param}") # output from predict or forward
module_name = func_signature.split('.')[0]
if check_res.missing:
errs.append(f"\tmissing param: {check_res.missing}")
@@ -557,7 +517,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
mapped_missing.append(_miss)
else:
unmapped_missing.append(_miss)
for _miss in mapped_missing + unmapped_missing:
if _miss in dataset:
suggestions.append(f"Set `{_miss}` as target.")
@@ -570,29 +530,17 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
else:
_tmp = f'Provide `{_miss}` in DataSet or output of {prev_func_signature}.'
suggestions.append(_tmp)
# for _miss in unmapped_missing:
# if _miss in dataset:
# suggestions.append(f"Set `{_miss}` as target.")
# else:
# _tmp = ''
# if check_res.unused:
# _tmp = f"Specify your assignment for `{input_func_map.get(_miss, _miss)}` when initialize {module_name}."
# if _tmp:
# _tmp += f' Or provide `{_miss}` in DataSet or output of {prev_func_signature}.'
# else:
# _tmp = f'Provide `{_miss}` in output of {prev_func_signature} or DataSet.'
# suggestions.append(_tmp)

if check_res.duplicated:
errs.append(f"\tduplicated param: {check_res.duplicated}.")
suggestions.append(f"Delete {check_res.duplicated} in the output of "
f"{prev_func_signature} or do not set {check_res.duplicated} as targets. ")

if len(errs) > 0:
errs.extend(unuseds)
elif check_level == STRICT_CHECK_LEVEL:
errs.extend(unuseds)

if len(errs) > 0:
errs.insert(0, f'Problems occurred when calling {func_signature}')
sugg_str = ""
@@ -619,11 +567,11 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
def _check_forward_error(forward_func, batch_x, dataset, check_level):
check_res = _check_arg_dict_list(forward_func, batch_x)
func_signature = _get_func_signature(forward_func)
errs = []
suggestions = []
_unused = []
# if check_res.varargs:
# errs.append(f"\tvarargs: {check_res.varargs}")
# suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.")
@@ -644,14 +592,14 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level):
# _tmp += f"Or you might find it in `unused field:`, you can use DataSet.rename_field() to " \
# f"rename the field in `unused field:`."
suggestions.append(_tmp)
if check_res.unused:
_unused = [f"\tunused field: {check_res.unused}"]
if len(errs) > 0:
errs.extend(_unused)
elif check_level == STRICT_CHECK_LEVEL:
errs.extend(_unused)
if len(errs) > 0:
errs.insert(0, f'Problems occurred when calling {func_signature}')
sugg_str = ""
@@ -699,7 +647,7 @@ def seq_len_to_mask(seq_len, max_len=None):
max_len = int(max_len) if max_len else int(seq_len.max())
broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1))
mask = broad_cast_seq_len < seq_len.reshape(-1, 1)
elif isinstance(seq_len, torch.Tensor):
assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}."
batch_size = seq_len.size(0)
@@ -708,7 +656,7 @@ def seq_len_to_mask(seq_len, max_len=None):
mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1))
else:
raise TypeError("Only support 1-d numpy.ndarray or 1-d torch.Tensor.")
return mask


@@ -716,25 +664,25 @@ class _pseudo_tqdm:
"""
当无法引入tqdm,或者Trainer中设置use_tqdm为false的时候,用该方法打印数据
"""
def __init__(self, **kwargs):
pass
self.logger = logger
def write(self, info):
print(info)
self.logger.info(info)
def set_postfix_str(self, info):
print(info)
self.logger.info(info)
def __getattr__(self, item):
def pass_func(*args, **kwargs):
pass
return pass_func
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
del self

@@ -788,3 +736,95 @@ def iob2bioes(tags: List[str]) -> List[str]:
else:
raise TypeError("Invalid IOB format.")
return new_tags


def _is_iterable(value):
# 检查是否是iterable的, duck typing
try:
iter(value)
return True
except BaseException as e:
return False


def get_seq_len(words, pad_value=0):
"""
给定batch_size x max_len的words矩阵,返回句子长度

:param words: batch_size x max_len
:return: (batch_size,)
"""
mask = words.ne(pad_value)
return mask.sum(dim=-1)


def pretty_table_printer(dataset_or_ins) -> PrettyTable:
"""
:param dataset_or_ins: 传入一个dataSet或者instance
ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2], field_3=["a", "b", "c"])
+-----------+-----------+-----------------+
| field_1 | field_2 | field_3 |
+-----------+-----------+-----------------+
| [1, 1, 1] | [2, 2, 2] | ['a', 'b', 'c'] |
+-----------+-----------+-----------------+
:return: 以 pretty table的形式返回根据terminal大小进行自动截断
"""
x = PrettyTable()
try:
sz = os.get_terminal_size()
column = sz.columns
row = sz.lines
except OSError:
column = 144
row = 11

if type(dataset_or_ins).__name__ == "DataSet":
x.field_names = list(dataset_or_ins.field_arrays.keys())
c_size = len(x.field_names)
for ins in dataset_or_ins:
x.add_row([sub_column(ins[k], column, c_size, k) for k in x.field_names])
row -= 1
if row < 0:
x.add_row(["..." for _ in range(c_size)])
break
elif type(dataset_or_ins).__name__ == "Instance":
x.field_names = list(dataset_or_ins.fields.keys())
c_size = len(x.field_names)
x.add_row([sub_column(dataset_or_ins[k], column, c_size, k) for k in x.field_names])

else:
raise Exception("only accept DataSet and Instance")
x.align = "l"

return x


def sub_column(string: str, c: int, c_size: int, title: str) -> str:
"""
:param string: 要被截断的字符串
:param c: 命令行列数
:param c_size: instance或dataset field数
:param title: 列名
:return: 对一个过长的列进行截断的结果
"""
avg = max(int(c / c_size / 2), len(title))
string = str(string)
res = ""
counter = 0
for char in string:
if ord(char) > 255:
counter += 2
else:
counter += 1
res += char
if counter > avg:
res = res + "..."
break
return res


def _check_fp16():
if amp is None:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
if not torch.backends.cudnn.enabled:
raise RuntimeError("Amp requires cudnn backend to be enabled.")

+ 107
- 73
fastNLP/core/vocabulary.py View File

@@ -1,14 +1,21 @@
"""
.. todo::
doc
"""

__all__ = [
"Vocabulary",
"VocabularyOption",
]

from collections import Counter
from functools import partial
from functools import wraps
from collections import Counter, defaultdict

from ._logger import logger
from .dataset import DataSet
from .utils import Option
from functools import partial
import numpy as np
from .utils import _is_iterable


class VocabularyOption(Option):
@@ -32,7 +39,7 @@ def _check_build_vocab(func):
@wraps(func) # to solve missing docstring
def _wrapper(self, *args, **kwargs):
if self.word2idx is None or self.rebuild is True:
if self._word2idx is None or self.rebuild is True:
self.build_vocab()
return func(self, *args, **kwargs)
@@ -49,8 +56,8 @@ def _check_build_status(func):
if self.rebuild is False:
self.rebuild = True
if self.max_size is not None and len(self.word_count) >= self.max_size:
print("[Warning] Vocabulary has reached the max size {} when calling {} method. "
"Adding more words may cause unexpected behaviour of Vocabulary. ".format(
logger.info("[Warning] Vocabulary has reached the max size {} when calling {} method. "
"Adding more words may cause unexpected behaviour of Vocabulary. ".format(
self.max_size, func.__name__))
return func(self, *args, **kwargs)
@@ -59,8 +66,6 @@ def _check_build_status(func):

class Vocabulary(object):
"""
别名::class:`fastNLP.Vocabulary` :class:`fastNLP.core.vocabulary.Vocabulary`
用于构建, 存储和使用 `str` 到 `int` 的一一映射::

vocab = Vocabulary()
@@ -68,32 +73,52 @@ class Vocabulary(object):
vocab.update(word_list)
vocab["word"] # str to int
vocab.to_word(5) # int to str

:param int max_size: `Vocabulary` 的最大大小, 即能存储词的最大数量
若为 ``None`` , 则不限制大小. Default: ``None``
:param int min_freq: 能被记录下的词在文本中的最小出现频率, 应大于或等于 1.
若小于该频率, 词语将被视为 `unknown`. 若为 ``None`` , 所有文本中的词都被记录. Default: ``None``
:param str optional padding: padding的字符. 如果设置为 ``None`` ,
则vocabulary中不考虑padding, 也不计入词表大小,为 ``None`` 的情况多在为label建立Vocabulary的情况.
Default: '<pad>'
:param str optional unknown: unknown的字符,所有未被记录的词在转为 `int` 时将被视为unknown.
如果设置为 ``None`` ,则vocabulary中不考虑unknow, 也不计入词表大小.
为 ``None`` 的情况多在为label建立Vocabulary的情况.
Default: '<unk>'
"""
def __init__(self, max_size=None, min_freq=None, padding='<pad>', unknown='<unk>'):
"""
:param int max_size: `Vocabulary` 的最大大小, 即能存储词的最大数量
若为 ``None`` , 则不限制大小. Default: ``None``
:param int min_freq: 能被记录下的词在文本中的最小出现频率, 应大于或等于 1.
若小于该频率, 词语将被视为 `unknown`. 若为 ``None`` , 所有文本中的词都被记录. Default: ``None``
:param str optional padding: padding的字符. 如果设置为 ``None`` ,
则vocabulary中不考虑padding, 也不计入词表大小,为 ``None`` 的情况多在为label建立Vocabulary的情况.
Default: '<pad>'
:param str optional unknown: unknown的字符,所有未被记录的词在转为 `int` 时将被视为unknown.
如果设置为 ``None`` ,则vocabulary中不考虑unknow, 也不计入词表大小.
为 ``None`` 的情况多在为label建立Vocabulary的情况.
Default: '<unk>'
"""
self.max_size = max_size
self.min_freq = min_freq
self.word_count = Counter()
self.unknown = unknown
self.padding = padding
self.word2idx = None
self.idx2word = None
self._word2idx = None
self._idx2word = None
self.rebuild = True
# 用于承载不需要单独创建entry的词语,具体见from_dataset()方法
self._no_create_word = Counter()

@property
@_check_build_vocab
def word2idx(self):
return self._word2idx

@word2idx.setter
def word2idx(self, value):
self._word2idx = value

@property
@_check_build_vocab
def idx2word(self):
return self._idx2word

@idx2word.setter
def idx2word(self, value):
self._word2idx = value

@_check_build_status
def update(self, word_lst, no_create_entry=False):
"""依次增加序列中词在词典中的出现频率
@@ -131,11 +156,11 @@ class Vocabulary(object):
"""
在新加入word时,检查_no_create_word的设置。

:param str, List[str] word:
:param str List[str] word:
:param bool no_create_entry:
:return:
"""
if isinstance(word, str):
if isinstance(word, str) or not _is_iterable(word):
word = [word]
for w in word:
if no_create_entry and self.word_count.get(w, 0) == self._no_create_word.get(w, 0):
@@ -180,36 +205,36 @@ class Vocabulary(object):
但已经记录在词典中的词, 不会改变对应的 `int`

"""
if self.word2idx is None:
self.word2idx = {}
if self._word2idx is None:
self._word2idx = {}
if self.padding is not None:
self.word2idx[self.padding] = len(self.word2idx)
self._word2idx[self.padding] = len(self._word2idx)
if self.unknown is not None:
self.word2idx[self.unknown] = len(self.word2idx)
self._word2idx[self.unknown] = len(self._word2idx)
max_size = min(self.max_size, len(self.word_count)) if self.max_size else None
words = self.word_count.most_common(max_size)
if self.min_freq is not None:
words = filter(lambda kv: kv[1] >= self.min_freq, words)
if self.word2idx is not None:
words = filter(lambda kv: kv[0] not in self.word2idx, words)
start_idx = len(self.word2idx)
self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)})
if self._word2idx is not None:
words = filter(lambda kv: kv[0] not in self._word2idx, words)
start_idx = len(self._word2idx)
self._word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)})
self.build_reverse_vocab()
self.rebuild = False
return self
def build_reverse_vocab(self):
"""
基于 `word to index` dict, 构建 `index to word` dict.

"""
self.idx2word = {i: w for w, i in self.word2idx.items()}
self._idx2word = {i: w for w, i in self._word2idx.items()}
return self
@_check_build_vocab
def __len__(self):
return len(self.word2idx)
return len(self._word2idx)
@_check_build_vocab
def __contains__(self, item):
@@ -219,7 +244,7 @@ class Vocabulary(object):
:param item: the word
:return: True or False
"""
return item in self.word2idx
return item in self._word2idx
def has_word(self, w):
"""
@@ -241,12 +266,12 @@ class Vocabulary(object):

vocab[w]
"""
if w in self.word2idx:
return self.word2idx[w]
if w in self._word2idx:
return self._word2idx[w]
if self.unknown is not None:
return self.word2idx[self.unknown]
return self._word2idx[self.unknown]
else:
raise ValueError("word {} not in vocabulary".format(w))
raise ValueError("word `{}` not in vocabulary".format(w))
@_check_build_vocab
def index_dataset(self, *datasets, field_name, new_field_name=None):
@@ -257,37 +282,47 @@ class Vocabulary(object):
vocab.index_dataset(train_data, dev_data, test_data, field_name='words')

:param ~fastNLP.DataSet,List[~fastNLP.DataSet] datasets: 需要转index的一个或多个数据集
:param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field.
目前支持 ``str`` , ``List[str]`` , ``List[List[str]]``
:param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field.
Default: ``None``
:param list,str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field.
目前支持 ``str`` , ``List[str]``
:param list,str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field.
Default: ``None``.
"""
def index_instance(ins):
def index_instance(field):
"""
有几种情况, str, 1d-list, 2d-list
:param ins:
:return:
"""
field = ins[field_name]
if isinstance(field, str):
if isinstance(field, str) or not _is_iterable(field):
return self.to_index(field)
elif isinstance(field, list):
if not isinstance(field[0], list):
else:
if isinstance(field[0], str) or not _is_iterable(field[0]):
return [self.to_index(w) for w in field]
else:
if isinstance(field[0][0], list):
if not isinstance(field[0][0], str) and _is_iterable(field[0][0]):
raise RuntimeError("Only support field with 2 dimensions.")
return [[self.to_index(c) for c in w] for w in field]
if new_field_name is None:
new_field_name = field_name
new_field_name = new_field_name or field_name
if type(new_field_name) == type(field_name):
if isinstance(new_field_name, list):
assert len(new_field_name) == len(field_name), "new_field_name should have same number elements with " \
"field_name."
elif isinstance(new_field_name, str):
field_name = [field_name]
new_field_name = [new_field_name]
else:
raise TypeError("field_name and new_field_name can only be str or List[str].")
for idx, dataset in enumerate(datasets):
if isinstance(dataset, DataSet):
try:
dataset.apply(index_instance, new_field_name=new_field_name)
for f_n, n_f_n in zip(field_name, new_field_name):
dataset.apply_field(index_instance, field_name=f_n, new_field_name=n_f_n)
except Exception as e:
print("When processing the `{}` dataset, the following error occurred.".format(idx))
logger.info("When processing the `{}` dataset, the following error occurred.".format(idx))
raise e
else:
raise RuntimeError("Only DataSet type is allowed.")
@@ -306,9 +341,8 @@ class Vocabulary(object):

:param ~fastNLP.DataSet,List[~fastNLP.DataSet] datasets: 需要转index的一个或多个数据集
:param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` .
构建词典所使用的 field(s), 支持一个或多个field
若有多个 DataSet, 每个DataSet都必须有这些field.
目前仅支持的field结构: ``str`` , ``List[str]`` , ``list[List[str]]``
构建词典所使用的 field(s), 支持一个或多个field,若有多个 DataSet, 每个DataSet都必须有这些field. 目前支持的field结构
: ``str`` , ``List[str]``
:param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain
的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev
中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。
@@ -326,14 +360,14 @@ class Vocabulary(object):
def construct_vocab(ins, no_create_entry=False):
for fn in field_name:
field = ins[fn]
if isinstance(field, str):
if isinstance(field, str) or not _is_iterable(field):
self.add_word(field, no_create_entry=no_create_entry)
elif isinstance(field, (list, np.ndarray)):
if not isinstance(field[0], (list, np.ndarray)):
else:
if isinstance(field[0], str) or not _is_iterable(field[0]):
for word in field:
self.add_word(word, no_create_entry=no_create_entry)
else:
if isinstance(field[0][0], (list, np.ndarray)):
if not isinstance(field[0][0], str) and _is_iterable(field[0][0]):
raise RuntimeError("Only support field with 2 dimensions.")
for words in field:
for word in words:
@@ -343,8 +377,8 @@ class Vocabulary(object):
if isinstance(dataset, DataSet):
try:
dataset.apply(construct_vocab)
except Exception as e:
print("When processing the `{}` dataset, the following error occurred.".format(idx))
except BaseException as e:
logger.error("When processing the `{}` dataset, the following error occurred:".format(idx))
raise e
else:
raise TypeError("Only DataSet type is allowed.")
@@ -370,7 +404,7 @@ class Vocabulary(object):
def to_index(self, w):
"""
将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出``ValueError``::
将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出 ``ValueError`` ::

index = vocab.to_index('abc')
# equals to
@@ -389,7 +423,7 @@ class Vocabulary(object):
"""
if self.unknown is None:
return None
return self.word2idx[self.unknown]
return self._word2idx[self.unknown]
@property
@_check_build_vocab
@@ -399,7 +433,7 @@ class Vocabulary(object):
"""
if self.padding is None:
return None
return self.word2idx[self.padding]
return self._word2idx[self.padding]
@_check_build_vocab
def to_word(self, idx):
@@ -409,7 +443,7 @@ class Vocabulary(object):
:param int idx: the index
:return str word: the word
"""
return self.idx2word[idx]
return self._idx2word[idx]
def clear(self):
"""
@@ -418,8 +452,8 @@ class Vocabulary(object):
:return:
"""
self.word_count.clear()
self.word2idx = None
self.idx2word = None
self._word2idx = None
self._idx2word = None
self.rebuild = True
self._no_create_word.clear()
return self
@@ -430,8 +464,8 @@ class Vocabulary(object):
"""
len(self) # make sure vocab has been built
state = self.__dict__.copy()
# no need to pickle idx2word as it can be constructed from word2idx
del state['idx2word']
# no need to pickle _idx2word as it can be constructed from _word2idx
del state['_idx2word']
return state
def __setstate__(self, state):
@@ -446,5 +480,5 @@ class Vocabulary(object):
@_check_build_vocab
def __iter__(self):
for word, index in self.word2idx.items():
for word, index in self._word2idx.items():
yield word, index

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save