Browse Source

init dev0.8.0

tags/v1.0.0alpha
yh_cc 2 years ago
parent
commit
916a2d2c30
100 changed files with 1 additions and 12639 deletions
  1. +0
    -47
      .Jenkinsfile
  2. +0
    -1
      .coverage
  3. +0
    -18
      .gitignore
  4. +0
    -30
      .travis.yml
  5. +0
    -7
      MANIFEST.in
  6. +1
    -130
      README.md
  7. +0
    -5
      codecov.yml
  8. +0
    -33
      docs/Makefile
  9. +0
    -40
      docs/README.md
  10. +0
    -191
      docs/check_tools.py
  11. +0
    -4
      docs/requirements.txt
  12. +0
    -260
      docs/source/_static/notebooks/extend_1_bert_embedding.ipynb
  13. +0
    -292
      docs/source/_static/notebooks/tutorial_1_data_preprocess.ipynb
  14. +0
    -343
      docs/source/_static/notebooks/tutorial_2_vocabulary.ipynb
  15. +0
    -524
      docs/source/_static/notebooks/tutorial_3_embedding.ipynb
  16. +0
    -309
      docs/source/_static/notebooks/tutorial_4_load_dataset.ipynb
  17. +0
    -603
      docs/source/_static/notebooks/tutorial_5_loss_optimizer.ipynb
  18. +0
    -681
      docs/source/_static/notebooks/tutorial_6_datasetiter.ipynb
  19. +0
    -1206
      docs/source/_static/notebooks/tutorial_7_metrics.ipynb
  20. +0
    -1014
      docs/source/_static/notebooks/tutorial_8_modules_models.ipynb
  21. +0
    -622
      docs/source/_static/notebooks/tutorial_9_callback.ipynb
  22. +0
    -912
      docs/source/_static/notebooks/序列标注.ipynb
  23. +0
    -564
      docs/source/_static/notebooks/文本分类.ipynb
  24. +0
    -181
      docs/source/conf.py
  25. +0
    -7
      docs/source/fastNLP.core.batch.rst
  26. +0
    -7
      docs/source/fastNLP.core.callback.rst
  27. +0
    -7
      docs/source/fastNLP.core.const.rst
  28. +0
    -7
      docs/source/fastNLP.core.dataset.rst
  29. +0
    -7
      docs/source/fastNLP.core.field.rst
  30. +0
    -7
      docs/source/fastNLP.core.instance.rst
  31. +0
    -7
      docs/source/fastNLP.core.losses.rst
  32. +0
    -7
      docs/source/fastNLP.core.metrics.rst
  33. +0
    -7
      docs/source/fastNLP.core.optimizer.rst
  34. +0
    -25
      docs/source/fastNLP.core.rst
  35. +0
    -7
      docs/source/fastNLP.core.sampler.rst
  36. +0
    -7
      docs/source/fastNLP.core.tester.rst
  37. +0
    -7
      docs/source/fastNLP.core.trainer.rst
  38. +0
    -7
      docs/source/fastNLP.core.utils.rst
  39. +0
    -7
      docs/source/fastNLP.core.vocabulary.rst
  40. +0
    -6
      docs/source/fastNLP.embeddings.bert_embedding.rst
  41. +0
    -6
      docs/source/fastNLP.embeddings.char_embedding.rst
  42. +0
    -6
      docs/source/fastNLP.embeddings.contextual_embedding.rst
  43. +0
    -6
      docs/source/fastNLP.embeddings.elmo_embedding.rst
  44. +0
    -6
      docs/source/fastNLP.embeddings.embedding.rst
  45. +0
    -20
      docs/source/fastNLP.embeddings.rst
  46. +0
    -6
      docs/source/fastNLP.embeddings.stack_embedding.rst
  47. +0
    -6
      docs/source/fastNLP.embeddings.static_embedding.rst
  48. +0
    -6
      docs/source/fastNLP.embeddings.utils.rst
  49. +0
    -7
      docs/source/fastNLP.io.data_bundle.rst
  50. +0
    -7
      docs/source/fastNLP.io.embed_loader.rst
  51. +0
    -7
      docs/source/fastNLP.io.file_utils.rst
  52. +0
    -7
      docs/source/fastNLP.io.loader.rst
  53. +0
    -7
      docs/source/fastNLP.io.model_io.rst
  54. +0
    -7
      docs/source/fastNLP.io.pipe.rst
  55. +0
    -20
      docs/source/fastNLP.io.rst
  56. +0
    -7
      docs/source/fastNLP.io.utils.rst
  57. +0
    -6
      docs/source/fastNLP.models.bert.rst
  58. +0
    -6
      docs/source/fastNLP.models.biaffine_parser.rst
  59. +0
    -6
      docs/source/fastNLP.models.cnn_text_classification.rst
  60. +0
    -18
      docs/source/fastNLP.models.rst
  61. +0
    -6
      docs/source/fastNLP.models.sequence_labeling.rst
  62. +0
    -6
      docs/source/fastNLP.models.snli.rst
  63. +0
    -6
      docs/source/fastNLP.models.star_transformer.rst
  64. +0
    -6
      docs/source/fastNLP.modules.decoder.rst
  65. +0
    -6
      docs/source/fastNLP.modules.encoder.rst
  66. +0
    -15
      docs/source/fastNLP.modules.rst
  67. +0
    -6
      docs/source/fastNLP.modules.utils.rst
  68. +0
    -18
      docs/source/fastNLP.rst
  69. BIN
      docs/source/figures/fitlogChart.png
  70. BIN
      docs/source/figures/fitlogTable.png
  71. BIN
      docs/source/figures/procedures.PNG
  72. BIN
      docs/source/figures/sequence_labeling.PNG
  73. BIN
      docs/source/figures/text_classification.png
  74. BIN
      docs/source/figures/workflow.png
  75. +0
    -56
      docs/source/index.rst
  76. +0
    -7
      docs/source/modules.rst
  77. BIN
      docs/source/tutorials/cn_cls_example.png
  78. +0
    -231
      docs/source/tutorials/extend_1_bert_embedding.rst
  79. +0
    -223
      docs/source/tutorials/extend_2_dist.rst
  80. +0
    -122
      docs/source/tutorials/extend_3_fitlog.rst
  81. +0
    -172
      docs/source/tutorials/tutorial_1_data_preprocess.rst
  82. +0
    -140
      docs/source/tutorials/tutorial_2_vocabulary.rst
  83. +0
    -462
      docs/source/tutorials/tutorial_3_embedding.rst
  84. +0
    -219
      docs/source/tutorials/tutorial_4_load_dataset.rst
  85. +0
    -248
      docs/source/tutorials/tutorial_5_loss_optimizer.rst
  86. +0
    -423
      docs/source/tutorials/tutorial_6_datasetiter.rst
  87. +0
    -135
      docs/source/tutorials/tutorial_7_metrics.rst
  88. +0
    -193
      docs/source/tutorials/tutorial_8_modules_models.rst
  89. +0
    -140
      docs/source/tutorials/tutorial_9_callback.rst
  90. +0
    -208
      docs/source/tutorials/序列标注.rst
  91. +0
    -542
      docs/source/tutorials/文本分类.rst
  92. +0
    -15
      docs/source/user/api_update.rst
  93. +0
    -162
      docs/source/user/example.rst
  94. +0
    -24
      docs/source/user/installation.rst
  95. +0
    -14
      docs/source/user/quickstart.rst
  96. +0
    -25
      docs/source/user/tutorials.rst
  97. +0
    -98
      fastNLP/__init__.py
  98. +0
    -112
      fastNLP/core/__init__.py
  99. +0
    -179
      fastNLP/core/_logger.py
  100. +0
    -107
      fastNLP/core/_parallel_utils.py

+ 0
- 47
.Jenkinsfile View File

@@ -1,47 +0,0 @@
pipeline {
agent {
docker {
image 'ubuntu_tester'
args '-u root:root -v ${JENKINS_HOME}/html/docs:/docs -v ${JENKINS_HOME}/html/_ci:/ci'
}
}
environment {
TRAVIS = 1
PJ_NAME = 'fastNLP'
POST_URL = 'https://open.feishu.cn/open-apis/bot/v2/hook/14719364-818d-4f88-9057-7c9f0eaaf6ae'
}
stages {
stage('Package Installation') {
steps {
sh 'python setup.py install'
}
}
stage('Parallel Stages') {
parallel {
stage('Document Building') {
steps {
sh 'cd docs && make prod'
sh 'rm -rf /docs/${PJ_NAME}'
sh 'mv docs/build/html /docs/${PJ_NAME}'
}
}
stage('Package Testing') {
steps {
sh 'pip install fitlog'
sh 'pytest ./tests --html=test_results.html --self-contained-html'
}
}
}
}
}
post {
failure {
sh 'post 1'
}
success {
sh 'post 0'
sh 'post github'
}
}

}

+ 0
- 1
.coverage
File diff suppressed because it is too large
View File


+ 0
- 18
.gitignore View File

@@ -1,18 +0,0 @@
.gitignore

.DS_Store
.ipynb_checkpoints
*.pyc
__pycache__
*.swp
.vscode/
.idea/**

caches

# fitlog
.fitlog
logs/
.fitconfig

docs/build

+ 0
- 30
.travis.yml View File

@@ -1,30 +0,0 @@
language: python
python:
- "3.6"

env:
- TRAVIS=1

# command to install dependencies
install:
- pip install --quiet -r requirements.txt
- pip install --quiet fitlog
- pip install pytest>=3.6
- pip install pytest-cov
# command to run tests
script:
# - python -m spacy download en
- pytest --cov=fastNLP tests/

after_success:
- bash <(curl -s https://codecov.io/bash)

notifications:
webhooks:
urls:
- https://open.feishu.cn/officialapp/notify/55ba4b15d04608e875c122f11484a4e2fa807c42b9ca074509bea654d1b99ca6
on_success: always # default: always
on_failure: always # default: always
on_start: never # default: never
on_cancel: always # default: always
on_error: always # default: always

+ 0
- 7
MANIFEST.in View File

@@ -1,7 +0,0 @@
include requirements.txt
include LICENSE
include README.md
prune tests/
prune reproduction/
prune fastNLP/api
prune fastNLP/automl

+ 1
- 130
README.md View File

@@ -6,133 +6,4 @@
![Hex.pm](https://img.shields.io/hexpm/l/plug.svg) ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg)
[![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest) [![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest)


fastNLP是一款轻量级的自然语言处理(NLP)工具包,目标是快速实现NLP任务以及构建复杂模型。

fastNLP具有如下的特性:

- 统一的Tabular式数据容器,简化数据预处理过程;
- 内置多种数据集的Loader和Pipe,省去预处理代码;
- 各种方便的NLP工具,例如Embedding加载(包括ELMo和BERT)、中间数据cache等;
- 部分[数据集与预训练模型](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)的自动下载;
- 提供多种神经网络组件以及复现模型(涵盖中文分词、命名实体识别、句法分析、文本分类、文本匹配、指代消解、摘要等任务);
- Trainer提供多种内置Callback函数,方便实验记录、异常捕获等。

## 安装指南

fastNLP 依赖以下包:

+ numpy>=1.14.2
+ torch>=1.0.0
+ tqdm>=4.28.1
+ nltk>=3.4.1
+ requests
+ spacy
+ prettytable>=0.7.2

其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 [PyTorch 官网](https://pytorch.org/) 。
在依赖包安装完成后,您可以在命令行执行如下指令完成安装

```shell
pip install fastNLP
python -m spacy download en
```


## fastNLP教程
中文[文档](http://www.fastnlp.top/docs/fastNLP/)、 [教程](http://www.fastnlp.top/docs/fastNLP/user/quickstart.html)

### 快速入门

- [Quick-1. 文本分类](http://www.fastnlp.top/docs/fastNLP/tutorials/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.html)
- [Quick-2. 序列标注](http://www.fastnlp.top/docs/fastNLP/tutorials/%E5%BA%8F%E5%88%97%E6%A0%87%E6%B3%A8.html)

### 详细使用教程

- [1. 使用DataSet预处理文本](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_1_data_preprocess.html)
- [2. 使用Vocabulary转换文本与index](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_2_vocabulary.html)
- [3. 使用Embedding模块将文本转成向量](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_3_embedding.html)
- [4. 使用Loader和Pipe加载并处理数据集](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_4_load_dataset.html)
- [5. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_5_loss_optimizer.html)
- [6. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_6_datasetiter.html)
- [7. 使用Metric快速评测你的模型](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_7_metrics.html)
- [8. 使用Modules和Models快速搭建自定义模型](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_8_modules_models.html)
- [9. 使用Callback自定义你的训练过程](http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_9_callback.html)

### 扩展教程

- [Extend-1. BertEmbedding的各种用法](http://www.fastnlp.top/docs/fastNLP/tutorials/extend_1_bert_embedding.html)
- [Extend-2. 分布式训练简介](http://www.fastnlp.top/docs/fastNLP/tutorials/extend_2_dist.html)
- [Extend-3. 使用fitlog 辅助 fastNLP 进行科研](http://www.fastnlp.top/docs/fastNLP/tutorials/extend_3_fitlog.html)


## 内置组件

大部分用于的 NLP 任务神经网络都可以看做由词嵌入(embeddings)和两种模块:编码器(encoder)、解码器(decoder)组成。

以文本分类任务为例,下图展示了一个BiLSTM+Attention实现文本分类器的模型流程图:


![](./docs/source/figures/text_classification.png)

fastNLP 在 embeddings 模块中内置了几种不同的embedding:静态embedding(GloVe、word2vec)、上下文相关embedding
(ELMo、BERT)、字符embedding(基于CNN或者LSTM的CharEmbedding)

与此同时,fastNLP 在 modules 模块中内置了两种模块的诸多组件,可以帮助用户快速搭建自己所需的网络。 两种模块的功能和常见组件如下:

<table>
<tr>
<td><b> 类型 </b></td>
<td><b> 功能 </b></td>
<td><b> 例子 </b></td>
</tr>
<tr>
<td> encoder </td>
<td> 将输入编码为具有具有表示能力的向量 </td>
<td> Embedding, RNN, CNN, Transformer, ...
</tr>
<tr>
<td> decoder </td>
<td> 将具有某种表示意义的向量解码为需要的输出形式 </td>
<td> MLP, CRF, ... </td>
</tr>
</table>


## 项目结构

<div align=center><img width="450" height="350" src="./docs/source/figures/workflow.png"/></div>



fastNLP的大致工作流程如上图所示,而项目结构如下:

<table>
<tr>
<td><b> fastNLP </b></td>
<td> 开源的自然语言处理库 </td>
</tr>
<tr>
<td><b> fastNLP.core </b></td>
<td> 实现了核心功能,包括数据处理组件、训练器、测试器等 </td>
</tr>
<tr>
<td><b> fastNLP.models </b></td>
<td> 实现了一些完整的神经网络模型 </td>
</tr>
<tr>
<td><b> fastNLP.modules </b></td>
<td> 实现了用于搭建神经网络模型的诸多组件 </td>
</tr>
<tr>
<td><b> fastNLP.embeddings </b></td>
<td> 实现了将序列index转为向量序列的功能,包括读取预训练embedding等 </td>
</tr>
<tr>
<td><b> fastNLP.io </b></td>
<td> 实现了读写功能,包括数据读入与预处理,模型读写,数据与模型自动下载等 </td>
</tr>
</table>

<hr>

*In memory of @FengZiYjun. May his soul rest in peace. We will miss you very very much!*
dev0.8.0正在开发中

+ 0
- 5
codecov.yml View File

@@ -1,5 +0,0 @@
ignore:
- "reproduction" # ignore folders and all its contents
- "setup.py"
- "docs"
- "tutorials"

+ 0
- 33
docs/Makefile View File

@@ -1,33 +0,0 @@
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS =
SPHINXAPIDOC = sphinx-apidoc
SPHINXBUILD = sphinx-build
SPHINXPROJ = fastNLP
SOURCEDIR = source
BUILDDIR = build

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

apidoc:
$(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ)

server:
cd build/html && python -m http.server

dev:
rm -f source/$(SPHINXPROJ).* source/modules.rst && rm -rf build && make apidoc && make html && make server

prod:
make apidoc && make html

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

+ 0
- 40
docs/README.md View File

@@ -1,40 +0,0 @@
# 快速入门 fastNLP 文档编写

本教程为 fastNLP 文档编写者创建,文档编写者包括合作开发人员和文档维护人员。您在一般情况下属于前者,
只需要了解整个框架的部分内容即可。

## 合作开发人员

FastNLP的文档使用基于[reStructuredText标记语言](http://docutils.sourceforge.net/rst.html)的
[Sphinx](http://sphinx.pocoo.org/)工具生成,由[Read the Docs](https://readthedocs.org/)网站自动维护生成。
一般开发者只要编写符合reStructuredText语法规范的文档并通过[PR](https://help.github.com/en/articles/about-pull-requests),
就可以为fastNLP的文档贡献一份力量。

如果你想在本地编译文档并进行大段文档的编写,您需要安装Sphinx工具以及sphinx-rtd-theme主题:
```bash
fastNLP/docs> pip install sphinx
fastNLP/docs> pip install sphinx-rtd-theme
```
然后在本目录下执行 `make dev` 命令。该命令只支持Linux和MacOS系统,期望看到如下输出:
```bash
fastNLP/docs> make dev
rm -rf build/html && make html && make server
Running Sphinx v1.5.6
making output directory...
......
Build finished. The HTML pages are in build/html.
cd build/html && python -m http.server
Serving HTTP on 0.0.0.0 port 8000 (http://0.0.0.0:8000/) ...
```
现在您浏览器访问 http://localhost:8000/ 查看文档。如果你在远程服务器尚进行工作,则访问地址为 http://{服务器的ip地址}:8000/ 。
但您必须保证服务器的8000端口是开放的。如果您的电脑或远程服务器的8000端口被占用,程序会顺延使用8001、8002……等端口。
当你结束访问时,您可以使用Control(Ctrl) + C 来结束进程。

我们在[这里](./source/user/example.rst)列举了fastNLP文档经常用到的reStructuredText语法(网页查看请结合Raw模式),
您可以通过阅读它进行快速上手。FastNLP大部分的文档都是写在代码中通过Sphinx工具进行抽取生成的,

## 文档维护人员

文档维护人员需要了解 Makefile 中全部命令的含义,并了解到目前的文档结构
是在 sphinx-apidoc 自动抽取的基础上进行手动修改得到的。
文档维护人员应进一步提升整个框架的自动化程度,并监督合作开发人员不要破坏文档项目的整体结构。

+ 0
- 191
docs/check_tools.py View File

@@ -1,191 +0,0 @@
import inspect
import os
import sys


def _colored_string(string: str, color: str or int) -> str:
"""在终端中显示一串有颜色的文字
:param string: 在终端中显示的文字
:param color: 文字的颜色
:return:
"""
if isinstance(color, str):
color = {
"black": 30, "Black": 30, "BLACK": 30,
"red": 31, "Red": 31, "RED": 31,
"green": 32, "Green": 32, "GREEN": 32,
"yellow": 33, "Yellow": 33, "YELLOW": 33,
"blue": 34, "Blue": 34, "BLUE": 34,
"purple": 35, "Purple": 35, "PURPLE": 35,
"cyan": 36, "Cyan": 36, "CYAN": 36,
"white": 37, "White": 37, "WHITE": 37
}[color]
return "\033[%dm%s\033[0m" % (color, string)


def gr(string, flag):
if flag:
return _colored_string(string, "green")
else:
return _colored_string(string, "red")


def find_all_modules():
modules = {}
children = {}
to_doc = set()
root = '../fastNLP'
for path, dirs, files in os.walk(root):
for file in files:
if file.endswith('.py'):
name = ".".join(path.split('/')[1:])
if file.split('.')[0] != "__init__":
name = name + '.' + file.split('.')[0]
__import__(name)
m = sys.modules[name]
modules[name] = m
try:
m.__all__
except:
print(name, "__all__ missing")
continue
if m.__doc__ is None:
print(name, "__doc__ missing")
continue
if "undocumented" not in m.__doc__:
to_doc.add(name)
for module in to_doc:
t = ".".join(module.split('.')[:-1])
if t in to_doc:
if t not in children:
children[t] = set()
children[t].add(module)
for m in children:
children[m] = sorted(children[m])
return modules, to_doc, children


def create_rst_file(modules, name, children):
m = modules[name]
with open("./source/" + name + ".rst", "w") as fout:
t = "=" * len(name)
fout.write(name + "\n")
fout.write(t + "\n")
fout.write("\n")
fout.write(".. automodule:: " + name + "\n")
if name != "fastNLP.core" and len(m.__all__) > 0:
fout.write(" :members: " + ", ".join(m.__all__) + "\n")
short = name[len("fastNLP."):]
if not (short.startswith('models') or short.startswith('modules') or short.startswith('embeddings')):
fout.write(" :inherited-members:\n")
fout.write("\n")
if name in children:
fout.write("子模块\n------\n\n.. toctree::\n :maxdepth: 1\n\n")
for module in children[name]:
fout.write(" " + module + "\n")


def check_file(m, name):
names = name.split('.')
test_name = "test." + ".".join(names[1:-1]) + ".test_" + names[-1]
try:
__import__(test_name)
tm = sys.modules[test_name]
except ModuleNotFoundError:
tm = None
tested = tm is not None
funcs = {}
classes = {}
for item, obj in inspect.getmembers(m):
if inspect.isclass(obj) and obj.__module__ == name and not obj.__name__.startswith('_'):
this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm), {})
for i in dir(obj):
func = getattr(obj, i)
if inspect.isfunction(func) and not i.startswith('_'):
this[2][i] = (func.__doc__ is not None, False)
classes[obj.__name__] = this
if inspect.isfunction(obj) and obj.__module__ == name and not obj.__name__.startswith('_'):
this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm)) # docs
funcs[obj.__name__] = this
return funcs, classes


def check_files(modules, out=None):
for name in sorted(modules.keys()):
print(name, file=out)
funcs, classes = check_file(modules[name], name)
if out is None:
for f in funcs:
print("%-30s \t %s \t %s" % (f, gr("文档", funcs[f][0]), gr("测试", funcs[f][1])))
for c in classes:
print("%-30s \t %s \t %s" % (c, gr("文档", classes[c][0]), gr("测试", classes[c][1])))
methods = classes[c][2]
for f in methods:
print(" %-28s \t %s" % (f, gr("文档", methods[f][0])))
else:
for f in funcs:
if not funcs[f][0]:
print("缺少文档 %s" % (f), file=out)
if not funcs[f][1]:
print("缺少测试 %s" % (f), file=out)
for c in classes:
if not classes[c][0]:
print("缺少文档 %s" % (c), file=out)
if not classes[c][1]:
print("缺少测试 %s" % (c), file=out)
methods = classes[c][2]
for f in methods:
if not methods[f][0]:
print("缺少文档 %s" % (c + "." + f), file=out)
print(file=out)


def main_check():
sys.path.append("..")
print(_colored_string('Getting modules...', "Blue"))
modules, to_doc, children = find_all_modules()
print(_colored_string('Done!', "Green"))
print(_colored_string('Creating rst files...', "Blue"))
for name in to_doc:
create_rst_file(modules, name, children)
print(_colored_string('Done!', "Green"))
print(_colored_string('Checking all files...', "Blue"))
check_files(modules, out=open("results.txt", "w"))
print(_colored_string('Done!', "Green"))


def check_file_r(file_path):
with open(file_path) as fin:
content = fin.read()
index = -3
cuts = []
while index != -1:
index = content.find('"""',index+3)
cuts.append(index)
cuts = cuts[:-1]
assert len(cuts)%2 == 0
write_content = ""
last = 0
for i in range(len(cuts)//2):
start, end = cuts[i+i], cuts[i+i+1]
if content[start-1] == "r":
write_content += content[last:end+3]
else:
write_content += content[last:start] + "r"
write_content += content[start:end+3]
last = end + 3
write_content += content[last:]
with open(file_path, "w") as fout:
fout.write(write_content)


def add_r(base_path='../fastNLP'):
for path, _, files in os.walk(base_path):
for f in files:
if f.endswith(".py"):
check_file_r(os.path.abspath(os.path.join(path,f)))
# sys.exit(0)


if __name__ == "__main__":
add_r()

+ 0
- 4
docs/requirements.txt View File

@@ -1,4 +0,0 @@
sphinx==3.2.1
docutils==0.16
sphinx-rtd-theme==0.5.0
readthedocs-sphinx-search==0.1.0rc3

+ 0
- 260
docs/source/_static/notebooks/extend_1_bert_embedding.ipynb View File

@@ -1,260 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# BertEmbedding的各种用法\n",
"Bert自从在 BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding 中被提出后,因其性能卓越受到了极大的关注,在这里我们展示一下在fastNLP中如何使用Bert进行各类任务。其中中文Bert我们使用的模型的权重来自于 中文Bert预训练 。\n",
"\n",
"为了方便大家的使用,fastNLP提供了预训练的Embedding权重及数据集的自动下载,支持自动下载的Embedding和数据集见 数据集 。或您可从 使用Embedding模块将文本转成向量 与 使用Loader和Pipe加载并处理数据集 了解更多相关信息\n",
"\n",
"\n",
"下面我们将介绍通过使用Bert来进行文本分类, 中文命名实体识别, 文本匹配, 中文问答。\n",
"\n",
"## 1. 使用Bert进行文本分类\n",
"\n",
"文本分类是指给定一段文字,判定其所属的类别。例如下面的文本情感分类\n",
"\n",
" *1, 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!*\n",
"\n",
"这里我们使用fastNLP提供自动下载的微博分类进行测试"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import WeiboSenti100kPipe\n",
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForSequenceClassification\n",
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n",
"import torch\n",
"\n",
"data_bundle =WeiboSenti100kPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"# 载入BertEmbedding\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)\n",
"\n",
"# 载入模型\n",
"model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))\n",
"\n",
"# 训练模型\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model,\n",
" optimizer=Adam(model_params=model.parameters(), lr=2e-5),\n",
" loss=CrossEntropyLoss(), device=device,\n",
" batch_size=8, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=AccuracyMetric(), n_epochs=2, print_every=1)\n",
"trainer.train()\n",
"\n",
"# 测试结果\n",
"from fastNLP import Tester\n",
"\n",
"tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. 使用Bert进行命名实体识别\n",
"\n",
"命名实体识别是给定一句话,标记出其中的实体。一般序列标注的任务都使用conll格式,conll格式是至一行中通过制表符分隔不同的内容,使用空行分隔 两句话,例如下面的例子\n",
"\n",
"```\n",
" 中 B-ORG\n",
" 共 I-ORG\n",
" 中 I-ORG\n",
" 央 I-ORG\n",
" 致 O\n",
" 中 B-ORG\n",
" 国 I-ORG\n",
" 致 I-ORG\n",
" 公 I-ORG\n",
" 党 I-ORG\n",
" 十 I-ORG\n",
" 一 I-ORG\n",
" 大 I-ORG\n",
" 的 O\n",
" 贺 O\n",
" 词 O\n",
"```\n",
"\n",
"这部分内容请参考 快速实现序列标注模型\n",
"\n",
"## 3. 使用Bert进行文本匹配\n",
"\n",
"文本匹配任务是指给定两句话判断他们的关系。比如,给定两句话判断前一句是否和后一句具有因果关系或是否是矛盾关系;或者给定两句话判断两句话是否 具有相同的意思。这里我们使用"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import CNXNLIBertPipe\n",
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForSentenceMatching\n",
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n",
"from fastNLP.core.optimizer import AdamW\n",
"from fastNLP.core.callback import WarmupCallback\n",
"from fastNLP import Tester\n",
"import torch\n",
"\n",
"data_bundle = CNXNLIBertPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"print(data_bundle)\n",
"\n",
"# 载入BertEmbedding\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)\n",
"\n",
"# 载入模型\n",
"model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target')))\n",
"\n",
"# 训练模型\n",
"callbacks = [WarmupCallback(warmup=0.1, schedule='linear'), ]\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model,\n",
" optimizer=AdamW(params=model.parameters(), lr=4e-5),\n",
" loss=CrossEntropyLoss(), device=device,\n",
" batch_size=8, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=AccuracyMetric(), n_epochs=5, print_every=1,\n",
" update_every=8, callbacks=callbacks)\n",
"trainer.train()\n",
"\n",
"tester = Tester(data_bundle.get_dataset('test'), model, batch_size=8, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. 使用Bert进行中文问答\n",
"\n",
"问答任务是给定一段内容,以及一个问题,需要从这段内容中找到答案。 例如:\n",
"\n",
"```\n",
"\"context\": \"锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的演奏方法。常\n",
"用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子和尾声,提示音乐的板式和速度,以及\n",
"作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合\n",
"相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果,如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单\n",
"皮鼓(板鼓)、大鼓、大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、大\n",
"钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责,又称为四大件,领奏的师\n",
"傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼\n",
"和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、昆剧和粤剧锣鼓中乐器对应的口诀用字:\",\n",
"\"question\": \"锣鼓经是什么?\",\n",
"\"answers\": [\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" },\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" },\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" }\n",
"]\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"您可以通过以下的代码训练 (原文代码:[CMRC2018](https://github.com/ymcui/cmrc2018) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForQuestionAnswering\n",
"from fastNLP.core.losses import CMRC2018Loss\n",
"from fastNLP.core.metrics import CMRC2018Metric\n",
"from fastNLP.io.pipe.qa import CMRC2018BertPipe\n",
"from fastNLP import Trainer, BucketSampler\n",
"from fastNLP import WarmupCallback, GradientClipCallback\n",
"from fastNLP.core.optimizer import AdamW\n",
"import torch\n",
"\n",
"data_bundle = CMRC2018BertPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"print(data_bundle)\n",
"\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True,\n",
" dropout=0.5, word_dropout=0.01)\n",
"model = BertForQuestionAnswering(embed)\n",
"loss = CMRC2018Loss()\n",
"metric = CMRC2018Metric()\n",
"\n",
"wm_callback = WarmupCallback(schedule='linear')\n",
"gc_callback = GradientClipCallback(clip_value=1, clip_type='norm')\n",
"callbacks = [wm_callback, gc_callback]\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=5e-5)\n",
"\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,\n",
" sampler=BucketSampler(seq_len_field_name='context_len'),\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric,\n",
" callbacks=callbacks, device=device, batch_size=6, num_workers=2, n_epochs=2, print_every=1,\n",
" test_use_tqdm=False, update_every=10)\n",
"trainer.train(load_best_model=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练结果(和论文中报道的基本一致):\n",
"\n",
"```\n",
" In Epoch:2/Step:1692, got best dev performance:\n",
" CMRC2018Metric: f1=85.61, em=66.08\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 292
docs/source/_static/notebooks/tutorial_1_data_preprocess.ipynb View File

@@ -1,292 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fastNLP中的DataSet"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+------------------------------+---------------------------------------------+---------+\n",
"| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"| Third instance . | ['Third', 'instance', '.'] | 3 |\n",
"+------------------------------+---------------------------------------------+---------+\n"
]
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
" 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
" 'seq_len': [6, 3, 3]}\n",
"dataset = DataSet(data)\n",
"# 传入的dict的每个key的value应该为具有相同长度的list\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的构建"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet()\n",
"instance = Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6)\n",
"dataset.append(instance)\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet([\n",
" Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6),\n",
" Instance(raw_words=\"Second instance .\",\n",
" words=['Second', 'instance', '.'],\n",
" seq_len=3)\n",
" ])\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的删除"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----+---+\n",
"| a | c |\n",
"+----+---+\n",
"| -5 | 0 |\n",
"| -4 | 0 |\n",
"| -3 | 0 |\n",
"| -2 | 0 |\n",
"| -1 | 0 |\n",
"| 0 | 0 |\n",
"| 1 | 0 |\n",
"| 2 | 0 |\n",
"| 3 | 0 |\n",
"| 4 | 0 |\n",
"+----+---+"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+---+\n",
"| c |\n",
"+---+\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"+---+"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet\n",
"dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
"# 在dataset中删除满足条件的instance\n",
"dataset.drop(lambda ins:ins['a']<0)\n",
"# 删除第3个instance\n",
"dataset.delete_instance(2)\n",
"# 删除名为'a'的field\n",
"dataset.delete_field('a')\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 简单的数据预处理"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 检查是否存在名为'a'的field\n",
"print(dataset.has_field('a')) # 或 ('a' in dataset)\n",
"# 将名为'a'的field改名为'b'\n",
"dataset.rename_field('c', 'b')\n",
"# DataSet的长度\n",
"len(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+------------------------------+-------------------------------------------------+\n",
"| raw_words | words |\n",
"+------------------------------+-------------------------------------------------+\n",
"| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
"| Second instance . | ['Second', 'instance', '.'] |\n",
"| Third instance . | ['Third', 'instance', '.'] |\n",
"+------------------------------+-------------------------------------------------+"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
"dataset = DataSet(data)\n",
"\n",
"# 将句子分成单词形式, 详见DataSet.apply()方法\n",
"dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
"\n",
"# 或使用DataSet.apply_field()\n",
"dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
"\n",
"# 除了匿名函数,也可以定义函数传递进去\n",
"def get_words(instance):\n",
" sentence = instance['raw_words']\n",
" words = sentence.split()\n",
" return words\n",
"dataset.apply(get_words, new_field_name='words')\n",
"dataset"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 343
docs/source/_static/notebooks/tutorial_2_vocabulary.ipynb View File

@@ -1,343 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fastNLP中的 Vocabulary\n",
"## 构建 Vocabulary"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(['复', '旦', '大', '学']) # 加入新的字\n",
"vocab.add_word('上海') # `上海`会作为一个整体\n",
"vocab.to_index('复') # 应该会为3\n",
"vocab.to_index('我') # 会输出1,Vocabulary中默认pad的index为0, unk(没有找到的词)的index为1\n",
"\n",
"# 在构建target的Vocabulary时,词表中应该用不上pad和unk,可以通过以下的初始化\n",
"vocab = Vocabulary(unknown=None, padding=None)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['positive', 'negative']...)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab.add_word_lst(['positive', 'negative'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab.to_index('positive')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 没有设置 unk 的情况"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "ValueError",
"evalue": "word `neutral` not in vocabulary",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-c6d424040b45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvocab\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'neutral'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# 会报错,因为没有unk这种情况\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36mto_index\u001b[0;34m(self, w)\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m \u001b[0mint\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mnumber\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 415\u001b[0m \"\"\"\n\u001b[0;32m--> 416\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 417\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 418\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36m_wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word2idx\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrebuild\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 44\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 45\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, w)\u001b[0m\n\u001b[1;32m 272\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word2idx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munknown\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 274\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"word `{}` not in vocabulary\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0m_check_build_vocab\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: word `neutral` not in vocabulary"
]
}
],
"source": [
"vocab.to_index('neutral') # 会报错,因为没有unk这种情况"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 设置 unk 的情况"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0, '<unk>')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary(unknown='<unk>', padding=None)\n",
"vocab.add_word_lst(['positive', 'negative'])\n",
"vocab.to_index('neutral'), vocab.to_word(vocab.to_index('neutral'))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['positive', 'negative']...)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------------------------------------------------+--------+\n",
"| chars | target |\n",
"+---------------------------------------------------+--------+\n",
"| [4, 2, 2, 5, 6, 7, 3] | 0 |\n",
"| [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 3] | 1 |\n",
"+---------------------------------------------------+--------+\n"
]
}
],
"source": [
"from fastNLP import Vocabulary\n",
"from fastNLP import DataSet\n",
"\n",
"dataset = DataSet({'chars': [\n",
" ['今', '天', '天', '气', '很', '好', '。'],\n",
" ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']\n",
" ],\n",
" 'target': ['neutral', 'negative']\n",
"})\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.from_dataset(dataset, field_name='chars')\n",
"vocab.index_dataset(dataset, field_name='chars')\n",
"\n",
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"target_vocab.from_dataset(dataset, field_name='target')\n",
"target_vocab.index_dataset(dataset, field_name='target')\n",
"print(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['今', '天', '心', '情', '很']...)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"from fastNLP import DataSet\n",
"\n",
"tr_data = DataSet({'chars': [\n",
" ['今', '天', '心', '情', '很', '好', '。'],\n",
" ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']\n",
" ],\n",
" 'target': ['positive', 'negative']\n",
"})\n",
"dev_data = DataSet({'chars': [\n",
" ['住', '宿', '条', '件', '还', '不', '错'],\n",
" ['糟', '糕', '的', '天', '气', ',', '无', '法', '出', '行', '。']\n",
" ],\n",
" 'target': ['positive', 'negative']\n",
"})\n",
"\n",
"vocab = Vocabulary()\n",
"# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。\n",
"vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 4%|▎ | 2.31M/63.5M [00:00<00:02, 22.9MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://212.129.155.247/embedding/glove.6B.50d.zip not found in cache, downloading to /tmp/tmpvziobj_e\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 63.5M/63.5M [00:01<00:00, 41.3MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finish download from http://212.129.155.247/embedding/glove.6B.50d.zip\n",
"Copy file to /remote-home/ynzheng/.fastNLP/embedding/glove.6B.50d\n",
"Found 2 out of 6 words in the pre-training embedding.\n",
"tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, -0.7208, -0.2931, -0.7468, 0.6512,\n",
" 0.4730, -0.7401, 0.1877, -0.3828, -0.5590, 0.4295, -0.2698, -0.4238,\n",
" -0.3124, 1.3423, -0.7857, -0.6302, 0.9182, 0.2113, -0.5744, 1.4549,\n",
" 0.7546, -1.6165, -0.0085, 0.0029, 0.5130, -0.4745, 2.5306, 0.8594,\n",
" -0.3067, 0.0578, 0.6623, 0.2080, 0.6424, -0.5246, -0.0534, 1.1404,\n",
" -0.1370, -0.1836, 0.4546, -0.5096, -0.0255, -0.0286, 0.1805, -0.4483,\n",
" 0.4053, -0.3682]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.1320, -0.2392, 0.1732, -0.2390, -0.0463, 0.0494, 0.0488, -0.0886,\n",
" 0.0224, -0.1300, 0.0369, 0.1800, 0.0750, -0.0183, 0.2264, 0.1628,\n",
" 0.1261, -0.1259, 0.1663, -0.1230, -0.1904, -0.0532, 0.1397, -0.0259,\n",
" -0.1799, 0.0226, 0.1858, 0.1981, 0.1338, 0.2394, 0.0248, 0.0203,\n",
" -0.1722, -0.1683, -0.1892, 0.0874, 0.0562, -0.0394, 0.0306, -0.1761,\n",
" 0.1015, -0.0171, 0.1172, 0.1357, 0.1519, -0.0011, 0.1572, 0.1265,\n",
" -0.2391, -0.0258]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.1318, -0.2552, -0.0679, 0.2619, -0.2616, 0.2357, 0.1308, -0.0118,\n",
" 1.7659, 0.2078, 0.2620, -0.1643, -0.8464, 0.0201, 0.0702, 0.3978,\n",
" 0.1528, -0.2021, -1.6184, -0.5433, -0.1786, 0.5389, 0.4987, -0.1017,\n",
" 0.6626, -1.7051, 0.0572, -0.3241, -0.6683, 0.2665, 2.8420, 0.2684,\n",
" -0.5954, -0.5004, 1.5199, 0.0396, 1.6659, 0.9976, -0.5597, -0.7049,\n",
" -0.0309, -0.2830, -0.1356, 0.6429, 0.4149, 1.2362, 0.7659, 0.9780,\n",
" 0.5851, -0.3018]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0.]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0.]], grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"import torch\n",
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word('train')\n",
"vocab.add_word('only_in_train') # 仅在train出现,但肯定在预训练词表中不存在\n",
"vocab.add_word('test', no_create_entry=True) # 该词只在dev或test中出现\n",
"vocab.add_word('only_in_test', no_create_entry=True) # 这个词在预训练的词表中找不到\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"print(embed(torch.LongTensor([vocab.to_index('train')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('only_in_train')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('test')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('only_in_test')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 524
docs/source/_static/notebooks/tutorial_3_embedding.ipynb View File

@@ -1,524 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5 out of 7 words in the pre-training embedding.\n",
"torch.Size([1, 5, 50])\n"
]
}
],
"source": [
"import torch\n",
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n",
"print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 5, 30])\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n",
"\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 256])\n"
]
}
],
"source": [
"from fastNLP.embeddings import ElmoEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 512])\n"
]
}
],
"source": [
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 256])\n"
]
}
],
"source": [
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n",
"print(embed(words).size()) # 三层输出按照权重element-wise的加起来"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 768])\n"
]
}
],
"source": [
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 1536])\n"
]
}
],
"source": [
"# 使用后面两层的输出\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n",
"print(embed(words).size()) # 结果将是在最后一维做拼接"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 7, 768])\n"
]
}
],
"source": [
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n",
"print(embed(words).size()) # 结果将在序列维度上增加2\n",
"# 取出句子的cls表示\n",
"cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 768])\n"
]
}
],
"source": [
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 10 words out of 10.\n",
"torch.Size([1, 9, 768])\n"
]
}
],
"source": [
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"torch.Size([1, 5, 64])\n"
]
}
],
"source": [
"from fastNLP.embeddings import CNNCharEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
"embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"torch.Size([1, 5, 64])\n"
]
}
],
"source": [
"from fastNLP.embeddings import LSTMCharEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
"embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5 out of 7 words in the pre-training embedding.\n",
"50\n",
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"30\n",
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"256\n",
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"512\n",
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"768\n",
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"1536\n",
"80\n"
]
}
],
"source": [
"from fastNLP.embeddings import *\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"print(static_embed.embedding_dim) # 50\n",
"char_embed = CNNCharEmbedding(vocab, embed_size=30)\n",
"print(char_embed.embedding_dim) # 30\n",
"elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n",
"print(elmo_embed_1.embedding_dim) # 256\n",
"elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n",
"print(elmo_embed_2.embedding_dim) # 512\n",
"bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n",
"print(bert_embed_1.embedding_dim) # 768\n",
"bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n",
"print(bert_embed_2.embedding_dim) # 1536\n",
"stack_embed = StackEmbedding([static_embed, char_embed])\n",
"print(stack_embed.embedding_dim) # 80"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n"
]
}
],
"source": [
"from fastNLP.embeddings import *\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n",
"embed.requires_grad = False # 修改BertEmbedding的权重为不更新"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n",
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n",
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n",
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 out of 4 words have frequency less than 2.\n",
"tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n",
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 out of 5 words have frequency less than 2.\n",
"All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n",
"tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('A')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 309
docs/source/_static/notebooks/tutorial_4_load_dataset.ipynb View File

@@ -1,309 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Loader和Pipe加载并处理数据集\n",
"\n",
"这一部分是关于如何加载数据集的教程\n",
"\n",
"## Part I: 数据集容器DataBundle\n",
"\n",
"而由于对于同一个任务,训练集,验证集和测试集会共用同一个词表以及具有相同的目标值,所以在fastNLP中我们使用了 DataBundle 来承载同一个任务的多个数据集 DataSet 以及它们的词表 Vocabulary 。下面会有例子介绍 DataBundle 的相关使用。\n",
"\n",
"DataBundle 在fastNLP中主要在各个 Loader 和 Pipe 中被使用。 下面我们先介绍一下 Loader 和 Pipe 。\n",
"\n",
"## Part II: 加载的各种数据集的Loader\n",
"\n",
"在fastNLP中,所有的 Loader 都可以通过其文档判断其支持读取的数据格式,以及读取之后返回的 DataSet 的格式, 例如 ChnSentiCorpLoader \n",
"\n",
"- download() 函数:自动将该数据集下载到缓存地址,默认缓存地址为~/.fastNLP/datasets/。由于版权等原因,不是所有的Loader都实现了该方法。该方法会返回下载后文件所处的缓存地址。\n",
"\n",
"- _load() 函数:从一个数据文件中读取数据,返回一个 DataSet 。返回的DataSet的格式可从Loader文档判断。\n",
"\n",
"- load() 函数:从文件或者文件夹中读取数据为 DataSet 并将它们组装成 DataBundle。支持接受的参数类型有以下的几种\n",
"\n",
" - None, 将尝试读取自动缓存的数据,仅支持提供了自动下载数据的Loader\n",
" - 文件夹路径, 默认将尝试在该文件夹下匹配文件名中含有 train , test , dev 的文件,如果有多个文件含有相同的关键字,将无法通过该方式读取\n",
" - dict, 例如{'train':\"/path/to/tr.conll\", 'dev':\"/to/validate.conll\", \"test\":\"/to/te.conll\"}。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1944 instances.\n",
"\ttrain has 17196 instances.\n",
"\tdev has 1858 instances.\n",
"\n"
]
}
],
"source": [
"from fastNLP.io import CWSLoader\n",
"\n",
"loader = CWSLoader(dataset_name='pku')\n",
"data_bundle = loader.load()\n",
"print(data_bundle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这里表示一共有3个数据集。其中:\n",
"\n",
" 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance\n",
"\n",
"也可以取出DataSet,并打印DataSet中的具体内容"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------------------------------------------------------------+\n",
"| raw_words |\n",
"+----------------------------------------------------------------+\n",
"| 迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ... |\n",
"| 中共中央 总书记 、 国家 主席 江 泽民 |\n",
"+----------------------------------------------------------------+\n"
]
}
],
"source": [
"tr_data = data_bundle.get_dataset('train')\n",
"print(tr_data[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Part III: 使用Pipe对数据集进行预处理\n",
"\n",
"通过 Loader 可以将文本数据读入,但并不能直接被神经网络使用,还需要进行一定的预处理。\n",
"\n",
"在fastNLP中,我们使用 Pipe 的子类作为数据预处理的类, Loader 和 Pipe 一般具备一一对应的关系,该关系可以从其名称判断, 例如 CWSLoader 与 CWSPipe 是一一对应的。一般情况下Pipe处理包含以下的几个过程,\n",
"1. 将raw_words或 raw_chars进行tokenize以切分成不同的词或字; \n",
"2. 再建立词或字的 Vocabulary , 并将词或字转换为index; \n",
"3. 将target 列建立词表并将target列转为index;\n",
"\n",
"所有的Pipe都可通过其文档查看该Pipe支持处理的 DataSet 以及返回的 DataBundle 中的Vocabulary的情况; 如 OntoNotesNERPipe\n",
"\n",
"各种数据集的Pipe当中,都包含了以下的两个函数:\n",
"\n",
"- process() 函数:对输入的 DataBundle 进行处理, 然后返回处理之后的 DataBundle 。process函数的文档中包含了该Pipe支持处理的DataSet的格式。\n",
"- process_from_file() 函数:输入数据集所在文件夹,使用对应的Loader读取数据(所以该函数支持的参数类型是由于其对应的Loader的load函数决定的),然后调用相对应的process函数对数据进行预处理。相当于是把Load和process放在一个函数中执行。\n",
"\n",
"接着上面 CWSLoader 的例子,我们展示一下 CWSPipe 的功能:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1944 instances.\n",
"\ttrain has 17196 instances.\n",
"\tdev has 1858 instances.\n",
"In total 2 vocabs:\n",
"\tchars has 4777 entries.\n",
"\ttarget has 4 entries.\n",
"\n"
]
}
],
"source": [
"from fastNLP.io import CWSPipe\n",
"\n",
"data_bundle = CWSPipe().process(data_bundle)\n",
"print(data_bundle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"表示一共有3个数据集和2个词表。其中:\n",
"\n",
"- 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance\n",
"- 2个词表分别为chars词表与target词表。其中chars词表为句子文本所构建的词表,一共有4777个不同的字;target词表为目标标签所构建的词表,一共有4种标签。\n",
"\n",
"相较于之前CWSLoader读取的DataBundle,新增了两个Vocabulary。 我们可以打印一下处理之后的DataSet"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------------------+---------------------+---------------------+---------+\n",
"| raw_words | chars | target | seq_len |\n",
"+---------------------+---------------------+---------------------+---------+\n",
"| 迈向 充满 希望... | [1224, 178, 674,... | [0, 1, 0, 1, 0, ... | 29 |\n",
"| 中共中央 总书记... | [11, 212, 11, 33... | [0, 3, 3, 1, 0, ... | 15 |\n",
"+---------------------+---------------------+---------------------+---------+\n"
]
}
],
"source": [
"tr_data = data_bundle.get_dataset('train')\n",
"print(tr_data[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看到有两列为int的field: chars和target。这两列的名称同时也是DataBundle中的Vocabulary的名称。可以通过下列的代码获取并查看Vocabulary的 信息"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vocabulary(['B', 'E', 'S', 'M']...)\n"
]
}
],
"source": [
"vocab = data_bundle.get_vocab('target')\n",
"print(vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Part IV: fastNLP封装好的Loader和Pipe\n",
"\n",
"fastNLP封装了多种任务/数据集的 Loader 和 Pipe 并提供自动下载功能,具体参见文档 [数据集](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)\n",
"\n",
"## Part V: 不同格式类型的基础Loader\n",
"\n",
"除了上面提到的针对具体任务的Loader,我们还提供了CSV格式和JSON格式的Loader\n",
"\n",
"**CSVLoader** 读取CSV类型的数据集文件。例子如下:\n",
"\n",
"```python\n",
"from fastNLP.io.loader import CSVLoader\n",
"data_set_loader = CSVLoader(\n",
" headers=('raw_words', 'target'), sep='\\t'\n",
")\n",
"```\n",
"\n",
"表示将CSV文件中每一行的第一项将填入'raw_words' field,第二项填入'target' field。其中项之间由'\\t'分割开来\n",
"\n",
"```python\n",
"data_set = data_set_loader._load('path/to/your/file')\n",
"```\n",
"\n",
"文件内容样例如下\n",
"\n",
"```csv\n",
"But it does not leave you with much . 1\n",
"You could hate it for the same reason . 1\n",
"The performances are an absolute joy . 4\n",
"```\n",
"\n",
"读取之后的DataSet具有以下的field\n",
"\n",
"| raw_words | target |\n",
"| --------------------------------------- | ------ |\n",
"| But it does not leave you with much . | 1 |\n",
"| You could hate it for the same reason . | 1 |\n",
"| The performances are an absolute joy . | 4 |\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**JsonLoader** 读取Json类型的数据集文件,数据必须按行存储,每行是一个包含各类属性的Json对象。例子如下\n",
"\n",
"```python\n",
"from fastNLP.io.loader import JsonLoader\n",
"loader = JsonLoader(\n",
" fields={'sentence1': 'raw_words1', 'sentence2': 'raw_words2', 'gold_label': 'target'}\n",
")\n",
"```\n",
"\n",
"表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'raw_words1'、'raw_words2'、'target'这三个fields\n",
"\n",
"```python\n",
"data_set = loader._load('path/to/your/file')\n",
"```\n",
"\n",
"数据集内容样例如下\n",
"```\n",
"{\"annotator_labels\": [\"neutral\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"neutral\", ... }\n",
"{\"annotator_labels\": [\"contradiction\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"contradiction\", ... }\n",
"{\"annotator_labels\": [\"entailment\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"entailment\", ... }\n",
"```\n",
"\n",
"读取之后的DataSet具有以下的field\n",
"\n",
"| raw_words0 | raw_words1 | target |\n",
"| ------------------------------------------------------ | ------------------------------------------------- | ------------- |\n",
"| A person on a horse jumps over a broken down airplane. | A person is training his horse for a competition. | neutral |\n",
"| A person on a horse jumps over a broken down airplane. | A person is at a diner, ordering an omelette. | contradiction |\n",
"| A person on a horse jumps over a broken down airplane. | A person is outdoors, on a horse. | entailment |"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 603
docs/source/_static/notebooks/tutorial_5_loss_optimizer.ipynb View File

@@ -1,603 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Trainer和Tester快速训练和测试"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 数据读入和处理"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/remote-home/ynzheng/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/io/loader/classification.py:340: UserWarning: SST2's test file has no target.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1821 instances.\n",
"\ttrain has 67349 instances.\n",
"\tdev has 872 instances.\n",
"In total 2 vocabs:\n",
"\twords has 16292 entries.\n",
"\ttarget has 2 entries.\n",
"\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| raw_words | target | words | seq_len |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| hide new secretions from the p... | 1 | [4110, 97, 12009, 39, 2, 6843,... | 7 |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)\n"
]
}
],
"source": [
"from fastNLP.io import SST2Pipe\n",
"\n",
"pipe = SST2Pipe()\n",
"databundle = pipe.process_from_file()\n",
"vocab = databundle.get_vocab('words')\n",
"print(databundle)\n",
"print(databundle.get_dataset('train')[0])\n",
"print(databundle.get_vocab('words'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4925 872 75\n"
]
}
],
"source": [
"train_data = databundle.get_dataset('train')[:5000]\n",
"train_data, test_data = train_data.split(0.015)\n",
"dev_data = databundle.get_dataset('dev')\n",
"print(len(train_data),len(dev_data),len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-----------+--------+-------+---------+\n",
"| field_names | raw_words | target | words | seq_len |\n",
"+-------------+-----------+--------+-------+---------+\n",
"| is_input | False | False | True | True |\n",
"| is_target | False | True | False | False |\n",
"| ignore_type | | False | False | False |\n",
"| pad_value | | 0 | 0 | 0 |\n",
"+-------------+-----------+--------+-------+---------+\n"
]
},
{
"data": {
"text/plain": [
"<prettytable.PrettyTable at 0x7f49ec540160>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data.print_field_meta()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用内置模型训练"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.models import CNNText\n",
"\n",
"#词嵌入的维度\n",
"EMBED_DIM = 100\n",
"\n",
"#使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数\n",
"#还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值\n",
"model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, dropout=0.1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import AccuracyMetric\n",
"from fastNLP import Const\n",
"\n",
"# metrics=AccuracyMetric() 在本例中与下面这行代码等价\n",
"metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import CrossEntropyLoss\n",
"\n",
"# loss = CrossEntropyLoss() 在本例中与下面这行代码等价\n",
"loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# 这表示构建了一个损失函数类,由func计算损失函数,其中将从模型返回值或者DataSet的target=True的field\n",
"# 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数;找到一个参数名为`label`的参数\n",
"# 传入func作为一个名为`target`的参数\n",
"#下面自己构建了一个交叉熵函数,和之后直接使用fastNLP中的交叉熵函数是一个效果\n",
"import torch\n",
"from fastNLP import LossFunc\n",
"func = torch.nn.functional.cross_entropy\n",
"loss_func = LossFunc(func, input=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import torch.optim as optim\n",
"\n",
"#使用 torch.optim 定义优化器\n",
"optimizer=optim.RMSprop(model_cnn.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 4]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-11-31-25\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=3080.0), HTML(value='')), layout=Layout(d…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.75 seconds!\n",
"\r",
"Evaluation on dev at Epoch 1/10. Step:308/3080: \n",
"\r",
"AccuracyMetric: acc=0.751147\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.83 seconds!\n",
"\r",
"Evaluation on dev at Epoch 2/10. Step:616/3080: \n",
"\r",
"AccuracyMetric: acc=0.755734\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.32 seconds!\n",
"\r",
"Evaluation on dev at Epoch 3/10. Step:924/3080: \n",
"\r",
"AccuracyMetric: acc=0.758028\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.88 seconds!\n",
"\r",
"Evaluation on dev at Epoch 4/10. Step:1232/3080: \n",
"\r",
"AccuracyMetric: acc=0.741972\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.96 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:1540/3080: \n",
"\r",
"AccuracyMetric: acc=0.728211\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.87 seconds!\n",
"\r",
"Evaluation on dev at Epoch 6/10. Step:1848/3080: \n",
"\r",
"AccuracyMetric: acc=0.755734\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.04 seconds!\n",
"\r",
"Evaluation on dev at Epoch 7/10. Step:2156/3080: \n",
"\r",
"AccuracyMetric: acc=0.732798\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.57 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:2464/3080: \n",
"\r",
"AccuracyMetric: acc=0.747706\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.48 seconds!\n",
"\r",
"Evaluation on dev at Epoch 9/10. Step:2772/3080: \n",
"\r",
"AccuracyMetric: acc=0.732798\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.48 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:3080/3080: \n",
"\r",
"AccuracyMetric: acc=0.740826\n",
"\n",
"\r\n",
"In Epoch:3/Step:924, got best dev performance:\n",
"AccuracyMetric: acc=0.758028\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.758028}},\n",
" 'best_epoch': 3,\n",
" 'best_step': 924,\n",
" 'seconds': 160.58}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer\n",
"\n",
"#训练的轮数和batch size\n",
"N_EPOCHS = 10\n",
"BATCH_SIZE = 16\n",
"\n",
"#如果在定义trainer的时候没有传入optimizer参数,模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3\n",
"#这里只使用了loss作为损失函数输入,感兴趣可以尝试其他损失函数(如之前自定义的loss_func)作为输入\n",
"trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics,\n",
"optimizer=optimizer,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE)\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.43 seconds!\n",
"[tester] \n",
"AccuracyMetric: acc=0.773333\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.773333}}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Tester\n",
"\n",
"tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 681
docs/source/_static/notebooks/tutorial_6_datasetiter.ipynb View File

@@ -1,681 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Trainer和Tester快速训练和测试"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 数据读入和处理"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/remote-home/ynzheng/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/io/loader/classification.py:340: UserWarning: SST2's test file has no target.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1821 instances.\n",
"\ttrain has 67349 instances.\n",
"\tdev has 872 instances.\n",
"In total 2 vocabs:\n",
"\twords has 16292 entries.\n",
"\ttarget has 2 entries.\n",
"\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| raw_words | target | words | seq_len |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| hide new secretions from the p... | 1 | [4110, 97, 12009, 39, 2, 6843,... | 7 |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)\n"
]
}
],
"source": [
"from fastNLP.io import SST2Pipe\n",
"\n",
"pipe = SST2Pipe()\n",
"databundle = pipe.process_from_file()\n",
"vocab = databundle.get_vocab('words')\n",
"print(databundle)\n",
"print(databundle.get_dataset('train')[0])\n",
"print(databundle.get_vocab('words'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4925 872 75\n"
]
}
],
"source": [
"train_data = databundle.get_dataset('train')[:5000]\n",
"train_data, test_data = train_data.split(0.015)\n",
"dev_data = databundle.get_dataset('dev')\n",
"print(len(train_data),len(dev_data),len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-----------+--------+-------+---------+\n",
"| field_names | raw_words | target | words | seq_len |\n",
"+-------------+-----------+--------+-------+---------+\n",
"| is_input | False | False | True | True |\n",
"| is_target | False | True | False | False |\n",
"| ignore_type | | False | False | False |\n",
"| pad_value | | 0 | 0 | 0 |\n",
"+-------------+-----------+--------+-------+---------+\n"
]
},
{
"data": {
"text/plain": [
"<prettytable.PrettyTable at 0x7f0db03d0640>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data.print_field_meta()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import AccuracyMetric\n",
"from fastNLP import Const\n",
"\n",
"# metrics=AccuracyMetric() 在本例中与下面这行代码等价\n",
"metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSetIter初探"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],\n",
" [15618, 3204, 5, 1675, 0]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],\n",
" [ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 12])}\n",
"batch_y: {'target': tensor([0, 1])}\n"
]
}
],
"source": [
"from fastNLP import BucketSampler\n",
"from fastNLP import DataSetIter\n",
"\n",
"tmp_data = dev_data[:10]\n",
"# 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。\n",
"# 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)\n",
"sampler = BucketSampler(batch_size=2, seq_len_field_name='seq_len')\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
" -1, -1, -1]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],\n",
" [15618, 3204, 5, 1675, -1]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n",
"batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],\n",
" [ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, -1, -1, -1, -1, -1, -1, -1, -1]]), 'seq_len': tensor([20, 12])}\n",
"batch_y: {'target': tensor([0, 1])}\n"
]
}
],
"source": [
"tmp_data.set_pad_val('words',-1)\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([12, 20])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [15618, 3204, 5, 1675, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n"
]
}
],
"source": [
"from fastNLP.core.field import Padder\n",
"import numpy as np\n",
"class FixLengthPadder(Padder):\n",
" def __init__(self, pad_val=0, length=None):\n",
" super().__init__(pad_val=pad_val)\n",
" self.length = length\n",
" assert self.length is not None, \"Creating FixLengthPadder with no specific length!\"\n",
"\n",
" def __call__(self, contents, field_name, field_ele_dtype, dim):\n",
" #计算当前contents中的最大长度\n",
" max_len = max(map(len, contents))\n",
" #如果当前contents中的最大长度大于指定的padder length的话就报错\n",
" assert max_len <= self.length, \"Fixed padder length smaller than actual length! with length {}\".format(max_len)\n",
" array = np.full((len(contents), self.length), self.pad_val, dtype=field_ele_dtype)\n",
" for i, content_i in enumerate(contents):\n",
" array[i, :len(content_i)] = content_i\n",
" return array\n",
"\n",
"#设定FixLengthPadder的固定长度为40\n",
"tmp_padder = FixLengthPadder(pad_val=0,length=40)\n",
"#利用dataset的set_padder函数设定words field的padder\n",
"tmp_data.set_padder('words',tmp_padder)\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用DataSetIter自己编写训练过程\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-----start training-----\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.68 seconds!\n",
"Epoch 0 Avg Loss: 0.66 AccuracyMetric: acc=0.708716 29307ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.38 seconds!\n",
"Epoch 1 Avg Loss: 0.41 AccuracyMetric: acc=0.770642 52200ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.51 seconds!\n",
"Epoch 2 Avg Loss: 0.16 AccuracyMetric: acc=0.747706 70268ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.96 seconds!\n",
"Epoch 3 Avg Loss: 0.06 AccuracyMetric: acc=0.741972 90349ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.04 seconds!\n",
"Epoch 4 Avg Loss: 0.03 AccuracyMetric: acc=0.740826 114250ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"Epoch 5 Avg Loss: 0.02 AccuracyMetric: acc=0.738532 134742ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.65 seconds!\n",
"Epoch 6 Avg Loss: 0.01 AccuracyMetric: acc=0.731651 154503ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"Epoch 7 Avg Loss: 0.01 AccuracyMetric: acc=0.738532 175397ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.36 seconds!\n",
"Epoch 8 Avg Loss: 0.01 AccuracyMetric: acc=0.733945 192384ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.84 seconds!\n",
"Epoch 9 Avg Loss: 0.01 AccuracyMetric: acc=0.744266 214417ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.04 seconds!\n",
"[tester] \n",
"AccuracyMetric: acc=0.786667\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.786667}}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import BucketSampler\n",
"from fastNLP import DataSetIter\n",
"from fastNLP.models import CNNText\n",
"from fastNLP import Tester\n",
"import torch\n",
"import time\n",
"\n",
"embed_dim = 100\n",
"model = CNNText((len(vocab),embed_dim), num_classes=2, dropout=0.1)\n",
"\n",
"def train(epoch, data, devdata):\n",
" optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
" lossfunc = torch.nn.CrossEntropyLoss()\n",
" batch_size = 32\n",
"\n",
" # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。\n",
" # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)\n",
" train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')\n",
" train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)\n",
"\n",
" start_time = time.time()\n",
" print(\"-\"*5+\"start training\"+\"-\"*5)\n",
" for i in range(epoch):\n",
" loss_list = []\n",
" for batch_x, batch_y in train_batch:\n",
" optimizer.zero_grad()\n",
" output = model(batch_x['words'])\n",
" loss = lossfunc(output['pred'], batch_y['target'])\n",
" loss.backward()\n",
" optimizer.step()\n",
" loss_list.append(loss.item())\n",
"\n",
" #这里verbose如果为0,在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1,打印出验证结果,返回评估信息\n",
" #在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果\n",
" tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)\n",
" res=tester_tmp.test()\n",
"\n",
" print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=\" \")\n",
" print(tester_tmp._format_eval_results(res),end=\" \")\n",
" print('{:d}ms'.format(round((time.time()-start_time)*1000)))\n",
" loss_list.clear()\n",
"\n",
"train(10, train_data, dev_data)\n",
"#使用tester进行快速测试\n",
"tester = Tester(test_data, model, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 1206
docs/source/_static/notebooks/tutorial_7_metrics.ipynb
File diff suppressed because it is too large
View File


+ 0
- 1014
docs/source/_static/notebooks/tutorial_8_modules_models.ipynb
File diff suppressed because it is too large
View File


+ 0
- 622
docs/source/_static/notebooks/tutorial_9_callback.ipynb View File

@@ -1,622 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用 Callback 自定义你的训练过程"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- 什么是 Callback\n",
"- 使用 Callback \n",
"- 一些常用的 Callback\n",
"- 自定义实现 Callback"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"什么是Callback\n",
"------\n",
"\n",
"Callback 是与 Trainer 紧密结合的模块,利用 Callback 可以在 Trainer 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。\n",
"\n",
"fastNLP 中提供了很多常用的 Callback ,开箱即用。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"使用 Callback\n",
" ------\n",
"\n",
"使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2019-09-17T07:34:46.465871Z",
"start_time": "2019-09-17T07:34:30.648758Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1200 instances.\n",
"\ttrain has 9600 instances.\n",
"\tdev has 1200 instances.\n",
"In total 2 vocabs:\n",
"\tchars has 4409 entries.\n",
"\ttarget has 2 entries.\n",
"\n",
"training epochs started 2019-09-17-03-34-34\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.1 seconds!\n",
"Evaluation on dev at Epoch 1/3. Step:300/900: \n",
"AccuracyMetric: acc=0.863333\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.11 seconds!\n",
"Evaluation on dev at Epoch 2/3. Step:600/900: \n",
"AccuracyMetric: acc=0.886667\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.1 seconds!\n",
"Evaluation on dev at Epoch 3/3. Step:900/900: \n",
"AccuracyMetric: acc=0.890833\n",
"\n",
"\r\n",
"In Epoch:3/Step:900, got best dev performance:\n",
"AccuracyMetric: acc=0.890833\n",
"Reloaded the best model.\n"
]
}
],
"source": [
"from fastNLP import (Callback, EarlyStopCallback,\n",
" Trainer, CrossEntropyLoss, AccuracyMetric)\n",
"from fastNLP.models import CNNText\n",
"import torch.cuda\n",
"\n",
"# prepare data\n",
"def get_data():\n",
" from fastNLP.io import ChnSentiCorpPipe as pipe\n",
" data = pipe().process_from_file()\n",
" print(data)\n",
" data.rename_field('chars', 'words')\n",
" train_data = data.datasets['train']\n",
" dev_data = data.datasets['dev']\n",
" test_data = data.datasets['test']\n",
" vocab = data.vocabs['words']\n",
" tgt_vocab = data.vocabs['target']\n",
" return train_data, dev_data, test_data, vocab, tgt_vocab\n",
"\n",
"# prepare model\n",
"train_data, dev_data, _, vocab, tgt_vocab = get_data()\n",
"device = 'cuda:0' if torch.cuda.is_available() else 'cpu'\n",
"model = CNNText((len(vocab),50), num_classes=len(tgt_vocab))\n",
"\n",
"# define callback\n",
"callbacks=[EarlyStopCallback(5)]\n",
"\n",
"# pass callbacks to Trainer\n",
"def train_with_callback(cb_list):\n",
" trainer = Trainer(\n",
" device=device,\n",
" n_epochs=3,\n",
" model=model, \n",
" train_data=train_data, \n",
" dev_data=dev_data, \n",
" loss=CrossEntropyLoss(), \n",
" metrics=AccuracyMetric(), \n",
" callbacks=cb_list, \n",
" check_code_level=-1\n",
" )\n",
" trainer.train()\n",
"\n",
"train_with_callback(callbacks)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"fastNLP 中的 Callback\n",
"-------\n",
"fastNLP 中提供了很多常用的 Callback,如梯度裁剪,训练时早停和测试验证集,fitlog 等等。具体 Callback 请参考 fastNLP.core.callbacks"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"ExecuteTime": {
"end_time": "2019-09-17T07:35:02.182727Z",
"start_time": "2019-09-17T07:34:49.443863Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"training epochs started 2019-09-17-03-34-49\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.13 seconds!\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.12 seconds!\n",
"Evaluation on data-test:\n",
"AccuracyMetric: acc=0.890833\n",
"Evaluation on dev at Epoch 1/3. Step:300/900: \n",
"AccuracyMetric: acc=0.890833\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.09 seconds!\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.09 seconds!\n",
"Evaluation on data-test:\n",
"AccuracyMetric: acc=0.8875\n",
"Evaluation on dev at Epoch 2/3. Step:600/900: \n",
"AccuracyMetric: acc=0.8875\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.11 seconds!\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.1 seconds!\n",
"Evaluation on data-test:\n",
"AccuracyMetric: acc=0.885\n",
"Evaluation on dev at Epoch 3/3. Step:900/900: \n",
"AccuracyMetric: acc=0.885\n",
"\n",
"\r\n",
"In Epoch:1/Step:300, got best dev performance:\n",
"AccuracyMetric: acc=0.890833\n",
"Reloaded the best model.\n"
]
}
],
"source": [
"from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback\n",
"callbacks = [\n",
" EarlyStopCallback(5),\n",
" GradientClipCallback(clip_value=5, clip_type='value'),\n",
" EvaluateCallback(dev_data)\n",
"]\n",
"\n",
"train_with_callback(callbacks)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"自定义 Callback\n",
"------\n",
"\n",
"这里我们以一个简单的 Callback作为例子,它的作用是打印每一个 Epoch 平均训练 loss。\n",
"\n",
"#### 创建 Callback\n",
" \n",
"要自定义 Callback,我们要实现一个类,继承 fastNLP.Callback。\n",
"\n",
"这里我们定义 MyCallBack ,继承 fastNLP.Callback 。\n",
"\n",
"#### 指定 Callback 调用的阶段\n",
" \n",
"Callback 中所有以 on_ 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用,on_epoch_end() 会在每个 epoch 结束时调用。 具体有哪些类方法,参见 Callback 文档。\n",
"\n",
"这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录当前 loss ,在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。\n",
"\n",
"#### 使用 Callback 的属性访问 Trainer 的内部信息\n",
" \n",
"为了方便使用,可以使用 Callback 的属性,访问 Trainer 中的对应信息,如 optimizer, epoch, n_epochs,分别对应训练时的优化器,当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见文档 Callback 。\n",
"\n",
"这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步数,可以通过 self.step 属性得到当前训练了多少步。\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"ExecuteTime": {
"end_time": "2019-09-17T07:43:10.907139Z",
"start_time": "2019-09-17T07:42:58.488177Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"training epochs started 2019-09-17-03-42-58\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=900), HTML(value='')), layout=Layout(display=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.11 seconds!\n",
"Evaluation on dev at Epoch 1/3. Step:300/900: \n",
"AccuracyMetric: acc=0.883333\n",
"\n",
"Avg loss at epoch 1, 0.100254\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.1 seconds!\n",
"Evaluation on dev at Epoch 2/3. Step:600/900: \n",
"AccuracyMetric: acc=0.8775\n",
"\n",
"Avg loss at epoch 2, 0.183511\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=38), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 0.13 seconds!\n",
"Evaluation on dev at Epoch 3/3. Step:900/900: \n",
"AccuracyMetric: acc=0.875833\n",
"\n",
"Avg loss at epoch 3, 0.257103\n",
"\r\n",
"In Epoch:1/Step:300, got best dev performance:\n",
"AccuracyMetric: acc=0.883333\n",
"Reloaded the best model.\n"
]
}
],
"source": [
"from fastNLP import Callback\n",
"from fastNLP import logger\n",
"\n",
"class MyCallBack(Callback):\n",
" \"\"\"Print average loss in each epoch\"\"\"\n",
" def __init__(self):\n",
" super().__init__()\n",
" self.total_loss = 0\n",
" self.start_step = 0\n",
" \n",
" def on_backward_begin(self, loss):\n",
" self.total_loss += loss.item()\n",
" \n",
" def on_epoch_end(self):\n",
" n_steps = self.step - self.start_step\n",
" avg_loss = self.total_loss / n_steps\n",
" logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss)\n",
" self.start_step = self.step\n",
"\n",
"callbacks = [MyCallBack()]\n",
"train_with_callback(callbacks)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 4
}

+ 0
- 912
docs/source/_static/notebooks/序列标注.ipynb View File

@@ -1,912 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 序列标注\n",
"\n",
"这一部分的内容主要展示如何使用fastNLP实现序列标注(Sequence labeling)任务。您可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。 在阅读这篇教程前,希望您已经熟悉了fastNLP的基础使用,尤其是数据的载入以及模型的构建,通过这个小任务的能让您进一步熟悉fastNLP的使用。\n",
"\n",
"## 命名实体识别(name entity recognition, NER)\n",
"\n",
"命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。 如下面的例子中\n",
"\n",
"*我来自复旦大学*\n",
"\n",
"其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题在实际做的时候会被 转换为序列标注问题\n",
"\n",
"针对\"我来自复旦大学\"这句话,我们的预测目标将是[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG],其中O表示out,即不是一个实体,B-ORG是ORG( organization的缩写)这个类别的开头(Begin),I-ORG是ORG类别的中间(Inside)。\n",
"\n",
"在本tutorial中我们将通过fastNLP尝试写出一个能够执行以上任务的模型。\n",
"\n",
"## 载入数据\n",
"\n",
"fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您可以通过《使用Loader和Pipe处理数据》了解如何使用fastNLP提供的数据加载函数。下面我们以微博命名实体任务来演示一下在fastNLP进行序列标注任务。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n",
"| raw_chars | target | chars | seq_len |\n",
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n",
"| ['科', '技', '全', '方', '位',... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | [792, 1015, 156, 198, 291, 714... | 26 |\n",
"| ['对', ',', '输', '给', '一',... | [0, 0, 0, 0, 0, 0, 3, 1, 0, 0,... | [123, 2, 1205, 115, 8, 24, 101... | 15 |\n",
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n"
]
}
],
"source": [
"from fastNLP.io import WeiboNERPipe\n",
"data_bundle = WeiboNERPipe().process_from_file()\n",
"print(data_bundle.get_dataset('train')[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 模型构建\n",
"\n",
"首先选择需要使用的Embedding类型。关于Embedding的相关说明可以参见《使用Embedding模块将文本转成向量》。 在这里我们使用通过word2vec预训练的中文汉字embedding。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 3321 out of 3471 words in the pre-training embedding.\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"\n",
"embed = StaticEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name='cn-char-fastnlp-100d')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"选择好Embedding之后,我们可以使用fastNLP中自带的 fastNLP.models.BiLSTMCRF 作为模型。"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.models import BiLSTMCRF\n",
"\n",
"data_bundle.rename_field('chars', 'words') # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名\n",
"model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,\n",
" target_vocab=data_bundle.get_vocab('target'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 进行训练\n",
"下面我们选择用来评估模型的metric,以及优化用到的优化函数。"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import SpanFPreRecMetric\n",
"from torch.optim import Adam\n",
"from fastNLP import LossInForward\n",
"\n",
"metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))\n",
"optimizer = Adam(model.parameters(), lr=1e-2)\n",
"loss = LossInForward()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"使用Trainer进行训练, 您可以通过修改 device 的值来选择显卡。"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-13-53-24\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=430.0), HTML(value='')), layout=Layout(di…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.89 seconds!\n",
"\r",
"Evaluation on dev at Epoch 1/10. Step:43/430: \n",
"\r",
"SpanFPreRecMetric: f=0.067797, pre=0.192771, rec=0.041131\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.9 seconds!\n",
"\r",
"Evaluation on dev at Epoch 2/10. Step:86/430: \n",
"\r",
"SpanFPreRecMetric: f=0.344086, pre=0.568047, rec=0.246787\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.88 seconds!\n",
"\r",
"Evaluation on dev at Epoch 3/10. Step:129/430: \n",
"\r",
"SpanFPreRecMetric: f=0.446701, pre=0.653465, rec=0.339332\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.81 seconds!\n",
"\r",
"Evaluation on dev at Epoch 4/10. Step:172/430: \n",
"\r",
"SpanFPreRecMetric: f=0.479871, pre=0.642241, rec=0.383033\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.91 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:215/430: \n",
"\r",
"SpanFPreRecMetric: f=0.486312, pre=0.650862, rec=0.388175\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.87 seconds!\n",
"\r",
"Evaluation on dev at Epoch 6/10. Step:258/430: \n",
"\r",
"SpanFPreRecMetric: f=0.541401, pre=0.711297, rec=0.437018\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.86 seconds!\n",
"\r",
"Evaluation on dev at Epoch 7/10. Step:301/430: \n",
"\r",
"SpanFPreRecMetric: f=0.430335, pre=0.685393, rec=0.313625\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.82 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:344/430: \n",
"\r",
"SpanFPreRecMetric: f=0.477759, pre=0.665138, rec=0.372751\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.81 seconds!\n",
"\r",
"Evaluation on dev at Epoch 9/10. Step:387/430: \n",
"\r",
"SpanFPreRecMetric: f=0.500759, pre=0.611111, rec=0.424165\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:430/430: \n",
"\r",
"SpanFPreRecMetric: f=0.496025, pre=0.65, rec=0.401028\n",
"\n",
"\r\n",
"In Epoch:6/Step:258, got best dev performance:\n",
"SpanFPreRecMetric: f=0.541401, pre=0.711297, rec=0.437018\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'SpanFPreRecMetric': {'f': 0.541401,\n",
" 'pre': 0.711297,\n",
" 'rec': 0.437018}},\n",
" 'best_epoch': 6,\n",
" 'best_step': 258,\n",
" 'seconds': 121.39}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer\n",
"import torch\n",
"\n",
"device= 0 if torch.cuda.is_available() else 'cpu'\n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)\n",
"trainer.train()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 进行测试\n",
"训练结束之后过,可以通过 Tester 测试其在测试集上的性能"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=17.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.54 seconds!\n",
"[tester] \n",
"SpanFPreRecMetric: f=0.439024, pre=0.685279, rec=0.322967\n"
]
},
{
"data": {
"text/plain": [
"{'SpanFPreRecMetric': {'f': 0.439024, 'pre': 0.685279, 'rec': 0.322967}}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Tester\n",
"tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用更强的Bert做序列标注\n",
"\n",
"在fastNLP使用Bert进行任务,您只需要把fastNLP.embeddings.StaticEmbedding 切换为 fastNLP.embeddings.BertEmbedding(可修改 device 选择显卡)。"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-chinese-wwm/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 3384 words out of 3471.\n",
"input fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-13-58-51\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=1130.0), HTML(value='')), layout=Layout(d…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.7 seconds!\n",
"Evaluation on dev at Epoch 1/10. Step:113/1130: \n",
"SpanFPreRecMetric: f=0.008114, pre=0.019231, rec=0.005141\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.49 seconds!\n",
"Evaluation on dev at Epoch 2/10. Step:226/1130: \n",
"SpanFPreRecMetric: f=0.467866, pre=0.467866, rec=0.467866\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.6 seconds!\n",
"Evaluation on dev at Epoch 3/10. Step:339/1130: \n",
"SpanFPreRecMetric: f=0.566879, pre=0.482821, rec=0.686375\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.56 seconds!\n",
"Evaluation on dev at Epoch 4/10. Step:452/1130: \n",
"SpanFPreRecMetric: f=0.651972, pre=0.59408, rec=0.722365\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.69 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:565/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.640909, pre=0.574338, rec=0.724936\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.52 seconds!\n",
"Evaluation on dev at Epoch 6/10. Step:678/1130: \n",
"SpanFPreRecMetric: f=0.661836, pre=0.624146, rec=0.70437\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.67 seconds!\n",
"Evaluation on dev at Epoch 7/10. Step:791/1130: \n",
"SpanFPreRecMetric: f=0.683429, pre=0.615226, rec=0.768638\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.37 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:904/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.674699, pre=0.634921, rec=0.719794\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.42 seconds!\n",
"Evaluation on dev at Epoch 9/10. Step:1017/1130: \n",
"SpanFPreRecMetric: f=0.693878, pre=0.650901, rec=0.742931\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.46 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:1130/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.686845, pre=0.62766, rec=0.758355\n",
"\n",
"\r\n",
"In Epoch:9/Step:1017, got best dev performance:\n",
"SpanFPreRecMetric: f=0.693878, pre=0.650901, rec=0.742931\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=17.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.96 seconds!\n",
"[tester] \n",
"SpanFPreRecMetric: f=0.626561, pre=0.596112, rec=0.660287\n"
]
},
{
"data": {
"text/plain": [
"{'SpanFPreRecMetric': {'f': 0.626561, 'pre': 0.596112, 'rec': 0.660287}}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"from fastNLP.io import WeiboNERPipe\n",
"data_bundle = WeiboNERPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"from fastNLP.embeddings import BertEmbedding\n",
"embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn')\n",
"model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,\n",
" target_vocab=data_bundle.get_vocab('target'))\n",
"\n",
"from fastNLP import SpanFPreRecMetric\n",
"from torch.optim import Adam\n",
"from fastNLP import LossInForward\n",
"metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))\n",
"optimizer = Adam(model.parameters(), lr=2e-5)\n",
"loss = LossInForward()\n",
"\n",
"from fastNLP import Trainer\n",
"import torch\n",
"device= 5 if torch.cuda.is_available() else 'cpu'\n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, batch_size=12,\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)\n",
"trainer.train()\n",
"\n",
"from fastNLP import Tester\n",
"tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 564
docs/source/_static/notebooks/文本分类.ipynb View File

@@ -1,564 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 文本分类(Text classification)\n",
"文本分类任务是将一句话或一段话划分到某个具体的类别。比如垃圾邮件识别,文本情绪分类等。\n",
"\n",
"Example:: \n",
"1,商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!\n",
"\n",
"\n",
"其中开头的1是只这条评论的标签,表示是正面的情绪。我们将使用到的数据可以通过http://dbcloud.irocn.cn:8989/api/public/dl/dataset/chn_senti_corp.zip 下载并解压,当然也可以通过fastNLP自动下载该数据。\n",
"\n",
"数据中的内容如下图所示。接下来,我们将用fastNLP在这个数据上训练一个分类网络。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"![jupyter](./cn_cls_example.png)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 步骤\n",
"一共有以下的几个步骤 \n",
"(1) 读取数据 \n",
"(2) 预处理数据 \n",
"(3) 选择预训练词向量 \n",
"(4) 创建模型 \n",
"(5) 训练模型 "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (1) 读取数据\n",
"fastNLP提供多种数据的自动下载与自动加载功能,对于这里我们要用到的数据,我们可以用\\ref{Loader}自动下载并加载该数据。更多有关Loader的使用可以参考\\ref{Loader}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import ChnSentiCorpLoader\n",
"\n",
"loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader\n",
"data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回\n",
"data_bundle = loader.load(data_dir) # 这一行代码将从{data_dir}处读取数据至DataBundle"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"DataBundle的相关介绍,可以参考\\ref{}。我们可以打印该data_bundle的基本信息。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(data_bundle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看出,该data_bundle中一个含有三个\\ref{DataSet}。通过下面的代码,我们可以查看DataSet的基本情况"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(data_bundle.get_dataset('train')[:2]) # 查看Train集前两个sample"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (2) 预处理数据\n",
"在NLP任务中,预处理一般包括: (a)将一整句话切分成汉字或者词; (b)将文本转换为index \n",
"\n",
"fastNLP中也提供了多种数据集的处理类,这里我们直接使用fastNLP的ChnSentiCorpPipe。更多关于Pipe的说明可以参考\\ref{Pipe}。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import ChnSentiCorpPipe\n",
"\n",
"pipe = ChnSentiCorpPipe()\n",
"data_bundle = pipe.process(data_bundle) # 所有的Pipe都实现了process()方法,且输入输出都为DataBundle类型"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(data_bundle) # 打印data_bundle,查看其变化"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看到除了之前已经包含的3个\\ref{DataSet}, 还新增了两个\\ref{Vocabulary}。我们可以打印DataSet中的内容"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(data_bundle.get_dataset('train')[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"新增了一列为数字列表的chars,以及变为数字的target列。可以看出这两列的名称和刚好与data_bundle中两个Vocabulary的名称是一致的,我们可以打印一下Vocabulary看一下里面的内容。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"char_vocab = data_bundle.get_vocab('chars')\n",
"print(char_vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Vocabulary是一个记录着词语与index之间映射关系的类,比如"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"index = char_vocab.to_index('选')\n",
"print(\"'选'的index是{}\".format(index)) # 这个值与上面打印出来的第一个instance的chars的第一个index是一致的\n",
"print(\"index:{}对应的汉字是{}\".format(index, char_vocab.to_word(index))) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (3) 选择预训练词向量 \n",
"由于Word2vec, Glove, Elmo, Bert等预训练模型可以增强模型的性能,所以在训练具体任务前,选择合适的预训练词向量非常重要。在fastNLP中我们提供了多种Embedding使得加载这些预训练模型的过程变得更加便捷。更多关于Embedding的说明可以参考\\ref{Embedding}。这里我们先给出一个使用word2vec的中文汉字预训练的示例,之后再给出一个使用Bert的文本分类。这里使用的预训练词向量为'cn-fastnlp-100d',fastNLP将自动下载该embedding至本地缓存,fastNLP支持使用名字指定的Embedding以及相关说明可以参见\\ref{Embedding}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"\n",
"word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (4) 创建模型\n",
"这里我们使用到的模型结构如下所示,补图"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from torch import nn\n",
"from fastNLP.modules import LSTM\n",
"import torch\n",
"\n",
"# 定义模型\n",
"class BiLSTMMaxPoolCls(nn.Module):\n",
" def __init__(self, embed, num_classes, hidden_size=400, num_layers=1, dropout=0.3):\n",
" super().__init__()\n",
" self.embed = embed\n",
" \n",
" self.lstm = LSTM(self.embed.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers, \n",
" batch_first=True, bidirectional=True)\n",
" self.dropout_layer = nn.Dropout(dropout)\n",
" self.fc = nn.Linear(hidden_size, num_classes)\n",
" \n",
" def forward(self, chars, seq_len): # 这里的名称必须和DataSet中相应的field对应,比如之前我们DataSet中有chars,这里就必须为chars\n",
" # chars:[batch_size, max_len]\n",
" # seq_len: [batch_size, ]\n",
" chars = self.embed(chars)\n",
" outputs, _ = self.lstm(chars, seq_len)\n",
" outputs = self.dropout_layer(outputs)\n",
" outputs, _ = torch.max(outputs, dim=1)\n",
" outputs = self.fc(outputs)\n",
" \n",
" return {'pred':outputs} # [batch_size,], 返回值必须是dict类型,且预测值的key建议设为pred\n",
"\n",
"# 初始化模型\n",
"model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (5) 训练模型\n",
"fastNLP提供了Trainer对象来组织训练过程,包括完成loss计算(所以在初始化Trainer的时候需要指定loss类型),梯度更新(所以在初始化Trainer的时候需要提供优化器optimizer)以及在验证集上的性能验证(所以在初始化时需要提供一个Metric)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import Trainer\n",
"from fastNLP import CrossEntropyLoss\n",
"from torch.optim import Adam\n",
"from fastNLP import AccuracyMetric\n",
"\n",
"loss = CrossEntropyLoss()\n",
"optimizer = Adam(model.parameters(), lr=0.001)\n",
"metric = AccuracyMetric()\n",
"device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快\n",
"\n",
"trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, \n",
" optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=metric, device=device)\n",
"trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型\n",
"\n",
"# 在测试集上测试一下模型的性能\n",
"from fastNLP import Tester\n",
"print(\"Performance on test is:\")\n",
"tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 使用Bert进行文本分类"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 只需要切换一下Embedding即可\n",
"from fastNLP.embeddings import BertEmbedding\n",
"\n",
"# 这里为了演示一下效果,所以默认Bert不更新权重\n",
"bert_embed = BertEmbedding(char_vocab, model_dir_or_name='cn', auto_truncate=True, requires_grad=False)\n",
"model = BiLSTMMaxPoolCls(bert_embed, len(data_bundle.get_vocab('target')), )\n",
"\n",
"\n",
"import torch\n",
"from fastNLP import Trainer\n",
"from fastNLP import CrossEntropyLoss\n",
"from torch.optim import Adam\n",
"from fastNLP import AccuracyMetric\n",
"\n",
"loss = CrossEntropyLoss()\n",
"optimizer = Adam(model.parameters(), lr=2e-5)\n",
"metric = AccuracyMetric()\n",
"device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快\n",
"\n",
"trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, \n",
" optimizer=optimizer, batch_size=16, dev_data=data_bundle.get_dataset('test'),\n",
" metrics=metric, device=device, n_epochs=3)\n",
"trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型\n",
"\n",
"# 在测试集上测试一下模型的性能\n",
"from fastNLP import Tester\n",
"print(\"Performance on test is:\")\n",
"tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 基于词进行文本分类"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"由于汉字中没有显示的字与字的边界,一般需要通过分词器先将句子进行分词操作。\n",
"下面的例子演示了如何不基于fastNLP已有的数据读取、预处理代码进行文本分类。"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (1) 读取数据"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这里我们继续以之前的数据为例,但这次我们不使用fastNLP自带的数据读取代码 "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import ChnSentiCorpLoader\n",
"\n",
"loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader\n",
"data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"下面我们先定义一个read_file_to_dataset的函数, 即给定一个文件路径,读取其中的内容,并返回一个DataSet。然后我们将所有的DataSet放入到DataBundle对象中来方便接下来的预处理"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from fastNLP import DataSet, Instance\n",
"from fastNLP.io import DataBundle\n",
"\n",
"\n",
"def read_file_to_dataset(fp):\n",
" ds = DataSet()\n",
" with open(fp, 'r') as f:\n",
" f.readline() # 第一行是title名称,忽略掉\n",
" for line in f:\n",
" line = line.strip()\n",
" target, chars = line.split('\\t')\n",
" ins = Instance(target=target, raw_chars=chars)\n",
" ds.append(ins)\n",
" return ds\n",
"\n",
"data_bundle = DataBundle()\n",
"for name in ['train.tsv', 'dev.tsv', 'test.tsv']:\n",
" fp = os.path.join(data_dir, name)\n",
" ds = read_file_to_dataset(fp)\n",
" data_bundle.set_dataset(name=name.split('.')[0], dataset=ds)\n",
"\n",
"print(data_bundle) # 查看以下数据集的情况\n",
"# In total 3 datasets:\n",
"# train has 9600 instances.\n",
"# dev has 1200 instances.\n",
"# test has 1200 instances."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (2) 数据预处理"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"在这里,我们首先把句子通过 [fastHan](http://gitee.com/fastnlp/fastHan) 进行分词操作,然后创建词表,并将词语转换为序号。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastHan import FastHan\n",
"from fastNLP import Vocabulary\n",
"\n",
"model=FastHan()\n",
"# model.set_device('cuda')\n",
"\n",
"# 定义分词处理操作\n",
"def word_seg(ins):\n",
" raw_chars = ins['raw_chars']\n",
" # 由于有些句子比较长,我们只截取前128个汉字\n",
" raw_words = model(raw_chars[:128], target='CWS')[0]\n",
" return raw_words\n",
"\n",
"for name, ds in data_bundle.iter_datasets():\n",
" # apply函数将对内部的instance依次执行word_seg操作,并把其返回值放入到raw_words这个field\n",
" ds.apply(word_seg, new_field_name='raw_words')\n",
" # 除了apply函数,fastNLP还支持apply_field, apply_more(可同时创建多个field)等操作\n",
" # 同时我们增加一个seq_len的field\n",
" ds.add_seq_len('raw_words')\n",
"\n",
"vocab = Vocabulary()\n",
"\n",
"# 对raw_words列创建词表, 建议把非训练集的dataset放在no_create_entry_dataset参数中\n",
"# 也可以通过add_word(), add_word_lst()等建立词表,请参考http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_2_vocabulary.html\n",
"vocab.from_dataset(data_bundle.get_dataset('train'), field_name='raw_words', \n",
" no_create_entry_dataset=[data_bundle.get_dataset('dev'), \n",
" data_bundle.get_dataset('test')]) \n",
"\n",
"# 将建立好词表的Vocabulary用于对raw_words列建立词表,并把转为序号的列存入到words列\n",
"vocab.index_dataset(data_bundle.get_dataset('train'), data_bundle.get_dataset('dev'), \n",
" data_bundle.get_dataset('test'), field_name='raw_words', new_field_name='words')\n",
"\n",
"# 建立target的词表,target的词表一般不需要padding和unknown\n",
"target_vocab = Vocabulary(padding=None, unknown=None) \n",
"# 一般情况下我们可以只用训练集建立target的词表\n",
"target_vocab.from_dataset(data_bundle.get_dataset('train'), field_name='target') \n",
"# 如果没有传递new_field_name, 则默认覆盖原词表\n",
"target_vocab.index_dataset(data_bundle.get_dataset('train'), data_bundle.get_dataset('dev'), \n",
" data_bundle.get_dataset('test'), field_name='target')\n",
"\n",
"# 我们可以把词表保存到data_bundle中,方便之后使用\n",
"data_bundle.set_vocab(field_name='words', vocab=vocab)\n",
"data_bundle.set_vocab(field_name='target', vocab=target_vocab)\n",
"\n",
"# 我们把words和target分别设置为input和target,这样它们才会在训练循环中被取出并自动padding, 有关这部分更多的内容参考\n",
"# http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_6_datasetiter.html\n",
"data_bundle.set_target('target')\n",
"data_bundle.set_input('words', 'seq_len') # DataSet也有这两个接口\n",
"# 如果某些field,您希望它被设置为target或者input,但是不希望fastNLP自动padding或需要使用特定的padding方式,请参考\n",
"# http://www.fastnlp.top/docs/fastNLP/fastNLP.core.dataset.html\n",
"\n",
"print(data_bundle.get_dataset('train')[:2]) # 我们可以看一下当前dataset的内容\n",
"\n",
"# 由于之后需要使用之前定义的BiLSTMMaxPoolCls模型,所以需要将words这个field修改为chars(因为该模型的forward接受chars参数)\n",
"data_bundle.rename_field('words', 'chars')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (3) 选择预训练词向量"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这里我们选择腾讯的预训练中文词向量,可以在 [腾讯词向量](https://ai.tencent.com/ailab/nlp/en/embedding.html) 处下载并解压。这里我们不能直接使用BERT,因为BERT是基于中文字进行预训练的。"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"\n",
"word2vec_embed = StaticEmbedding(data_bundle.get_vocab('words'), \n",
" model_dir_or_name='/path/to/Tencent_AILab_ChineseEmbedding.txt')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import Trainer\n",
"from fastNLP import CrossEntropyLoss\n",
"from torch.optim import Adam\n",
"from fastNLP import AccuracyMetric\n",
"\n",
"# 初始化模型\n",
"model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))\n",
"\n",
"# 开始训练\n",
"loss = CrossEntropyLoss()\n",
"optimizer = Adam(model.parameters(), lr=0.001)\n",
"metric = AccuracyMetric()\n",
"device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快\n",
"\n",
"trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss, \n",
" optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=metric, device=device)\n",
"trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型\n",
"\n",
"# 在测试集上测试一下模型的性能\n",
"from fastNLP import Tester\n",
"print(\"Performance on test is:\")\n",
"tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 0
- 181
docs/source/conf.py View File

@@ -1,181 +0,0 @@
# -*- coding: utf-8 -*-
#
# Configuration file for the Sphinx documentation builder.
#
# This file does only contain a selection of the most common options. For a
# full list see the documentation:
# http://www.sphinx-doc.org/en/master/config

# -- Path setup --------------------------------------------------------------

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import sys

sys.path.insert(0, os.path.abspath('../../'))

# -- Project information -----------------------------------------------------

project = 'fastNLP'
copyright = '2020, xpqiu'
author = 'xpqiu'

# The short X.Y version
version = '0.6.0'
# The full version, including alpha/beta/rc tags
release = '0.6.0'

# -- General configuration ---------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'sphinx.ext.autosummary',
'sphinx.ext.mathjax',
'sphinx.ext.todo'
]

autodoc_default_options = {
'member-order': 'bysource',
'special-members': '__init__',
'undoc-members': False,
}

autoclass_content = "class"

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# template_bridge
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
source_suffix = '.rst'

# The master toctree document.
master_doc = 'index'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = "zh_CN"

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path .
exclude_patterns = ['modules.rst']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

# -- Options for HTML output -------------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
html_theme_options = {
'collapse_navigation': False,
'titles_only': True
}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']

# Custom sidebar templates, must be a dictionary that maps document names
# to template names.
#
# The default sidebars (for documents that don't match any pattern) are
# defined by theme itself. Builtin themes are using these templates by
# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
# 'searchbox.html']``.
#
# html_sidebars = {}


# -- Options for HTMLHelp output ---------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = 'fastNLP doc'

# -- Options for LaTeX output ------------------------------------------------

latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',
# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'fastNLP.tex', 'fastNLP Documentation',
'xpqiu', 'manual'),
]

# -- Options for manual page output ------------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'fastnlp', 'fastNLP Documentation',
[author], 1)
]

# -- Options for Texinfo output ----------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'fastNLP', 'fastNLP Documentation',
author, 'fastNLP', 'One line description of project.',
'Miscellaneous'),
]


# -- Extension configuration -------------------------------------------------
def maybe_skip_member(app, what, name, obj, skip, options):
if obj.__doc__ is None:
return True
if name == "__init__":
return False
if name.startswith("_"):
return True
return False


def setup(app):
app.connect('autodoc-skip-member', maybe_skip_member)

+ 0
- 7
docs/source/fastNLP.core.batch.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.batch
==================

.. automodule:: fastNLP.core.batch
:members: BatchIter, DataSetIter, TorchLoaderIter
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.callback.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.callback
=====================

.. automodule:: fastNLP.core.callback
:members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.const.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.const
==================

.. automodule:: fastNLP.core.const
:members: Const
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.dataset.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.dataset
====================

.. automodule:: fastNLP.core.dataset
:members: DataSet
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.field.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.field
==================

.. automodule:: fastNLP.core.field
:members: Padder, AutoPadder, EngChar2DPadder
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.instance.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.instance
=====================

.. automodule:: fastNLP.core.instance
:members: Instance
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.losses.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.losses
===================

.. automodule:: fastNLP.core.losses
:members: LossBase, LossFunc, LossInForward, CrossEntropyLoss, BCELoss, L1Loss, NLLLoss
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.metrics.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.metrics
====================

.. automodule:: fastNLP.core.metrics
:members: MetricBase, AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric, ClassifyFPreRecMetric
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.optimizer.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.optimizer
======================

.. automodule:: fastNLP.core.optimizer
:members: Optimizer, SGD, Adam, AdamW
:inherited-members:


+ 0
- 25
docs/source/fastNLP.core.rst View File

@@ -1,25 +0,0 @@
fastNLP.core
============

.. automodule:: fastNLP.core

子模块
------

.. toctree::
:maxdepth: 1

fastNLP.core.batch
fastNLP.core.callback
fastNLP.core.const
fastNLP.core.dataset
fastNLP.core.field
fastNLP.core.instance
fastNLP.core.losses
fastNLP.core.metrics
fastNLP.core.optimizer
fastNLP.core.sampler
fastNLP.core.tester
fastNLP.core.trainer
fastNLP.core.utils
fastNLP.core.vocabulary

+ 0
- 7
docs/source/fastNLP.core.sampler.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.sampler
====================

.. automodule:: fastNLP.core.sampler
:members: Sampler, BucketSampler, SequentialSampler, RandomSampler
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.tester.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.tester
===================

.. automodule:: fastNLP.core.tester
:members: Tester
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.trainer.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.trainer
====================

.. automodule:: fastNLP.core.trainer
:members: Trainer
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.utils.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.utils
==================

.. automodule:: fastNLP.core.utils
:members: cache_results, seq_len_to_mask, get_seq_len
:inherited-members:


+ 0
- 7
docs/source/fastNLP.core.vocabulary.rst View File

@@ -1,7 +0,0 @@
fastNLP.core.vocabulary
=======================

.. automodule:: fastNLP.core.vocabulary
:members: Vocabulary, VocabularyOption
:inherited-members:


+ 0
- 6
docs/source/fastNLP.embeddings.bert_embedding.rst View File

@@ -1,6 +0,0 @@
fastNLP.embeddings.bert_embedding
=================================

.. automodule:: fastNLP.embeddings.bert_embedding
:members: BertEmbedding, BertWordPieceEncoder


+ 0
- 6
docs/source/fastNLP.embeddings.char_embedding.rst View File

@@ -1,6 +0,0 @@
fastNLP.embeddings.char_embedding
=================================

.. automodule:: fastNLP.embeddings.char_embedding
:members: CNNCharEmbedding, LSTMCharEmbedding


+ 0
- 6
docs/source/fastNLP.embeddings.contextual_embedding.rst View File

@@ -1,6 +0,0 @@
fastNLP.embeddings.contextual_embedding
=======================================

.. automodule:: fastNLP.embeddings.contextual_embedding
:members: ContextualEmbedding


+ 0
- 6
docs/source/fastNLP.embeddings.elmo_embedding.rst View File

@@ -1,6 +0,0 @@
fastNLP.embeddings.elmo_embedding
=================================

.. automodule:: fastNLP.embeddings.elmo_embedding
:members: ElmoEmbedding


+ 0
- 6
docs/source/fastNLP.embeddings.embedding.rst View File

@@ -1,6 +0,0 @@
fastNLP.embeddings.embedding
============================

.. automodule:: fastNLP.embeddings.embedding
:members: Embedding, TokenEmbedding


+ 0
- 20
docs/source/fastNLP.embeddings.rst View File

@@ -1,20 +0,0 @@
fastNLP.embeddings
==================

.. automodule:: fastNLP.embeddings
:members: Embedding, TokenEmbedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, BertWordPieceEncoder, StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding, get_embeddings

子模块
------

.. toctree::
:maxdepth: 1

fastNLP.embeddings.bert_embedding
fastNLP.embeddings.char_embedding
fastNLP.embeddings.contextual_embedding
fastNLP.embeddings.elmo_embedding
fastNLP.embeddings.embedding
fastNLP.embeddings.stack_embedding
fastNLP.embeddings.static_embedding
fastNLP.embeddings.utils

+ 0
- 6
docs/source/fastNLP.embeddings.stack_embedding.rst View File

@@ -1,6 +0,0 @@
fastNLP.embeddings.stack_embedding
==================================

.. automodule:: fastNLP.embeddings.stack_embedding
:members: StackEmbedding


+ 0
- 6
docs/source/fastNLP.embeddings.static_embedding.rst View File

@@ -1,6 +0,0 @@
fastNLP.embeddings.static_embedding
===================================

.. automodule:: fastNLP.embeddings.static_embedding
:members: StaticEmbedding


+ 0
- 6
docs/source/fastNLP.embeddings.utils.rst View File

@@ -1,6 +0,0 @@
fastNLP.embeddings.utils
========================

.. automodule:: fastNLP.embeddings.utils
:members: get_embeddings


+ 0
- 7
docs/source/fastNLP.io.data_bundle.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.data_bundle
======================

.. automodule:: fastNLP.io.data_bundle
:members: DataBundle
:inherited-members:


+ 0
- 7
docs/source/fastNLP.io.embed_loader.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.embed_loader
=======================

.. automodule:: fastNLP.io.embed_loader
:members: EmbedLoader, EmbeddingOption
:inherited-members:


+ 0
- 7
docs/source/fastNLP.io.file_utils.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.file_utils
=====================

.. automodule:: fastNLP.io.file_utils
:members: cached_path, get_filepath, get_cache_path, split_filename_suffix, get_from_cache
:inherited-members:


+ 0
- 7
docs/source/fastNLP.io.loader.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.loader
=================

.. automodule:: fastNLP.io.loader
:members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, CoReferenceLoader
:inherited-members:


+ 0
- 7
docs/source/fastNLP.io.model_io.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.model_io
===================

.. automodule:: fastNLP.io.model_io
:members: ModelLoader, ModelSaver
:inherited-members:


+ 0
- 7
docs/source/fastNLP.io.pipe.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.pipe
===============

.. automodule:: fastNLP.io.pipe
:members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, LCQMCPipe, CNXNLIPipe, BQCorpusPipe, RenamePipe, GranularizePipe, MachingTruncatePipe, CoReferencePipe
:inherited-members:


+ 0
- 20
docs/source/fastNLP.io.rst View File

@@ -1,20 +0,0 @@
fastNLP.io
==========

.. automodule:: fastNLP.io
:members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver
:inherited-members:

子模块
------

.. toctree::
:maxdepth: 1

fastNLP.io.data_bundle
fastNLP.io.embed_loader
fastNLP.io.file_utils
fastNLP.io.loader
fastNLP.io.model_io
fastNLP.io.pipe
fastNLP.io.utils

+ 0
- 7
docs/source/fastNLP.io.utils.rst View File

@@ -1,7 +0,0 @@
fastNLP.io.utils
================

.. automodule:: fastNLP.io.utils
:members: check_loader_paths
:inherited-members:


+ 0
- 6
docs/source/fastNLP.models.bert.rst View File

@@ -1,6 +0,0 @@
fastNLP.models.bert
===================

.. automodule:: fastNLP.models.bert
:members: BertForSequenceClassification, BertForSentenceMatching, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering


+ 0
- 6
docs/source/fastNLP.models.biaffine_parser.rst View File

@@ -1,6 +0,0 @@
fastNLP.models.biaffine_parser
==============================

.. automodule:: fastNLP.models.biaffine_parser
:members: BiaffineParser, GraphParser


+ 0
- 6
docs/source/fastNLP.models.cnn_text_classification.rst View File

@@ -1,6 +0,0 @@
fastNLP.models.cnn_text_classification
======================================

.. automodule:: fastNLP.models.cnn_text_classification
:members: CNNText


+ 0
- 18
docs/source/fastNLP.models.rst View File

@@ -1,18 +0,0 @@
fastNLP.models
==============

.. automodule:: fastNLP.models
:members: CNNText, SeqLabeling, AdvSeqLabel, ESIM, StarTransEnc, STSeqLabel, STNLICls, STSeqCls, BiaffineParser, GraphParser, BertForSequenceClassification, BertForSentenceMatching, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering

子模块
------

.. toctree::
:maxdepth: 1

fastNLP.models.bert
fastNLP.models.biaffine_parser
fastNLP.models.cnn_text_classification
fastNLP.models.sequence_labeling
fastNLP.models.snli
fastNLP.models.star_transformer

+ 0
- 6
docs/source/fastNLP.models.sequence_labeling.rst View File

@@ -1,6 +0,0 @@
fastNLP.models.sequence_labeling
================================

.. automodule:: fastNLP.models.sequence_labeling
:members: SeqLabeling, AdvSeqLabel, BiLSTMCRF


+ 0
- 6
docs/source/fastNLP.models.snli.rst View File

@@ -1,6 +0,0 @@
fastNLP.models.snli
===================

.. automodule:: fastNLP.models.snli
:members: ESIM


+ 0
- 6
docs/source/fastNLP.models.star_transformer.rst View File

@@ -1,6 +0,0 @@
fastNLP.models.star_transformer
===============================

.. automodule:: fastNLP.models.star_transformer
:members: StarTransEnc, STNLICls, STSeqCls, STSeqLabel


+ 0
- 6
docs/source/fastNLP.modules.decoder.rst View File

@@ -1,6 +0,0 @@
fastNLP.modules.decoder
=======================

.. automodule:: fastNLP.modules.decoder
:members: MLP, ConditionalRandomField, viterbi_decode, allowed_transitions


+ 0
- 6
docs/source/fastNLP.modules.encoder.rst View File

@@ -1,6 +0,0 @@
fastNLP.modules.encoder
=======================

.. automodule:: fastNLP.modules.encoder
:members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, KMaxPool, AvgPool, AvgPoolWithMask, MultiHeadAttention, BiAttention, SelfAttention


+ 0
- 15
docs/source/fastNLP.modules.rst View File

@@ -1,15 +0,0 @@
fastNLP.modules
===============

.. automodule:: fastNLP.modules
:members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, KMaxPool, AvgPool, AvgPoolWithMask, MultiHeadAttention, MLP, ConditionalRandomField, viterbi_decode, allowed_transitions, TimestepDropout

子模块
------

.. toctree::
:maxdepth: 1

fastNLP.modules.decoder
fastNLP.modules.encoder
fastNLP.modules.utils

+ 0
- 6
docs/source/fastNLP.modules.utils.rst View File

@@ -1,6 +0,0 @@
fastNLP.modules.utils
=====================

.. automodule:: fastNLP.modules.utils
:members: initial_parameter, summary


+ 0
- 18
docs/source/fastNLP.rst View File

@@ -1,18 +0,0 @@
fastNLP
=======

.. automodule:: fastNLP
:members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger
:inherited-members:

子模块
------

.. toctree::
:maxdepth: 1

fastNLP.core
fastNLP.embeddings
fastNLP.io
fastNLP.models
fastNLP.modules

BIN
docs/source/figures/fitlogChart.png View File

Before After
Width: 2556  |  Height: 1450  |  Size: 272 kB

BIN
docs/source/figures/fitlogTable.png View File

Before After
Width: 2552  |  Height: 858  |  Size: 168 kB

BIN
docs/source/figures/procedures.PNG View File

Before After
Width: 1150  |  Height: 161  |  Size: 14 kB

BIN
docs/source/figures/sequence_labeling.PNG View File

Before After
Width: 762  |  Height: 541  |  Size: 30 kB

BIN
docs/source/figures/text_classification.png View File

Before After
Width: 3200  |  Height: 1438  |  Size: 322 kB

BIN
docs/source/figures/workflow.png View File

Before After
Width: 1014  |  Height: 709  |  Size: 89 kB

+ 0
- 56
docs/source/index.rst View File

@@ -1,56 +0,0 @@
fastNLP 中文文档
=====================

`fastNLP <https://github.com/fastnlp/fastNLP/>`_ 是一款轻量级的自然语言处理(NLP)工具包。你既可以用它来快速地完成一个NLP任务,
也可以用它在研究中快速构建更复杂的模型。

.. hint::

如果你是从 readthedocs 访问的该文档,请跳转到我们的 `最新网站 <http://www.fastnlp.top/docs/fastNLP/>`_

fastNLP具有如下的特性:

- 统一的Tabular式数据容器,简化数据预处理过程;
- 内置多种数据集的 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` ,省去预处理代码;
- 各种方便的NLP工具,例如Embedding加载(包括 :class:`~fastNLP.embeddings.ElmoEmbedding` 和 :class:`~fastNLP.embeddings.BertEmbedding` )、中间数据cache等;
- 部分 `数据集与预训练模型 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_ 的自动下载;
- 提供多种神经网络组件以及复现模型(涵盖中文分词、命名实体识别、句法分析、文本分类、文本匹配、指代消解、摘要等任务);
- :class:`~fastNLP.Trainer` 提供多种内置 :mod:`~fastNLP.core.callback` 函数,方便实验记录、异常捕获等.


用户手册
----------------

.. toctree::
:maxdepth: 2

安装指南 </user/installation>
快速入门 </user/quickstart>
详细教程 </user/tutorials>

API 文档
-------------

除了用户手册之外,你还可以通过查阅 API 文档来找到你所需要的工具。

.. toctree::
:titlesonly:
:maxdepth: 2
fastNLP


:doc:`API变动列表 </user/api_update>`

fitlog文档
----------

您可以 `点此 <http://www.fastnlp.top/docs/fitlog/>`_ 查看fitlog的文档。
fitlog 是由我们团队开发的日志记录+代码管理的工具。

索引与搜索
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

+ 0
- 7
docs/source/modules.rst View File

@@ -1,7 +0,0 @@
fastNLP
=======

.. toctree::
:maxdepth: 4

fastNLP

BIN
docs/source/tutorials/cn_cls_example.png View File

Before After
Width: 722  |  Height: 317  |  Size: 162 kB

+ 0
- 231
docs/source/tutorials/extend_1_bert_embedding.rst View File

@@ -1,231 +0,0 @@
==============================
BertEmbedding的各种用法
==============================

Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_
中被提出后,因其性能卓越受到了极大的关注,在这里我们展示一下在fastNLP中如何使用Bert进行各类任务。其中中文Bert我们使用的模型的权重来自于
`中文Bert预训练 <https://github.com/ymcui/Chinese-BERT-wwm>`_ 。

为了方便大家的使用,fastNLP提供了预训练的Embedding权重及数据集的自动下载,支持自动下载的Embedding和数据集见
`数据集 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?tab=fed5xh&c=D42A0AC0>`_ 。或您可从 :doc:`/tutorials/tutorial_3_embedding` 与
:doc:`/tutorials/tutorial_4_load_dataset` 了解更多相关信息。

----------------------------------
中文任务
----------------------------------
下面我们将介绍通过使用Bert来进行文本分类, 中文命名实体识别, 文本匹配, 中文问答。

.. note::

本教程必须使用 GPU 进行实验,并且会花费大量的时间

1. 使用Bert进行文本分类
----------------------------------
文本分类是指给定一段文字,判定其所属的类别。例如下面的文本情感分类

.. code-block:: text

1, 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!

这里我们使用fastNLP提供自动下载的微博分类进行测试

.. code-block:: python

from fastNLP.io import WeiboSenti100kPipe
from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForSequenceClassification
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam
import torch

data_bundle =WeiboSenti100kPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

# 载入BertEmbedding
embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)

# 载入模型
model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))

# 训练模型
device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model,
optimizer=Adam(model_params=model.parameters(), lr=2e-5),
loss=CrossEntropyLoss(), device=device,
batch_size=8, dev_data=data_bundle.get_dataset('dev'),
metrics=AccuracyMetric(), n_epochs=2, print_every=1)
trainer.train()

# 测试结果
from fastNLP import Tester

tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())
tester.test()

输出结果::

In Epoch:1/Step:12499, got best dev performance:
AccuracyMetric: acc=0.9838
Reloaded the best model.
Evaluate data in 63.84 seconds!
[tester]
AccuracyMetric: acc=0.9815


2. 使用Bert进行命名实体识别
----------------------------------
命名实体识别是给定一句话,标记出其中的实体。一般序列标注的任务都使用conll格式,conll格式是至一行中通过制表符分隔不同的内容,使用空行分隔
两句话,例如下面的例子

.. code-block:: text

中 B-ORG
共 I-ORG
中 I-ORG
央 I-ORG
致 O
中 B-ORG
国 I-ORG
致 I-ORG
公 I-ORG
党 I-ORG
十 I-ORG
一 I-ORG
大 I-ORG
的 O
贺 O
词 O

这部分内容请参考 :doc:`/tutorials/序列标注`


3. 使用Bert进行文本匹配
----------------------------------
文本匹配任务是指给定两句话判断他们的关系。比如,给定两句话判断前一句是否和后一句具有因果关系或是否是矛盾关系;或者给定两句话判断两句话是否
具有相同的意思。这里我们使用

.. code-block:: python

from fastNLP.io import CNXNLIBertPipe
from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForSentenceMatching
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam
from fastNLP.core.optimizer import AdamW
from fastNLP.core.callback import WarmupCallback
from fastNLP import Tester
import torch

data_bundle = CNXNLIBertPipe().process_from_file()
data_bundle.rename_field('chars', 'words')
print(data_bundle)

# 载入BertEmbedding
embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)

# 载入模型
model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target')))

# 训练模型
callbacks = [WarmupCallback(warmup=0.1, schedule='linear'), ]
device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model,
optimizer=AdamW(params=model.parameters(), lr=4e-5),
loss=CrossEntropyLoss(), device=device,
batch_size=8, dev_data=data_bundle.get_dataset('dev'),
metrics=AccuracyMetric(), n_epochs=5, print_every=1,
update_every=8, callbacks=callbacks)
trainer.train()

tester = Tester(data_bundle.get_dataset('test'), model, batch_size=8, metrics=AccuracyMetric())
tester.test()

运行结果::

In Epoch:3/Step:73632, got best dev performance:
AccuracyMetric: acc=0.781928
Reloaded the best model.
Evaluate data in 18.54 seconds!
[tester]
AccuracyMetric: acc=0.783633


4. 使用Bert进行中文问答
----------------------------------
问答任务是给定一段内容,以及一个问题,需要从这段内容中找到答案。
例如::

"context": "锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的演奏方法。常
用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子和尾声,提示音乐的板式和速度,以及
作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合
相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果,如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单
皮鼓(板鼓)、大鼓、大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、大
钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责,又称为四大件,领奏的师
傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼
和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、昆剧和粤剧锣鼓中乐器对应的口诀用字:",
"question": "锣鼓经是什么?",
"answers": [
{
"text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法",
"answer_start": 4
},
{
"text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法",
"answer_start": 4
},
{
"text": "大陆传统器乐及戏曲里面常用的打击乐记谱方法",
"answer_start": 4
}
]

您可以通过以下的代码训练 (原文代码:`CMRC2018 <https://github.com/ymcui/cmrc2018>`_)

.. code-block:: python

from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForQuestionAnswering
from fastNLP.core.losses import CMRC2018Loss
from fastNLP.core.metrics import CMRC2018Metric
from fastNLP.io.pipe.qa import CMRC2018BertPipe
from fastNLP import Trainer, BucketSampler
from fastNLP import WarmupCallback, GradientClipCallback
from fastNLP.core.optimizer import AdamW
import torch

data_bundle = CMRC2018BertPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

print(data_bundle)

embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True,
dropout=0.5, word_dropout=0.01)
model = BertForQuestionAnswering(embed)
loss = CMRC2018Loss()
metric = CMRC2018Metric()

wm_callback = WarmupCallback(schedule='linear')
gc_callback = GradientClipCallback(clip_value=1, clip_type='norm')
callbacks = [wm_callback, gc_callback]

optimizer = AdamW(model.parameters(), lr=5e-5)

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,
sampler=BucketSampler(seq_len_field_name='context_len'),
dev_data=data_bundle.get_dataset('dev'), metrics=metric,
callbacks=callbacks, device=device, batch_size=6, num_workers=2, n_epochs=2, print_every=1,
test_use_tqdm=False, update_every=10)
trainer.train(load_best_model=False)

训练结果(和原论文中报道的基本一致)::

In Epoch:2/Step:1692, got best dev performance:
CMRC2018Metric: f1=85.61, em=66.08


----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/extend_1_bert_embedding.ipynb" download="extend_1_bert_embedding.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 223
docs/source/tutorials/extend_2_dist.rst View File

@@ -1,223 +0,0 @@
Distributed Parallel Training
=============================

原理
----

随着深度学习模型越来越复杂,单个GPU可能已经无法满足正常的训练。比如BERT等预训练模型,更是在多个GPU上训练得到的。为了使用多GPU训练,Pytorch框架已经提供了
`nn.DataParallel <https://pytorch.org/docs/stable/nn.html#dataparallel>`_ 以及
`nn.DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#distributeddataparallel>`_ 两种方式的支持。
`nn.DataParallel <https://pytorch.org/docs/stable/nn.html#dataparallel>`_
很容易使用,但是却有着GPU负载不均衡,单进程速度慢等缺点,无法发挥出多GPU的全部性能。因此,分布式的多GPU训练方式
`nn.DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#distributeddataparallel>`_
是更好的选择。然而,因为分布式训练的特点,
`nn.DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#distributeddataparallel>`_
常常难以理解和使用,也很难debug。所以,在使用分布式训练之前,需要理解它的原理。

在使用
`nn.DistributedDataParallel <https://pytorch.org/docs/stable/nn.html#distributeddataparallel>`_
时,模型会被复制到所有使用的GPU,通常每个GPU上存有一个模型,并被一个单独的进程控制。这样有N块GPU,就会产生N个进程。当训练一个batch时,这一batch会被分为N份,每个进程会使用batch的一部分进行训练,然后在必要时进行同步,并通过网络传输需要同步的数据。这时,只有模型的梯度会被同步,而模型的参数不会,所以能缓解大部分的网络传输压力,网络传输不再是训练速度的瓶颈之一。你可能会好奇,不同步模型的参数,怎么保证不同进程所训练的模型相同?只要每个进程初始的模型是同一个,具有相同的参数,而之后每次更新,都使用相同的梯度,就能保证梯度更新后的模型也具有相同的参数了。

为了让每个进程的模型初始化完全相同,通常这N个进程都是由单个进程复制而来的,这时需要对分布式的进程进行初始化,建立相互通信的机制。在
Pytorch 中,我们用
`distributed.init_process_group <https://pytorch.org/docs/stable/distributed.html#initialization>`_
函数来完成,需要在程序开头就加入这一步骤。初始化完成后,每一个进程用唯一的编号
``rank`` 进行区分,从 0 到 N-1递增,一般地,我们将 ``rank`` 为 0
的进程当作主进程,而其他 ``rank`` 的进程为子进程。每个进程还要知道
``world_size`` ,即分布式训练的总进程数
N。训练时,每个进程使用batch的一部分,互相不能重复,这里通过
`nn.utils.data.DistributedSampler <https://pytorch.org/docs/stable/_modules/torch/utils/data/distributed.html>`_
来实现。

使用方式
--------

Pytorch的分布式训练使用起来非常麻烦,难以理解,可以从给出的\ `官方教程 <https://pytorch.org/tutorials/intermediate/ddp_tutorial.html>`_ \ 中看到。而\ ``fastNLP``
提供了
``DistTrainer``\ ,将大部分的分布式训练的细节进行了封装,只需简单的改动训练代码,就能直接用上分布式训练。那么,具体怎么将普通的训练代码改成支持分布式训练的代码呢。下面我们来讲一讲分布式训练的完整流程。通常,分布式程序的多个进程是单个进程的复制。假设我们用N个GPU进行分布式训练,我们需要启动N个进程,这时,在命令行使用:

.. code:: shell

python -m torch.distributed.launch --nproc_per_node=N train_script.py --args

其中\ ``N``\ 是需要启动的进程数,\ ``train_script.py``\ 为训练代码,\ ``--args``\ 是自定义的命令行参数。在启动了N个进程之后,如果我们在\ ``train_script.py``\ 的训练代码中正常配置,分布式训练就能正常进行。

此外,还可以使用环境变量\ ``CUDA_VISIBLE_DEVICES``\ 设置指定的GPU,比如在8卡机器上使用编号为4,5,6,7的4块GPU:

.. code:: shell

CUDA_VISIBLE_DEVICES=4,5,6,7 python -m torch.distributed.launch --nproc_per_node=N train_script.py --args

在 ``train_script.py``
训练代码中,有一些必须的配置。为了清晰的叙述,这里放一个简单的分布式训练代码,省去多余细节:

.. code:: python

import torch.distributed as dist
from fastNLP import DistTrainer, get_local_rank
import fastNLP as fnlp

def main(options):
# options为训练所需的参数,batch_size等
set_seed(options.seed)
# 初始化分布式进程
dist.init_process_group('nccl')

######## 读取数据
if get_local_rank() != 0:
dist.barrier() # 先让主进程(rank==0)先执行,进行数据处理,预训模型参数下载等操作,然后保存cache
data = get_processed_data()
model = get_model(data.get_vocab("words"), data.get_vocab("target"))
if get_local_rank() == 0:
dist.barrier() # 主进程执行完后,其余进程开始读取cache
########

# 初始化Trainer,训练等,与普通训练差别不大
def get_trainer(model, data):
# 注意设置的callback有两种,一种只在主进程执行,一种在所有进程都执行
callbacks_master = [fnlp.FitlogCallback()]
callbacks_all = [fnlp.WarmupCallback(warmup=options.warmup)]
trainer = DistTrainer(
save_path='save',
train_data=data.get_dataset("train"),
dev_data=data.get_dataset("dev"),
model=model,
loss=fnlp.CrossEntropyLoss(),
metrics=fnlp.AccuracyMetric(),
metric_key="acc",
optimizer=fnlp.AdamW(model.parameters(), lr=options.lr),
callbacks_master=callbacks_master, # 仅在主进程执行(如模型保存,日志记录)
callbacks_all=callbacks_all, # 在所有进程都执行(如梯度裁剪,学习率衰减)
batch_size_per_gpu=options.batch_size, # 指定每个GPU的batch大小
update_every=options.update,
n_epochs=options.epochs,
use_tqdm=True,
)
return trainer
trainer = get_trainer(model, data)
trainer.train()

指定进程编号
^^^^^^^^^^^^

首先,为了区分不同的进程,初始时需要对每个进程传入\ ``rank``\ 。这里一般分为\ ``node_rank``\ 和\ ``local_rank``\ ,分别表示进程处于哪一机器以及同机器上处于第几进程。如果在单一机器上,\ ``node_rank``\ 可以省略。\ ``local_rank``\ 一般通过命令行参数\ ``--local_rank``\ 传入,为\ ``int``\ 类型。也可以通过环境变量传入\ ``local_rank``\ ,只需在\ ``torch.distributed.launch``\ 时,使用\ ``--use_env``\ 参数。无论哪种方式,在训练脚本中,都要获取到\ ``local_rank``\ ,用于初始化分布式通信,以及区分进程。如果你使用\ ``fastNLP``\ ,可以通过\ ``fastNLP.get_local_rank``\ 来得到\ ``local_rank``\ 。

初始化进程
^^^^^^^^^^

在获取了\ ``local_rank``\ 等重要参数后,在开始训练前,我们需要建立不同进程的通信和同步机制。这时我们使用\ `torch.distributed.init_process_group <https://pytorch.org/docs/stable/distributed.html#initialization>`_
来完成。通常,我们只需要 ``torch.distributed.init_process_group('nccl')``
来指定使用\ ``nccl``\ 后端来进行同步即可。其他参数程序将读取环境变量自动设置。如果想手动设置这些参数,比如,使用TCP进行通信,可以设置:

.. code:: python

init_process_group('nccl', init_method='tcp://localhost:55678',
rank=args.rank, world_size=N)

或者使用文件进行通信:

.. code:: python

init_process_group('nccl', init_method='file:///mnt/nfs/sharedfile',
world_size=N, rank=args.rank)

注意,此时必须显式指定\ ``world_size``\ 和\ ``rank``\ ,具体可以参考
`torch.distributed.init_process_group <https://pytorch.org/docs/stable/distributed.html#initialization>`_
的使用文档。

在初始化分布式通信后,再初始化\ ``DistTrainer``\ ,传入数据和模型,就完成了分布式训练的代码。代码修改完成后,使用上面给出的命令行启动脚本,就能成功运行分布式训练。但是,如果数据处理,训练中的自定义操作比较复杂,则可能需要额外的代码修改。下面列出一些需要特别注意的地方,在使用分布式训练前,请仔细检查这些事项。

注意事项
--------

在执行完
`torch.distributed.init_process_group <https://pytorch.org/docs/stable/distributed.html#initialization>`_
后,我们就可以在不同进程间完成传输数据,进行同步等操作。这些操作都可以在\ `torch.distributed <https://pytorch.org/docs/stable/distributed.html#>`_
中找到。其中,最重要的是
`barrier <https://pytorch.org/docs/stable/distributed.html#torch.distributed.barrier>`_
以及
`get_rank <https://pytorch.org/docs/stable/distributed.html#torch.distributed.get_rank>`_
操作。对于训练而言,我们关心的是读入数据,记录日志,模型初始化,模型参数更新,模型保存等操作。这些操作大多是读写操作,在多进程状态下,这些操作都必须小心进行,否则可能出现难以预料的bug。而在\ ``fastNLP``\ 中,大部分操作都封装在
``DistTrainer`` 中,只需保证数据读入和模型初始化正确即可完成训练。

写操作
^^^^^^

一般而言,读入操作需要在每一个进程都执行,因为每个进程都要使用读入的数据和模型参数进行训练。而写出操作只需在其中一个进程(通常为主进程)执行,因为每一个进程保存的模型都相同,都处于同一训练状态。所以,通常单进程的训练脚本中,只需要修改写出操作的部分,通过加入对进程\ ``rank``\ 的判断,仅让其中一个进程执行写操作:

.. code:: python

import torch.distributed as dist

# 仅在主进程才执行
if dist.get_rank() == 0:
do_wirte_op() # 一些写操作
dist.barrier() # 确保写完成后,所有进程再执行(若进程无需读入写出的数据,可以省去)

若使用\ ``fastNLP``\ 中的\ ``DistTrainer``\ ,也可以这样写:

.. code:: python

# 判断是否是主进程的trainer
if trainer.is_master:
do_wirte_op()
dist.barrier()

读操作
^^^^^^

然而有些时候,我们需要其中一个进程先执行某些操作,等这一进程执行完后,其它进程再执行这一操作。比如,在读入数据时,我们有时需要从网上下载,再处理,将处理好的数据保存,供反复使用。这时,我们不需要所有进程都去下载和处理数据,只需要主进程进行这些操作,其它进程等待。直到处理好的数据被保存后,其他进程再从保存位置直接读入数据。这里可以参考范例代码中的读取数据:

.. code:: python

if dist.get_rank() != 0:
dist.barrier() # 先让主进程(rank==0)先执行,进行数据处理,预训模型参数下载等操作,然后保存cache

# 这里会自动处理数据,或直接读取保存的cache
data = get_processed_data()
model = get_model(data.get_vocab("words"), data.get_vocab("target"))

if dist.get_rank() == 0:
dist.barrier() # 主进程执行完后,其余进程开始读取cache

也可以显式的将主进程和其它进程的操作分开:

.. code:: python

if dist.get_rank() == 0:
data = do_data_processing() # 数据处理
dist.barrier()
else:
dist.barrier()
data = load_processed_data() # 读取cache

日志操作
^^^^^^^^

通常,我们需要知道训练的状态,如当前在第几个epoch,模型当前的loss等等。单进程训练时,我们可以直接使用\ ``print``\ 将这些信息输出到命令行或日志文件。然而,在多进程时,\ ``print``\ 会导致同样的信息在每一进程都输出,造成问题。这一问题和写操作类似,也可以通过判断进程的编号之后再输出。问题是,日志通常在训练的很多地方都有输出,逐一加上判断代码是非常繁琐的。这里,建议统一修改为:

.. code:: python

from fastNLP import logger
logger.info('....') # 替换print

在\ ``DistTrainer``\ 中,主进程的\ ``logger``\ 级别为\ ``INFO``\ ,而其它进程为\ ``WARNING``\ 。这样级别为\ ``INFO``\ 的信息只会在主进程输出,不会造成日志重复问题。若需要其它进程中的信息,可以使用\ ``logger.warning``\ 。

注意,\ ``logger``\ 的级别设置只有初始化了\ ``DistTrainer``\ 后才能生效。如果想要在初始化进程后就生效,需要在分布式通信初始化后,执行\ ``init_logger_dist``\ 。

Callback
^^^^^^^^

``fastNLP``\ 的一个特色是可以使用\ ``Callback``\ 在训练时完成各种自定义操作。而这一特色在\ ``DistTrainer``\ 中得以保留。但是,这时需要特别注意\ ``Callback``\ 是否只需要在主进程执行。一些\ ``Callback``\ ,比如调整学习率,梯度裁剪等,会改变模型的状态,因此需要在所有进程上都执行,将它们通过\ ``callback_all``\ 参数传入\ ``DistTrainer``\ 。而另一些\ ``Callback``\ ,比如\ ``fitlog``\ ,保存模型,不会改变模型的状态,而是进行数据写操作,因此仅在主进程上执行,将它们通过\ ``callback_master``\ 传入。

在自定义\ ``Callback``\ 时,请遵循一个原则,改变训练或模型状态的操作在所有进程中执行,而数据写到硬盘请在主进程单独进行。这样就能避免进程间失去同步,或者磁盘写操作的冲突。

Debug
^^^^^

多进程的程序很难进行debug,如果出现问题,可以先参考报错信息进行处理。也可以在程序中多输出日志,定位问题。具体情况,具体分析。在debug时,要多考虑进程同步和异步的操作,判断问题是程序本身导致的,还是由进程间没有同步而产生。

其中,有一个常见问题是程序卡住不动。具体表现为训练暂停,程序没有输出,但是GPU利用率保持100%。这一问题是由进程失去同步导致的。这时只能手动\ ``kill``\ GPU上残留的进程,再检查代码。需要检查进程同步的位置,比如模型\ ``backward()``\ 时,\ ``barrier()``\ 时等。同时,也要检查主进程与其它进程操作不同的位置,比如存储模型,evaluate模型时等。注意,失去同步的位置可能并不是程序卡住的位置,所以需要细致的检查。

+ 0
- 122
docs/source/tutorials/extend_3_fitlog.rst View File

@@ -1,122 +0,0 @@
============================================
使用fitlog 辅助 fastNLP 进行科研
============================================

本文介绍结合使用 fastNLP 和 fitlog 进行科研的方法。

首先,我们需要安装 `fitlog <http://www.fastnlp.top/docs/fitlog/>`_ 。你需要确认你的电脑中没有其它名为 `fitlog` 的命令。

我们从命令行中进入到一个文件夹,现在我们要在文件夹中创建我们的 fastNLP 项目。你可以在命令行输入 `fitlog init test1` ,
然后你会看到如下提示::

Initialized empty Git repository in /Users/fdujyn/workspaces/test1/.git/
Auto commit by fitlog
Initialized empty Git repository in /Users/fdujyn/workspaces/test1/.git/
Fitlog project test1 is initialized.

这表明你已经创建成功了项目文件夹,并且在项目文件夹中已经初始化了 Git。如果你不想初始化 Git,
可以参考文档 `命令行工具 <http://www.fastnlp.top/docs/fitlog/user/command_line.html>`_

现在我们进入你创建的项目文件夹 test1 中,可以看到有一个名为 logs 的文件夹,后面我们将会在里面存放你的实验记录。
同时也有一个名为 main.py 的文件,这是我们推荐你使用的训练入口文件。文件的内容如下::

import fitlog

fitlog.commit(__file__) # auto commit your codes
fitlog.add_hyper_in_file (__file__) # record your hyperparameters

"""
Your training code here, you may use these functions to log your result:
fitlog.add_hyper()
fitlog.add_loss()
fitlog.add_metric()
fitlog.add_best_metric()
......
"""

fitlog.finish() # finish the logging

我们推荐你保留除注释外的四行代码,它们有助于你的实验,
他们的具体用处参见文档 `用户 API <http://www.fastnlp.top/docs/fitlog/>`_

我们假定你要进行前两个教程中的实验,并已经把数据复制到了项目根目录下的 tutorial_sample_dataset.csv 文件中。
现在我们编写如下的训练代码,使用 :class:`~fastNLP.core.callback.FitlogCallback` 进行实验记录保存::

import fitlog
from fastNLP import Vocabulary, Trainer, CrossEntropyLoss, AccuracyMetric
from fastNLP.io import CSVLoader
from fastNLP.models import CNNText
from fastNLP.core.callback import FitlogCallback

fitlog.commit(__file__) # auto commit your codes
fitlog.add_hyper_in_file (__file__) # record your hyperparameters

############hyper
word_embed = 50
dropout = 0.1
############hyper

loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
dataset = loader.load("tutorial_sample_dataset.csv")

dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')
dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)
dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True)
vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')
vocab.index_dataset(dataset, field_name='words',new_field_name='words')

model = CNNText((len(vocab),word_embed), num_classes=5, padding=2, dropout=dropout)

train_dev_data, test_data = dataset.split(0.1)
train_data, dev_data = train_dev_data.split(0.1)

trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=CrossEntropyLoss(), metrics=AccuracyMetric(),
callbacks=[FitlogCallback(test_data)])
trainer.train()

fitlog.finish() # finish the logging

用命令行在项目目录下执行 `python main.py` 之后,输出结果如下::

Auto commit by fitlog
input fields after batch(if batch size is 2):
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 11])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])

training epochs started 2019-05-23-21-11-51
Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714

Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.285714

Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.285714

Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.428571

Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.571429

Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.571429

Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.285714

Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.142857

Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.285714

Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.571429


In Epoch:5/Step:10, got best dev performance:AccuracyMetric: acc=0.571429
Reloaded the best model.

现在,我们在项目目录下输入 `fitlog log logs` ,命令行会启动一个网页,默认 url 为 ``0.0.0.0:5000`` 。
我们在浏览器中打开网页,可以看到如下的统计表格:

.. image:: ../figures/fitlogTable.png

如果我们点击action中的最后一个键钮,可以看到详细的 loss 图:

.. image:: ../figures/fitlogChart.png

更多的教程还在编写中,敬请期待~

+ 0
- 172
docs/source/tutorials/tutorial_1_data_preprocess.rst View File

@@ -1,172 +0,0 @@
==============================
fastNLP中的DataSet
==============================

:class:`~fastNLP.DataSet` 是fastNLP用于承载数据的类,一般训练集、验证集和测试集会被加载为三个单独的 :class:`~fastNLP.DataSet` 对象。

:class:`~fastNLP.DataSet` 中的数据组织形式类似一个表格,比如下面 :class:`~fastNLP.DataSet` 一共有3列,列在fastNLP中被称为field。

.. csv-table::
:header: "raw_chars", "chars", "seq_len"

"历任公司副总经理、总工程师,", "[历 任 公 司 副 总 经 理 、 总 工 程 师 ,]", 6
"Third instance .", "[Third, instance, .]", 3
"...", "[...]", "..."

每一行是一个instance (在fastNLP中被称为 :mod:`~fastNLP.core.Instance` ),
每一列是一个field (在fastNLP中称为 :mod:`~fastNLP.core.FieldArray` )。

DataSet的构建
-----------------------------

我们使用传入字典的方式初始化一个DataSet,这是 :class:`~fastNLP.DataSet` 初始化的最基础的方式

.. code-block:: python

from fastNLP import DataSet
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."],
'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],
'seq_len': [6, 3, 3]}
dataset = DataSet(data)
# 传入的dict的每个key的value应该为具有相同长度的list
print(dataset)

输出为::

+------------------------------+------------------------------------------------+---------+
| raw_words | words | seq_len |
+------------------------------+------------------------------------------------+---------+
| This is the first instance . | ['this', 'is', 'the', 'first', 'instance', ... | 6 |
| Second instance . | ['Second', 'instance', '.'] | 3 |
| Third instance . | ['Third', 'instance', '.'] | 3 |
+------------------------------+------------------------------------------------+---------+


我们还可以使用 :func:`~fastNLP.DataSet.append` 方法向DataSet增加数据

.. code-block:: python

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
instance = Instance(raw_words="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6)
dataset.append(instance)
# 可以继续append更多内容,但是append的instance应该和前面的instance拥有完全相同的field

另外,我们还可以用 :class:`~fastNLP.Instance` 数组的方式构建DataSet

.. code-block:: python

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet([
Instance(raw_words="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6),
Instance(raw_words="Second instance .",
words=['Second', 'instance', '.'],
seq_len=3)
])

在初步构建完DataSet之后,我们可以通过 `for` 循环遍历 :class:`~fastNLP.DataSet` 中的内容。

.. code-block:: python

for instance in dataset:
# do something

DataSet的删除
-----------------------------

FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` 、 :func:`~fastNLP.DataSet.delete_instance` 和 :func:`~fastNLP.DataSet.delete_field`
我们先用下面的代码生成一个只有两列的样例DataSet,第一列的值分别为 -5 ~ 4,第二列的值均为 0.

.. code-block:: python

from fastNLP import DataSet
dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})

然后我们使用三种方法进行删除,删除后的DataSet仅包含名为 c 的一列,包含4个值为0 的数据。

.. code-block:: python

# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet
dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)
# 在dataset中删除满足条件的instance
dataset.drop(lambda ins:ins['a']<0)
# 删除第3个instance
dataset.delete_instance(2)
# 删除名为'a'的field
dataset.delete_field('a')


简单的数据预处理
-----------------------------

因为 fastNLP 中的数据是按列存储的,所以大部分的数据预处理操作是以列( :mod:`~fastNLP.core.field` )为操作对象的。
首先,我们可以检查特定名称的 :mod:`~fastNLP.core.field` 是否存在,并对其进行改名。

.. code-block:: python

# 检查是否存在名为'a'的field
dataset.has_field('a') # 或 ('a' in dataset)
# 将名为'c'的field改名为'b'
dataset.rename_field('c', 'b')
# DataSet的长度
len(dataset)

其次,我们可以使用 :func:`~fastNLP.DataSet.apply` 或 :func:`~fastNLP.DataSet.apply_field` 进行数据预处理操作操作。
使用以上的两个方法需要传入一个函数,函数可以是 lambda 匿名函数,也可以是完整定义的函数,fastNLP将对DataSet遍历地应用该函数。
同时,你还可以用 ``new_field_name`` 参数指定函数返回值组成的新 :mod:`~fastNLP.core.field` 的名称。

.. code-block:: python

from fastNLP import DataSet
data = {'raw_words':["This is the first instance .", "Second instance .", "Third instance ."]}
dataset = DataSet(data)

# 将句子分成单词形式, 详见DataSet.apply()方法
dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')

# 或使用DataSet.apply_field()
dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')

# 除了匿名函数,也可以定义函数传递进去
def get_words(instance):
sentence = instance['raw_words']
words = sentence.split()
return words
dataset.apply(get_words, new_field_name='words')

除了手动处理数据集之外,你还可以使用 fastNLP 提供的各种 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 来进行数据处理。
详细请参考这篇教程 :doc:`使用Loader和Pipe处理数据 </tutorials/tutorial_4_load_dataset>` 。


fastNLP中field的命名习惯
-----------------------------

在英文任务中,fastNLP常用的field名称有:

- **raw_words**: 表示的是原始的str。例如"This is a demo sentence ."。存在多个raw_words的情况,例如matching任务,它们会被定义为raw_words0, raw_words1。但在conll格式下,raw_words列也可能为["This", "is", "a", "demo", "sentence", "."]的形式。
- **words**: 表示的是已经tokenize后的词语。例如["This", "is", "a", "demo", "sentence"], 但由于str并不能直接被神经网络所使用,所以words中的内容往往被转换为int,如[3, 10, 4, 2, 7, ...]等。多列words的情况,会被命名为words0, words1
- **target**: 表示目标值。分类场景下,只有一个值;序列标注场景下是一个序列。
- **seq_len**: 一般用于表示words列的长度

在中文任务中,fastNLP常用的field名称有:

- **raw_words**: 如果原始汉字序列中已经包含了词语的边界,则该列称为raw_words。如"上海 浦东 开发 与 法制 建设 同步"。
- **words**: 表示单独的汉字词语序列。例如["上海", "", "浦东", "开发", "与", "法制", "建设", ...]或[2, 3, 4, ...]
- **raw_chars**: 表示的是原始的连续汉字序列。例如"这是一个示例。"
- **chars**: 表示已经切分为单独的汉字的序列。例如["这", "是", "一", "个", "示", "例", "。"]。但由于神经网络不能识别汉字,所以一般该列会被转为int形式,如[3, 4, 5, 6, ...]。
- **target**: 表示目标值。分类场景下,只有一个值;序列标注场景下是一个序列
- **seq_len**: 表示输入序列的长度

----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_1_data_preprocess.ipynb" download="tutorial_1_data_preprocess.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 140
docs/source/tutorials/tutorial_2_vocabulary.rst View File

@@ -1,140 +0,0 @@
==============================
fastNLP中的Vocabulary
==============================

:class:`~fastNLP.Vocabulary` 是包含字或词与index关系的类,用于将文本转换为index。


构建Vocabulary
-----------------------------

.. code-block:: python

from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst(['复', '旦', '大', '学']) # 加入新的字
vocab.add_word('上海') # `上海`会作为一个整体
vocab.to_index('复') # 应该会为3
vocab.to_index('我') # 会输出1,Vocabulary中默认pad的index为0, unk(没有找到的词)的index为1

# 在构建target的Vocabulary时,词表中应该用不上pad和unk,可以通过以下的初始化
vocab = Vocabulary(unknown=None, padding=None)
vocab.add_word_lst(['positive', 'negative'])
vocab.to_index('positive') # 输出0
vocab.to_index('neutral') # 会报错,因为没有unk这种情况

除了通过以上的方式建立词表,Vocabulary还可以通过使用下面的函数直接从 :class:`~fastNLP.DataSet` 中的某一列建立词表以及将该列转换为index

.. code-block:: python

from fastNLP import Vocabulary
from fastNLP import DataSet

dataset = DataSet({'chars': [
['今', '天', '天', '气', '很', '好', '。'],
['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']
],
'target': ['neutral', 'negative']
})

vocab = Vocabulary()
# 从该dataset中的chars列建立词表
vocab.from_dataset(dataset, field_name='chars')
# 使用vocabulary将chars列转换为index
vocab.index_dataset(dataset, field_name='chars')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(dataset, field_name='target')
target_vocab.index_dataset(dataset, field_name='target')
print(dataset)

输出内容为::

+---------------------------------------------------+--------+
| chars | target |
+---------------------------------------------------+--------+
| [4, 2, 2, 5, 6, 7, 3] | 0 |
| [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 3] | 1 |
+---------------------------------------------------+--------+


一些使用tips
-----------------------------

在使用from_dataset()函数建立词表时,将测试集和验证集放入参数no_create_entry_dataset中,如下所示

.. code-block:: python

from fastNLP import Vocabulary
from fastNLP import DataSet

tr_data = DataSet({'chars': [
['今', '天', '心', '情', '很', '好', '。'],
['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']
],
'target': ['positive', 'negative']
})
dev_data = DataSet({'chars': [
['住', '宿', '条', '件', '还', '不', '错'],
['糟', '糕', '的', '天', '气', ',', '无', '法', '出', '行', '。']
],
'target': ['positive', 'negative']
})

vocab = Vocabulary()
# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。
vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])

:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` ,如果您并不关心具体的原理,您可以直接采取以下的建议:在添加来自于非训练集的词的时候将该参数置为True, 或将非训练集数据
传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的
情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们
会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。

通过与fastNLP中的各种Embedding配合使用,会有如下的效果,
如果一个词出现在了train中,但是没在预训练模型中,embedding会为随机初始化,且它单独的一个vector,如果finetune embedding的话,
这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中,那么就不能为它们单独建立vector,而应该让它指向unk这个vector的
值(当unk的值更新时,这个词也使用的是更新之后的vector)。所以被认为是no_create_entry的token,将首先从预训练的词表中寻找它的表示,如
果找到了,就使用该表示; 如果没有找到,则认为该词的表示应该为unk的表示。

下面我们结合部分 :class:`~fastNLP.embeddings.StaticEmbedding` 的例子来说明下该值造成的影响,如果您对 :class:`~fastNLP.embeddings.StaticEmbedding` 不太了解,您可以先参考 :doc:`使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>` 部分再来阅读该部分

.. code-block:: python

import torch
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word('train')
vocab.add_word('only_in_train') # 仅在train出现,但肯定在预训练词表中不存在
vocab.add_word('test', no_create_entry=True) # 该词只在dev或test中出现
vocab.add_word('only_in_test', no_create_entry=True) # 这个词在预训练的词表中找不到

embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')
print(embed(torch.LongTensor([vocab.to_index('train')])))
print(embed(torch.LongTensor([vocab.to_index('only_in_train')])))
print(embed(torch.LongTensor([vocab.to_index('test')])))
print(embed(torch.LongTensor([vocab.to_index('only_in_test')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

输出结果(只截取了部分vector)::

tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, ...]], grad_fn=<EmbeddingBackward>) # train,en-glove-6b-50d,找到了该词
tensor([[ 0.0540, -0.0557, -0.0514, -0.1688, ...]], grad_fn=<EmbeddingBackward>) # only_in_train,en-glove-6b-50d,使用了随机初始化
tensor([[ 0.1318, -0.2552, -0.0679, 0.2619, ...]], grad_fn=<EmbeddingBackward>) # test,在en-glove-6b-50d中找到了这个词
tensor([[0., 0., 0., 0., 0., ...]], grad_fn=<EmbeddingBackward>) # only_in_test, en-glove-6b-50d中找不到这个词,使用unk的vector
tensor([[0., 0., 0., 0., 0., ...]], grad_fn=<EmbeddingBackward>) # unk,使用zero初始化

首先train和test都能够从预训练中找到对应的vector,所以它们是各自的vector表示; only_in_train在预训练中找不到,StaticEmbedding为它
新建了一个entry,所以它有一个单独的vector; 而only_in_test在预训练中找不到改词,因此被指向了unk的值(fastNLP用零向量初始化unk),与最后一行unk的
表示相同。


----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_2_vocabulary.ipynb" download="tutorial_2_vocabulary.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 462
docs/source/tutorials/tutorial_3_embedding.rst View File

@@ -1,462 +0,0 @@
=========================================
使用Embedding模块将文本转成向量
=========================================

这一部分是一个关于在fastNLP当中使用embedding的教程。

教程目录:

- `Part I: embedding介绍`_
- `Part II: 使用预训练的静态embedding`_
- `Part III: 使用随机初始化的embedding`_
- `Part IV: ELMo Embedding`_
- `Part V: Bert Embedding`_
- `Part VI: 使用character-level的embedding`_
- `Part VII: 叠加使用多个embedding`_
- `Part VIII: Embedding的其它说明`_
- `Part IX: StaticEmbedding的使用建议`_



Part I: embedding介绍
---------------------------------------

Embedding是一种词嵌入技术,可以将字或者词转换为实向量。目前使用较多的预训练词嵌入有word2vec, fasttext, glove, character embedding,
elmo以及bert。
但使用这些词嵌入方式的时候都需要做一些加载上的处理,比如预训练的word2vec, fasttext以及glove都有着超过几十万个词语的表示,但一般任务大概
只会用到其中的几万个词,如果直接加载所有的词汇,会导致内存占用变大以及训练速度变慢,需要从预训练文件中抽取本次实验的用到的词汇;而对于英文的
elmo和character embedding, 需要将word拆分成character才能使用;Bert的使用更是涉及到了Byte pair encoding(BPE)相关的内容。为了方便
大家的使用,fastNLP通过 :class:`~fastNLP.Vocabulary` 统一了不同embedding的使用。下面我们将讲述一些例子来说明一下



Part II: 使用预训练的静态embedding
---------------------------------------

在fastNLP中,加载预训练的word2vec, glove以及fasttext都使用的是 :class:`~fastNLP.embeddings.StaticEmbedding` 。另外,为了方便大家的
使用,fastNLP提供了多种静态词向量的自动下载并缓存(默认缓存到~/.fastNLP/embeddings文件夹下)的功能,支持自动下载的预训练向量可以在
`下载文档 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_ 查看。

.. code-block:: python

import torch
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')

words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) # 将文本转为index
print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的

输出为::

torch.Size([1, 5, 50])

fastNLP的StaticEmbedding在初始化之后,就和pytorch中的Embedding是类似的了。 :class:`~fastNLP.embeddings.StaticEmbedding` 的初始化
主要是从model_dir_or_name提供的词向量中抽取出 :class:`~fastNLP.Vocabulary` 中词语的vector。

除了可以通过使用预先提供的Embedding, :class:`~fastNLP.embeddings.StaticEmbedding` 也支持加载本地的预训练词向量,glove, word2vec以及
fasttext格式的。通过将model_dir_or_name修改为本地的embedding文件路径,即可使用本地的embedding。


Part III: 使用随机初始化的embedding
---------------------------------------

有时候需要使用随机初始化的Embedding,也可以通过使用 :class:`~fastNLP.embeddings.StaticEmbedding` 获得。只需要将model_dir_or_name
置为None,且传入embedding_dim,如下例所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)

words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 5, 30])



Part IV: ELMo Embedding
-----------------------------------------------------------

在fastNLP中,我们提供了ELMo和BERT的embedding: :class:`~fastNLP.embeddings.ElmoEmbedding`
和 :class:`~fastNLP.embeddings.BertEmbedding` 。可自动下载的ElmoEmbedding可以
从 `下载文档 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_ 找到。

与静态embedding类似,ELMo的使用方法如下:

.. code-block:: python

from fastNLP.embeddings import ElmoEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 5, 256])

也可以输出多层的ELMo结果,fastNLP将在不同层的结果在最后一维上拼接,下面的代码需要在上面的代码执行结束之后执行

.. code-block:: python

embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')
print(embed(words).size())

输出为::

torch.Size([1, 5, 512])

另外,根据 `Deep contextualized word representations <https://arxiv.org/abs/1802.05365>`_ ,不同层之间使用可学习的权重可以使得ELMo的效果更好,在fastNLP中可以通过以下的初始化
实现3层输出的结果通过可学习的权重进行加法融合。

.. code-block:: python

embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')
print(embed(words).size()) # 三层输出按照权重element-wise的加起来

输出为::

torch.Size([1, 5, 256])



Part V: Bert Embedding
-----------------------------------------------------------

虽然Bert并不算严格意义上的Embedding,但通过将Bert封装成Embedding的形式将极大减轻使用的复杂程度。可自动下载的Bert Embedding可以
从 `下载文档 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_ 找到。我们将使用下面的例子讲述一下
BertEmbedding的使用

.. code-block:: python

from fastNLP.embeddings import BertEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 5, 768])

可以通过申明使用指定层数的output也可以使用多层的output,下面的代码需要在上面的代码执行结束之后执行

.. code-block:: python

# 使用后面两层的输出
embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')
print(embed(words).size()) # 结果将是在最后一维做拼接

输出为::

torch.Size([1, 5, 1536])

在Bert中还存在两个特殊的字符[CLS]和[SEP],默认情况下这两个字符是自动加入并且在计算结束之后会自动删除,以使得输入的序列长度和输出的序列
长度是一致的,但是有些分类的情况,必须需要使用[CLS]的表示,这种情况可以通过在初始化时申明一下需要保留[CLS]的表示,如下例所示

.. code-block:: python

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)
print(embed(words).size()) # 结果将在序列维度上增加2
# 取出句子的cls表示
cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]

输出为::

torch.Size([1, 7, 768])

在英文Bert模型中,一个英文单词可能会被切分为多个subword,例如"fairness"会被拆分为 ``["fair", "##ness"]`` ,这样一个word对应的将有两个输出,
:class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector,通过pool_method可以控制
该pooling方法,支持的有"first"(即使用fair的表示作为fairness的表示), "last"(使用##ness的表示作为fairness的表示), "max"(对fair和
##ness在每一维上做max),"avg"(对fair和##ness每一维做average)。

.. code-block:: python

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')
print(embed(words).size())

输出为::

torch.Size([1, 5, 768])

另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ ,
Bert在针对具有两句话的任务时(如matching,Q&A任务),句子之间通过[SEP]拼接起来,前一句话的token embedding为0,
后一句话的token embedding为1。BertEmbedding能够自动识别句子中间的[SEP]来正确设置对应的token_type_id的。

.. code-block:: python

vocab = Vocabulary()
vocab.add_word_lst("this is a demo . [SEP] another sentence .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo . [SEP] another sentence .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 9, 768])

在多个[SEP]的情况下,将会使token_type_id不断0,1循环。比如"first sentence [SEP] second sentence [SEP] third sentence", 它们的
token_type_id将是[0, 0, 0, 1, 1, 1, 0, 0]。但请注意[SEP]一定要大写的,不能是[sep],否则无法识别。

更多 :class:`~fastNLP.embedding.BertEmbedding` 的使用,请参考 :doc:`/tutorials/extend_1_bert_embedding`


Part VI: 使用character-level的embedding
-----------------------------------------------------

除了预训练的embedding以外,fastNLP还提供了两种Character Embedding: :class:`~fastNLP.embeddings.CNNCharEmbedding` 和
:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时,需要在预处理的时候将word拆分成character,这
会使得预处理过程变得非常繁琐。在fastNLP中,使用character embedding也只需要传入 :class:`~fastNLP.Vocabulary` 即可,而且该
Vocabulary与其它Embedding使用的Vocabulary是一致的,下面我们看两个例子。

CNNCharEmbedding的使用例子如下:

.. code-block:: python

from fastNLP.embeddings import CNNCharEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

# character的embedding维度大小为50,返回的embedding结果维度大小为64。
embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 5, 64])

与CNNCharEmbedding类似,LSTMCharEmbedding的使用例子如下:

.. code-block:: python

from fastNLP.embeddings import LSTMCharEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

# character的embedding维度大小为50,返回的embedding结果维度大小为64。
embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size())

输出为::

torch.Size([1, 5, 64])


Part VII: 叠加使用多个embedding
-----------------------------------------------------

单独使用Character Embedding往往效果并不是很好,需要同时结合word embedding。在fastNLP中可以通过 :class:`~fastNLP.embeddings.StackEmbedding`
来叠加embedding,具体的例子如下所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding, StackEmbedding, CNNCharEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

word_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')
char_embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)
embed = StackEmbedding([word_embed, char_embed])

words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size()) # 输出embedding的维度为50+64=114

输出为::

torch.Size([1, 5, 114])

:class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` ,
:class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。
:class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的,即输出index返回对应的表示。但能够拼接起来的Embedding
必须使用同样的 :class:`~fastNLP.Vocabulary` ,因为只有使用同样的 :class:`~fastNLP.Vocabulary` 才能保证同一个index指向的是同一个词或字



Part VIII: Embedding的其它说明
-----------------------------------------------------------

(1) 获取各种Embedding的dimension

.. code-block:: python

from fastNLP.embeddings import *

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')
print(static_embed.embedding_dim) # 50
char_embed = CNNCharEmbedding(vocab, embed_size=30)
print(char_embed.embedding_dim) # 30
elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')
print(elmo_embed_1.embedding_dim) # 256
elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')
print(elmo_embed_2.embedding_dim) # 512
bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')
print(bert_embed_1.embedding_dim) # 768
bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')
print(bert_embed_2.embedding_dim) # 1536
stack_embed = StackEmbedding([static_embed, char_embed])
print(stack_embed.embedding_dim) # 80

(2) 设置Embedding的权重是否更新

.. code-block:: python

from fastNLP.embeddings import *

vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split())

embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新
embed.requires_grad = False # 修改BertEmbedding的权重为不更新

(3) 各种Embedding中word_dropout与dropout的说明

fastNLP中所有的Embedding都支持传入word_dropout和dropout参数,word_dropout指示的是以多大概率将输入的word置为unk的index,这样既可以
是的unk得到训练,也可以有一定的regularize效果; dropout参数是在获取到word的表示之后,以多大概率将一些维度的表示置为0。

如果使用 :class:`~fastNLP.embeddings.StackEmbedding` 且需要用到word_dropout,建议将word_dropout设置在 :class:`~fastNLP.embeddings.StackEmbedding` 上。



Part IX: StaticEmbedding的使用建议
-----------------------------------------------------------

在英文的命名实体识别(NER)任务中,由 `Named Entity Recognition with Bidirectional LSTM-CNNs <http://xxx.itp.ac.cn/pdf/1511.08308.pdf>`_ 指出,同时使用cnn character embedding和word embedding
会使得NER的效果有比较大的提升。正如你在上节中看到的那样,fastNLP支持将 :class:`~fastNLP.embeddings.CNNCharEmbedding`
与 :class:`~fastNLP.embeddings.StaticEmbedding` 拼成一个 :class:`~fastNLP.embeddings.StackEmbedding` 。如果通过这种方式使用,需要
在预处理文本时,不要将词汇小写化(因为Character Embedding需要利用词语中的大小写信息)且不要将出现频次低于某个阈值的word设置为unk(因为
Character embedding需要利用字形信息);但 :class:`~fastNLP.embeddings.StaticEmbedding` 使用的某些预训练词嵌入的词汇表中只有小写的词
语, 且某些低频词并未在预训练中出现需要被剔除。即(1) character embedding需要保留大小写,而预训练词向量不需要保留大小写。(2)
character embedding需要保留所有的字形, 而static embedding需要设置一个最低阈值以学到更好的表示。

(1) fastNLP如何解决关于大小写的问题

fastNLP通过在 :class:`~fastNLP.embeddings.StaticEmbedding` 增加了一个lower参数解决该问题。如下面的例子所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("The the a A".split())
# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)
print(embed(torch.LongTensor([vocab.to_index('The')])))
print(embed(torch.LongTensor([vocab.to_index('the')])))

输出为::

tensor([[-0.4685, 0.4572, 0.5159, -0.2618, -0.6871]], grad_fn=<EmbeddingBackward>)
tensor([[ 0.2615, 0.1490, -0.2491, 0.4009, -0.3842]], grad_fn=<EmbeddingBackward>)

可以看到"The"与"the"的vector是不一致的。但如果我们在初始化 :class:`~fastNLP.embeddings.StaticEmbedding` 将lower设置为True,效果将
如下所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("The the a A".split())
# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)
print(embed(torch.LongTensor([vocab.to_index('The')])))
print(embed(torch.LongTensor([vocab.to_index('the')])))

输出为::

tensor([[-0.2237, 0.6825, -0.3459, -0.1795, 0.7516]], grad_fn=<EmbeddingBackward>)
tensor([[-0.2237, 0.6825, -0.3459, -0.1795, 0.7516]], grad_fn=<EmbeddingBackward>)

可以看到"The"与"the"的vector是一致的。他们实际上也是引用的同一个vector。通过将lower设置为True,可以在 :class:`~fastNLP.embeddings.StaticEmbedding`
实现类似具备相同小写结果的词语引用同一个vector。

(2) fastNLP如何解决min_freq的问题

fastNLP通过在 :class:`~fastNLP.embeddings.StaticEmbedding` 增加了一个min_freq参数解决该问题。如下面的例子所示

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("the the the a".split())
# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)
print(embed(torch.LongTensor([vocab.to_index('the')])))
print(embed(torch.LongTensor([vocab.to_index('a')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

输出为::

tensor([[ 0.0454, 0.3375, 0.6758, -0.2026, -0.4715]], grad_fn=<EmbeddingBackward>)
tensor([[-0.7602, 0.0149, 0.2733, 0.3974, 0.7371]], grad_fn=<EmbeddingBackward>)
tensor([[-0.7602, 0.0149, 0.2733, 0.3974, 0.7371]], grad_fn=<EmbeddingBackward>)

其中最后一行为unknown值的vector,可以看到a的vector表示与unknown是一样的,这是由于a的频次低于了2,所以被指向了unknown的表示;而the由于
词频超过了2次,所以它是单独的表示。

在计算min_freq时,也会考虑到lower的作用,比如

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary().add_word_lst("the the the a A".split())
# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的
embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)
print(embed(torch.LongTensor([vocab.to_index('the')])))
print(embed(torch.LongTensor([vocab.to_index('a')])))
print(embed(torch.LongTensor([vocab.to_index('A')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

输出为::

tensor([[-0.7453, -0.5542, 0.5039, 0.6195, -0.4723]], grad_fn=<EmbeddingBackward>) # the
tensor([[ 0.0170, -0.0995, -0.5743, -0.2469, -0.2095]], grad_fn=<EmbeddingBackward>) # a
tensor([[ 0.0170, -0.0995, -0.5743, -0.2469, -0.2095]], grad_fn=<EmbeddingBackward>) # A
tensor([[ 0.6707, -0.5786, -0.6967, 0.0111, 0.1209]], grad_fn=<EmbeddingBackward>) # unk

可以看到a不再和最后一行的unknown共享一个表示了,这是由于a与A都算入了a的词频,且A的表示也是a的表示。


----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_3_embedding.ipynb" download="tutorial_3_embedding.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 219
docs/source/tutorials/tutorial_4_load_dataset.rst View File

@@ -1,219 +0,0 @@
=======================================
使用Loader和Pipe加载并处理数据集
=======================================

这一部分是关于如何加载数据集的教程

教程目录:

- `Part I: 数据集容器DataBundle`_
- `Part II: 加载的各种数据集的Loader`_
- `Part III: 使用Pipe对数据集进行预处理`_
- `Part IV: fastNLP封装好的Loader和Pipe`_
- `Part V: 不同格式类型的基础Loader`_


Part I: 数据集容器DataBundle
------------------------------------

而由于对于同一个任务,训练集,验证集和测试集会共用同一个词表以及具有相同的目标值,所以在fastNLP中我们使用了 :class:`~fastNLP.io.DataBundle`
来承载同一个任务的多个数据集 :class:`~fastNLP.DataSet` 以及它们的词表 :class:`~fastNLP.Vocabulary` 。下面会有例子介绍 :class:`~fastNLP.io.DataBundle`
的相关使用。

:class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 中被使用。
下面我们先介绍一下 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 。

Part II: 加载的各种数据集的Loader
-------------------------------------

在fastNLP中,所有的 :class:`~fastNLP.io.Loader` 都可以通过其文档判断其支持读取的数据格式,以及读取之后返回的 :class:`~fastNLP.DataSet` 的格式,
例如 :class:`~fastNLP.io.ChnSentiCorpLoader` 。

- **download()** 函数:自动将该数据集下载到缓存地址,默认缓存地址为~/.fastNLP/datasets/。由于版权等原因,不是所有的Loader都实现了该方法。该方法会返回下载后文件所处的缓存地址。
- **_load()** 函数:从一个数据文件中读取数据,返回一个 :class:`~fastNLP.DataSet` 。返回的DataSet的格式可从Loader文档判断。
- **load()** 函数:从文件或者文件夹中读取数据为 :class:`~fastNLP.DataSet` 并将它们组装成 :class:`~fastNLP.io.DataBundle`。支持接受的参数类型有以下的几种

- None, 将尝试读取自动缓存的数据,仅支持提供了自动下载数据的Loader
- 文件夹路径, 默认将尝试在该文件夹下匹配文件名中含有 `train` , `test` , `dev` 的文件,如果有多个文件含有相同的关键字,将无法通过该方式读取
- dict, 例如{'train':"/path/to/tr.conll", 'dev':"/to/validate.conll", "test":"/to/te.conll"}。

.. code-block:: python

from fastNLP.io import CWSLoader

loader = CWSLoader(dataset_name='pku')
data_bundle = loader.load()
print(data_bundle)

输出内容为::

In total 3 datasets:
dev has 1831 instances.
train has 17223 instances.
test has 1944 instances.

这里表示一共有3个数据集。其中:

- 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance

也可以取出DataSet,并打印DataSet中的具体内容

.. code-block:: python

tr_data = data_bundle.get_dataset('train')
print(tr_data[:2])

输出为::

+--------------------------------------------------------------------------------------+
| raw_words |
+--------------------------------------------------------------------------------------+
| 迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ( 附 图片 1 张 ) |
| 中共中央 总书记 、 国家 主席 江 泽民 |
+--------------------------------------------------------------------------------------+

Part III: 使用Pipe对数据集进行预处理
------------------------------------------
通过 :class:`~fastNLP.io.Loader` 可以将文本数据读入,但并不能直接被神经网络使用,还需要进行一定的预处理。

在fastNLP中,我们使用 :class:`~fastNLP.io.Pipe` 的子类作为数据预处理的类, :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 一般具备一一对应的关系,该关系可以从其名称判断,
例如 :class:`~fastNLP.io.CWSLoader` 与 :class:`~fastNLP.io.CWSPipe` 是一一对应的。一般情况下Pipe处理包含以下的几个过程,(1)将raw_words或
raw_chars进行tokenize以切分成不同的词或字; (2) 再建立词或字的 :class:`~fastNLP.Vocabulary` , 并将词或字转换为index; (3)将target
列建立词表并将target列转为index;

所有的Pipe都可通过其文档查看该Pipe支持处理的 :class:`~fastNLP.DataSet` 以及返回的 :class:`~fastNLP.io.DataBundle` 中的Vocabulary的情况;
如 :class:`~fastNLP.io.OntoNotesNERPipe`

各种数据集的Pipe当中,都包含了以下的两个函数:

- process() 函数:对输入的 :class:`~fastNLP.io.DataBundle` 进行处理, 然后返回处理之后的 :class:`~fastNLP.io.DataBundle` 。process函数的文档中包含了该Pipe支持处理的DataSet的格式。
- process_from_file() 函数:输入数据集所在文件夹,使用对应的Loader读取数据(所以该函数支持的参数类型是由于其对应的Loader的load函数决定的),然后调用相对应的process函数对数据进行预处理。相当于是把Load和process放在一个函数中执行。

接着上面 :class:`~fastNLP.io.CWSLoader` 的例子,我们展示一下 :class:`~fastNLP.io.CWSPipe` 的功能:

.. code-block:: python

from fastNLP.io import CWSPipe

data_bundle = CWSPipe().process(data_bundle)
print(data_bundle)

输出内容为::

In total 3 datasets:
dev has 1831 instances.
train has 17223 instances.
test has 1944 instances.
In total 2 vocabs:
chars has 4777 entries.
target has 4 entries.

表示一共有3个数据集和2个词表。其中:

- 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance
- 2个词表分别为chars词表与target词表。其中chars词表为句子文本所构建的词表,一共有4777个不同的字;target词表为目标标签所构建的词表,一共有4种标签。

相较于之前CWSLoader读取的DataBundle,新增了两个Vocabulary。 我们可以打印一下处理之后的DataSet

.. code-block:: python

tr_data = data_bundle.get_dataset('train')
print(tr_data[:2])

输出为::

+---------------------------------------------------+------------------------------------+------------------------------------+---------+
| raw_words | chars | target | seq_len |
+---------------------------------------------------+------------------------------------+------------------------------------+---------+
| 迈向 充满 希望 的 新 世纪 —— 一九九八年... | [1224, 178, 674, 544, 573, 435,... | [0, 1, 0, 1, 0, 1, 2, 2, 0, 1, ... | 29 |
| 中共中央 总书记 、 国家 主席 江 泽民 | [11, 212, 11, 335, 124, 256, 10... | [0, 3, 3, 1, 0, 3, 1, 2, 0, 1, ... | 15 |
+---------------------------------------------------+------------------------------------+------------------------------------+---------+

可以看到有两列为int的field: chars和target。这两列的名称同时也是DataBundle中的Vocabulary的名称。可以通过下列的代码获取并查看Vocabulary的
信息

.. code-block:: python

vocab = data_bundle.get_vocab('target')
print(vocab)

输出为::

Vocabulary(['B', 'E', 'S', 'M']...)


Part IV: fastNLP封装好的Loader和Pipe
------------------------------------------

fastNLP封装了多种任务/数据集的 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 并提供自动下载功能,具体参见文档
`数据集 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_


Part V: 不同格式类型的基础Loader
--------------------------------------------------------

除了上面提到的针对具体任务的Loader,我们还提供了CSV格式和JSON格式的Loader

:class:`~fastNLP.io.loader.CSVLoader` 读取CSV类型的数据集文件。例子如下:

.. code-block:: python

from fastNLP.io.loader import CSVLoader
data_set_loader = CSVLoader(
headers=('raw_words', 'target'), sep='\t'
)
# 表示将CSV文件中每一行的第一项将填入'raw_words' field,第二项填入'target' field。
# 其中项之间由'\t'分割开来

data_set = data_set_loader._load('path/to/your/file')

文件内容样例如下 ::

But it does not leave you with much . 1
You could hate it for the same reason . 1
The performances are an absolute joy . 4

读取之后的DataSet具有以下的field

.. csv-table::
:header: raw_words, target

"But it does not leave you with much .", "1"
"You could hate it for the same reason .", "1"
"The performances are an absolute joy .", "4"

:class:`~fastNLP.io.JsonLoader` 读取Json类型的数据集文件,数据必须按行存储,每行是一个包含各类属性的Json对象。例子如下:

.. code-block:: python

from fastNLP.io.loader import JsonLoader
loader = JsonLoader(
fields={'sentence1': 'raw_words1', 'sentence2': 'raw_words2', 'gold_label': 'target'}
)
# 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'raw_words1'、'raw_words2'、'target'这三个fields

data_set = loader._load('path/to/your/file')

数据集内容样例如下 ::

{"annotator_labels": ["neutral"], "captionID": "3416050480.jpg#4", "gold_label": "neutral", "pairID": "3416050480.jpg#4r1n", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is training his horse for a competition.", "sentence2_binary_parse": "( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))"}
{"annotator_labels": ["contradiction"], "captionID": "3416050480.jpg#4", "gold_label": "contradiction", "pairID": "3416050480.jpg#4r1c", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is at a diner, ordering an omelette.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is ( at ( a diner ) ) ) , ) ( ordering ( an omelette ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (PP (IN at) (NP (DT a) (NN diner))) (, ,) (S (VP (VBG ordering) (NP (DT an) (NN omelette))))) (. .)))"}
{"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"}

读取之后的DataSet具有以下的field

.. csv-table::
:header: raw_words0, raw_words1, target

"A person on a horse jumps over a broken down airplane.", "A person is training his horse for a competition.", "neutral"
"A person on a horse jumps over a broken down airplane.", "A person is at a diner, ordering an omelette.", "contradiction"
"A person on a horse jumps over a broken down airplane.", "A person is outdoors, on a horse.", "entailment"


----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_4_load_dataset.ipynb" download="tutorial_4_load_dataset.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 248
docs/source/tutorials/tutorial_5_loss_optimizer.rst View File

@@ -1,248 +0,0 @@
==============================================================================
使用Trainer和Tester快速训练和测试
==============================================================================

我们使用前面介绍过的 :doc:`/tutorials/文本分类` 任务来进行详细的介绍。这里我们把数据集换成了SST2,使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。

.. note::

本教程中的代码没有使用 GPU 。读者可以自行修改代码,扩大数据量并使用 GPU 进行训练。

数据读入和处理
-----------------

数据读入
我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的
:meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数据集,函数默认paths值为None。
此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` :

* raw_words: 原source句子
* target: 标签值
* words: index之后的raw_words
* seq_len: 句子长度

读入数据代码如下:

.. code-block:: python

from fastNLP.io import SST2Pipe
pipe = SST2Pipe()
databundle = pipe.process_from_file()
vocab = databundle.get_vocab('words')
print(databundle)
print(databundle.get_dataset('train')[0])
print(databundle.get_vocab('words'))


输出数据如下::

In total 3 datasets:
test has 1821 instances.
train has 67349 instances.
dev has 872 instances.
In total 2 vocabs:
words has 16293 entries.
target has 2 entries.

+-------------------------------------------+--------+--------------------------------------+---------+
| raw_words | target | words | seq_len |
+-------------------------------------------+--------+--------------------------------------+---------+
| hide new secretions from the parental ... | 1 | [4111, 98, 12010, 38, 2, 6844, 9042] | 7 |
+-------------------------------------------+--------+--------------------------------------+---------+
Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)

除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:`/tutorials/tutorial_4_load_dataset` 。
数据集分割
由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法,
为了能让读者快速运行完整个教程,我们只取了训练集的前5000个数据。

.. code-block:: python

train_data = databundle.get_dataset('train')[:5000]
train_data, test_data = train_data.split(0.015)
dev_data = databundle.get_dataset('dev')
print(len(train_data),len(dev_data),len(test_data))

输出结果为::
4925 872 75

数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数
:class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证
集的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将 `target` :mod:`~fastNLP.core.field` 设定
为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个 :mod:`~fastNLP.core.field` 的设定情况,代码如下:

.. code-block:: python

train_data.print_field_meta()

输出结果为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

其中is_input和is_target分别表示是否为input和target。ignore_type为true时指使用 :class:`~fastNLP.DataSetIter` 取出batch数
据时fastNLP不会进行自动padding,pad_value指对应 :mod:`~fastNLP.core.field` padding所用的值,这两者只有
当 :mod:`~fastNLP.core.field` 设定为input或者target的时候才有存在的意义。

is_input为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的batch_x 中,而is_target为true
的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的 batch_y 中。
具体分析见 :doc:`使用DataSetIter实现自定义训练过程 </tutorials/tutorial_6_datasetiter>` 。

使用内置模型训练
---------------------
模型定义和初始化
我们可以导入 fastNLP 内置的文本分类模型 :class:`~fastNLP.models.CNNText` 来对模型进行定义,代码如下:

.. code-block:: python

from fastNLP.models import CNNText

#词嵌入的维度
EMBED_DIM = 100

#使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数
#还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值
model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, dropout=0.1)

使用fastNLP快速搭建自己的模型详见 :doc:`/tutorials/tutorial_8_modules_models` 。

评价指标
训练模型需要提供一个评价指标。这里使用准确率做为评价指标。

* ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
* ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。

这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或
数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。代码如下:

.. code-block:: python

from fastNLP import AccuracyMetric
from fastNLP import Const
# metrics=AccuracyMetric() 在本例中与下面这行代码等价
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)

损失函数
训练模型需要提供一个损失函数
,fastNLP中提供了直接可以导入使用的四种loss,分别为:
* :class:`~fastNLP.CrossEntropyLoss`:包装了torch.nn.functional.cross_entropy()函数,返回交叉熵损失(可以运用于多分类场景)
* :class:`~fastNLP.BCELoss`:包装了torch.nn.functional.binary_cross_entropy()函数,返回二分类的交叉熵
* :class:`~fastNLP.L1Loss`:包装了torch.nn.functional.l1_loss()函数,返回L1 损失
* :class:`~fastNLP.NLLLoss`:包装了torch.nn.functional.nll_loss()函数,返回负对数似然损失
下面提供了一个在分类问题中常用的交叉熵损失。注意它的 **初始化参数** 。

* ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
* ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。

这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或
数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。

.. code-block:: python

from fastNLP import CrossEntropyLoss
# loss = CrossEntropyLoss() 在本例中与下面这行代码等价
loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)
除了使用fastNLP已经包装好的了损失函数,也可以通过fastNLP中的LossFunc类来构建自己的损失函数,方法如下:

.. code-block:: python

# 这表示构建了一个损失函数类,由func计算损失函数,其中将从模型返回值或者DataSet的target=True的field
# 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数;找到一个参数名为`label`的参数
# 传入func作为一个名为`target`的参数
#下面自己构建了一个交叉熵函数,和之后直接使用fastNLP中的交叉熵函数是一个效果
import torch
from fastNLP import LossFunc
func = torch.nn.functional.cross_entropy
loss_func = LossFunc(func, input=Const.OUTPUT, target=Const.TARGET)
优化器
定义模型运行的时候使用的优化器,可以直接使用torch.optim.Optimizer中的优化器,并在实例化 :class:`~fastNLP.Trainer` 类的时候传入优化器实参
.. code-block:: python

import torch.optim as optim

#使用 torch.optim 定义优化器
optimizer=optim.RMSprop(model_cnn.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)

快速训练
现在我们对上面定义的模型使用 :class:`~fastNLP.Trainer` 进行训练。
除了使用 :class:`~fastNLP.Trainer`进行训练,我们也可以通过使用 :class:`~fastNLP.DataSetIter` 来编写自己的训练过程,具体见 :doc:`/tutorials/tutorial_6_datasetiter`

.. code-block:: python

from fastNLP import Trainer
#训练的轮数和batch size
N_EPOCHS = 10
BATCH_SIZE = 16

#如果在定义trainer的时候没有传入optimizer参数,模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3
#这里只使用了loss作为损失函数输入,感兴趣可以尝试其他损失函数(如之前自定义的loss_func)作为输入
trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics,
optimizer=optimizer,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE)
trainer.train()

训练过程的输出如下::

input fields after batch(if batch size is 2):
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 13])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])

training epochs started 2020-02-26-16-45-40
Evaluate data in 0.5 seconds!
Evaluation on dev at Epoch 1/10. Step:308/3080:
AccuracyMetric: acc=0.677752

......

Evaluate data in 0.44 seconds!
Evaluation on dev at Epoch 10/10. Step:3080/3080:
AccuracyMetric: acc=0.725917


In Epoch:5/Step:1540, got best dev performance:
AccuracyMetric: acc=0.740826
Reloaded the best model.

快速测试
与 :class:`~fastNLP.Trainer` 对应,fastNLP 也提供了 :class:`~fastNLP.Tester` 用于快速测试,用法如下

.. code-block:: python

from fastNLP import Tester

tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())
tester.test()
训练过程输出如下::
Evaluate data in 0.43 seconds!
[tester]
AccuracyMetric: acc=0.773333

----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_5_loss_optimizer.ipynb" download="tutorial_5_loss_optimizer.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 423
docs/source/tutorials/tutorial_6_datasetiter.rst View File

@@ -1,423 +0,0 @@
==============================================================================
使用DataSetIter实现自定义训练过程
==============================================================================

我们使用前面介绍过的 :doc:`/tutorials/文本分类` 任务来进行详细的介绍。这里我们把数据集换成了SST2,使用 :class:`~fastNLP.DataSetIter` 类来编写自己的训练过程。
DataSetIter初探之前的内容与 :doc:`/tutorials/tutorial_5_loss_optimizer` 中的完全一样,如已经阅读过可以跳过。

.. note::

本教程中的代码没有使用 GPU 。读者可以自行修改代码,扩大数据量并使用 GPU 进行训练。

数据读入和预处理
--------------------

数据读入
我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的
:meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数 据集,函数默认paths值为None。
此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` :

* raw_words: 原source句子
* target: 标签值
* words: index之后的raw_words
* seq_len: 句子长度

读入数据代码如下:

.. code-block:: python

from fastNLP.io import SST2Pipe
pipe = SST2Pipe()
databundle = pipe.process_from_file()
vocab = databundle.get_vocab('words')
print(databundle)
print(databundle.get_dataset('train')[0])
print(databundle.get_vocab('words'))


输出数据如下::
In total 3 datasets:
test has 1821 instances.
train has 67349 instances.
dev has 872 instances.
In total 2 vocabs:
words has 16293 entries.
target has 2 entries.

+-------------------------------------------+--------+--------------------------------------+---------+
| raw_words | target | words | seq_len |
+-------------------------------------------+--------+--------------------------------------+---------+
| hide new secretions from the parental ... | 1 | [4111, 98, 12010, 38, 2, 6844, 9042] | 7 |
+-------------------------------------------+--------+--------------------------------------+---------+
Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)

除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:`/tutorials/tutorial_4_load_dataset` 。
数据集分割
由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法,
为了能让读者快速运行完整个教程,我们只取了训练集的前5000个数据。

.. code-block:: python

train_data = databundle.get_dataset('train')[:5000]
train_data, test_data = train_data.split(0.015)
dev_data = databundle.get_dataset('dev')
print(len(train_data),len(dev_data),len(test_data))

输出结果为::

4925 872 75

数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数
:class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证集
的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将`target` :mod:`~fastNLP.core.field` 设定为target。
我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个
:mod:`~fastNLP.core.field` 的设定情况,代码如下:

.. code-block:: python

train_data.print_field_meta()

输出结果为::
+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

其中is_input和is_target分别表示是否为input和target。ignore_type为true时指使用 :class:`~fastNLP.DataSetIter` 取出batch数
据时fastNLP不会进行自动padding,pad_value指对应 :mod:`~fastNLP.core.field` padding所用的值,这两者只有当
:mod:`~fastNLP.core.field` 设定为input或者target的时候才有存在的意义。

is_input为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的 batch_x 中,
而 is_target为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的 batch_y 中。
具体分析见下面DataSetIter的介绍过程。


评价指标
训练模型需要提供一个评价指标。这里使用准确率做为评价指标。

* ``pred`` 参数对应的是模型的 forward 方法返回的 dict 中的一个 key 的名字。
* ``target`` 参数对应的是 :class:`~fastNLP.DataSet` 中作为标签的 :mod:`~fastNLP.core.field` 的名字。

这里我们用 :class:`~fastNLP.Const` 来辅助命名,如果你自己编写模型中 forward 方法的返回值或
数据集中 :mod:`~fastNLP.core.field` 的名字与本例不同, 你可以把 ``pred`` 参数和 ``target`` 参数设定符合自己代码的值。代码如下:

.. code-block:: python

from fastNLP import AccuracyMetric
from fastNLP import Const
# metrics=AccuracyMetric() 在本例中与下面这行代码等价
metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)


DataSetIter初探
--------------------------

DataSetIter
fastNLP定义的 :class:`~fastNLP.DataSetIter` 类,用于定义一个batch,并实现batch的多种功能,在初始化时传入的参数有:
* dataset: :class:`~fastNLP.DataSet` 对象, 数据集
* batch_size: 取出的batch大小
* sampler: 规定使用的 :class:`~fastNLP.Sampler` 若为 None, 使用 :class:`~fastNLP.RandomSampler` (Default: None)
* as_numpy: 若为 True, 输出batch为 `numpy.array`. 否则为 `torch.Tensor` (Default: False)
* prefetch: 若为 True使用多进程预先取出下一batch. (Default: False)

sampler
fastNLP 实现的采样器有:
* :class:`~fastNLP.BucketSampler` 可以随机地取出长度相似的元素 【初始化参数: num_buckets:bucket的数量; batch_size:batch大小; seq_len_field_name:dataset中对应序列长度的 :mod:`~fastNLP.core.field` 的名字】
* SequentialSampler: 顺序取出元素的采样器【无初始化参数】
* RandomSampler:随机化取元素的采样器【无初始化参数】

Padder
在fastNLP里,pad是与一个 :mod:`~fastNLP.core.field` 绑定的。即不同的 :mod:`~fastNLP.core.field` 可以使用不同的pad方式,比如在英文任务中word需要的pad和
character的pad方式往往是不同的。fastNLP是通过一个叫做 :class:`~fastNLP.Padder` 的子类来完成的。
默认情况下,所有field使用 :class:`~fastNLP.AutoPadder`
。大多数情况下直接使用 :class:`~fastNLP.AutoPadder` 就可以了。
如果 :class:`~fastNLP.AutoPadder` 或 :class:`~fastNLP.EngChar2DPadder` 无法满足需求,
也可以自己写一个 :class:`~fastNLP.Padder` 。

DataSetIter自动padding
以下代码展示了DataSetIter的简单使用:

.. code-block:: python

from fastNLP import BucketSampler
from fastNLP import DataSetIter

tmp_data = dev_data[:10]
# 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。
# 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)
sampler = BucketSampler(batch_size=2, seq_len_field_name='seq_len')
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
print("batch_x: ",batch_x)
print("batch_y: ", batch_y)
输出结果如下::

batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,
2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,
13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,
1323, 4398, 7],
[ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,
2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0]]), 'seq_len': tensor([33, 21])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],
[ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],
[15618, 3204, 5, 1675, 0]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,
41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],
[ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,
2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,
4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],
[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,
1217, 7, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 12])}
batch_y: {'target': tensor([0, 1])}

可以看到那些设定为input的 :mod:`~fastNLP.core.field` 都出现在batch_x中,而设定为target的 :mod:`~fastNLP.core.field` 则出现在batch_y中。同时对于同一个batch_x中的两个数据,长度偏短的那个会被自动padding到和长度偏长的句子长度一致,默认的padding值为0。

Dataset改变padding值
可以通过 :meth:`~fastNLP.core.Dataset.set_pad_val` 方法修改默认的pad值,代码如下:

.. code-block:: python

tmp_data.set_pad_val('words',-1)
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
print("batch_x: ",batch_x)
print("batch_y: ", batch_y)

输出结果如下::

batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,
2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,
13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,
1323, 4398, 7],
[ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,
2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,
7, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1]]), 'seq_len': tensor([33, 21])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],
[ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,
41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],
[ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,
2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],
[15618, 3204, 5, 1675, -1]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,
4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],
[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,
1217, 7, -1, -1, -1, -1, -1, -1, -1, -1]]), 'seq_len': tensor([20, 12])}
batch_y: {'target': tensor([0, 1])}
可以看到使用了-1进行padding。

Dataset个性化padding
如果我们希望对某一些 :mod:`~fastNLP.core.field` 进行个性化padding,可以自己构造Padder类,并使用 :meth:`~fastNLP.core.Dataset.set_padder` 函数修改padder来实现。下面通过构造一个将数据padding到固定长度的padder进行展示:

.. code-block:: python

from fastNLP.core.field import Padder
import numpy as np
class FixLengthPadder(Padder):
def __init__(self, pad_val=0, length=None):
super().__init__(pad_val=pad_val)
self.length = length
assert self.length is not None, "Creating FixLengthPadder with no specific length!"
def __call__(self, contents, field_name, field_ele_dtype, dim):
#计算当前contents中的最大长度
max_len = max(map(len, contents))
#如果当前contents中的最大长度大于指定的padder length的话就报错
assert max_len <= self.length, "Fixed padder length smaller than actual length! with length {}".format(max_len)
array = np.full((len(contents), self.length), self.pad_val, dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
array[i, :len(content_i)] = content_i
return array

#设定FixLengthPadder的固定长度为40
tmp_padder = FixLengthPadder(pad_val=0,length=40)
#利用dataset的set_padder函数设定words field的padder
tmp_data.set_padder('words',tmp_padder)
batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)
for batch_x, batch_y in batch:
print("batch_x: ",batch_x)
print("batch_y: ", batch_y)

输出结果如下::

batch_x: {'words': tensor([[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,
1217, 7, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,
4, 673, 662, 15, 4, 1154, 240, 639, 417, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([12, 20])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,
2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,
13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,
1323, 4398, 7, 0, 0, 0, 0, 0, 0, 0],
[ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,
2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([33, 21])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0],
[ 14, 10, 437, 32, 78, 3, 78, 437, 7, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,
41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,
2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 4, 277, 685, 18, 7, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[15618, 3204, 5, 1675, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}

在这里所有的 `words` 都被pad成了长度为40的list。


使用DataSetIter自己编写训练过程
------------------------------------
如果你想用类似 PyTorch 的使用方法,自己编写训练过程,可以参考下面这段代码。
其中使用了 fastNLP 提供的 :class:`~fastNLP.DataSetIter` 来获得小批量训练的小批量数据,
使用 :class:`~fastNLP.BucketSampler` 做为 :class:`~fastNLP.DataSetIter` 的参数来选择采样的方式。

以下代码使用BucketSampler作为 :class:`~fastNLP.DataSetIter` 初始化的输入,运用 :class:`~fastNLP.DataSetIter` 自己写训练程序

.. code-block:: python

from fastNLP import BucketSampler
from fastNLP import DataSetIter
from fastNLP.models import CNNText
from fastNLP import Tester
import torch
import time

embed_dim = 100
model = CNNText((len(vocab),embed_dim), num_classes=2, dropout=0.1)

def train(epoch, data, devdata):
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
lossfunc = torch.nn.CrossEntropyLoss()
batch_size = 32

# 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。
# 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)
train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')
train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)

start_time = time.time()
print("-"*5+"start training"+"-"*5)
for i in range(epoch):
loss_list = []
for batch_x, batch_y in train_batch:
optimizer.zero_grad()
output = model(batch_x['words'])
loss = lossfunc(output['pred'], batch_y['target'])
loss.backward()
optimizer.step()
loss_list.append(loss.item())

#这里verbose如果为0,在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1,打印出验证结果,返回评估信息
#在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果
tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)
res=tester_tmp.test()

print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=" ")
print(tester_tmp._format_eval_results(res),end=" ")
print('{:d}ms'.format(round((time.time()-start_time)*1000)))
loss_list.clear()

train(10, train_data, dev_data)
#使用tester进行快速测试
tester = Tester(test_data, model, metrics=AccuracyMetric())
tester.test()

这段代码的输出如下::

-----start training-----

Evaluate data in 2.68 seconds!
Epoch 0 Avg Loss: 0.66 AccuracyMetric: acc=0.708716 29307ms

Evaluate data in 0.38 seconds!
Epoch 1 Avg Loss: 0.41 AccuracyMetric: acc=0.770642 52200ms

Evaluate data in 0.51 seconds!
Epoch 2 Avg Loss: 0.16 AccuracyMetric: acc=0.747706 70268ms

Evaluate data in 0.96 seconds!
Epoch 3 Avg Loss: 0.06 AccuracyMetric: acc=0.741972 90349ms

Evaluate data in 1.04 seconds!
Epoch 4 Avg Loss: 0.03 AccuracyMetric: acc=0.740826 114250ms

Evaluate data in 0.8 seconds!
Epoch 5 Avg Loss: 0.02 AccuracyMetric: acc=0.738532 134742ms

Evaluate data in 0.65 seconds!
Epoch 6 Avg Loss: 0.01 AccuracyMetric: acc=0.731651 154503ms

Evaluate data in 0.8 seconds!
Epoch 7 Avg Loss: 0.01 AccuracyMetric: acc=0.738532 175397ms

Evaluate data in 0.36 seconds!
Epoch 8 Avg Loss: 0.01 AccuracyMetric: acc=0.733945 192384ms

Evaluate data in 0.84 seconds!
Epoch 9 Avg Loss: 0.01 AccuracyMetric: acc=0.744266 214417ms

Evaluate data in 0.04 seconds!
[tester]
AccuracyMetric: acc=0.786667



----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_6_datasetiter.ipynb" download="tutorial_6_datasetiter.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 135
docs/source/tutorials/tutorial_7_metrics.rst View File

@@ -1,135 +0,0 @@
===============================
使用Metric快速评测你的模型
===============================

在进行训练时,fastNLP提供了各种各样的 :mod:`~fastNLP.core.metrics` 。
如前面的教程中所介绍,:class:`~fastNLP.AccuracyMetric` 类的对象被直接传到 :class:`~fastNLP.Trainer` 中用于训练

.. code-block:: python

trainer = Trainer(train_data=train_data, dev_data=dev_data, model=model,
loss=loss, device=device, metrics=metric)
trainer.train()

除了 :class:`~fastNLP.AccuracyMetric` 之外,:class:`~fastNLP.SpanFPreRecMetric` 也是一种非常见的评价指标,
例如在序列标注问题中,常以span的方式计算 F-measure, precision, recall。

另外,fastNLP 还实现了用于抽取式QA(如SQuAD)的metric :class:`~fastNLP.ExtractiveQAMetric`。
用户可以参考下面这个表格,点击第一列查看各个 :mod:`~fastNLP.core.metrics` 的详细文档。

.. csv-table::
:header: 名称, 介绍

:class:`~fastNLP.core.metrics.MetricBase` , 自定义metrics需继承的基类
:class:`~fastNLP.core.metrics.AccuracyMetric` , 简单的正确率metric
:class:`~fastNLP.core.metrics.SpanFPreRecMetric` , "同时计算 F-measure, precision, recall 值的 metric"
:class:`~fastNLP.core.metrics.ExtractiveQAMetric` , 用于抽取式QA任务 的metric

更多的 :mod:`~fastNLP.core.metrics` 正在被添加到 fastNLP 当中,敬请期待。

------------------------------
定义自己的metrics
------------------------------

在定义自己的metrics类时需继承 fastNLP 的 :class:`~fastNLP.core.metrics.MetricBase`,
并覆盖写入 ``evaluate`` 和 ``get_metric`` 方法。

evaluate(xxx) 中传入一个批次的数据,将针对一个批次的预测结果做评价指标的累计

get_metric(xxx) 当所有数据处理完毕时调用该方法,它将根据 evaluate函数累计的评价指标统计量来计算最终的评价结果

以分类问题中,accuracy 计算为例,假设 model 的 `forward` 返回 dict 中包含 `pred` 这个 key , 并且该 key 需要用于 accuracy::

class Model(nn.Module):
def __init__(xxx):
# do something
def forward(self, xxx):
# do something
return {'pred': pred, 'other_keys':xxx} # pred's shape: batch_size x num_classes

假设dataset中 `target` 这个 field 是需要预测的值,并且该 field 被设置为了 target 对应的 `AccMetric` 可以按如下的定义( Version 1, 只使用这一次)::

from fastNLP import MetricBase

class AccMetric(MetricBase):

def __init__(self):
super().__init__()
# 根据你的情况自定义指标
self.total = 0
self.acc_count = 0

# evaluate的参数需要和DataSet 中 field 名以及模型输出的结果 field 名一致,不然找不到对应的value
# pred, target 的参数是 fastNLP 的默认配置
def evaluate(self, pred, target):
# dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric
self.total += target.size(0)
self.acc_count += target.eq(pred).sum().item()

def get_metric(self, reset=True): # 在这里定义如何计算metric
acc = self.acc_count/self.total
if reset: # 是否清零以便重新计算
self.acc_count = 0
self.total = 0
return {'acc': acc}
# 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中


如果需要复用 metric,比如下一次使用 `AccMetric` 时,dataset中目标field不叫 `target` 而叫 `y` ,或者model的输出不是 `pred` (Version 2)::

class AccMetric(MetricBase):
def __init__(self, pred=None, target=None):
"""
假设在另一场景使用时,目标field叫y,model给出的key为pred_y。则只需要在初始化AccMetric时,
acc_metric = AccMetric(pred='pred_y', target='y')即可。
当初始化为acc_metric = AccMetric() 时,fastNLP会直接使用 'pred', 'target' 作为key去索取对应的的值
"""

super().__init__()

# 如果没有注册该则效果与 Version 1 就是一样的
self._init_param_map(pred=pred, target=target) # 该方法会注册 pred 和 target . 仅需要注册evaluate()方法会用到的参数名即可

# 根据你的情况自定义指标
self.total = 0
self.acc_count = 0

# evaluate的参数需要和DataSet 中 field 名以及模型输出的结果 field 名一致,不然找不到对应的value
# pred, target 的参数是 fastNLP 的默认配置
def evaluate(self, pred, target):
# dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric
self.total += target.size(0)
self.acc_count += target.eq(pred).sum().item()

def get_metric(self, reset=True): # 在这里定义如何计算metric
acc = self.acc_count/self.total
if reset: # 是否清零以便重新计算
self.acc_count = 0
self.total = 0
return {'acc': acc}
# 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中

``MetricBase`` 将会在输入的字典 ``pred_dict`` 和 ``target_dict`` 中进行检查.
``pred_dict`` 是模型当中 ``forward()`` 函数或者 ``predict()`` 函数的返回值.
``target_dict`` 是DataSet当中的ground truth, 判定ground truth的条件是field的 ``is_target`` 被设置为True.

``MetricBase`` 会进行以下的类型检测:

1. self.evaluate当中是否有 varargs, 这是不支持的.
2. self.evaluate当中所需要的参数是否既不在 ``pred_dict`` 也不在 ``target_dict`` .
3. self.evaluate当中所需要的参数是否既在 ``pred_dict`` 也在 ``target_dict`` .

除此以外,在参数被传入self.evaluate以前,这个函数会检测 ``pred_dict`` 和 ``target_dict`` 当中没有被用到的参数
如果kwargs是self.evaluate的参数,则不会检测

self.evaluate将计算一个批次(batch)的评价指标,并累计。 没有返回值
self.get_metric将统计当前的评价指标并返回评价结果, 返回值需要是一个dict, key是指标名称,value是指标的值


----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_7_metrics.ipynb" download="tutorial_7_metrics.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 193
docs/source/tutorials/tutorial_8_modules_models.rst View File

@@ -1,193 +0,0 @@
======================================
使用Modules和Models快速搭建自定义模型
======================================

:mod:`~fastNLP.modules` 和 :mod:`~fastNLP.models` 用于构建 fastNLP 所需的神经网络模型,它可以和 torch.nn 中的模型一起使用。
下面我们会分三节介绍编写构建模型的具体方法。


使用 models 中的模型
----------------------

fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models.CNNText` 、
:class:`~fastNLP.models.SeqLabeling` 等完整的模型,以供用户直接使用。
以文本分类的任务为例,我们从 models 中导入 :class:`~fastNLP.models.CNNText` 模型,用它进行训练。

.. code-block:: python

from fastNLP.models import CNNText

model_cnn = CNNText((len(vocab),100), num_classes=2, dropout=0.1)

trainer = Trainer(train_data=train_data, dev_data=dev_data, metrics=metric,
loss=loss, device=device, model=model_cnn)
trainer.train()

在 iPython 环境输入 `model_cnn` ,我们可以看到 ``model_cnn`` 的网络结构

.. parsed-literal::

CNNText(
(embed): Embedding(
(embed): Embedding(16292, 100)
(dropout): Dropout(p=0.0, inplace=False)
)
(conv_pool): ConvMaxpool(
(convs): ModuleList(
(0): Conv1d(100, 30, kernel_size=(1,), stride=(1,), bias=False)
(1): Conv1d(100, 40, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
(2): Conv1d(100, 50, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
)
)
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=120, out_features=2, bias=True)
)

FastNLP 中内置的 models 如下表所示,您可以点击具体的名称查看详细的 API:

.. csv-table::
:header: 名称, 介绍

:class:`~fastNLP.models.CNNText` , 使用 CNN 进行文本分类的模型
:class:`~fastNLP.models.SeqLabeling` , 简单的序列标注模型
:class:`~fastNLP.models.AdvSeqLabel` , 更大网络结构的序列标注模型
:class:`~fastNLP.models.ESIM` , ESIM 模型的实现
:class:`~fastNLP.models.StarTransEnc` , 带 word-embedding的Star-Transformer模 型
:class:`~fastNLP.models.STSeqLabel` , 用于序列标注的 Star-Transformer 模型
:class:`~fastNLP.models.STNLICls` ,用于自然语言推断 (NLI) 的 Star-Transformer 模型
:class:`~fastNLP.models.STSeqCls` , 用于分类任务的 Star-Transformer 模型
:class:`~fastNLP.models.BiaffineParser` , Biaffine 依存句法分析网络的实现
:class:`~fastNLP.models.BiLSTMCRF`, 使用BiLSTM与CRF进行序列标注


使用 nn.torch 编写模型
----------------------------

FastNLP 完全支持使用 pyTorch 编写的模型,但与 pyTorch 中编写模型的常见方法不同,
用于 fastNLP 的模型中 forward 函数需要返回一个字典,字典中至少需要包含 ``pred`` 这个字段。

下面是使用 pyTorch 中的 torch.nn 模块编写的文本分类,注意观察代码中标注的向量维度。
由于 pyTorch 使用了约定俗成的维度设置,使得 forward 中需要多次处理维度顺序

.. code-block:: python

import torch
import torch.nn as nn

class LSTMText(nn.Module):
def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim=64, num_layers=2, dropout=0.5):
super().__init__()

self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout)
self.fc = nn.Linear(hidden_dim * 2, output_dim)
self.dropout = nn.Dropout(dropout)

def forward(self, words):
# (input) words : (batch_size, seq_len)
words = words.permute(1,0)
# words : (seq_len, batch_size)

embedded = self.dropout(self.embedding(words))
# embedded : (seq_len, batch_size, embedding_dim)
output, (hidden, cell) = self.lstm(embedded)
# output: (seq_len, batch_size, hidden_dim * 2)
# hidden: (num_layers * 2, batch_size, hidden_dim)
# cell: (num_layers * 2, batch_size, hidden_dim)

hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
hidden = self.dropout(hidden)
# hidden: (batch_size, hidden_dim * 2)

pred = self.fc(hidden.squeeze(0))
# result: (batch_size, output_dim)
return {"pred":pred}

我们同样可以在 iPython 环境中查看这个模型的网络结构

.. parsed-literal::

LSTMText(
(embedding): Embedding(16292, 100)
(lstm): LSTM(100, 64, num_layers=2, dropout=0.5, bidirectional=True)
(fc): Linear(in_features=128, out_features=2, bias=True)
(dropout): Dropout(p=0.5, inplace=False)
)


使用 modules 编写模型
----------------------------

下面我们使用 :mod:`fastNLP.modules` 中的组件来构建同样的网络。由于 fastNLP 统一把 ``batch_size`` 放在第一维,
在编写代码的过程中会有一定的便利。

.. code-block:: python

from fastNLP.modules import Embedding, LSTM, MLP

class MyText(nn.Module):
def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim=64, num_layers=2, dropout=0.5):
super().__init__()

self.embedding = Embedding((vocab_size, embedding_dim))
self.lstm = LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True)
self.mlp = MLP([hidden_dim*2,output_dim], dropout=dropout)

def forward(self, words):
embedded = self.embedding(words)
_,(hidden,_) = self.lstm(embedded)
pred = self.mlp(torch.cat((hidden[-1],hidden[-2]),dim=1))
return {"pred":pred}

我们自己编写模型的网络结构如下

.. parsed-literal::

MyText(
(embedding): Embedding(
(embed): Embedding(16292, 100)
(dropout): Dropout(p=0.0, inplace=False)
)
(lstm): LSTM(
(lstm): LSTM(100, 64, num_layers=2, batch_first=True, bidirectional=True)
)
(mlp): MLP(
(hiddens): ModuleList()
(output): Linear(in_features=128, out_features=2, bias=True)
(dropout): Dropout(p=0.5, inplace=False)
)
)

FastNLP 中包含的各种模块如下表,您可以点击具体的名称查看详细的 API,也可以通过 :doc:`/fastNLP.modules` 进行了解。

.. csv-table::
:header: 名称, 介绍

:class:`~fastNLP.modules.ConvolutionCharEncoder` , char级别的卷积 encoder
:class:`~fastNLP.modules.LSTMCharEncoder` , char级别基于LSTM的 encoder
:class:`~fastNLP.modules.ConvMaxpool` , 结合了Convolution和Max-Pooling于一体的模块
:class:`~fastNLP.modules.LSTM` , LSTM模块, 轻量封装了PyTorch的LSTM
:class:`~fastNLP.modules.StarTransformer` , Star-Transformer 的encoder部分
:class:`~fastNLP.modules.TransformerEncoder` , Transformer的encoder模块,不包含embedding层
:class:`~fastNLP.modules.VarRNN` , Variational Dropout RNN 模块
:class:`~fastNLP.modules.VarLSTM` , Variational Dropout LSTM 模块
:class:`~fastNLP.modules.VarGRU` , Variational Dropout GRU 模块
:class:`~fastNLP.modules.MaxPool` , Max-pooling模块
:class:`~fastNLP.modules.MaxPoolWithMask` , 带mask矩阵的max pooling。在做 max-pooling的时候不会考虑mask值为0的位置。
:class:`~fastNLP.modules.AvgPool` , Average-pooling模块
:class:`~fastNLP.modules.AvgPoolWithMask` , 带mask矩阵的average pooling。在做 average-pooling的时候不会考虑mask值为0的位置。
:class:`~fastNLP.modules.MultiHeadAttention` , MultiHead Attention 模块
:class:`~fastNLP.modules.MLP` , 简单的多层感知器模块
:class:`~fastNLP.modules.ConditionalRandomField` , 条件随机场模块
:class:`~fastNLP.modules.viterbi_decode` , 给定一个特征矩阵以及转移分数矩阵,计算出最佳的路径以及对应的分数 (与 :class:`~fastNLP.modules.ConditionalRandomField` 配合使用)
:class:`~fastNLP.modules.allowed_transitions` , 给定一个id到label的映射表,返回所有可以跳转的列表(与 :class:`~fastNLP.modules.ConditionalRandomField` 配合使用)
:class:`~fastNLP.modules.TimestepDropout` , 简单包装过的Dropout 组件


----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_8_modules_models.ipynb" download="tutorial_8_modules_models.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 140
docs/source/tutorials/tutorial_9_callback.rst View File

@@ -1,140 +0,0 @@
===================================================
使用 Callback 自定义你的训练过程
===================================================

- `什么是Callback`_
- `使用 Callback`_
- `fastNLP 中的 Callback`_
- `自定义 Callback`_


什么是Callback
---------------------

:class:`~fastNLP.core.callback.Callback` 是与 :class:`~fastNLP.core.trainer.Trainer` 紧密结合的模块,利用 Callback 可以在 :class:`~fastNLP.core.trainer.Trainer` 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。

fastNLP 中提供了很多常用的 :class:`~fastNLP.core.callback.Callback` ,开箱即用。


使用 Callback
---------------------

使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。


.. code-block:: python

from fastNLP import (Callback, EarlyStopCallback,
Trainer, CrossEntropyLoss, AccuracyMetric)
from fastNLP.models import CNNText
import torch.cuda

# prepare data
def get_data():
from fastNLP.io import ChnSentiCorpPipe as pipe
data = pipe().process_from_file()
print(data)
data.rename_field('chars', 'words')
train_data = data.get_dataset('train')
dev_data = data.get_dataset('dev')
test_data = data.get_dataset('test')
vocab = data.get_vocab('words')
tgt_vocab = data.get_vocab('target')
return train_data, dev_data, test_data, vocab, tgt_vocab

# prepare model
train_data, dev_data, _, vocab, tgt_vocab = get_data()
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = CNNText((len(vocab),50), num_classes=len(tgt_vocab))

# define callback
callbacks=[EarlyStopCallback(5)]

# pass callbacks to Trainer
def train_with_callback(cb_list):
trainer = Trainer(
device=device,
n_epochs=3,
model=model,
train_data=train_data,
dev_data=dev_data,
loss=CrossEntropyLoss(),
metrics=AccuracyMetric(),
callbacks=cb_list,
check_code_level=-1
)
trainer.train()

train_with_callback(callbacks)



fastNLP 中的 Callback
---------------------

fastNLP 中提供了很多常用的 Callback,如梯度裁剪,训练时早停和测试验证集,fitlog 等等。具体 Callback 请参考 :mod:`fastNLP.core.callback`

.. code-block:: python

from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback
callbacks = [
EarlyStopCallback(5),
GradientClipCallback(clip_value=5, clip_type='value'),
EvaluateCallback(dev_data)
]

train_with_callback(callbacks)

自定义 Callback
---------------------

这里我们以一个简单的 Callback作为例子,它的作用是打印每一个 Epoch 平均训练 loss。

1. 创建 Callback
要自定义 Callback,我们要实现一个类,继承 :class:`~fastNLP.core.callback.Callback` 。这里我们定义 ``MyCallBack`` ,继承 fastNLP.Callback 。

2. 指定 Callback 调用的阶段
Callback 中所有以 `on_` 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用,on_epoch_end()
会在每个 epoch 结束时调用。 具体有哪些类方法,参见 :class:`~fastNLP.core.callback.Callback` 文档。这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录
当前 loss,在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。

3. 使用 Callback 的属性访问 Trainer 的内部信息
为了方便使用,可以使用 :class:`~fastNLP.core.callback.Callback` 的属性,访问 :class:`~fastNLP.core.trainer.Trainer` 中的对应信息,如 optimizer, epoch, n_epochs,分别对应训练时的优化器,
当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见 :class:`~fastNLP.core.callback.Callback` 。这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步
数,可以通过 self.step 属性得到当前训练了多少步。

.. code-block:: python

from fastNLP import Callback
from fastNLP import logger

class MyCallBack(Callback):
"""Print average loss in each epoch"""
def __init__(self):
super().__init__()
self.total_loss = 0
self.start_step = 0

def on_backward_begin(self, loss):
self.total_loss += loss.item()

def on_epoch_end(self):
n_steps = self.step - self.start_step
avg_loss = self.total_loss / n_steps
logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss)
self.start_step = self.step

callbacks = [MyCallBack()]
train_with_callback(callbacks)


----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/tutorial_9_callback.ipynb" download="tutorial_9_callback.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 208
docs/source/tutorials/序列标注.rst View File

@@ -1,208 +0,0 @@
=====================
序列标注
=====================

这一部分的内容主要展示如何使用fastNLP实现序列标注(Sequence labeling)任务。您可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。
在阅读这篇教程前,希望您已经熟悉了fastNLP的基础使用,尤其是数据的载入以及模型的构建。通过这个小任务,能让您进一步熟悉fastNLP的使用。

.. note::

本教程推荐使用 GPU 进行实验

命名实体识别(name entity recognition, NER)
------------------------------------------

命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。
如下面的例子中

我来自复旦大学。

其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题在实际做的时候会被
转换为序列标注问题

针对"我来自复旦大学"这句话,我们的预测目标将是[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG],其中O表示out,即不是一个实体,B-ORG是ORG(
organization的缩写)这个类别的开头(Begin),I-ORG是ORG类别的中间(Inside)。

在本tutorial中我们将通过fastNLP尝试写出一个能够执行以上任务的模型。

载入数据
------------------------------------------
fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您可以通过 :doc:`使用Loader和Pipe处理数据 </tutorials/tutorial_4_load_dataset>`
了解如何使用fastNLP提供的数据加载函数。下面我们以微博命名实体任务来演示一下在fastNLP进行序列标注任务。

.. code-block:: python

from fastNLP.io import WeiboNERPipe
data_bundle = WeiboNERPipe().process_from_file()
print(data_bundle.get_dataset('train')[:2])

打印的数据如下 ::

+-------------------------------------------------+------------------------------------------+------------------------------------------+---------+
| raw_chars | target | chars | seq_len |
+-------------------------------------------------+------------------------------------------+------------------------------------------+---------+
| ['一', '节', '课', '的', '时', '间', '真', '... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, ... | [8, 211, 775, 3, 49, 245, 89, 26, 101... | 16 |
| ['回', '复', '支', '持', ',', '赞', '成', '... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... | [116, 480, 127, 109, 2, 446, 134, 2, ... | 59 |
+-------------------------------------------------+------------------------------------------+------------------------------------------+---------+


模型构建
--------------------------------

首先选择需要使用的Embedding类型。关于Embedding的相关说明可以参见 :doc:`使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>` 。
在这里我们使用通过word2vec预训练的中文汉字embedding。

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding

embed = StaticEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name='cn-char-fastnlp-100d')

选择好Embedding之后,我们可以使用fastNLP中自带的 :class:`fastNLP.models.BiLSTMCRF` 作为模型。

.. code-block:: python

from fastNLP.models import BiLSTMCRF

data_bundle.rename_field('chars', 'words') # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名
model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,
target_vocab=data_bundle.get_vocab('target'))

进行训练
--------------------------------

下面我们选择用来评估模型的metric,以及优化用到的优化函数。

.. code-block:: python

from fastNLP import SpanFPreRecMetric
from torch.optim import Adam
from fastNLP import LossInForward

metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=1e-2)
loss = LossInForward()

使用Trainer进行训练, 您可以通过修改 device 的值来选择显卡。

.. code-block:: python

from fastNLP import Trainer
import torch

device= 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,
dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)
trainer.train()

训练过程输出为::

input fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])

training epochs started 2019-09-25-10-43-09
Evaluate data in 0.62 seconds!
Evaluation on dev at Epoch 1/10. Step:43/430:
SpanFPreRecMetric: f=0.070352, pre=0.100962, rec=0.053985

...

Evaluate data in 0.61 seconds!
Evaluation on dev at Epoch 10/10. Step:430/430:
SpanFPreRecMetric: f=0.51223, pre=0.581699, rec=0.457584


In Epoch:7/Step:301, got best dev performance:
SpanFPreRecMetric: f=0.515528, pre=0.65098, rec=0.426735
Reloaded the best model.

进行测试
--------------------------------

训练结束之后过,可以通过 :class:`~fastNLP.Tester` 测试其在测试集上的性能

.. code-block:: python

from fastNLP import Tester

tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()

输出为::

[tester]
SpanFPreRecMetric: f=0.482399, pre=0.530086, rec=0.442584


使用更强的Bert做序列标注
--------------------------------

在fastNLP使用Bert进行任务,您只需要把 :class:`fastNLP.embeddings.StaticEmbedding` 切换为 :class:`fastNLP.embeddings.BertEmbedding` (可修改 device 选择显卡)。

.. code-block:: python

from fastNLP.io import WeiboNERPipe
from fastNLP.models import BiLSTMCRF

data_bundle = WeiboNERPipe().process_from_file()
data_bundle.rename_field('chars', 'words')

from fastNLP.embeddings import BertEmbedding
embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn')
model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,
target_vocab=data_bundle.get_vocab('target'))

from fastNLP import SpanFPreRecMetric
from torch.optim import Adam
from fastNLP import LossInForward
metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(), lr=2e-5)
loss = LossInForward()

from fastNLP import Trainer
import torch
device= 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, batch_size=12,
dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)
trainer.train()

from fastNLP import Tester
tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
tester.test()

输出为::

training epochs started 2019-09-25-07-15-43
Evaluate data in 2.02 seconds!
Evaluation on dev at Epoch 1/10. Step:113/1130:
SpanFPreRecMetric: f=0.0, pre=0.0, rec=0.0

...

Evaluate data in 2.17 seconds!
Evaluation on dev at Epoch 10/10. Step:1130/1130:
SpanFPreRecMetric: f=0.647332, pre=0.589852, rec=0.717224

In Epoch:6/Step:678, got best dev performance:
SpanFPreRecMetric: f=0.669963, pre=0.645238, rec=0.696658
Reloaded the best model.

Evaluate data in 1.82 seconds!
[tester]
SpanFPreRecMetric: f=0.641774, pre=0.626424, rec=0.657895

可以看出通过使用Bert,效果有明显的提升,从48.2提升到了64.1。


----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/%E5%BA%8F%E5%88%97%E6%A0%87%E6%B3%A8.ipynb" download="序列标注.ipynb">点击下载 IPython Notebook 文件</a><hr>

+ 0
- 542
docs/source/tutorials/文本分类.rst View File

@@ -1,542 +0,0 @@
文本分类
=============================

文本分类(Text classification)任务是将一句话或一段话划分到某个具体的类别。比如垃圾邮件识别,文本情绪分类等。这篇教程可以带你从零开始了解 fastNLP 的使用

.. note::

本教程推荐使用 GPU 进行实验

.. code-block:: text

1, 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!

其中开头的1是只这条评论的标签,表示是正面的情绪。我们将使用到的数据可以通过 `此链接 <http://download.fastnlp.top/dataset/chn_senti_corp.zip>`_
下载并解压,当然也可以通过fastNLP自动下载该数据。

数据中的内容如下图所示。接下来,我们将用fastNLP在这个数据上训练一个分类网络。

.. figure:: ./cn_cls_example.png
:alt: jupyter

步骤
----

一共有以下的几个步骤:

1. `读取数据 <#id4>`_

2. `预处理数据 <#id5>`_

3. `选择预训练词向量 <#id6>`_

4. `创建模型 <#id7>`_

5. `训练模型 <#id8>`_

(1) 读取数据
~~~~~~~~~~~~~~~~~~~~

fastNLP提供多种数据的自动下载与自动加载功能,对于这里我们要用到的数据,我们可以用 :class:`~fastNLP.io.Loader` 自动下载并加载该数据。
更多有关Loader的使用可以参考 :mod:`~fastNLP.io.loader`

.. code-block:: python

from fastNLP.io import ChnSentiCorpLoader
loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader
data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回
data_bundle = loader.load(data_dir) # 这一行代码将从{data_dir}处读取数据至DataBundle


DataBundle的相关介绍,可以参考 :class:`~fastNLP.io.DataBundle` 。我们可以打印该data\_bundle的基本信息。

.. code-block:: python

print(data_bundle)


.. code-block:: text

In total 3 datasets:
dev has 1200 instances.
train has 9600 instances.
test has 1200 instances.
In total 0 vocabs:


可以看出,该data\_bundle中一个含有三个 :class:`~fastNLP.DataSet` 。通过下面的代码,我们可以查看DataSet的基本情况

.. code-block:: python

print(data_bundle.get_dataset('train')[:2]) # 查看Train集前两个sample


.. code-block:: text

+-----------------------------+--------+
| raw_chars | target |
+-----------------------------+--------+
| 选择珠江花园的原因就是方... | 1 |
| 15.4寸笔记本的键盘确实爽... | 1 |
+-----------------------------+--------+

(2) 预处理数据
~~~~~~~~~~~~~~~~~~~~

在NLP任务中,预处理一般包括:

(a) 将一整句话切分成汉字或者词;

(b) 将文本转换为index

fastNLP中也提供了多种数据集的处理类,这里我们直接使用fastNLP的ChnSentiCorpPipe。更多关于Pipe的说明可以参考 :mod:`~fastNLP.io.pipe` 。

.. code-block:: python

from fastNLP.io import ChnSentiCorpPipe

pipe = ChnSentiCorpPipe()
data_bundle = pipe.process(data_bundle) # 所有的Pipe都实现了process()方法,且输入输出都为DataBundle类型

print(data_bundle) # 打印data_bundle,查看其变化


.. code-block:: text

In total 3 datasets:
dev has 1200 instances.
train has 9600 instances.
test has 1200 instances.
In total 2 vocabs:
chars has 4409 entries.
target has 2 entries.



可以看到除了之前已经包含的3个 :class:`~fastNLP.DataSet` ,还新增了两个 :class:`~fastNLP.Vocabulary` 。我们可以打印DataSet中的内容

.. code-block:: python

print(data_bundle.get_dataset('train')[:2])


.. code-block:: text

+-----------------+--------+-----------------+---------+
| raw_chars | target | chars | seq_len |
+-----------------+--------+-----------------+---------+
| 选择珠江花园... | 0 | [338, 464, 1... | 106 |
| 15.4寸笔记本... | 0 | [50, 133, 20... | 56 |
+-----------------+--------+-----------------+---------+


新增了一列为数字列表的chars,以及变为数字的target列。可以看出这两列的名称和刚好与data\_bundle中两个Vocabulary的名称是一致的,我们可以打印一下Vocabulary看一下里面的内容。

.. code-block:: python

char_vocab = data_bundle.get_vocab('chars')
print(char_vocab)


.. code-block:: text

Vocabulary(['选', '择', '珠', '江', '花']...)


Vocabulary是一个记录着词语与index之间映射关系的类,比如

.. code-block:: python

index = char_vocab.to_index('选')
print("'选'的index是{}".format(index)) # 这个值与上面打印出来的第一个instance的chars的第一个index是一致的
print("index:{}对应的汉字是{}".format(index, char_vocab.to_word(index)))


.. code-block:: text

'选'的index是338
index:338对应的汉字是选


(3) 选择预训练词向量
~~~~~~~~~~~~~~~~~~~~

由于Word2vec, Glove, Elmo, Bert等预训练模型可以增强模型的性能,所以在训练具体任务前,选择合适的预训练词向量非常重要。
在fastNLP中我们提供了多种Embedding使得加载这些预训练模型的过程变得更加便捷。
这里我们先给出一个使用word2vec的中文汉字预训练的示例,之后再给出一个使用Bert的文本分类。
这里使用的预训练词向量为'cn-fastnlp-100d',fastNLP将自动下载该embedding至本地缓存,
fastNLP支持使用名字指定的Embedding以及相关说明可以参见 :mod:`fastNLP.embeddings`

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding

word2vec_embed = StaticEmbedding(char_vocab, model_dir_or_name='cn-char-fastnlp-100d')


.. code-block:: text

Found 4321 out of 4409 compound in the pre-training embedding.

(4) 创建模型
~~~~~~~~~~~~

.. code-block:: python

from torch import nn
from fastNLP.modules import LSTM
import torch
# 定义模型
class BiLSTMMaxPoolCls(nn.Module):
def __init__(self, embed, num_classes, hidden_size=400, num_layers=1, dropout=0.3):
super().__init__()
self.embed = embed
self.lstm = LSTM(self.embed.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers,
batch_first=True, bidirectional=True)
self.dropout_layer = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, chars, seq_len): # 这里的名称必须和DataSet中相应的field对应,比如之前我们DataSet中有chars,这里就必须为chars
# chars:[batch_size, max_len]
# seq_len: [batch_size, ]
chars = self.embed(chars)
outputs, _ = self.lstm(chars, seq_len)
outputs = self.dropout_layer(outputs)
outputs, _ = torch.max(outputs, dim=1)
outputs = self.fc(outputs)
return {'pred':outputs} # [batch_size,], 返回值必须是dict类型,且预测值的key建议设为pred
# 初始化模型
model = BiLSTMMaxPoolCls(word2vec_embed, len(data_bundle.get_vocab('target')))

(5) 训练模型
~~~~~~~~~~~~

fastNLP提供了Trainer对象来组织训练过程,包括完成loss计算(所以在初始化Trainer的时候需要指定loss类型),梯度更新(所以在初始化Trainer的时候需要提供优化器optimizer)以及在验证集上的性能验证(所以在初始化时需要提供一个Metric)

.. code-block:: python

from fastNLP import Trainer
from fastNLP import CrossEntropyLoss
from torch.optim import Adam
from fastNLP import AccuracyMetric
loss = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
metric = AccuracyMetric()
device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快
trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss,
optimizer=optimizer, batch_size=32, dev_data=data_bundle.get_dataset('dev'),
metrics=metric, device=device)
trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型
# 在测试集上测试一下模型的性能
from fastNLP import Tester
print("Performance on test is:")
tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)
tester.test()


.. code-block:: text

input fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
chars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
Evaluate data in 0.01 seconds!
training epochs started 2019-09-03-23-57-10

Evaluate data in 0.43 seconds!
Evaluation on dev at Epoch 1/10. Step:300/3000:
AccuracyMetric: acc=0.81

Evaluate data in 0.44 seconds!
Evaluation on dev at Epoch 2/10. Step:600/3000:
AccuracyMetric: acc=0.8675

Evaluate data in 0.44 seconds!
Evaluation on dev at Epoch 3/10. Step:900/3000:
AccuracyMetric: acc=0.878333

....

Evaluate data in 0.48 seconds!
Evaluation on dev at Epoch 9/10. Step:2700/3000:
AccuracyMetric: acc=0.8875

Evaluate data in 0.43 seconds!
Evaluation on dev at Epoch 10/10. Step:3000/3000:
AccuracyMetric: acc=0.895833
In Epoch:7/Step:2100, got best dev performance:
AccuracyMetric: acc=0.8975
Reloaded the best model.

Evaluate data in 0.34 seconds!
[tester]
AccuracyMetric: acc=0.8975

{'AccuracyMetric': {'acc': 0.8975}}



PS: 使用Bert进行文本分类
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code-block:: python

# 只需要切换一下Embedding即可
from fastNLP.embeddings import BertEmbedding
# 这里为了演示一下效果,所以默认Bert不更新权重
bert_embed = BertEmbedding(char_vocab, model_dir_or_name='cn', auto_truncate=True, requires_grad=False)
model = BiLSTMMaxPoolCls(bert_embed, len(data_bundle.get_vocab('target')))
import torch
from fastNLP import Trainer
from fastNLP import CrossEntropyLoss
from torch.optim import Adam
from fastNLP import AccuracyMetric
loss = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=2e-5)
metric = AccuracyMetric()
device = 0 if torch.cuda.is_available() else 'cpu' # 如果有gpu的话在gpu上运行,训练速度会更快
trainer = Trainer(train_data=data_bundle.get_dataset('train'), model=model, loss=loss,
optimizer=optimizer, batch_size=16, dev_data=data_bundle.get_dataset('test'),
metrics=metric, device=device, n_epochs=3)
trainer.train() # 开始训练,训练完成之后默认会加载在dev上表现最好的模型
# 在测试集上测试一下模型的性能
from fastNLP import Tester
print("Performance on test is:")
tester = Tester(data=data_bundle.get_dataset('test'), model=model, metrics=metric, batch_size=64, device=device)
tester.test()


.. code-block:: text

loading vocabulary file ~/.fastNLP/embedding/bert-chinese-wwm/vocab.txt
Load pre-trained BERT parameters from file ~/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin.
Start to generating word pieces for word.
Found(Or segment into word pieces) 4286 words out of 4409.
input fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
chars: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 106])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
Evaluate data in 0.05 seconds!
training epochs started 2019-09-04-00-02-37

Evaluate data in 15.89 seconds!
Evaluation on dev at Epoch 1/3. Step:1200/3600:
AccuracyMetric: acc=0.9

Evaluate data in 15.92 seconds!
Evaluation on dev at Epoch 2/3. Step:2400/3600:
AccuracyMetric: acc=0.904167

Evaluate data in 15.91 seconds!
Evaluation on dev at Epoch 3/3. Step:3600/3600:
AccuracyMetric: acc=0.918333

In Epoch:3/Step:3600, got best dev performance:
AccuracyMetric: acc=0.918333
Reloaded the best model.
Performance on test is:

Evaluate data in 29.24 seconds!
[tester]
AccuracyMetric: acc=0.919167

{'AccuracyMetric': {'acc': 0.919167}}


PS: 基于词进行文本分类
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

由于汉字中没有显示的字与字的边界,一般需要通过分词器先将句子进行分词操作。
下面的例子演示了如何不基于fastNLP已有的数据读取、预处理代码进行文本分类。

(1) 读取数据
~~~~~~~~~~~~~~~~~~~~

这里我们继续以之前的数据为例,但这次我们不使用fastNLP自带的数据读取代码

.. code-block:: python

from fastNLP.io import ChnSentiCorpLoader
loader = ChnSentiCorpLoader() # 初始化一个中文情感分类的loader
data_dir = loader.download() # 这一行代码将自动下载数据到默认的缓存地址, 并将该地址返回

获取到的data_dir下应该有类似以下的文件

.. code-block:: text

- chn_senti_corp
- train.tsv
- dev.tsv
- test.tsv

如果打开任何一个文件查看,会发现里面的格式均为

.. code-block:: text

target raw_chars
1 这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般
0 怀着十分激动的心情放映...

下面我们先定义一个read_file_to_dataset的函数, 即给定一个文件路径,读取其中的内容,并返回一个DataSet。然后我们将所有的DataSet放入到DataBundle对象中来方便接下来的预处理

.. code-block:: python

import os
from fastNLP import DataSet, Instance
from fastNLP.io import DataBundle


def read_file_to_dataset(fp):
ds = DataSet()
with open(fp, 'r') as f:
f.readline() # 第一行是title名称,忽略掉
for line in f:
line = line.strip()
target, chars = line.split('\t')
ins = Instance(target=target, raw_chars=chars)
ds.append(ins)
return ds

data_bundle = DataBundle()
for name in ['train.tsv', 'dev.tsv', 'test.tsv']:
fp = os.path.join(data_dir, name)
ds = read_file_to_dataset(fp)
data_bundle.set_dataset(name=name.split('.')[0], dataset=ds)

print(data_bundle) # 查看以下数据集的情况
# In total 3 datasets:
# train has 9600 instances.
# dev has 1200 instances.
# test has 1200 instances.

(2) 数据预处理
~~~~~~~~~~~~~~~~~~~~

在这里,我们首先把句子通过 fastHan_ 进行分词操作,然后创建词表,并将词语转换为序号。

.. _fastHan: https://gitee.com/fastnlp/fastHan

.. code-block:: python

from fastHan import FastHan
from fastNLP import Vocabulary

model=FastHan()
# model.set_device('cuda') # 可以注视掉这一行增加速度

# 定义分词处理操作
def word_seg(ins):
raw_chars = ins['raw_chars']
# 由于有些句子比较长,我们只截取前128个汉字
raw_words = model(raw_chars[:128], target='CWS')[0]
return raw_words

for name, ds in data_bundle.iter_datasets():
# apply函数将对内部的instance依次执行word_seg操作,并把其返回值放入到raw_words这个field
ds.apply(word_seg, new_field_name='raw_words')
# 除了apply函数,fastNLP还支持apply_field, apply_more(可同时创建多个field)等操作
# 同时我们增加一个seq_len的field
ds.add_seq_len('raw_words')

vocab = Vocabulary()

# 对raw_words列创建词表, 建议把非训练集的dataset放在no_create_entry_dataset参数中
# 也可以通过add_word(), add_word_lst()等建立词表,请参考http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_2_vocabulary.html
vocab.from_dataset(data_bundle.get_dataset('train'), field_name='raw_words',
no_create_entry_dataset=[data_bundle.get_dataset('dev'),
data_bundle.get_dataset('test')])

# 将建立好词表的Vocabulary用于对raw_words列建立词表,并把转为序号的列存入到words列
vocab.index_dataset(data_bundle.get_dataset('train'), data_bundle.get_dataset('dev'),
data_bundle.get_dataset('test'), field_name='raw_words', new_field_name='words')

# 建立target的词表,target的词表一般不需要padding和unknown
target_vocab = Vocabulary(padding=None, unknown=None)
# 一般情况下我们可以只用训练集建立target的词表
target_vocab.from_dataset(data_bundle.get_dataset('train'), field_name='target')
# 如果没有传递new_field_name, 则默认覆盖原词表
target_vocab.index_dataset(data_bundle.get_dataset('train'), data_bundle.get_dataset('dev'),
data_bundle.get_dataset('test'), field_name='target')

# 我们可以把词表保存到data_bundle中,方便之后使用
data_bundle.set_vocab(field_name='words', vocab=vocab)
data_bundle.set_vocab(field_name='target', vocab=target_vocab)

# 我们把words和target分别设置为input和target,这样它们才会在训练循环中被取出并自动padding, 有关这部分更多的内容参考
# http://www.fastnlp.top/docs/fastNLP/tutorials/tutorial_6_datasetiter.html
data_bundle.set_target('target')
data_bundle.set_input('words') # DataSet也有这两个接口
# 如果某些field,您希望它被设置为target或者input,但是不希望fastNLP自动padding或需要使用特定的padding方式,请参考
# http://www.fastnlp.top/docs/fastNLP/fastNLP.core.dataset.html

print(data_bundle.get_dataset('train')[:2]) # 我们可以看一下当前dataset的内容

# +--------+-----------------------+-----------------------+----------------------+
# | target | raw_chars | raw_words | words |
# +--------+-----------------------+-----------------------+----------------------+
# | 0 | 选择珠江花园的原因... | ['选择', '珠江', ... | [2, 3, 4, 5, 6, 7... |
# | 0 | 15.4寸笔记本的键盘... | ['15.4', '寸', '笔... | [71, 72, 73, 74, ... |
# +--------+-----------------------+-----------------------+----------------------+

# 由于之后需要使用之前定义的BiLSTMMaxPoolCls模型,所以需要将words这个field修改为chars
data_bundle.rename_field('words', 'chars')

我们可以打印一下vocab看一下当前的词表内容

.. code-block:: python

print(data_bundle.get_vocab('chars'))
# Vocabulary([选择, 珠江, 花园, 的, 原因]...)

(3) 选择预训练词向量
~~~~~~~~~~~~~~~~~~~~

这里我们选择腾讯的预训练中文词向量,可以在 腾讯词向量_ 处下载并解压。这里我们不能直接使用BERT,因为BERT是基于中文字进行预训练的。

.. _腾讯词向量: https://ai.tencent.com/ailab/nlp/en/embedding.html

下面我们使用 :mod:`fastNLP.embeddings` 加载该词向量,fastNLP会抽取vocabulary中包含的词的向量,并随机初始化不包含在文件中的词语的词向量。

.. code-block:: python

from fastNLP.embeddings import StaticEmbedding

word2vec_embed = StaticEmbedding(data_bundle.get_vocab('chars'), model_dir_or_name='/path/to/Tencent_AILab_ChineseEmbedding.txt')

再之后的模型定义与训练过程与上面是一致的,这里就不再赘述了。



----------------------------------
代码下载
----------------------------------

.. raw:: html

<a href="../_static/notebooks/%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB.ipynb" download="文本分类.ipynb">点击下载 IPython Notebook 文件 </a><hr>


+ 0
- 15
docs/source/user/api_update.rst View File

@@ -1,15 +0,0 @@
===========================
API变动列表
===========================

2020.4.14
========================

修改了 :class:`fastNLP.core.callback.ControlC` 的 API。

原来的参数 ``quit_all`` 修改为 ``quit_and_do`` ,仍然接收一个 bool 值。新增可选参数 ``action`` ,接收一个待执行的函数,
在 ``quit_and_do`` 的值为 ``True`` 时,退出训练过程后执行该函数。 ``action`` 的默认值是退出整个程序,与原有功能一致。

.. note::
原有用法 `ControlC(True)` 和 `ControlC(False)` 均可以继续正确执行,但 `ControlC(quit_all=True/False)` 需要修改为
`ControlC(quit_and_do=True/False)` 。

+ 0
- 162
docs/source/user/example.rst View File

@@ -1,162 +0,0 @@
======
大标题
======

.. note::
中文标题需要符号的数量至少是中文字数的两倍

.. warning::
符号的数量只可以多,不可以少。

小标题1
###########

小标题2
*********

小标题3(正常使用)
========================

小标题4
-------------------

推荐使用大标题、小标题3和小标题4

官方文档 http://docutils.sourceforge.net/docs/user/rst/quickref.html

`熟悉markdown的同学推荐参考这篇文章 <https://macplay.github.io/posts/cong-markdown-dao-restructuredtext/#id30>`_

\<\>内表示的是链接地址,\<\>外的是显示到外面的文字

常见语法
============

*emphasis*

**strong**

`text`

``inline literal``

http://docutils.sf.net/ 孤立的网址会自动生成链接

显示为特定的文字的链接 `sohu <http://www.sohu.com>`_

突出显示的
上面文字

正常缩进

形成锻炼



特殊模块
============

选项会自动识别

-v An option
-o file Same with value
--delta A long option
--delta=len Same with value


图片

.. image:: ../figures/procedures.PNG
:height: 200
:width: 560
:scale: 50
:alt: alternate text
:align: center

显示一个冒号的代码块::

中间要空一行

::

不显示冒号的代码块

.. code-block:: python

:linenos:
:emphasize-lines: 1,3

print("专业的代码块")
print("")
print("有行号和高亮")

数学块
==========

.. math::

H_2O + Na = NaOH + H_2 \uparrow

复杂表格
==========

+------------------------+------------+----------+----------+
| Header row, column 1 | Header 2 | Header 3 | Header 4 |
| (header rows optional) | | | |
+========================+============+==========+==========+
| body row 1, column 1 | column 2 | column 3 | column 4 |
+------------------------+------------+----------+----------+
| body row 2 | Cells may span columns. |
+------------------------+------------+---------------------+
| body row 3 | Cells may | - Table cells |
+------------------------+ span rows. | - contain |
| body row 4 | | - body elements. |
+------------------------+------------+---------------------+

简易表格
==========

===== ===== ======
Inputs Output
------------ ------
A B A or B
===== ===== ======
False False False
True True True
===== ===== ======

csv 表格
============

.. csv-table::
:header: sentence, target

This is the first instance ., 0
Second instance ., 1
Third instance ., 1
..., ...



[重要]各种链接
===================

各种链接帮助我们连接到fastNLP文档的各个位置

\<\>内表示的是链接地址,\<\>外的是显示到外面的文字

:doc:`根据文件名链接 </user/quickstart>`

:mod:`~fastNLP.core.batch`

:class:`~fastNLP.Batch`

~表示只显示最后一项

:meth:`fastNLP.DataSet.apply`

下面这个代码是不可行的,必须要用 r""" 才行:

.. code::

:param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为 `beta=0.5, 1, 2` 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。


+ 0
- 24
docs/source/user/installation.rst View File

@@ -1,24 +0,0 @@
===============
安装指南
===============

.. contents::
:local:

fastNLP 依赖如下包::

numpy>=1.14.2
torch>=1.0.0
tqdm>=4.28.1
nltk>=3.4.1
requests
spacy
prettytable>=0.7.2

其中torch的安装可能与操作系统及 CUDA 的版本相关,请参见 `PyTorch 官网 <https://pytorch.org/>`_ 。
在依赖包安装完成的情况,您可以在命令行执行如下指令完成安装

.. code:: shell

>>> pip install fastNLP
>>> python -m spacy download en

+ 0
- 14
docs/source/user/quickstart.rst View File

@@ -1,14 +0,0 @@
===============
快速入门
===============

如果你想用 fastNLP 来快速地解决某类 NLP 问题,你可以参考以下教程之一:

.. toctree::
:maxdepth: 1

/tutorials/文本分类
/tutorials/序列标注

这些教程是简单地介绍了 fastNLP 的使用流程,其中文本分类相对简单,序列标注则较为复杂。更多的教程分析见 :doc:`/user/tutorials`


+ 0
- 25
docs/source/user/tutorials.rst View File

@@ -1,25 +0,0 @@
========================
fastNLP 详细使用教程
========================

这里是更详细的使用教程。对于大部分的用户,我们建议你从第一篇开始顺序阅读;如果你只想了解其中的一部分,也可以进行选读。

.. toctree::
:maxdepth: 1

使用DataSet预处理文本 </tutorials/tutorial_1_data_preprocess>
使用Vocabulary转换文本与index </tutorials/tutorial_2_vocabulary>
使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>
使用Loader和Pipe加载并处理数据集 </tutorials/tutorial_4_load_dataset>
使用Trainer和Tester快速训练和测试 </tutorials/tutorial_5_loss_optimizer>
使用DataSetIter实现自定义训练过程 </tutorials/tutorial_6_datasetiter>
使用Metric快速评测你的模型 </tutorials/tutorial_7_metrics>
使用Modules和Models快速搭建自定义模型 </tutorials/tutorial_8_modules_models>
使用Callback自定义你的训练过程 </tutorials/tutorial_9_callback>

.. toctree::
:maxdepth: 1

拓展阅读1:BertEmbedding的各种用法 </tutorials/extend_1_bert_embedding>
拓展阅读2:分布式训练简介 </tutorials/extend_2_dist>
拓展阅读3:使用fitlog 辅助 fastNLP 进行科研 </tutorials/extend_3_fitlog>

+ 0
- 98
fastNLP/__init__.py View File

@@ -1,98 +0,0 @@
r"""
fastNLP 由 :mod:`~fastNLP.core` 、 :mod:`~fastNLP.io` 、:mod:`~fastNLP.embeddings` 、 :mod:`~fastNLP.modules`、
:mod:`~fastNLP.models` 等子模块组成,你可以查看每个模块的文档。

- :mod:`~fastNLP.core` 是fastNLP 的核心模块,包括 DataSet、 Trainer、 Tester 等组件。详见文档 :mod:`fastNLP.core`
- :mod:`~fastNLP.io` 是实现输入输出的模块,包括了数据集的读取,模型的存取等功能。详见文档 :mod:`fastNLP.io`
- :mod:`~fastNLP.embeddings` 提供用于构建复杂网络模型所需的各种embedding。详见文档 :mod:`fastNLP.embeddings`
- :mod:`~fastNLP.modules` 包含了用于搭建神经网络模型的诸多组件,可以帮助用户快速搭建自己所需的网络。详见文档 :mod:`fastNLP.modules`
- :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型,包括 :class:`~fastNLP.models.CNNText` 、 :class:`~fastNLP.models.SeqLabeling` 等常见模型。详见文档 :mod:`fastNLP.models`

fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ,他们的文档如下:
"""
__all__ = [
"Instance",
"FieldArray",
"DataSetIter",
"BatchIter",
"TorchLoaderIter",
"Vocabulary",
"DataSet",
"Const",
"Trainer",
"Tester",

"DistTrainer",
"get_local_rank",
"Callback",
"GradientClipCallback",
"EarlyStopCallback",
"FitlogCallback",
"EvaluateCallback",
"LRScheduler",
"ControlC",
"LRFinder",
"TensorboardCallback",
"WarmupCallback",
'SaveModelCallback',
"CallbackException",
"EarlyStopError",
"CheckPointCallback",

"Padder",
"AutoPadder",
"EngChar2DPadder",

# "CollateFn",
"ConcatCollateFn",

"MetricBase",
"AccuracyMetric",
"SpanFPreRecMetric",
"CMRC2018Metric",
"ClassifyFPreRecMetric",
"ConfusionMatrixMetric",
"Optimizer",
"SGD",
"Adam",
"AdamW",
"Sampler",
"SequentialSampler",
"BucketSampler",
"RandomSampler",
"SortedSampler",
"ConstantTokenNumSampler",
"LossFunc",
"CrossEntropyLoss",
"MSELoss",
"L1Loss",
"BCELoss",
"NLLLoss",
"LossInForward",
"LossBase",
"CMRC2018Loss",
"cache_results",
'logger',
"init_logger_dist",
]
__version__ = '0.6.0'

import sys

from . import embeddings
from . import models
from . import modules
from .core import *
from .doc_utils import doc_process
from .io import loader, pipe

doc_process(sys.modules[__name__])

+ 0
- 112
fastNLP/core/__init__.py View File

@@ -1,112 +0,0 @@
r"""
core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fastNLP 包中直接 import。当然你也同样可以从 core 模块的子模块中 import,
例如 :class:`~fastNLP.DataSetIter` 组件有两种 import 的方式::
# 直接从 fastNLP 中 import
from fastNLP import DataSetIter
# 从 core 模块的子模块 batch 中 import DataSetIter
from fastNLP.core.batch import DataSetIter

对于常用的功能,你只需要在 :mod:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。

"""
__all__ = [
"DataSet",
"Instance",
"FieldArray",
"Padder",
"AutoPadder",
"EngChar2DPadder",

"ConcatCollateFn",
"Vocabulary",
"DataSetIter",
"BatchIter",
"TorchLoaderIter",
"Const",
"Tester",
"Trainer",

"DistTrainer",
"get_local_rank",

"cache_results",
"seq_len_to_mask",
"get_seq_len",
"logger",
"init_logger_dist",

"Callback",
"GradientClipCallback",
"EarlyStopCallback",
"FitlogCallback",
"EvaluateCallback",
"LRScheduler",
"ControlC",
"LRFinder",
"TensorboardCallback",
"WarmupCallback",
'SaveModelCallback',
"CallbackException",
"EarlyStopError",
"CheckPointCallback",

"LossFunc",
"CrossEntropyLoss",
"L1Loss",
"BCELoss",
"BCEWithLogits",
"NLLLoss",
"LossInForward",
"CMRC2018Loss",
"MSELoss",
"LossBase",

"MetricBase",
"AccuracyMetric",
"SpanFPreRecMetric",
"CMRC2018Metric",
"ClassifyFPreRecMetric",
"ConfusionMatrixMetric",

"Optimizer",
"SGD",
"Adam",
"AdamW",
"SequentialSampler",
"BucketSampler",
"RandomSampler",
"Sampler",
"SortedSampler",
"ConstantTokenNumSampler"
]

from ._logger import logger, init_logger_dist
from .batch import DataSetIter, BatchIter, TorchLoaderIter
from .callback import Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, \
LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, \
EarlyStopError, CheckPointCallback
from .const import Const
from .dataset import DataSet
from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder
from .instance import Instance
from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, \
LossInForward, CMRC2018Loss, LossBase, MSELoss, BCEWithLogits
from .metrics import AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric, ClassifyFPreRecMetric, MetricBase,\
ConfusionMatrixMetric
from .optimizer import Optimizer, SGD, Adam, AdamW
from .sampler import SequentialSampler, BucketSampler, RandomSampler, Sampler, SortedSampler, ConstantTokenNumSampler
from .tester import Tester
from .trainer import Trainer
from .utils import cache_results, seq_len_to_mask, get_seq_len
from .vocabulary import Vocabulary
from .collate_fn import ConcatCollateFn
from .dist_trainer import DistTrainer, get_local_rank

+ 0
- 179
fastNLP/core/_logger.py View File

@@ -1,179 +0,0 @@
r"""
Logger 是fastNLP中记录日志的模块,logger封装了logging模块的Logger,
具体使用方式与直接使用logging.Logger相同,同时也新增一些简单好用的API
使用方式:
from fastNLP import logger
#
# logger 可以和 logging.Logger 一样使用
logger.info('your msg')
logger.error('your msg')

# logger 新增的API
# 将日志输出到文件,以及输出的日志等级
logger.add_file('/path/to/log', level='INFO')
# 定义在命令行中的显示格式和日志等级
logger.set_stdout('tqdm', level='WARN')

"""

__all__ = [
'logger',
'init_logger_dist'
]

import logging
import logging.config
import os
import sys
import warnings
from torch import distributed as dist

ROOT_NAME = 'fastNLP'

try:
import fitlog
except ImportError:
fitlog = None
try:
from tqdm.auto import tqdm
except ImportError:
tqdm = None

if tqdm is not None:
class TqdmLoggingHandler(logging.Handler):
def __init__(self, level=logging.INFO):
super().__init__(level)
def emit(self, record):
try:
msg = self.format(record)
tqdm.write(msg)
self.flush()
except (KeyboardInterrupt, SystemExit):
raise
except:
self.handleError(record)
else:
class TqdmLoggingHandler(logging.StreamHandler):
def __init__(self, level=logging.INFO):
super().__init__(sys.stdout)
self.setLevel(level)


def _get_level(level):
if isinstance(level, int):
pass
else:
level = level.lower()
level = {'info': logging.INFO, 'debug': logging.DEBUG,
'warn': logging.WARN, 'warning': logging.WARN,
'error': logging.ERROR}[level]
return level


def _add_file_handler(logger, path, level='INFO'):
for h in logger.handlers:
if isinstance(h, logging.FileHandler):
if os.path.abspath(path) == h.baseFilename:
# file path already added
return
# File Handler
if os.path.exists(path):
assert os.path.isfile(path)
warnings.warn('log already exists in {}'.format(path))
dirname = os.path.abspath(os.path.dirname(path))
os.makedirs(dirname, exist_ok=True)
file_handler = logging.FileHandler(path, mode='a')
file_handler.setLevel(_get_level(level))
file_formatter = logging.Formatter(fmt='%(asctime)s - %(module)s - [%(levelname)s] - %(message)s',
datefmt='%Y/%m/%d %H:%M:%S')
file_handler.setFormatter(file_formatter)
logger.addHandler(file_handler)


def _set_stdout_handler(logger, stdout='tqdm', level='INFO'):
level = _get_level(level)
if stdout not in ['none', 'plain', 'tqdm']:
raise ValueError('stdout must in one of {}'.format(['none', 'plain', 'tqdm']))
# make sure to initialize logger only once
stream_handler = None
for i, h in enumerate(logger.handlers):
if isinstance(h, (logging.StreamHandler, TqdmLoggingHandler)):
stream_handler = h
break
if stream_handler is not None:
logger.removeHandler(stream_handler)
# Stream Handler
if stdout == 'plain':
stream_handler = logging.StreamHandler(sys.stdout)
elif stdout == 'tqdm':
stream_handler = TqdmLoggingHandler(level)
else:
stream_handler = None
if stream_handler is not None:
stream_formatter = logging.Formatter('%(message)s')
stream_handler.setLevel(level)
stream_handler.setFormatter(stream_formatter)
logger.addHandler(stream_handler)


class FastNLPLogger(logging.getLoggerClass()):
def __init__(self, name):
super().__init__(name)
def add_file(self, path='./log.txt', level='INFO'):
r"""add log output file and the output level"""
_add_file_handler(self, path, level)
def set_stdout(self, stdout='tqdm', level='INFO'):
r"""set stdout format and the output level"""
_set_stdout_handler(self, stdout, level)


logging.setLoggerClass(FastNLPLogger)


# print(logging.getLoggerClass())
# print(logging.getLogger())

def _init_logger(path=None, stdout='tqdm', level='INFO'):
r"""initialize logger"""
level = _get_level(level)
# logger = logging.getLogger()
logger = logging.getLogger(ROOT_NAME)
logger.propagate = False
logger.setLevel(1) # make the logger the lowest level
_set_stdout_handler(logger, stdout, level)
# File Handler
if path is not None:
_add_file_handler(logger, path, level)
return logger


def _get_logger(name=None, level='INFO'):
level = _get_level(level)
if name is None:
name = ROOT_NAME
assert isinstance(name, str)
if not name.startswith(ROOT_NAME):
name = '{}.{}'.format(ROOT_NAME, name)
logger = logging.getLogger(name)
logger.setLevel(level)
return logger


logger = _init_logger(path=None, level='INFO')


def init_logger_dist():
global logger
rank = dist.get_rank()
logger.setLevel(logging.INFO if rank == 0 else logging.WARNING)

+ 0
- 107
fastNLP/core/_parallel_utils.py View File

@@ -1,107 +0,0 @@
r"""undocumented"""

__all__ = []

import threading

import torch
from torch import nn
from torch.nn.parallel.parallel_apply import get_a_var
from torch.nn.parallel.replicate import replicate
from torch.nn.parallel.scatter_gather import scatter_kwargs, gather


def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None):
r"""Applies each `module` in :attr:`modules` in parallel on arguments
contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
on each of :attr:`devices`.

:attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
:attr:`devices` (if given) should all have same length. Moreover, each
element of :attr:`inputs` can either be a single object as the only argument
to a module, or a collection of positional arguments.
"""
assert len(modules) == len(inputs)
if kwargs_tup is not None:
assert len(modules) == len(kwargs_tup)
else:
kwargs_tup = ({},) * len(modules)
if devices is not None:
assert len(modules) == len(devices)
else:
devices = [None] * len(modules)
lock = threading.Lock()
results = {}
grad_enabled = torch.is_grad_enabled()
def _worker(i, module, input, kwargs, device=None):
torch.set_grad_enabled(grad_enabled)
if device is None:
device = get_a_var(input).get_device()
try:
with torch.cuda.device(device):
# this also avoids accidental slicing of `input` if it is a Tensor
if not isinstance(input, (list, tuple)):
input = (input,)
output = getattr(module, func_name)(*input, **kwargs)
with lock:
results[i] = output
except Exception as e:
with lock:
results[i] = e
if len(modules) > 1:
threads = [threading.Thread(target=_worker,
args=(i, module, input, kwargs, device))
for i, (module, input, kwargs, device) in
enumerate(zip(modules, inputs, kwargs_tup, devices))]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
else:
_worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
outputs = []
for i in range(len(inputs)):
output = results[i]
if isinstance(output, Exception):
raise output
outputs.append(output)
return outputs


def _data_parallel_wrapper(func_name, device_ids, output_device):
r"""
这个函数是用于对需要多卡执行的函数的wrapper函数。参考的nn.DataParallel的forward函数

:param str, func_name: 对network中的这个函数进行多卡运行
:param device_ids: nn.DataParallel中的device_ids
:param output_device: nn.DataParallel中的output_device
:return:
"""
def wrapper(network, *inputs, **kwargs):
inputs, kwargs = scatter_kwargs(inputs, kwargs, device_ids, dim=0)
if len(device_ids) == 1:
return getattr(network, func_name)(*inputs[0], **kwargs[0])
replicas = replicate(network, device_ids[:len(inputs)])
outputs = parallel_apply(replicas, func_name, inputs, kwargs, device_ids[:len(replicas)])
return gather(outputs, output_device)
return wrapper


def _model_contains_inner_module(model):
r"""

:param nn.Module model: 模型文件,判断是否内部包含model.module, 多用于check模型是否是nn.DataParallel,
nn.parallel.DistributedDataParallel。主要是在做形参匹配的时候需要使用最内部的model的function。
:return: bool
"""
if isinstance(model, nn.Module):
if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
return True
return False

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save