# Conflicts: # fastNLP/modules/encoder/embedding.py # reproduction/seqence_labelling/ner/train_ontonote.py # reproduction/text_classification/model/lstm.pytags/v0.4.10
@@ -55,12 +55,17 @@ python -m spacy download en | |||||
## 内置组件 | ## 内置组件 | ||||
大部分用于的 NLP 任务神经网络都可以看做由编码器(encoder)、解码器(decoder)两种模块组成。 | |||||
大部分用于的 NLP 任务神经网络都可以看做由词嵌入(embeddings)和两种模块:编码器(encoder)、解码器(decoder)组成。 | |||||
以文本分类任务为例,下图展示了一个BiLSTM+Attention实现文本分类器的模型流程图: | |||||
 |  | ||||
fastNLP 在 modules 模块中内置了两种模块的诸多组件,可以帮助用户快速搭建自己所需的网络。 两种模块的功能和常见组件如下: | |||||
fastNLP 在 embeddings 模块中内置了几种不同的embedding:静态embedding(GloVe、word2vec)、上下文相关embedding | |||||
(ELMo、BERT)、字符embedding(基于CNN或者LSTM的CharEmbedding) | |||||
与此同时,fastNLP 在 modules 模块中内置了两种模块的诸多组件,可以帮助用户快速搭建自己所需的网络。 两种模块的功能和常见组件如下: | |||||
<table> | <table> | ||||
<tr> | <tr> | ||||
@@ -104,6 +109,10 @@ fastNLP的大致工作流程如上图所示,而项目结构如下: | |||||
<td><b> fastNLP.modules </b></td> | <td><b> fastNLP.modules </b></td> | ||||
<td> 实现了用于搭建神经网络模型的诸多组件 </td> | <td> 实现了用于搭建神经网络模型的诸多组件 </td> | ||||
</tr> | </tr> | ||||
<tr> | |||||
<td><b> fastNLP.embeddings </b></td> | |||||
<td> 实现了将序列index转为向量序列的功能,包括读取预训练embedding等 </td> | |||||
</tr> | |||||
<tr> | <tr> | ||||
<td><b> fastNLP.io </b></td> | <td><b> fastNLP.io </b></td> | ||||
<td> 实现了读写功能,包括数据读入,模型读写等 </td> | <td> 实现了读写功能,包括数据读入,模型读写等 </td> | ||||
@@ -0,0 +1,7 @@ | |||||
fastNLP.io.data\_loader | |||||
========================== | |||||
.. automodule:: fastNLP.io.data_loader | |||||
:members: | |||||
:undoc-members: | |||||
:show-inheritance: |
@@ -12,6 +12,7 @@ fastNLP.io | |||||
.. toctree:: | .. toctree:: | ||||
:titlesonly: | :titlesonly: | ||||
fastNLP.io.data_loader | |||||
fastNLP.io.base_loader | fastNLP.io.base_loader | ||||
fastNLP.io.dataset_loader | fastNLP.io.dataset_loader | ||||
fastNLP.io.embed_loader | fastNLP.io.embed_loader | ||||
@@ -6,7 +6,7 @@ | |||||
教程目录: | 教程目录: | ||||
- `Part I: 数据集信息`_ | |||||
- `Part I: 数据集容器`_ | |||||
- `Part II: 数据集的使用方式`_ | - `Part II: 数据集的使用方式`_ | ||||
- `Part III: 不同数据类型的DataSetLoader`_ | - `Part III: 不同数据类型的DataSetLoader`_ | ||||
- `Part IV: DataSetLoader举例`_ | - `Part IV: DataSetLoader举例`_ | ||||
@@ -14,11 +14,11 @@ | |||||
---------------------------- | ---------------------------- | ||||
Part I: 数据集信息 | |||||
Part I: 数据集容器 | |||||
---------------------------- | ---------------------------- | ||||
在fastNLP中,我们使用 :class:`~fastNLP.io.base_loader.DataInfo` 来存储数据集信息。 :class:`~fastNLP.io.base_loader.DataInfo` | |||||
类包含了两个重要内容: `datasets` 和 `vocabs` 。 | |||||
在fastNLP中,我们使用 :class:`~fastNLP.io.base_loader.DataBundle` 来存储数据集信息。 | |||||
:class:`~fastNLP.io.base_loader.DataBundle` 类包含了两个重要内容: `datasets` 和 `vocabs` 。 | |||||
`datasets` 是一个 `key` 为数据集名称(如 `train` , `dev` ,和 `test` 等), `value` 为 :class:`~fastNLP.DataSet` 的字典。 | `datasets` 是一个 `key` 为数据集名称(如 `train` , `dev` ,和 `test` 等), `value` 为 :class:`~fastNLP.DataSet` 的字典。 | ||||
@@ -91,11 +91,11 @@ Part IV: DataSetLoader举例 | |||||
以Matching任务为例子: | 以Matching任务为例子: | ||||
:class:`~fastNLP.io.data_loader.matching.MatchingLoader` | |||||
我们在fastNLP当中封装了一个Matching任务数据集的数据加载类: :class:`~fastNLP.io.data_loader.matching.MatchingLoader` . | |||||
:class:`~fastNLP.io.data_loader.MatchingLoader` | |||||
我们在fastNLP当中封装了一个Matching任务数据集的数据加载类: :class:`~fastNLP.io.data_loader.MatchingLoader` . | |||||
在MatchingLoader类当中我们封装了一个对数据集中的文本内容进行进一步的预处理的函数: | 在MatchingLoader类当中我们封装了一个对数据集中的文本内容进行进一步的预处理的函数: | ||||
:meth:`~fastNLP.io.data_loader.matching.MatchingLoader.process` | |||||
:meth:`~fastNLP.io.data_loader.MatchingLoader.process` | |||||
这个函数具有各种预处理option,如: | 这个函数具有各种预处理option,如: | ||||
- 是否将文本转成全小写 | - 是否将文本转成全小写 | ||||
- 是否需要序列长度信息,需要什么类型的序列长度信息 | - 是否需要序列长度信息,需要什么类型的序列长度信息 | ||||
@@ -104,90 +104,121 @@ Part IV: DataSetLoader举例 | |||||
具体内容参见 :meth:`fastNLP.io.MatchingLoader.process` 。 | 具体内容参见 :meth:`fastNLP.io.MatchingLoader.process` 。 | ||||
:class:`~fastNLP.io.data_loader.matching.SNLILoader` | |||||
:class:`~fastNLP.io.data_loader.SNLILoader` | |||||
一个关于SNLI数据集的DataSetLoader。SNLI数据集来自 | 一个关于SNLI数据集的DataSetLoader。SNLI数据集来自 | ||||
`SNLI Data Set <https://nlp.stanford.edu/projects/snli/snli_1.0.zip>`_ . | `SNLI Data Set <https://nlp.stanford.edu/projects/snli/snli_1.0.zip>`_ . | ||||
在 :class:`~fastNLP.io.data_loader.matching.SNLILoader` 的 :meth:`~fastNLP.io.data_loader.matching.SNLILoader._load` | |||||
函数中,我们用以下代码将数据集内容从文本文件读入内存 | |||||
在 :class:`~fastNLP.io.data_loader.SNLILoader` 的 :meth:`~fastNLP.io.data_loader.SNLILoader._load` | |||||
函数中,我们用以下代码将数据集内容从文本文件读入内存: | |||||
.. code-block:: python | .. code-block:: python | ||||
def _load(self, path): | |||||
ds = JsonLoader._load(self, path) # SNLI数据集原始文件为Json格式,可以采用JsonLoader来读取数据集文件 | |||||
data = SNLILoader().process( | |||||
paths='path/to/snli/data', to_lower=False, seq_len_type='seq_len', | |||||
get_index=True, concat=False, | |||||
) | |||||
print(data) | |||||
parentheses_table = str.maketrans({'(': None, ')': None}) | |||||
# 字符串匹配格式:SNLI数据集的文本中由括号分割开的,组成树结构,因此 | |||||
# 我们将这些括号去除。 | |||||
输出的内容是:: | |||||
ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), | |||||
new_field_name=Const.INPUTS(0)) | |||||
# 把第一句话的内容用上面的字符串匹配格式进行替换,并将句子分割为一个由单词组成的list | |||||
ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), | |||||
new_field_name=Const.INPUTS(1)) | |||||
# 对第二句话的内容进行同样的预处理 | |||||
ds.drop(lambda x: x[Const.TARGET] == '-') # 将标签为'-'的样本丢掉 | |||||
return ds | |||||
In total 3 datasets: | |||||
train has 549367 instances. | |||||
dev has 9842 instances. | |||||
test has 9824 instances. | |||||
In total 2 vocabs: | |||||
words has 43154 entries. | |||||
target has 3 entries. | |||||
------------------------------------------ | |||||
Part V: fastNLP封装好的数据集加载器 | |||||
------------------------------------------ | |||||
fastNLP封装好的数据集加载器可以适用于多种类型的任务: | |||||
- `文本分类任务`_ | |||||
- `序列标注任务`_ | |||||
- `Matching任务`_ | |||||
- `指代消解任务`_ | |||||
- `摘要任务`_ | |||||
这里的data是一个 :class:`~fastNLP.io.base_loader.DataBundle` ,取 ``datasets`` 字典里的内容即可直接传入 | |||||
:class:`~fastNLP.Trainer` 或者 :class:`~fastNLP.Tester` 进行训练或者测试。 | |||||
:class:`~fastNLP.io.data_loader.IMDBLoader` | |||||
以IMDB数据集为例,在 :class:`~fastNLP.io.data_loader.IMDBLoader` 的 :meth:`~fastNLP.io.data_loader.IMDBLoader._load` | |||||
函数中,我们用以下代码将数据集内容从文本文件读入内存: | |||||
文本分类任务 | |||||
------------------- | |||||
文本分类任务 | |||||
.. code-block:: python | |||||
data = IMDBLoader().process( | |||||
paths={'train': 'path/to/train/file', 'test': 'path/to/test/file'} | |||||
) | |||||
print(data) | |||||
输出的内容是:: | |||||
序列标注任务 | |||||
------------------- | |||||
In total 3 datasets: | |||||
train has 22500 instances. | |||||
test has 25000 instances. | |||||
dev has 2500 instances. | |||||
In total 2 vocabs: | |||||
words has 82846 entries. | |||||
target has 2 entries. | |||||
序列标注任务 | |||||
这里的将原来的train集按9:1的比例分成了训练集和验证集。 | |||||
Matching任务 | |||||
------------------- | |||||
:class:`~fastNLP.io.data_loader.matching.SNLILoader` | |||||
一个关于SNLI数据集的DataSetLoader。SNLI数据集来自 | |||||
`SNLI Data Set <https://nlp.stanford.edu/projects/snli/snli_1.0.zip>`_ . | |||||
------------------------------------------ | |||||
Part V: fastNLP封装好的数据集加载器 | |||||
------------------------------------------ | |||||
:class:`~fastNLP.io.data_loader.matching.MNLILoader` | |||||
一个关于MultiNLI数据集的DataSetLoader。MultiNLI数据集来自 `GLUE benchmark <https://gluebenchmark.com/tasks>`_ | |||||
fastNLP封装好的数据集加载器可以适用于多种类型的任务: | |||||
:class:`~fastNLP.io.data_loader.matching.QNLILoader` | |||||
一个关于QNLI数据集的DataSetLoader。QNLI数据集来自 `GLUE benchmark <https://gluebenchmark.com/tasks>`_ | |||||
- `文本分类任务`_ | |||||
- `序列标注任务`_ | |||||
- `Matching任务`_ | |||||
:class:`~fastNLP.io.data_loader.matching.RTELoader` | |||||
一个关于Recognizing Textual Entailment数据集(RTE)的DataSetLoader。RTE数据集来自 | |||||
`GLUE benchmark <https://gluebenchmark.com/tasks>`_ | |||||
:class:`~fastNLP.io.data_loader.matching.QuoraLoader` | |||||
一个关于Quora数据集的DataSetLoader。 | |||||
文本分类任务 | |||||
------------------- | |||||
========================== ================================================================== | |||||
数据集名称 数据集加载器 | |||||
-------------------------- ------------------------------------------------------------------ | |||||
IMDb :class:`~fastNLP.io.data_loader.IMDBLoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
SST :class:`~fastNLP.io.data_loader.SSTLoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
SST-2 :class:`~fastNLP.io.data_loader.SST2Loader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
Yelp Polarity :class:`~fastNLP.io.data_loader.YelpLoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
Yelp Full :class:`~fastNLP.io.data_loader.YelpLoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
MTL16 :class:`~fastNLP.io.data_loader.MTL16Loader` | |||||
========================== ================================================================== | |||||
指代消解任务 | |||||
序列标注任务 | |||||
------------------- | ------------------- | ||||
指代消解任务 | |||||
========================== ================================================================== | |||||
数据集名称 数据集加载器 | |||||
-------------------------- ------------------------------------------------------------------ | |||||
Conll :class:`~fastNLP.io.data_loader.ConllLoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
Conll2003 :class:`~fastNLP.io.data_loader.Conll2003Loader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
人民日报数据集 :class:`~fastNLP.io.data_loader.PeopleDailyCorpusLoader` | |||||
========================== ================================================================== | |||||
摘要任务 | |||||
Matching任务 | |||||
------------------- | ------------------- | ||||
摘要任务 | |||||
========================== ================================================================== | |||||
数据集名称 数据集加载器 | |||||
-------------------------- ------------------------------------------------------------------ | |||||
SNLI :class:`~fastNLP.io.data_loader.SNLILoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
MultiNLI :class:`~fastNLP.io.data_loader.MNLILoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
QNLI :class:`~fastNLP.io.data_loader.QNLILoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
RTE :class:`~fastNLP.io.data_loader.RTELoader` | |||||
-------------------------- ------------------------------------------------------------------ | |||||
Quora Pair Dataset :class:`~fastNLP.io.data_loader.QuoraLoader` | |||||
========================== ================================================================== | |||||
@@ -5,7 +5,7 @@ fastNLP 由 :mod:`~fastNLP.core` 、 :mod:`~fastNLP.io` 、:mod:`~fastNLP.module | |||||
- :mod:`~fastNLP.core` 是fastNLP 的核心模块,包括 DataSet、 Trainer、 Tester 等组件。详见文档 :doc:`/fastNLP.core` | - :mod:`~fastNLP.core` 是fastNLP 的核心模块,包括 DataSet、 Trainer、 Tester 等组件。详见文档 :doc:`/fastNLP.core` | ||||
- :mod:`~fastNLP.io` 是实现输入输出的模块,包括了数据集的读取,模型的存取等功能。详见文档 :doc:`/fastNLP.io` | - :mod:`~fastNLP.io` 是实现输入输出的模块,包括了数据集的读取,模型的存取等功能。详见文档 :doc:`/fastNLP.io` | ||||
- :mod:`~fastNLP.modules` 包含了用于搭建神经网络模型的诸多组件,可以帮助用户快速搭建自己所需的网络。详见文档 :doc:`/fastNLP.modules` | - :mod:`~fastNLP.modules` 包含了用于搭建神经网络模型的诸多组件,可以帮助用户快速搭建自己所需的网络。详见文档 :doc:`/fastNLP.modules` | ||||
- :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型,包括CNNText、SeqLabeling等常见模型。详见文档 :doc:`/fastNLP.models` | |||||
- :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型,包括 :class:`~fastNLP.models.CNNText` 、 :class:`~fastNLP.models.SeqLabeling` 等常见模型。详见文档 :doc:`/fastNLP.models` | |||||
fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ,他们的文档如下: | fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ,他们的文档如下: | ||||
""" | """ | ||||
@@ -1,12 +1,12 @@ | |||||
""" | """ | ||||
core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fastNLP 包中直接 import。当然你也同样可以从 core 模块的子模块中 import, | core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fastNLP 包中直接 import。当然你也同样可以从 core 模块的子模块中 import, | ||||
例如 Batch 组件有两种 import 的方式:: | |||||
例如 :class:`~fastNLP.DataSetIter` 组件有两种 import 的方式:: | |||||
# 直接从 fastNLP 中 import | # 直接从 fastNLP 中 import | ||||
from fastNLP import Batch | |||||
from fastNLP import DataSetIter | |||||
# 从 core 模块的子模块 batch 中 import | |||||
from fastNLP.core.batch import Batch | |||||
# 从 core 模块的子模块 batch 中 import DataSetIter | |||||
from fastNLP.core.batch import DataSetIter | |||||
对于常用的功能,你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。 | 对于常用的功能,你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。 | ||||
@@ -1,18 +1,17 @@ | |||||
""" | """ | ||||
batch 模块实现了 fastNLP 所需的 Batch 类。 | |||||
batch 模块实现了 fastNLP 所需的 :class:`~fastNLP.core.batch.DataSetIter` 类。 | |||||
""" | """ | ||||
__all__ = [ | __all__ = [ | ||||
"BatchIter", | |||||
"DataSetIter", | "DataSetIter", | ||||
"TorchLoaderIter", | "TorchLoaderIter", | ||||
] | ] | ||||
import atexit | import atexit | ||||
from queue import Empty, Full | |||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
import torch.multiprocessing as mp | |||||
import torch.utils.data | import torch.utils.data | ||||
from numbers import Number | from numbers import Number | ||||
@@ -2,11 +2,11 @@ r""" | |||||
callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class:`~fastNLP.Trainer` 类。 | callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class:`~fastNLP.Trainer` 类。 | ||||
虽然Trainer本身已经集成了一些功能,但仍然不足以囊括训练过程中可能需要到的功能, | 虽然Trainer本身已经集成了一些功能,但仍然不足以囊括训练过程中可能需要到的功能, | ||||
比如负采样,learning rate decay, Early Stop等。 | |||||
为了解决这个问题fastNLP引入了callback的机制,Callback 是一种在Trainer训练过程中特定阶段会运行的函数集合。 | |||||
关于Trainer的详细文档,请参见 :doc:`trainer 模块<fastNLP.core.trainer>` | |||||
比如负采样,learning rate decay 和 early stop等。 | |||||
为了解决这个问题,fastNLP引入了callback的机制,:class:`~fastNLP.Callback` 是一种在Trainer训练过程中特定阶段会运行的函数集合。 | |||||
关于 :class:`~fastNLP.Trainer` 的详细文档,请参见 :doc:`trainer 模块<fastNLP.core.trainer>` | |||||
我们将 :meth:`~fastNLP.Train.train` 这个函数内部分为以下的阶段,在对应阶段会触发相应的调用:: | |||||
我们将 :meth:`~fastNLP.Trainer.train` 这个函数内部分为以下的阶段,在对应阶段会触发相应的调用:: | |||||
callback.on_train_begin() # 开始进行训练 | callback.on_train_begin() # 开始进行训练 | ||||
for i in range(1, n_epochs+1): | for i in range(1, n_epochs+1): | ||||
@@ -31,8 +31,8 @@ callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class: | |||||
callback.on_train_end() # 训练结束 | callback.on_train_end() # 训练结束 | ||||
callback.on_exception() # 这是一个特殊的步骤,在训练过程中遭遇exception会跳转到这里。 | callback.on_exception() # 这是一个特殊的步骤,在训练过程中遭遇exception会跳转到这里。 | ||||
如下面的例子所示,我们可以使用内置的 callback 类,或者继承 :class:`~fastNLP.core.callback.Callback` | |||||
定义自己的 callback 类:: | |||||
如下面的例子所示,我们可以使用内置的 callback 组件,或者继承 :class:`~fastNLP.core.callback.Callback` | |||||
定义自己的 callback 组件:: | |||||
from fastNLP import Callback, EarlyStopCallback, Trainer, CrossEntropyLoss, AccuracyMetric | from fastNLP import Callback, EarlyStopCallback, Trainer, CrossEntropyLoss, AccuracyMetric | ||||
from fastNLP.models import CNNText | from fastNLP.models import CNNText | ||||
@@ -448,7 +448,7 @@ class FitlogCallback(Callback): | |||||
并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则 | 并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则 | ||||
fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。 | fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。 | ||||
:param ~fastNLP.DataSet,dict(~fastNLP.DataSet) data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 | |||||
:param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 | |||||
DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。若tester不为None时,data需要通过 | DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。若tester不为None时,data需要通过 | ||||
dict的方式传入。如果仅传入DataSet, 则被命名为test | dict的方式传入。如果仅传入DataSet, 则被命名为test | ||||
:param ~fastNLP.Tester tester: Tester对象,将在on_valid_end时调用。tester中的DataSet会被称为为`test` | :param ~fastNLP.Tester tester: Tester对象,将在on_valid_end时调用。tester中的DataSet会被称为为`test` | ||||
@@ -1,7 +1,7 @@ | |||||
""" | """ | ||||
:class:`~fastNLP.core.dataset.DataSet` 是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格, | :class:`~fastNLP.core.dataset.DataSet` 是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格, | ||||
每一行是一个sample (在fastNLP中被称为 :mod:`~.instance` ), | |||||
每一列是一个feature (在fastNLP中称为 :mod:`.field` )。 | |||||
每一行是一个sample (在fastNLP中被称为 :mod:`~fastNLP.core.instance` ), | |||||
每一列是一个feature (在fastNLP中称为 :mod:`~fastNLP.core.field` )。 | |||||
.. csv-table:: Following is a demo layout of DataSet | .. csv-table:: Following is a demo layout of DataSet | ||||
:header: "sentence", "words", "seq_len" | :header: "sentence", "words", "seq_len" | ||||
@@ -13,57 +13,64 @@ | |||||
在fastNLP内部每一行是一个 :class:`~fastNLP.Instance` 对象; 每一列是一个 :class:`~fastNLP.FieldArray` 对象。 | 在fastNLP内部每一行是一个 :class:`~fastNLP.Instance` 对象; 每一列是一个 :class:`~fastNLP.FieldArray` 对象。 | ||||
1 DataSet的创建 | |||||
创建DataSet主要有以下的3种方式 | |||||
---------------------------- | |||||
1.DataSet的创建 | |||||
---------------------------- | |||||
1.1 传入dict | |||||
创建DataSet主要有以下的3种方式 | |||||
Example:: | |||||
1.1 传入dict | |||||
---------------------------- | |||||
from fastNLP import DataSet | |||||
data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."], | |||||
'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.'], | |||||
'seq_len': [6, 3, 3]} | |||||
dataset = DataSet(data) | |||||
# 传入的dict的每个key的value应该为具有相同长度的list | |||||
.. code-block:: | |||||
1.2 通过构建Instance | |||||
from fastNLP import DataSet | |||||
data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."], | |||||
'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.'], | |||||
'seq_len': [6, 3, 3]} | |||||
dataset = DataSet(data) | |||||
# 传入的dict的每个key的value应该为具有相同长度的list | |||||
Example:: | |||||
1.2 通过 Instance 构建 | |||||
---------------------------- | |||||
from fastNLP import DataSet | |||||
from fastNLP import Instance | |||||
dataset = DataSet() | |||||
instance = Instance(sentence="This is the first instance", | |||||
words=['this', 'is', 'the', 'first', 'instance', '.'], | |||||
seq_len=6) | |||||
dataset.append(instance) | |||||
# 可以继续append更多内容,但是append的instance应该和第一个instance拥有完全相同的field | |||||
.. code-block:: | |||||
1.3 通过list(Instance) | |||||
from fastNLP import DataSet | |||||
from fastNLP import Instance | |||||
dataset = DataSet() | |||||
instance = Instance(sentence="This is the first instance", | |||||
words=['this', 'is', 'the', 'first', 'instance', '.'], | |||||
seq_len=6) | |||||
dataset.append(instance) | |||||
# 可以继续append更多内容,但是append的instance应该和第一个instance拥有完全相同的field | |||||
Example:: | |||||
1.3 通过 List[Instance] 构建 | |||||
-------------------------------------- | |||||
from fastNLP import DataSet | |||||
from fastNLP import Instance | |||||
instances = [] | |||||
instances.append(Instance(sentence="This is the first instance", | |||||
words=['this', 'is', 'the', 'first', 'instance', '.'], | |||||
seq_len=6)) | |||||
instances.append(Instance(sentence="Second instance .", | |||||
words=['Second', 'instance', '.'], | |||||
seq_len=3)) | |||||
dataset = DataSet(instances) | |||||
.. code-block:: | |||||
2 DataSet与预处理 | |||||
常见的预处理有如下几种 | |||||
from fastNLP import DataSet | |||||
from fastNLP import Instance | |||||
instances = [] | |||||
winstances.append(Instance(sentence="This is the first instance", | |||||
ords=['this', 'is', 'the', 'first', 'instance', '.'], | |||||
seq_len=6)) | |||||
instances.append(Instance(sentence="Second instance .", | |||||
words=['Second', 'instance', '.'], | |||||
seq_len=3)) | |||||
dataset = DataSet(instances) | |||||
-------------------------------------- | |||||
2.DataSet与预处理 | |||||
-------------------------------------- | |||||
2.1 从某个文本文件读取内容 # | |||||
常见的预处理有如下几种 | |||||
.. todo:: | |||||
引用DataLoader | |||||
2.1 从某个文本文件读取内容 | |||||
-------------------------------------- | |||||
Example:: | |||||
.. code-block:: | |||||
from fastNLP import DataSet | from fastNLP import DataSet | ||||
from fastNLP import Instance | from fastNLP import Instance | ||||
@@ -78,9 +85,13 @@ | |||||
sent, label = line.strip().split('\t') | sent, label = line.strip().split('\t') | ||||
dataset.append(Instance(sentence=sent, label=label)) | dataset.append(Instance(sentence=sent, label=label)) | ||||
.. note:: | |||||
直接读取特定数据集的数据请参考 :doc:`/tutorials/tutorial_2_load_dataset` | |||||
2.2 对DataSet中的内容处理 | 2.2 对DataSet中的内容处理 | ||||
-------------------------------------- | |||||
Example:: | |||||
.. code-block:: | |||||
from fastNLP import DataSet | from fastNLP import DataSet | ||||
data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."]} | data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."]} | ||||
@@ -97,8 +108,9 @@ | |||||
dataset.apply(get_words, new_field_name='words') | dataset.apply(get_words, new_field_name='words') | ||||
2.3 删除DataSet的内容 | 2.3 删除DataSet的内容 | ||||
-------------------------------------- | |||||
Example:: | |||||
.. code-block:: | |||||
from fastNLP import DataSet | from fastNLP import DataSet | ||||
dataset = DataSet({'a': list(range(-5, 5))}) | dataset = DataSet({'a': list(range(-5, 5))}) | ||||
@@ -113,15 +125,17 @@ | |||||
2.4 遍历DataSet的内容 | 2.4 遍历DataSet的内容 | ||||
-------------------------------------- | |||||
Example:: | |||||
.. code-block:: | |||||
for instance in dataset: | for instance in dataset: | ||||
# do something | # do something | ||||
2.5 一些其它操作 | 2.5 一些其它操作 | ||||
-------------------------------------- | |||||
Example:: | |||||
.. code-block:: | |||||
# 检查是否存在名为'a'的field | # 检查是否存在名为'a'的field | ||||
dataset.has_field('a') # 或 ('a' in dataset) | dataset.has_field('a') # 或 ('a' in dataset) | ||||
@@ -129,21 +143,25 @@ | |||||
dataset.rename_field('a', 'b') | dataset.rename_field('a', 'b') | ||||
# DataSet的长度 | # DataSet的长度 | ||||
len(dataset) | len(dataset) | ||||
-------------------------------------- | |||||
3.DataSet与自然语言处理(NLP) | |||||
-------------------------------------- | |||||
3 DataSet与自然语言处理(NLP) | |||||
在目前深度学习的模型中,大都依赖于随机梯度下降法(SGD)进行模型的优化。随机梯度下降需要将数据切分成一个一个的Batch, | |||||
一个Batch进行一次前向计算(forward)与梯度后向传播(backward)。在自然语言处理的场景下,往往还需要对数据进行pad。这是 | |||||
由于句子的长度一般是不同的,但是一次Batch中的每个field都必须是一个tensor,所以需要将所有句子都补齐到相同的长度。 | |||||
在目前深度学习的模型中,大都依赖于随机梯度下降法(SGD)进行模型的优化。随机梯度下降需要将数据切分成一个个的 batch, | |||||
一个batch进行一次前向计算(forward)与梯度后向传播(backward)。在自然语言处理的场景下,往往还需要对数据进行pad。这是 | |||||
由于句子的长度一般是不同的,但是一次batch中的每个field都必须是一个tensor,所以需要将所有句子都补齐到相同的长度。 | |||||
3.1 DataSet与Batch | |||||
3.1 DataSet与DataSetIter | |||||
-------------------------------------- | |||||
我们先看fastNLP中如何将数据分成一个一个的Batch的例子, 这里我们使用随机生成的数据来模拟一个二分类文本分类任务, | |||||
我们先看fastNLP中如何将数据分成一个一个的batch的例子, 这里我们使用随机生成的数据来模拟一个二分类文本分类任务, | |||||
words和characters是输入,labels是文本类别 | words和characters是输入,labels是文本类别 | ||||
Example:: | |||||
.. code-block:: | |||||
from fastNLP import DataSet | from fastNLP import DataSet | ||||
from fastNLP import Batch | |||||
from fastNLP import DataSetIter | |||||
from fastNLP import SequentialSampler | from fastNLP import SequentialSampler | ||||
from fastNLP import EngChar2DPadder | from fastNLP import EngChar2DPadder | ||||
@@ -163,7 +181,7 @@ | |||||
d.set_target('label') | d.set_target('label') | ||||
d.set_input('words', 'chars') | d.set_input('words', 'chars') | ||||
for batch_x, batch_y in Batch(d, sampler=SequentialSampler(), batch_size=2): | |||||
for batch_x, batch_y in DataSetIter(d, sampler=SequentialSampler(), batch_size=2): | |||||
print("batch_x:", batch_x) | print("batch_x:", batch_x) | ||||
print("batch_y:", batch_y) | print("batch_y:", batch_y) | ||||
break | break | ||||
@@ -182,23 +200,26 @@ | |||||
# [ 0, 0, 0, 0, 0]]])} | # [ 0, 0, 0, 0, 0]]])} | ||||
# {'label': tensor([0, 0])} | # {'label': tensor([0, 0])} | ||||
其中 :class:`~fastNLP.Batch` 是用于从DataSet中按照batch_size为大小取出batch的迭代器, | |||||
:class:`~fastNLP.SequentialSampler` 用于指示 Batch 以怎样的 | |||||
其中 :class:`~fastNLP.DataSetIter` 是用于从DataSet中按照batch_size为大小取出batch的迭代器, | |||||
:class:`~fastNLP.SequentialSampler` 用于指示 :class:`~fastNLP.DataSetIter` 以怎样的 | |||||
顺序从DataSet中取出instance以组成一个batch, | 顺序从DataSet中取出instance以组成一个batch, | ||||
更详细的说明请参照 :class:`~fastNLP.Batch` 和 :class:`~fastNLP.SequentialSampler` 文档。 | |||||
更详细的说明请参照 :class:`~fastNLP.DataSetIter` 和 :class:`~fastNLP.SequentialSampler` 文档。 | |||||
通过DataSet.set_input('words', 'chars'), fastNLP将认为'words'和'chars'这两个field都是input,并将它们都放入迭代器 | |||||
生成的第一个dict中; DataSet.set_target('labels'), fastNLP将认为'labels'这个field是target,并将其放入到迭代器的第 | |||||
通过 ``DataSet.set_input('words', 'chars')`` , fastNLP将认为 `words` 和 `chars` 这两个field都是input,并将它们都放入迭代器 | |||||
生成的第一个dict中; ``DataSet.set_target('labels')`` , fastNLP将认为 `labels` 这个field是target,并将其放入到迭代器的第 | |||||
二个dict中。如上例中所打印结果。分为input和target的原因是由于它们在被 :class:`~fastNLP.Trainer` 所使用时会有所差异, | 二个dict中。如上例中所打印结果。分为input和target的原因是由于它们在被 :class:`~fastNLP.Trainer` 所使用时会有所差异, | ||||
详见 :class:`~fastNLP.Trainer` | 详见 :class:`~fastNLP.Trainer` | ||||
当把某个field设置为'target'或者'input'的时候(两者不是互斥的,可以同时设为input和target),fastNLP不仅仅只是将其放 | |||||
置到不同的dict中,而还会对被设置为input或target的field进行类型检查。类型检查的目的是为了看能否把该field转为 | |||||
pytorch的torch.LongTensor或torch.FloatTensor类型(也可以在Batch中设置输出numpy类型,参考 :class:`~fastNLP.Batch` ),如上例所示, | |||||
fastNLP已将words,chars和label转为了Tensor类型。如果field在每个instance都拥有相同的维度(不能超过两维),且最内层 | |||||
的元素都为相同的type(int, float, np.int*, np.float*),则fastNLP默认将对该field进行pad。也支持全为str的field作为 | |||||
target和input,这种情况下,fastNLP默认不进行pad。另外,当某个field已经被设置为了target或者input后,之后append的 | |||||
instance对应的field必须要和前面已有的内容一致,否则会报错。 | |||||
当把某个field设置为 `target` 或者 `input` 的时候(两者不是互斥的,可以同时设为两种),fastNLP不仅仅只是将其放 | |||||
置到不同的dict中,而还会对被设置为 `input` 或 `target` 的 field 进行类型检查。类型检查的目的是为了看能否把该 field 转为 | |||||
pytorch的 :class:`torch.LongTensor` 或 :class:`torch.FloatTensor` 类型 | |||||
(也可以在 :class:`~fastNLP.DataSetIter` 中设置输出numpy类型,参考 :class:`~fastNLP.DataSetIter` )。 | |||||
如上例所示,fastNLP已将 `words` ,`chars` 和 `label` 转为了 :class:`Tensor` 类型。 | |||||
如果 field 在每个 `instance` 都拥有相同的维度(不能超过两维),且最内层的元素都为相同的 type(int, float, np.int*, np.float*), | |||||
则fastNLP默认将对该 field 进行pad。也支持全为str的field作为target和input,这种情况下,fastNLP默认不进行pad。 | |||||
另外,当某个 field 已经被设置为了 target 或者 input 后,之后 `append` 的 | |||||
`instance` 对应的 field 必须要和前面已有的内容一致,否则会报错。 | |||||
可以查看field的dtype:: | 可以查看field的dtype:: | ||||
@@ -217,6 +238,7 @@ | |||||
错误:: | 错误:: | ||||
from fastNLP import DataSet | from fastNLP import DataSet | ||||
d = DataSet({'data': [1, 'a']}) | d = DataSet({'data': [1, 'a']}) | ||||
d.set_input('data') | d.set_input('data') | ||||
>> RuntimeError: Mixed data types in Field data: [<class 'str'>, <class 'int'>] | >> RuntimeError: Mixed data types in Field data: [<class 'str'>, <class 'int'>] | ||||
@@ -231,6 +253,7 @@ | |||||
当某个field被设置为忽略type之后,fastNLP将不对其进行pad。 | 当某个field被设置为忽略type之后,fastNLP将不对其进行pad。 | ||||
3.2 DataSet与pad | 3.2 DataSet与pad | ||||
-------------------------------------- | |||||
在fastNLP里,pad是与一个field绑定的。即不同的field可以使用不同的pad方式,比如在英文任务中word需要的pad和 | 在fastNLP里,pad是与一个field绑定的。即不同的field可以使用不同的pad方式,比如在英文任务中word需要的pad和 | ||||
character的pad方式往往是不同的。fastNLP是通过一个叫做 :class:`~fastNLP.Padder` 的子类来完成的。 | character的pad方式往往是不同的。fastNLP是通过一个叫做 :class:`~fastNLP.Padder` 的子类来完成的。 | ||||
@@ -240,7 +263,7 @@ | |||||
如果 :class:`~fastNLP.AutoPadder` 或 :class:`~fastNLP.EngChar2DPadder` 无法满足需求, | 如果 :class:`~fastNLP.AutoPadder` 或 :class:`~fastNLP.EngChar2DPadder` 无法满足需求, | ||||
也可以自己写一个 :class:`~fastNLP.Padder` 。 | 也可以自己写一个 :class:`~fastNLP.Padder` 。 | ||||
Example:: | |||||
.. code-block:: | |||||
from fastNLP import DataSet | from fastNLP import DataSet | ||||
from fastNLP import EngChar2DPadder | from fastNLP import EngChar2DPadder | ||||
@@ -405,7 +428,7 @@ class DataSet(object): | |||||
""" | """ | ||||
将一个instance对象append到DataSet后面。 | 将一个instance对象append到DataSet后面。 | ||||
:param instance: :class:`~fastNLP.Instance` 类型。若DataSet不为空,则instance应该拥有和DataSet完全一样的field。 | |||||
:param ~fastNLP.Instance instance: 若DataSet不为空,则instance应该拥有和DataSet完全一样的field。 | |||||
""" | """ | ||||
if len(self.field_arrays) == 0: | if len(self.field_arrays) == 0: | ||||
@@ -431,7 +454,7 @@ class DataSet(object): | |||||
将fieldarray添加到DataSet中. | 将fieldarray添加到DataSet中. | ||||
:param str field_name: 新加入的field的名称 | :param str field_name: 新加入的field的名称 | ||||
:param fieldarray: :class:`~fastNLP.FieldArray` 类型。需要加入DataSet的field的内容 | |||||
:param ~fastNLP.core.FieldArray fieldarray: 需要加入DataSet的field的内容 | |||||
:return: | :return: | ||||
""" | """ | ||||
if not isinstance(fieldarray, FieldArray): | if not isinstance(fieldarray, FieldArray): | ||||
@@ -447,8 +470,7 @@ class DataSet(object): | |||||
:param str field_name: 新增的field的名称 | :param str field_name: 新增的field的名称 | ||||
:param list fields: 需要新增的field的内容 | :param list fields: 需要新增的field的内容 | ||||
:param None, padder: :class:`~fastNLP.Padder` 类型, | |||||
如果为None,则不进行pad,默认使用 :class:`~fastNLP.AutoPadder` 自动判断是否需要做pad。 | |||||
:param None,~fastNLP.Padder padder: 如果为None,则不进行pad,默认使用 :class:`~fastNLP.AutoPadder` 自动判断是否需要做pad。 | |||||
:param bool is_input: 新加入的field是否是input | :param bool is_input: 新加入的field是否是input | ||||
:param bool is_target: 新加入的field是否是target | :param bool is_target: 新加入的field是否是target | ||||
:param bool ignore_type: 是否忽略对新加入的field的类型检查 | :param bool ignore_type: 是否忽略对新加入的field的类型检查 | ||||
@@ -510,7 +532,7 @@ class DataSet(object): | |||||
""" | """ | ||||
返回一个dict,key为field_name, value为对应的 :class:`~fastNLP.FieldArray` | 返回一个dict,key为field_name, value为对应的 :class:`~fastNLP.FieldArray` | ||||
:return: dict: 返回如上所述的字典 | |||||
:return dict: 返回如上所述的字典 | |||||
""" | """ | ||||
return self.field_arrays | return self.field_arrays | ||||
@@ -518,7 +540,7 @@ class DataSet(object): | |||||
""" | """ | ||||
返回一个list,包含所有 field 的名字 | 返回一个list,包含所有 field 的名字 | ||||
:return: list: 返回如上所述的列表 | |||||
:return list: 返回如上所述的列表 | |||||
""" | """ | ||||
return sorted(self.field_arrays.keys()) | return sorted(self.field_arrays.keys()) | ||||
@@ -612,7 +634,7 @@ class DataSet(object): | |||||
dataset.set_padder('chars', padder) # 则chars这个field会使用EngChar2DPadder进行pad操作 | dataset.set_padder('chars', padder) # 则chars这个field会使用EngChar2DPadder进行pad操作 | ||||
:param str field_name: 设置field的padding方式为padder | :param str field_name: 设置field的padding方式为padder | ||||
:param None, Padder padder: 设置为None即删除padder, 即对该field不进行pad操作。 | |||||
:param None,~fastNLP.Padder padder: 设置为None即删除padder, 即对该field不进行pad操作。 | |||||
""" | """ | ||||
if field_name not in self.field_arrays: | if field_name not in self.field_arrays: | ||||
raise KeyError("There is no field named {}.".format(field_name)) | raise KeyError("There is no field named {}.".format(field_name)) | ||||
@@ -660,7 +682,7 @@ class DataSet(object): | |||||
2. is_target: bool, 如果为True则将名为 `new_field_name` 的field设置为target | 2. is_target: bool, 如果为True则将名为 `new_field_name` 的field设置为target | ||||
3. ignore_type: bool, 如果为True则将名为 `new_field_name` 的field的ignore_type设置为true, 忽略其类型 | 3. ignore_type: bool, 如果为True则将名为 `new_field_name` 的field的ignore_type设置为true, 忽略其类型 | ||||
:return: list(Any), 里面的元素为func的返回值,所以list长度为DataSet的长度 | |||||
:return List[Any]: 里面的元素为func的返回值,所以list长度为DataSet的长度 | |||||
""" | """ | ||||
assert len(self) != 0, "Null DataSet cannot use apply_field()." | assert len(self) != 0, "Null DataSet cannot use apply_field()." | ||||
@@ -687,7 +709,7 @@ class DataSet(object): | |||||
""" | """ | ||||
将results作为加入到新的field中,field名称为new_field_name | 将results作为加入到新的field中,field名称为new_field_name | ||||
:param list(str) results: 一般是apply*()之后的结果 | |||||
:param List[str] results: 一般是apply*()之后的结果 | |||||
:param str new_field_name: 新加入的field的名称 | :param str new_field_name: 新加入的field的名称 | ||||
:param dict kwargs: 用户apply*()时传入的自定义参数 | :param dict kwargs: 用户apply*()时传入的自定义参数 | ||||
:return: | :return: | ||||
@@ -730,7 +752,7 @@ class DataSet(object): | |||||
3. ignore_type: bool, 如果为True则将 `new_field_name` 的field的ignore_type设置为true, 忽略其类型 | 3. ignore_type: bool, 如果为True则将 `new_field_name` 的field的ignore_type设置为true, 忽略其类型 | ||||
:return: list(Any), 里面的元素为func的返回值,所以list长度为DataSet的长度 | |||||
:return List[Any]: 里面的元素为func的返回值,所以list长度为DataSet的长度 | |||||
""" | """ | ||||
assert len(self) != 0, "Null DataSet cannot use apply()." | assert len(self) != 0, "Null DataSet cannot use apply()." | ||||
idx = -1 | idx = -1 | ||||
@@ -795,7 +817,7 @@ class DataSet(object): | |||||
:param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `(1-ratio)` 这么多数据,第二个DataSet拥有`ratio`这么多数据 | :param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `(1-ratio)` 这么多数据,第二个DataSet拥有`ratio`这么多数据 | ||||
:param bool shuffle: 在split前是否shuffle一下 | :param bool shuffle: 在split前是否shuffle一下 | ||||
:return: [DataSet, DataSet] | |||||
:return: [ :class:`~fastNLP.读取后的DataSet` , :class:`~fastNLP.读取后的DataSet` ] | |||||
""" | """ | ||||
assert isinstance(ratio, float) | assert isinstance(ratio, float) | ||||
assert 0 < ratio < 1 | assert 0 < ratio < 1 | ||||
@@ -819,7 +841,7 @@ class DataSet(object): | |||||
@classmethod | @classmethod | ||||
def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): | def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): | ||||
""" | |||||
r""" | |||||
.. warning:: | .. warning:: | ||||
此方法会在下个版本移除,请使用 :class:`fastNLP.io.CSVLoader` | 此方法会在下个版本移除,请使用 :class:`fastNLP.io.CSVLoader` | ||||
@@ -830,7 +852,7 @@ class DataSet(object): | |||||
与csv文件中每行的元素个数相同。 | 与csv文件中每行的元素个数相同。 | ||||
:param str sep: 分割符 | :param str sep: 分割符 | ||||
:param bool dropna: 是否忽略与header数量不一致行。 | :param bool dropna: 是否忽略与header数量不一致行。 | ||||
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象 | |||||
:return: 读取后的 :class:`~fastNLP.读取后的DataSet`。 | |||||
""" | """ | ||||
warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead', | warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead', | ||||
category=DeprecationWarning) | category=DeprecationWarning) | ||||
@@ -870,11 +892,11 @@ class DataSet(object): | |||||
@staticmethod | @staticmethod | ||||
def load(path): | def load(path): | ||||
""" | |||||
r""" | |||||
从保存的DataSet pickle文件的路径中读取DataSet | 从保存的DataSet pickle文件的路径中读取DataSet | ||||
:param str path: 从哪里读取DataSet | :param str path: 从哪里读取DataSet | ||||
:return: 一个 :class:`~fastNLP.DataSet` 类型的对象 | |||||
:return: 读取后的 :class:`~fastNLP.读取后的DataSet`。 | |||||
""" | """ | ||||
with open(path, 'rb') as f: | with open(path, 'rb') as f: | ||||
d = pickle.load(f) | d = pickle.load(f) | ||||
@@ -448,9 +448,10 @@ class Padder: | |||||
用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 | 用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 | ||||
.. py:function:: __call__(self, contents, field_name, field_ele_dtype): | .. py:function:: __call__(self, contents, field_name, field_ele_dtype): | ||||
传入的是List内容。假设有以下的DataSet。 | 传入的是List内容。假设有以下的DataSet。 | ||||
:param list(Any) contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 | |||||
:param List[Any] contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 | |||||
deepcopy一份。 | deepcopy一份。 | ||||
:param str, field_name: field的名称。 | :param str, field_name: field的名称。 | ||||
:param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True,该这个值为None。 | :param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True,该这个值为None。 | ||||
@@ -469,7 +470,7 @@ class Padder: | |||||
""" | """ | ||||
传入的是List内容。假设有以下的DataSet。 | 传入的是List内容。假设有以下的DataSet。 | ||||
:param list(Any) contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 | |||||
:param List[Any] contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 | |||||
deepcopy一份。 | deepcopy一份。 | ||||
:param str, field_name: field的名称。 | :param str, field_name: field的名称。 | ||||
:param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True, | :param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True, | ||||
@@ -208,7 +208,7 @@ class CrossEntropyLoss(LossBase): | |||||
:param seq_len: 句子的长度, 长度之外的token不会计算loss。。 | :param seq_len: 句子的长度, 长度之外的token不会计算loss。。 | ||||
:param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替 | :param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替 | ||||
传入seq_len. | 传入seq_len. | ||||
:param str reduction: 支持'mean','sum'和'none'. | |||||
:param str reduction: 支持 `mean` ,`sum` 和 `none` . | |||||
Example:: | Example:: | ||||
@@ -265,9 +265,9 @@ class BCELoss(LossBase): | |||||
二分类交叉熵损失函数 | 二分类交叉熵损失函数 | ||||
:param pred: 参数映射表中`pred`的映射关系,None表示映射关系为`pred`->`pred` | |||||
:param target: 参数映射表中`target`的映射关系,None表示映射关系为`target`->`target` | |||||
:param str reduction: 支持'mean','sum'和'none'. | |||||
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` | |||||
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` | |||||
:param str reduction: 支持 `mean` ,`sum` 和 `none` . | |||||
""" | """ | ||||
def __init__(self, pred=None, target=None, reduction='mean'): | def __init__(self, pred=None, target=None, reduction='mean'): | ||||
@@ -286,11 +286,11 @@ class NLLLoss(LossBase): | |||||
负对数似然损失函数 | 负对数似然损失函数 | ||||
:param pred: 参数映射表中`pred`的映射关系,None表示映射关系为`pred`->`pred` | |||||
:param target: 参数映射表中`target`的映射关系,None表示映射关系为`target`->`target` | |||||
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` | |||||
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` | |||||
:param ignore_idx: ignore的index,在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替 | :param ignore_idx: ignore的index,在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替 | ||||
传入seq_len. | 传入seq_len. | ||||
:param str reduction: 支持'mean','sum'和'none'. | |||||
:param str reduction: 支持 `mean` ,`sum` 和 `none` . | |||||
""" | """ | ||||
def __init__(self, pred=None, target=None, ignore_idx=-100, reduction='mean'): | def __init__(self, pred=None, target=None, ignore_idx=-100, reduction='mean'): | ||||
@@ -27,14 +27,14 @@ from abc import abstractmethod | |||||
class MetricBase(object): | class MetricBase(object): | ||||
""" | """ | ||||
所有metrics的基类,,所有的传入到Trainer, Tester的Metric需要继承自该对象,需要覆盖写入evaluate(), get_metric()方法。 | |||||
所有metrics的基类,所有的传入到Trainer, Tester的Metric需要继承自该对象,需要覆盖写入evaluate(), get_metric()方法。 | |||||
evaluate(xxx)中传入的是一个batch的数据。 | evaluate(xxx)中传入的是一个batch的数据。 | ||||
get_metric(xxx)当所有数据处理完毕,调用该方法得到最终的metric值 | get_metric(xxx)当所有数据处理完毕,调用该方法得到最终的metric值 | ||||
以分类问题中,Accuracy计算为例 | 以分类问题中,Accuracy计算为例 | ||||
假设model的forward返回dict中包含'pred'这个key, 并且该key需要用于Accuracy:: | |||||
假设model的forward返回dict中包含 `pred` 这个key, 并且该key需要用于Accuracy:: | |||||
class Model(nn.Module): | class Model(nn.Module): | ||||
def __init__(xxx): | def __init__(xxx): | ||||
@@ -43,7 +43,7 @@ class MetricBase(object): | |||||
# do something | # do something | ||||
return {'pred': pred, 'other_keys':xxx} # pred's shape: batch_size x num_classes | return {'pred': pred, 'other_keys':xxx} # pred's shape: batch_size x num_classes | ||||
假设dataset中'label'这个field是需要预测的值,并且该field被设置为了target | |||||
假设dataset中 `label` 这个field是需要预测的值,并且该field被设置为了target | |||||
对应的AccMetric可以按如下的定义, version1, 只使用这一次:: | 对应的AccMetric可以按如下的定义, version1, 只使用这一次:: | ||||
class AccMetric(MetricBase): | class AccMetric(MetricBase): | ||||
@@ -478,7 +478,7 @@ class SpanFPreRecMetric(MetricBase): | |||||
别名::class:`fastNLP.SpanFPreRecMetric` :class:`fastNLP.core.metrics.SpanFPreRecMetric` | 别名::class:`fastNLP.SpanFPreRecMetric` :class:`fastNLP.core.metrics.SpanFPreRecMetric` | ||||
在序列标注问题中,以span的方式计算F, pre, rec. | 在序列标注问题中,以span的方式计算F, pre, rec. | ||||
比如中文Part of speech中,会以character的方式进行标注,句子'中国在亚洲'对应的POS可能为(以BMES为例) | |||||
比如中文Part of speech中,会以character的方式进行标注,句子 `中国在亚洲` 对应的POS可能为(以BMES为例) | |||||
['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。 | ['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。 | ||||
最后得到的metric结果为:: | 最后得到的metric结果为:: | ||||
@@ -502,15 +502,15 @@ class SpanFPreRecMetric(MetricBase): | |||||
:param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN), | :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN), | ||||
在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'. | 在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'. | ||||
:param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用'pred'取数据 | |||||
:param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用'target'取数据 | |||||
:param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用'seq_len'取数据。 | |||||
:param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据 | |||||
:param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据 | |||||
:param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。 | |||||
:param str encoding_type: 目前支持bio, bmes, bmeso, bioes | :param str encoding_type: 目前支持bio, bmes, bmeso, bioes | ||||
:param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这 | :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这 | ||||
个label | 个label | ||||
:param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个 | :param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个 | ||||
label的f1, pre, rec | label的f1, pre, rec | ||||
:param str f_type: 'micro'或'macro'. 'micro':通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; 'macro': | |||||
:param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; `macro` : | |||||
分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同) | 分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同) | ||||
:param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . | :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . | ||||
常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 | 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 | ||||
@@ -5,7 +5,8 @@ optimizer 模块定义了 fastNLP 中所需的各种优化器,一般做为 :cl | |||||
__all__ = [ | __all__ = [ | ||||
"Optimizer", | "Optimizer", | ||||
"SGD", | "SGD", | ||||
"Adam" | |||||
"Adam", | |||||
"AdamW" | |||||
] | ] | ||||
import torch | import torch | ||||
@@ -103,21 +104,28 @@ class Adam(Optimizer): | |||||
class AdamW(TorchOptimizer): | class AdamW(TorchOptimizer): | ||||
r"""对AdamW的实现,该实现应该会在pytorch更高版本中出现,https://github.com/pytorch/pytorch/pull/21250。这里提前加入 | |||||
r""" | |||||
别名::class:`fastNLP.AdamW` :class:`fastNLP.core.optimizer.AdamW` | |||||
对AdamW的实现,该实现应该会在pytorch更高版本中出现,https://github.com/pytorch/pytorch/pull/21250。这里提前加入 | |||||
.. todo:: | |||||
翻译成中文 | |||||
The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. | The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. | ||||
The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. | The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. | ||||
Arguments: | |||||
params (iterable): iterable of parameters to optimize or dicts defining | |||||
parameter groups | |||||
lr (float, optional): learning rate (default: 1e-3) | |||||
betas (Tuple[float, float], optional): coefficients used for computing | |||||
running averages of gradient and its square (default: (0.9, 0.99)) | |||||
eps (float, optional): term added to the denominator to improve | |||||
numerical stability (default: 1e-8) | |||||
weight_decay (float, optional): weight decay coefficient (default: 1e-2) | |||||
amsgrad (boolean, optional): whether to use the AMSGrad variant of this | |||||
algorithm from the paper `On the Convergence of Adam and Beyond`_ | |||||
(default: False) | |||||
:param params (iterable): iterable of parameters to optimize or dicts defining | |||||
parameter groups | |||||
:param lr (float, optional): learning rate (default: 1e-3) | |||||
:param betas (Tuple[float, float], optional): coefficients used for computing | |||||
running averages of gradient and its square (default: (0.9, 0.99)) | |||||
:param eps (float, optional): term added to the denominator to improve | |||||
numerical stability (default: 1e-8) | |||||
:param weight_decay (float, optional): weight decay coefficient (default: 1e-2) | |||||
algorithm from the paper `On the Convergence of Adam and Beyond`_ | |||||
(default: False) | |||||
.. _Adam\: A Method for Stochastic Optimization: | .. _Adam\: A Method for Stochastic Optimization: | ||||
https://arxiv.org/abs/1412.6980 | https://arxiv.org/abs/1412.6980 | ||||
.. _Decoupled Weight Decay Regularization: | .. _Decoupled Weight Decay Regularization: | ||||
@@ -147,9 +155,9 @@ class AdamW(TorchOptimizer): | |||||
def step(self, closure=None): | def step(self, closure=None): | ||||
"""Performs a single optimization step. | """Performs a single optimization step. | ||||
Arguments: | |||||
closure (callable, optional): A closure that reevaluates the model | |||||
and returns the loss. | |||||
:param closure: (callable, optional) A closure that reevaluates the model | |||||
and returns the loss. | |||||
""" | """ | ||||
loss = None | loss = None | ||||
if closure is not None: | if closure is not None: | ||||
@@ -1,7 +1,7 @@ | |||||
""" | """ | ||||
tester模块实现了 fastNLP 所需的Tester类,能在提供数据、模型以及metric的情况下进行性能测试。 | tester模块实现了 fastNLP 所需的Tester类,能在提供数据、模型以及metric的情况下进行性能测试。 | ||||
Example:: | |||||
.. code-block:: | |||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
@@ -60,15 +60,14 @@ class Tester(object): | |||||
Tester是在提供数据,模型以及metric的情况下进行性能测试的类。需要传入模型,数据以及metric进行验证。 | Tester是在提供数据,模型以及metric的情况下进行性能测试的类。需要传入模型,数据以及metric进行验证。 | ||||
:param data: 需要测试的数据集, :class:`~fastNLP.DataSet` 类型 | |||||
:param ~fastNLP.DataSet data: 需要测试的数据集 | |||||
:param torch.nn.module model: 使用的模型 | :param torch.nn.module model: 使用的模型 | ||||
:param metrics: :class:`~fastNLP.core.metrics.MetricBase` 或者一个列表的 :class:`~fastNLP.core.metrics.MetricBase` | |||||
:param ~fastNLP.core.metrics.MetricBase,List[~fastNLP.core.metrics.MetricBase] metrics: 测试时使用的metrics | |||||
:param int batch_size: evaluation时使用的batch_size有多大。 | :param int batch_size: evaluation时使用的batch_size有多大。 | ||||
:param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 | :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 | ||||
的计算位置进行管理。支持以下的输入: | 的计算位置进行管理。支持以下的输入: | ||||
1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中, | |||||
可见的第二个GPU中; | |||||
1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中,可见的第一个GPU中,可见的第二个GPU中; | |||||
2. torch.device:将模型装载到torch.device上。 | 2. torch.device:将模型装载到torch.device上。 | ||||
@@ -11,288 +11,310 @@ Trainer在fastNLP中用于组织单任务的训练过程,可以避免用户在 | |||||
(5) 保存获得更好验证性能的模型。 | (5) 保存获得更好验证性能的模型。 | ||||
1 Trainer的基本使用 | |||||
下面的例子是使用神经网络来进行预测一个序列中是否有偶数个1。 | |||||
Example:: | |||||
import numpy as np | |||||
from torch import nn | |||||
import torch | |||||
import torch.nn.functional as F | |||||
from torch.optim import SGD | |||||
from fastNLP import DataSet | |||||
from fastNLP import Trainer | |||||
from fastNLP import CrossEntropyLoss | |||||
from fastNLP import AccuracyMetric | |||||
from fastNLP.modules.decoder import MLP | |||||
# 模型 | |||||
class Model(nn.Module): | |||||
def __init__(self, input_num): | |||||
super().__init__() | |||||
self.fcs = MLP([input_num, 40, 40, 2], 'relu') | |||||
def forward(self, x): | |||||
x = self.fcs(x) | |||||
return {'pred': x} | |||||
model = Model(10) | |||||
# 生成数据 | |||||
def generate_psedo_dataset(num_samples): | |||||
dataset = DataSet() | |||||
data = np.random.randint(2, size=(num_samples, 10)) | |||||
label = np.sum(data, axis=1)%2 | |||||
dataset = DataSet({'x':data.astype(float), 'label': label}) | |||||
dataset.set_input('x') | |||||
dataset.set_target('label') | |||||
return dataset | |||||
tr_dataset = generate_psedo_dataset(1000) | |||||
dev_data = generate_psedo_dataset(100) | |||||
# 训练 | |||||
trainer = Trainer(tr_dataset, model, loss=CrossEntropyLoss(target='label'), | |||||
optimizer=SGD(model.parameters(), lr=0.1),n_epochs=1000, | |||||
dev_data = dev_data, metrics=AccuracyMetric(target='label')) | |||||
trainer.train() | |||||
由上面的例子可以看出通过使用Trainer,可以使得训练部分的代码大幅减少。 | |||||
使用Trainer需要满足以下几个条件: | |||||
---------------------------- | |||||
1. Trainer的基本使用 | |||||
---------------------------- | |||||
下面的例子是使用神经网络来进行预测一个序列中是否有偶数个1。 | |||||
.. code-block:: python | |||||
import numpy as np | |||||
from torch import nn | |||||
import torch | |||||
import torch.nn.functional as F | |||||
from torch.optim import SGD | |||||
from fastNLP import DataSet | |||||
from fastNLP import Trainer | |||||
from fastNLP import CrossEntropyLoss | |||||
from fastNLP import AccuracyMetric | |||||
from fastNLP.modules.decoder import MLP | |||||
# 模型 | |||||
class Model(nn.Module): | |||||
def __init__(self, input_num): | |||||
super().__init__() | |||||
self.fcs = MLP([input_num, 40, 40, 2], 'relu') | |||||
def forward(self, x): | |||||
x = self.fcs(x) | |||||
return {'pred': x} | |||||
model = Model(10) | |||||
# 生成数据 | |||||
def generate_psedo_dataset(num_samples): | |||||
dataset = DataSet() | |||||
data = np.random.randint(2, size=(num_samples, 10)) | |||||
label = np.sum(data, axis=1)%2 | |||||
dataset = DataSet({'x':data.astype(float), 'label': label}) | |||||
dataset.set_input('x') | |||||
dataset.set_target('label') | |||||
return dataset | |||||
tr_dataset = generate_psedo_dataset(1000) | |||||
dev_data = generate_psedo_dataset(100) | |||||
# 训练 | |||||
trainer = Trainer(tr_dataset, model, loss=CrossEntropyLoss(target='label'), | |||||
optimizer=SGD(model.parameters(), lr=0.1),n_epochs=1000, | |||||
dev_data = dev_data, metrics=AccuracyMetric(target='label')) | |||||
trainer.train() | |||||
由上面的例子可以看出通过使用Trainer,可以使得训练部分的代码大幅减少。 | |||||
使用Trainer需要满足以下几个条件: | |||||
1.1 模型 | 1.1 模型 | ||||
1 模型的forward()的参数名需要与DataSet中的名字对应。实际上fastNLP在将DataSet中的数据传递给模型forward()时,是 | |||||
通过匹配名称实现的。所以上例中,如果Model的forward函数修改为forward(self, data), 则DataSet中的'x'这个field就应该 | |||||
改名为'data'。 | |||||
---------------------------- | |||||
1 模型的forward()的参数名需要与DataSet中的名字对应。实际上fastNLP在将DataSet中的数据传递给模型forward()时,是 | |||||
通过匹配名称实现的。所以上例中,如果Model的forward函数修改为forward(self, data), 则DataSet中的'x'这个field就应该 | |||||
改名为'data'。 | |||||
2 传递给forward()的参数是DataSet中被设置为input的那些field。但如果forward()中没有对应的参数,则不会将数据传递 | |||||
给forward()。例如,DataSet中'x1', 'x2'都是input,但是模型的函数为forward(self, x1), 那么'x2'不会传递给forward()。 | |||||
2 传递给forward()的参数是DataSet中被设置为input的那些field。但如果forward()中没有对应的参数,则不会将数据传递 | |||||
给forward()。例如,DataSet中'x1', 'x2'都是input,但是模型的函数为forward(self, x1), 那么'x2'不会传递给forward()。 | |||||
3 模型的forward()返回值需要为一个dict。 | |||||
3 模型的forward()返回值需要为一个dict。 | |||||
1.2 Loss | 1.2 Loss | ||||
fastNLP中的为了不限制forward函数的返回内容数量(比如一些复杂任务需要返回多个内容,如Dependency Parsing, | |||||
:mod:`Loss<fastNLP.core.losses>` 与 :mod:`Metric<fastNLP.core.metrics>` 都使用了通过名称来匹配相应内容的策略。如上面的例子中 | |||||
---------------------------- | |||||
Example:: | |||||
fastNLP中的为了不限制forward函数的返回内容数量(比如一些复杂任务需要返回多个内容,如Dependency Parsing, | |||||
:mod:`Loss<fastNLP.core.losses>` 与 :mod:`Metric<fastNLP.core.metrics>` 都使用了通过名称来匹配相应内容的策略。如上面的例子中 | |||||
trainer = Trainer(tr_dataset, model, loss=CrossEntropyLoss(target='label'), | |||||
optimizer=SGD(model.parameters(), lr=0.1),n_epochs=1000, | |||||
dev_data = dev_data, metrics=AccuracyMetric(target='label')) | |||||
.. code-block:: python | |||||
loss被设置为了 :class:`~fastNLP.CrossEntropyLoss` , 但在初始化的时候传入了target='label'这个参数, | |||||
:class:`~fastNLP.CrossEntropyLoss` 的初始化参数为(pred=None, target=None, padding_idx=-100)。 | |||||
这里的两个参数分别为计算CrossEntropy时需要使用到的模型的预测值与真实值。 | |||||
其中 `pred` 一般来自于模型forward()的返回结果,`target` 一般是来自于DataSet中被设置为target的field。 | |||||
由于每个人对真实值或者model的返回值取名并不一样,所以fastNLP的 :mod:`Loss<fastNLP.core.losses>` 提供一种类似于映射的机制来匹配对应的值, | |||||
比如这里 :class:`~fastNLP.CrossEntropyLoss` 将尝试找到名为'label'的内容来作为真实值得到loss; | |||||
而pred=None, 则 :class:`~fastNLP.CrossEntropyLoss` 使用'pred'作为名称匹配预测值, | |||||
正好forward的返回值也叫pred,所以这里不需要申明pred。 | |||||
尽管fastNLP使用了映射机制来使得loss的计算变得比较灵活,但有些情况下loss必须在模型中进行计算,比如使用了CRF的模型。 | |||||
fastNLP中提供了 :class:`~fastNLP.LossInForward` 这个loss。 | |||||
这个loss的原理是直接在forward()的返回结果中找到loss_key(默认寻找'loss')指定的那个tensor,并使用它作为loss。 | |||||
如果Trainer初始化没有提供loss则默认使用 :class:`~fastNLP.LossInForward` 。 | |||||
.. todo:: | |||||
补充一个例子 详细例子可以参照 | |||||
trainer = Trainer(tr_dataset, model, loss=CrossEntropyLoss(target='label'), | |||||
optimizer=SGD(model.parameters(), lr=0.1),n_epochs=1000, | |||||
dev_data = dev_data, metrics=AccuracyMetric(target='label')) | |||||
loss被设置为了 :class:`~fastNLP.CrossEntropyLoss` , 但在初始化的时候传入了target='label'这个参数, | |||||
:class:`~fastNLP.CrossEntropyLoss` 的初始化参数为(pred=None, target=None, padding_idx=-100)。 | |||||
这里的两个参数分别为计算CrossEntropy时需要使用到的模型的预测值与真实值。 | |||||
其中 `pred` 一般来自于模型forward()的返回结果,`target` 一般是来自于DataSet中被设置为target的field。 | |||||
由于每个人对真实值或者model的返回值取名并不一样,所以fastNLP的 :mod:`Loss<fastNLP.core.losses>` 提供一种类似于映射的机制来匹配对应的值, | |||||
比如这里 :class:`~fastNLP.CrossEntropyLoss` 将尝试找到名为'label'的内容来作为真实值得到loss; | |||||
而pred=None, 则 :class:`~fastNLP.CrossEntropyLoss` 使用'pred'作为名称匹配预测值, | |||||
正好forward的返回值也叫pred,所以这里不需要申明pred。 | |||||
尽管fastNLP使用了映射机制来使得loss的计算变得比较灵活,但有些情况下loss必须在模型中进行计算,比如使用了CRF的模型。 | |||||
fastNLP中提供了 :class:`~fastNLP.LossInForward` 这个loss。 | |||||
这个loss的原理是直接在forward()的返回结果中找到loss_key(默认寻找'loss')指定的那个tensor,并使用它作为loss。 | |||||
如果Trainer初始化没有提供loss则默认使用 :class:`~fastNLP.LossInForward` 。 | |||||
.. todo:: | |||||
补充一个例子 详细例子可以参照 | |||||
1.3 Metric | 1.3 Metric | ||||
:mod:`Metric<fastNLP.core.metrics>` 使用了与上述Loss一样的策略,即使用名称进行匹配。 | |||||
AccuracyMetric(target='label')的情况与CrossEntropyLoss 是同理的。 | |||||
在进行验证时,可能用到的计算与forward()中不太一致,没有办法直接从forward()的结果中得到预测值,这时模型可以提供一个predict()方法, | |||||
如果提供的模型具有predict方法,则在模型验证时将调用predict()方法获取预测结果, | |||||
传入到predict()的参数也是从DataSet中被设置为input的field中选择出来的; | |||||
与forward()一样,返回值需要为一个dict。 | |||||
---------------------------- | |||||
:mod:`Metric<fastNLP.core.metrics>` 使用了与上述Loss一样的策略,即使用名称进行匹配。 | |||||
AccuracyMetric(target='label')的情况与CrossEntropyLoss 是同理的。 | |||||
在进行验证时,可能用到的计算与forward()中不太一致,没有办法直接从forward()的结果中得到预测值,这时模型可以提供一个predict()方法, | |||||
如果提供的模型具有predict方法,则在模型验证时将调用predict()方法获取预测结果, | |||||
传入到predict()的参数也是从DataSet中被设置为input的field中选择出来的; | |||||
与forward()一样,返回值需要为一个dict。 | |||||
.. todo:: | |||||
补充一个例子 具体例子可以参考 | |||||
.. todo:: | |||||
补充一个例子 具体例子可以参考 | |||||
---------------------------- | |||||
2. Trainer的代码检查 | |||||
---------------------------- | |||||
2 Trainer的代码检查 | |||||
由于在fastNLP中采取了映射的机制,所以难免可能存在对应出错的情况。Trainer提供一种映射检查机制,可以通过check_code_level来进行控制 | |||||
比如下面的例子中,由于各种原因产生的报错 | |||||
由于在fastNLP中采取了映射的机制,所以难免可能存在对应出错的情况。Trainer提供一种映射检查机制,可以通过check_code_level来进行控制 | |||||
比如下面的例子中,由于各种原因产生的报错 | |||||
Example2.1 | Example2.1 | ||||
:: | |||||
import numpy as np | |||||
from torch import nn | |||||
import torch | |||||
from torch.optim import SGD | |||||
from fastNLP import Trainer | |||||
from fastNLP import DataSet | |||||
class Model(nn.Module): | |||||
def __init__(self): | |||||
super().__init__() | |||||
self.fc = nn.Linear(1, 1) | |||||
def forward(self, x, b): | |||||
loss = torch.mean((self.fc(x)-b)**2) | |||||
return {'loss': loss} | |||||
model = Model() | |||||
dataset = DataSet({'a': np.arange(10), 'b':np.arange(10)*2}) | |||||
dataset.set_input('a', 'b') | |||||
trainer = Trainer(dataset, model, loss=None, optimizer=SGD(model.parameters(), lr=0.001)) | |||||
trainer = Trainer(dataset, model, SGD(model.parameters())) | |||||
# 会报以下的错误 | |||||
# input fields after batch(if batch size is 2): | |||||
# a: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) | |||||
# b: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) | |||||
# There is no target field. | |||||
# .... | |||||
# NameError: | |||||
# Problems occurred when calling Model.forward(self, x, b) | |||||
# missing param: ['x'] | |||||
# unused field: ['a'] | |||||
# Suggestion: You need to provide ['x'] in DataSet and set it as input. | |||||
这里就是由于在Trainer初始化的时候,fastNLP会尝试使用一个batch_size=2的batch去运行一遍forward()以及backward()。这里有两类 | |||||
信息可以为你提供参考 | |||||
1 'input fields after batch...'这部分显示的是train dataset经过Batch操作后,每个field对应的类型以及进行shape。这里 | |||||
因为train dataset没有target所以没有显示。根据这里可以看出是否正确将需要的内容设置为了input或target。 | |||||
2 NameError,NameError发生在映射出错的情况。这里报错的原因是由于尝试进行forward计算时(可以通过Model.forward(self, x, b)判断 | |||||
出当前是在调取forward),却没有获取到forward()函数中需要的'x';在报错信息中同时指出了缺'x',而'a'没有被使用,那么可能 | |||||
就是由于field的名称不对。这里将dataset中'a'这个field的名称改为'x',或者model的参数从'x'修改为'a'都可以解决问题。 | |||||
下面的例子是由于loss计算的时候找不到需要的值 | |||||
---------------------------- | |||||
.. code-block:: python | |||||
import numpy as np | |||||
from torch import nn | |||||
import torch | |||||
from torch.optim import SGD | |||||
from fastNLP import Trainer | |||||
from fastNLP import DataSet | |||||
class Model(nn.Module): | |||||
def __init__(self): | |||||
super().__init__() | |||||
self.fc = nn.Linear(1, 1) | |||||
def forward(self, x, b): | |||||
loss = torch.mean((self.fc(x)-b)**2) | |||||
return {'loss': loss} | |||||
model = Model() | |||||
dataset = DataSet({'a': np.arange(10), 'b':np.arange(10)*2}) | |||||
dataset.set_input('a', 'b') | |||||
trainer = Trainer(dataset, model, loss=None, optimizer=SGD(model.parameters(), lr=0.001)) | |||||
trainer = Trainer(dataset, model, SGD(model.parameters())) | |||||
# 会报以下的错误 | |||||
# input fields after batch(if batch size is 2): | |||||
# a: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) | |||||
# b: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) | |||||
# There is no target field. | |||||
# .... | |||||
# NameError: | |||||
# Problems occurred when calling Model.forward(self, x, b) | |||||
# missing param: ['x'] | |||||
# unused field: ['a'] | |||||
# Suggestion: You need to provide ['x'] in DataSet and set it as input. | |||||
这里就是由于在Trainer初始化的时候,fastNLP会尝试使用一个batch_size=2的batch去运行一遍forward()以及backward()。这里有两类 | |||||
信息可以为你提供参考 | |||||
1 'input fields after batch...'这部分显示的是train dataset经过Batch操作后,每个field对应的类型以及进行shape。这里 | |||||
因为train dataset没有target所以没有显示。根据这里可以看出是否正确将需要的内容设置为了input或target。 | |||||
2 NameError,NameError发生在映射出错的情况。这里报错的原因是由于尝试进行forward计算时(可以通过Model.forward(self, x, b)判断 | |||||
出当前是在调取forward),却没有获取到forward()函数中需要的'x';在报错信息中同时指出了缺'x',而'a'没有被使用,那么可能 | |||||
就是由于field的名称不对。这里将dataset中'a'这个field的名称改为'x',或者model的参数从'x'修改为'a'都可以解决问题。 | |||||
下面的例子是由于loss计算的时候找不到需要的值 | |||||
Example2.2 | Example2.2 | ||||
:: | |||||
import numpy as np | |||||
from torch import nn | |||||
from torch.optim import SGD | |||||
from fastNLP import Trainer | |||||
from fastNLP import DataSet | |||||
from fastNLP import L1Loss | |||||
import torch | |||||
class Model(nn.Module): | |||||
def __init__(self): | |||||
super().__init__() | |||||
self.fc = nn.Linear(1, 1) | |||||
def forward(self, a): | |||||
return {'pred_b': self.fc(a.unsqueeze(1)).squeeze(1), 'No use':1} | |||||
model = Model() | |||||
dataset = DataSet({'a': np.arange(10, dtype=float), 'b':np.arange(10, dtype=float)*2}) | |||||
dataset.set_input('a') | |||||
dataset.set_target('b') | |||||
trainer = Trainer(dataset, model, loss=L1Loss(target='label'), optimizer=SGD(model.parameters(), lr=0.001)) | |||||
# 报错信息如下 | |||||
# input fields after batch(if batch size is 2): | |||||
# a: (1)type:torch.Tensor (2)dtype:torch.float32, (3)shape:torch.Size([2]) | |||||
# target fields after batch(if batch size is 2): | |||||
# b: (1)type:torch.Tensor (2)dtype:torch.float32, (3)shape:torch.Size([2]) | |||||
# .... | |||||
# NameError: | |||||
# Problems occurred when calling L1Loss.get_loss(self, pred, target) | |||||
# missing param: ['pred(assign to `pred` in `L1Loss`)', 'label(assign to `target` in `L1Loss`)'] | |||||
# unused field: ['b'] | |||||
# unused param: ['pred_b', 'No use'] | |||||
# target field: ['b'] | |||||
# param from Model.forward(self, a): ['pred_b', 'No use'] | |||||
# Suggestion: (1). Check key assignment for `target` when initialize L1Loss. Or provide `label` in DataSet or output of Model.forward(self, a). | |||||
# (2). Check key assignment for `pred` when initialize L1Loss. Or provide `pred` in DataSet or output of Model.forward(self, a). | |||||
报错信息也包含两部分: | |||||
1 第一部分与上面是一样的 | |||||
2 这里报错的原因是由于计算loss的时候找不到相应的值(通过L1Loss.get_loss(self, pred, target)判断出来的); | |||||
报错的原因是因为 `pred` 和 `label` (我们在初始化L1Loss时将target指定为了label)都没有找到。 | |||||
这里'unused field'是DataSet中出现了,但却没有被设置为input或者target的field; | |||||
'unused param'是forward()中返回且没有被使用到的内容;'target field'是被设置为了target的field; | |||||
'param from Model.forward(self, a)'是forward()返回的所有key。"Suggestion"是关于当前错误处理的建议。 | |||||
但是在一些情况下,比如forward()返回值只有一个,target也只有一个,fastNLP不会进行匹配,而直接将forward()的结果作为pred, | |||||
将DataSet中的target设置为target。上面的例子在返回值中加入了一个'No use'则只是为了使得Loss去匹配结果。 | |||||
下面是带有dev dataset时如果出现错误会发生的报错, | |||||
---------------------------- | |||||
.. code-block:: python | |||||
import numpy as np | |||||
from torch import nn | |||||
from torch.optim import SGD | |||||
from fastNLP import Trainer | |||||
from fastNLP import DataSet | |||||
from fastNLP import L1Loss | |||||
import torch | |||||
class Model(nn.Module): | |||||
def __init__(self): | |||||
super().__init__() | |||||
self.fc = nn.Linear(1, 1) | |||||
def forward(self, a): | |||||
return {'pred_b': self.fc(a.unsqueeze(1)).squeeze(1), 'No use':1} | |||||
model = Model() | |||||
dataset = DataSet({'a': np.arange(10, dtype=float), 'b':np.arange(10, dtype=float)*2}) | |||||
dataset.set_input('a') | |||||
dataset.set_target('b') | |||||
trainer = Trainer(dataset, model, loss=L1Loss(target='label'), optimizer=SGD(model.parameters(), lr=0.001)) | |||||
# 报错信息如下 | |||||
# input fields after batch(if batch size is 2): | |||||
# a: (1)type:torch.Tensor (2)dtype:torch.float32, (3)shape:torch.Size([2]) | |||||
# target fields after batch(if batch size is 2): | |||||
# b: (1)type:torch.Tensor (2)dtype:torch.float32, (3)shape:torch.Size([2]) | |||||
# .... | |||||
# NameError: | |||||
# Problems occurred when calling L1Loss.get_loss(self, pred, target) | |||||
# missing param: ['pred(assign to `pred` in `L1Loss`)', 'label(assign to `target` in `L1Loss`)'] | |||||
# unused field: ['b'] | |||||
# unused param: ['pred_b', 'No use'] | |||||
# target field: ['b'] | |||||
# param from Model.forward(self, a): ['pred_b', 'No use'] | |||||
# Suggestion: (1). Check key assignment for `target` when initialize L1Loss. Or provide `label` in DataSet or output of Model.forward(self, a). | |||||
# (2). Check key assignment for `pred` when initialize L1Loss. Or provide `pred` in DataSet or output of Model.forward(self, a). | |||||
报错信息也包含两部分: | |||||
1 第一部分与上面是一样的 | |||||
2 这里报错的原因是由于计算loss的时候找不到相应的值(通过L1Loss.get_loss(self, pred, target)判断出来的); | |||||
报错的原因是因为 `pred` 和 `label` (我们在初始化L1Loss时将target指定为了label)都没有找到。 | |||||
这里'unused field'是DataSet中出现了,但却没有被设置为input或者target的field; | |||||
'unused param'是forward()中返回且没有被使用到的内容;'target field'是被设置为了target的field; | |||||
'param from Model.forward(self, a)'是forward()返回的所有key。"Suggestion"是关于当前错误处理的建议。 | |||||
但是在一些情况下,比如forward()返回值只有一个,target也只有一个,fastNLP不会进行匹配,而直接将forward()的结果作为pred, | |||||
将DataSet中的target设置为target。上面的例子在返回值中加入了一个'No use'则只是为了使得Loss去匹配结果。 | |||||
下面是带有dev dataset时如果出现错误会发生的报错, | |||||
Example2.3 | Example2.3 | ||||
:: | |||||
---------------------------- | |||||
.. code-block:: python | |||||
import numpy as np | |||||
from torch import nn | |||||
from torch.optim import SGD | |||||
from fastNLP import Trainer | |||||
from fastNLP import DataSet | |||||
from fastNLP import AccuracyMetric | |||||
import torch | |||||
class Model(nn.Module): | |||||
def __init__(self): | |||||
super().__init__() | |||||
self.fc = nn.Linear(1, 1) | |||||
def forward(self, a, b): | |||||
loss = torch.mean((self.fc(a.float().unsqueeze(1))-b.float())**2) | |||||
return {'loss': loss} | |||||
def predict(self, a): # 使用predict()进行验证 | |||||
return {'output':self.fc(a.float().unsqueeze(1))} #这里return的值不包含'pred'这个key | |||||
model = Model() | |||||
dataset = DataSet({'a': np.arange(10), 'b':np.arange(10)*2}) | |||||
dev_data = DataSet({'a': np.arange(10, 20), 'b':np.arange(10, 20)*2}) | |||||
dataset.set_input('a', 'b') | |||||
dev_data.set_input('a') # 这里没有设置target | |||||
trainer = Trainer(dataset, model, loss=None, optimizer=SGD(model.parameters(), lr=0.001), | |||||
dev_data=dev_data, metrics=AccuracyMetric()) | |||||
# 报错信息 | |||||
# ... | |||||
# NameError: | |||||
# Problems occurred when calling AccuracyMetric.evaluate(self, pred, target, seq_len=None) | |||||
# missing param: ['pred(assign to `pred` in `AccuracyMetric`)', 'target(assign to `target` in `AccuracyMetric`)'] | |||||
# unused param: ['output'] | |||||
# target field: [] | |||||
# param from Model.predict(self, a): ['output'] | |||||
# Suggestion: (1). Check key assignment for `pred` when initialize AccuracyMetric. Or provide `pred` in DataSet or output of Model.predict(self, a). | |||||
# (2). Check key assignment for `target` when initialize AccuracyMetric. Or provide `target` in DataSet or output of Model.predict(self, a). | |||||
报错信息和前面都是类似的,但是可以通过'AccuracyMetric.evaluate(self, pred, target, seq_len=None)'看出这里是evaluation | |||||
的时候发生了错误。这样避免了需要在完成一整个epoch的训练才能发现evaluation弄错的情况。这里的修改是通过在初始化metric的时候 | |||||
指明通过'output'获取`pred`, 即AccuracyMetric(pred='output')。 | |||||
可以通过check_code_level调节检查的强度。默认为0,即进行检查。 | |||||
---------------------------- | |||||
3. Trainer与callback | |||||
---------------------------- | |||||
虽然Trainer本身已经集成了一些功能,但仍然不足以囊括训练过程中可能需要到的功能,比如负采样,learning rate decay, Early Stop等。 | |||||
为了解决这个问题fastNLP引入了callback的机制,:class:`~fastNLP.Callback` 是一种在Trainer训练过程中特定阶段会运行的函数集合, | |||||
所有的 :class:`~fastNLP.Callback` 都具有on_*(比如on_train_start, on_backward_begin)等函数。 | |||||
如果 Callback 实现了该函数,则Trainer运行至对应阶段,会进行调用,例如:: | |||||
from fastNLP import Callback, EarlyStopCallback, Trainer, CrossEntropyLoss, AccuracyMetric | |||||
from fastNLP.models import CNNText | |||||
start_time = time.time() | |||||
import numpy as np | |||||
from torch import nn | |||||
from torch.optim import SGD | |||||
from fastNLP import Trainer | |||||
from fastNLP import DataSet | |||||
from fastNLP import AccuracyMetric | |||||
import torch | |||||
class Model(nn.Module): | |||||
def __init__(self): | |||||
super().__init__() | |||||
self.fc = nn.Linear(1, 1) | |||||
def forward(self, a, b): | |||||
loss = torch.mean((self.fc(a.float().unsqueeze(1))-b.float())**2) | |||||
return {'loss': loss} | |||||
def predict(self, a): # 使用predict()进行验证 | |||||
return {'output':self.fc(a.float().unsqueeze(1))} #这里return的值不包含'pred'这个key | |||||
model = Model() | |||||
dataset = DataSet({'a': np.arange(10), 'b':np.arange(10)*2}) | |||||
dev_data = DataSet({'a': np.arange(10, 20), 'b':np.arange(10, 20)*2}) | |||||
dataset.set_input('a', 'b') | |||||
dev_data.set_input('a') # 这里没有设置target | |||||
trainer = Trainer(dataset, model, loss=None, optimizer=SGD(model.parameters(), lr=0.001), | |||||
dev_data=dev_data, metrics=AccuracyMetric()) | |||||
# 报错信息 | |||||
# ... | |||||
# NameError: | |||||
# Problems occurred when calling AccuracyMetric.evaluate(self, pred, target, seq_len=None) | |||||
# missing param: ['pred(assign to `pred` in `AccuracyMetric`)', 'target(assign to `target` in `AccuracyMetric`)'] | |||||
# unused param: ['output'] | |||||
# target field: [] | |||||
# param from Model.predict(self, a): ['output'] | |||||
# Suggestion: (1). Check key assignment for `pred` when initialize AccuracyMetric. Or provide `pred` in DataSet or output of Model.predict(self, a). | |||||
# (2). Check key assignment for `target` when initialize AccuracyMetric. Or provide `target` in DataSet or output of Model.predict(self, a). | |||||
报错信息和前面都是类似的,但是可以通过'AccuracyMetric.evaluate(self, pred, target, seq_len=None)'看出这里是evaluation | |||||
的时候发生了错误。这样避免了需要在完成一整个epoch的训练才能发现evaluation弄错的情况。这里的修改是通过在初始化metric的时候 | |||||
指明通过'output'获取`pred`, 即AccuracyMetric(pred='output')。 | |||||
可以通过check_code_level调节检查的强度。默认为0,即进行检查。 | |||||
3 Trainer与callback | |||||
虽然Trainer本身已经集成了一些功能,但仍然不足以囊括训练过程中可能需要到的功能,比如负采样,learning rate decay, Early Stop等。 | |||||
为了解决这个问题fastNLP引入了callback的机制,:class:`~fastNLP.Callback` 是一种在Trainer训练过程中特定阶段会运行的函数集合, | |||||
所有的 :class:`~fastNLP.Callback` 都具有on_*(比如on_train_start, on_backward_begin)等函数。 | |||||
如果 Callback 实现了该函数,则Trainer运行至对应阶段,会进行调用,例如:: | |||||
class MyCallback(Callback): | |||||
def on_epoch_end(self): | |||||
print('{:d}ms\n\n'.format(round((time.time()-start_time)*1000))) | |||||
from fastNLP import Callback, EarlyStopCallback, Trainer, CrossEntropyLoss, AccuracyMetric | |||||
from fastNLP.models import CNNText | |||||
start_time = time.time() | |||||
class MyCallback(Callback): | |||||
def on_epoch_end(self): | |||||
print('{:d}ms\n\n'.format(round((time.time()-start_time)*1000))) | |||||
model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1) | |||||
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), | |||||
metrics=AccuracyMetric(), callbacks=[MyCallback(),EarlyStopCallback(10)]) | |||||
trainer.train() | |||||
这里,我们通过继承 :class:`~fastNLP.Callback` 类定义了自己的 callback 的,并和内置的 :class:`~fastNLP.EarlyStopCallback` | |||||
一起传给了 :class:`~fastNLP.Trainer` ,增强了 :class:`~fastNLP.Trainer` 的功能 | |||||
model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1) | |||||
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, loss=CrossEntropyLoss(), | |||||
metrics=AccuracyMetric(), callbacks=[MyCallback(),EarlyStopCallback(10)]) | |||||
trainer.train() | |||||
fastNLP已经自带了很多callback函数供使用,可以参考 :doc:`fastNLP.core.callback` 。 | |||||
这里,我们通过继承 :class:`~fastNLP.Callback` 类定义了自己的 callback 的,并和内置的 :class:`~fastNLP.EarlyStopCallback` | |||||
一起传给了 :class:`~fastNLP.Trainer` ,增强了 :class:`~fastNLP.Trainer` 的功能 | |||||
fastNLP已经自带了很多callback函数供使用,可以参考 :doc:`fastNLP.core.callback` 。 | |||||
""" | """ | ||||
__all__ = [ | __all__ = [ | ||||
@@ -4,7 +4,6 @@ utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户 | |||||
__all__ = [ | __all__ = [ | ||||
"cache_results", | "cache_results", | ||||
"seq_len_to_mask", | "seq_len_to_mask", | ||||
"Option", | |||||
] | ] | ||||
import _pickle | import _pickle | ||||
@@ -24,26 +23,27 @@ _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'require | |||||
class Option(dict): | class Option(dict): | ||||
"""a dict can treat keys as attributes""" | """a dict can treat keys as attributes""" | ||||
def __getattr__(self, item): | def __getattr__(self, item): | ||||
try: | try: | ||||
return self.__getitem__(item) | return self.__getitem__(item) | ||||
except KeyError: | except KeyError: | ||||
raise AttributeError(item) | raise AttributeError(item) | ||||
def __setattr__(self, key, value): | def __setattr__(self, key, value): | ||||
if key.startswith('__') and key.endswith('__'): | if key.startswith('__') and key.endswith('__'): | ||||
raise AttributeError(key) | raise AttributeError(key) | ||||
self.__setitem__(key, value) | self.__setitem__(key, value) | ||||
def __delattr__(self, item): | def __delattr__(self, item): | ||||
try: | try: | ||||
self.pop(item) | self.pop(item) | ||||
except KeyError: | except KeyError: | ||||
raise AttributeError(item) | raise AttributeError(item) | ||||
def __getstate__(self): | def __getstate__(self): | ||||
return self | return self | ||||
def __setstate__(self, state): | def __setstate__(self, state): | ||||
self.update(state) | self.update(state) | ||||
@@ -163,6 +163,7 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1): | |||||
return wrapper_ | return wrapper_ | ||||
def _save_model(model, model_name, save_dir, only_param=False): | def _save_model(model, model_name, save_dir, only_param=False): | ||||
""" 存储不含有显卡信息的state_dict或model | """ 存储不含有显卡信息的state_dict或model | ||||
:param model: | :param model: | ||||
@@ -673,7 +674,7 @@ def seq_len_to_mask(seq_len, max_len=None): | |||||
将一个表示sequence length的一维数组转换为二维的mask,不包含的位置为0。 | 将一个表示sequence length的一维数组转换为二维的mask,不包含的位置为0。 | ||||
转变 1-d seq_len到2-d mask. | 转变 1-d seq_len到2-d mask. | ||||
Example:: | |||||
.. code-block:: | |||||
>>> seq_len = torch.arange(2, 16) | >>> seq_len = torch.arange(2, 16) | ||||
>>> mask = seq_len_to_mask(seq_len) | >>> mask = seq_len_to_mask(seq_len) | ||||
@@ -691,7 +692,7 @@ def seq_len_to_mask(seq_len, max_len=None): | |||||
:param np.ndarray,torch.LongTensor seq_len: shape将是(B,) | :param np.ndarray,torch.LongTensor seq_len: shape将是(B,) | ||||
:param int max_len: 将长度pad到这个长度。默认(None)使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有 | :param int max_len: 将长度pad到这个长度。默认(None)使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有 | ||||
区别,所以需要传入一个max_len使得mask的长度是pad到该长度。 | 区别,所以需要传入一个max_len使得mask的长度是pad到该长度。 | ||||
:return: np.ndarray or torch.Tensor, shape将是(B, max_length)。 元素类似为bool或torch.uint8 | |||||
:return: np.ndarray, torch.Tensor 。shape将是(B, max_length), 元素类似为bool或torch.uint8 | |||||
""" | """ | ||||
if isinstance(seq_len, np.ndarray): | if isinstance(seq_len, np.ndarray): | ||||
assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}." | assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}." | ||||
@@ -737,7 +738,8 @@ class _pseudo_tqdm: | |||||
def __exit__(self, exc_type, exc_val, exc_tb): | def __exit__(self, exc_type, exc_val, exc_tb): | ||||
del self | del self | ||||
def iob2(tags:List[str])->List[str]: | |||||
def iob2(tags: List[str]) -> List[str]: | |||||
""" | """ | ||||
检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。两者的差异见 | 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。两者的差异见 | ||||
https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format | https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format | ||||
@@ -760,7 +762,8 @@ def iob2(tags:List[str])->List[str]: | |||||
tags[i] = "B" + tag[1:] | tags[i] = "B" + tag[1:] | ||||
return tags | return tags | ||||
def iob2bioes(tags:List[str])->List[str]: | |||||
def iob2bioes(tags: List[str]) -> List[str]: | |||||
""" | """ | ||||
将iob的tag转换为bioes编码 | 将iob的tag转换为bioes编码 | ||||
:param tags: List[str]. 编码需要是大写的。 | :param tags: List[str]. 编码需要是大写的。 | ||||
@@ -773,15 +776,15 @@ def iob2bioes(tags:List[str])->List[str]: | |||||
else: | else: | ||||
split = tag.split('-')[0] | split = tag.split('-')[0] | ||||
if split == 'B': | if split == 'B': | ||||
if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I': | |||||
if i + 1 != len(tags) and tags[i + 1].split('-')[0] == 'I': | |||||
new_tags.append(tag) | new_tags.append(tag) | ||||
else: | else: | ||||
new_tags.append(tag.replace('B-', 'S-')) | new_tags.append(tag.replace('B-', 'S-')) | ||||
elif split == 'I': | elif split == 'I': | ||||
if i + 1<len(tags) and tags[i+1].split('-')[0] == 'I': | |||||
if i + 1 < len(tags) and tags[i + 1].split('-')[0] == 'I': | |||||
new_tags.append(tag) | new_tags.append(tag) | ||||
else: | else: | ||||
new_tags.append(tag.replace('I-', 'E-')) | new_tags.append(tag.replace('I-', 'E-')) | ||||
else: | else: | ||||
raise TypeError("Invalid IOB format.") | raise TypeError("Invalid IOB format.") | ||||
return new_tags | |||||
return new_tags |
@@ -10,6 +10,7 @@ from .utils import Option | |||||
from functools import partial | from functools import partial | ||||
import numpy as np | import numpy as np | ||||
class VocabularyOption(Option): | class VocabularyOption(Option): | ||||
def __init__(self, | def __init__(self, | ||||
max_size=None, | max_size=None, | ||||
@@ -92,7 +93,7 @@ class Vocabulary(object): | |||||
self.rebuild = True | self.rebuild = True | ||||
# 用于承载不需要单独创建entry的词语,具体见from_dataset()方法 | # 用于承载不需要单独创建entry的词语,具体见from_dataset()方法 | ||||
self._no_create_word = Counter() | self._no_create_word = Counter() | ||||
@_check_build_status | @_check_build_status | ||||
def update(self, word_lst, no_create_entry=False): | def update(self, word_lst, no_create_entry=False): | ||||
"""依次增加序列中词在词典中的出现频率 | """依次增加序列中词在词典中的出现频率 | ||||
@@ -123,7 +124,7 @@ class Vocabulary(object): | |||||
""" | """ | ||||
self._add_no_create_entry(word, no_create_entry) | self._add_no_create_entry(word, no_create_entry) | ||||
self.word_count[word] += 1 | self.word_count[word] += 1 | ||||
def _add_no_create_entry(self, word, no_create_entry): | def _add_no_create_entry(self, word, no_create_entry): | ||||
""" | """ | ||||
在新加入word时,检查_no_create_word的设置。 | 在新加入word时,检查_no_create_word的设置。 | ||||
@@ -139,7 +140,7 @@ class Vocabulary(object): | |||||
self._no_create_word[w] += 1 | self._no_create_word[w] += 1 | ||||
elif not no_create_entry and w in self._no_create_word: | elif not no_create_entry and w in self._no_create_word: | ||||
self._no_create_word.pop(w) | self._no_create_word.pop(w) | ||||
@_check_build_status | @_check_build_status | ||||
def add_word(self, word, no_create_entry=False): | def add_word(self, word, no_create_entry=False): | ||||
""" | """ | ||||
@@ -193,10 +194,10 @@ class Vocabulary(object): | |||||
self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) | self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) | ||||
self.build_reverse_vocab() | self.build_reverse_vocab() | ||||
self.rebuild = False | self.rebuild = False | ||||
def build_reverse_vocab(self): | def build_reverse_vocab(self): | ||||
""" | """ | ||||
基于 "word to index" dict, 构建 "index to word" dict. | |||||
基于 `word to index` dict, 构建 `index to word` dict. | |||||
""" | """ | ||||
self.idx2word = {i: w for w, i in self.word2idx.items()} | self.idx2word = {i: w for w, i in self.word2idx.items()} | ||||
@@ -250,9 +251,9 @@ class Vocabulary(object): | |||||
# remember to use `field_name` | # remember to use `field_name` | ||||
vocab.index_dataset(train_data, dev_data, test_data, field_name='words') | vocab.index_dataset(train_data, dev_data, test_data, field_name='words') | ||||
:param datasets: 需要转index的 class:`~fastNLP.DataSet` , 支持一个或多个(list) | |||||
:param ~fastNLP.DataSet,List[~fastNLP.DataSet] datasets: 需要转index的一个或多个数据集 | |||||
:param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field. | :param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field. | ||||
目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))`` | |||||
目前仅支持 ``str`` , ``List[str]`` , ``List[List[str]]`` | |||||
:param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field. | :param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field. | ||||
Default: ``None`` | Default: ``None`` | ||||
""" | """ | ||||
@@ -285,11 +286,11 @@ class Vocabulary(object): | |||||
raise e | raise e | ||||
else: | else: | ||||
raise RuntimeError("Only DataSet type is allowed.") | raise RuntimeError("Only DataSet type is allowed.") | ||||
@property | @property | ||||
def _no_create_word_length(self): | def _no_create_word_length(self): | ||||
return len(self._no_create_word) | return len(self._no_create_word) | ||||
def from_dataset(self, *datasets, field_name, no_create_entry_dataset=None): | def from_dataset(self, *datasets, field_name, no_create_entry_dataset=None): | ||||
""" | """ | ||||
使用dataset的对应field中词构建词典:: | 使用dataset的对应field中词构建词典:: | ||||
@@ -297,11 +298,11 @@ class Vocabulary(object): | |||||
# remember to use `field_name` | # remember to use `field_name` | ||||
vocab.from_dataset(train_data1, train_data2, field_name='words') | vocab.from_dataset(train_data1, train_data2, field_name='words') | ||||
:param datasets: 需要转index的 class:`~fastNLP.DataSet` , 支持一个或多个(list) | |||||
:param field_name: 可为 ``str`` 或 ``list(str)`` . | |||||
:param ~fastNLP.DataSet,List[~fastNLP.DataSet] datasets: 需要转index的一个或多个数据集 | |||||
:param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` . | |||||
构建词典所使用的 field(s), 支持一个或多个field | 构建词典所使用的 field(s), 支持一个或多个field | ||||
若有多个 DataSet, 每个DataSet都必须有这些field. | 若有多个 DataSet, 每个DataSet都必须有这些field. | ||||
目前仅支持的field结构: ``str`` , ``list(str)`` , ``list(list(str))`` | |||||
目前仅支持的field结构: ``str`` , ``List[str]`` , ``list[List[str]]`` | |||||
:param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain | :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain | ||||
的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev | 的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev | ||||
中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | 中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | ||||
@@ -331,7 +332,7 @@ class Vocabulary(object): | |||||
for words in field: | for words in field: | ||||
for word in words: | for word in words: | ||||
self.add_word(word, no_create_entry=no_create_entry) | self.add_word(word, no_create_entry=no_create_entry) | ||||
for idx, dataset in enumerate(datasets): | for idx, dataset in enumerate(datasets): | ||||
if isinstance(dataset, DataSet): | if isinstance(dataset, DataSet): | ||||
try: | try: | ||||
@@ -341,7 +342,7 @@ class Vocabulary(object): | |||||
raise e | raise e | ||||
else: | else: | ||||
raise TypeError("Only DataSet type is allowed.") | raise TypeError("Only DataSet type is allowed.") | ||||
if no_create_entry_dataset is not None: | if no_create_entry_dataset is not None: | ||||
partial_construct_vocab = partial(construct_vocab, no_create_entry=True) | partial_construct_vocab = partial(construct_vocab, no_create_entry=True) | ||||
if isinstance(no_create_entry_dataset, DataSet): | if isinstance(no_create_entry_dataset, DataSet): | ||||
@@ -352,7 +353,7 @@ class Vocabulary(object): | |||||
raise TypeError("Only DataSet type is allowed.") | raise TypeError("Only DataSet type is allowed.") | ||||
dataset.apply(partial_construct_vocab) | dataset.apply(partial_construct_vocab) | ||||
return self | return self | ||||
def _is_word_no_create_entry(self, word): | def _is_word_no_create_entry(self, word): | ||||
""" | """ | ||||
判断当前的word是否是不需要创建entry的,具体参见from_dataset的说明 | 判断当前的word是否是不需要创建entry的,具体参见from_dataset的说明 | ||||
@@ -360,11 +361,10 @@ class Vocabulary(object): | |||||
:return: bool | :return: bool | ||||
""" | """ | ||||
return word in self._no_create_word | return word in self._no_create_word | ||||
def to_index(self, w): | def to_index(self, w): | ||||
""" | """ | ||||
将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出 | |||||
``ValueError``:: | |||||
将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出``ValueError``:: | |||||
index = vocab.to_index('abc') | index = vocab.to_index('abc') | ||||
# equals to | # equals to | ||||
@@ -0,0 +1,23 @@ | |||||
""" | |||||
embeddings 模块里实现了 | |||||
""" | |||||
__all__ = [ | |||||
"Embedding", | |||||
"StaticEmbedding", | |||||
"ElmoEmbedding", | |||||
"BertEmbedding", | |||||
"StackEmbedding", | |||||
"LSTMCharEmbedding", | |||||
"CNNCharEmbedding", | |||||
"get_embeddings" | |||||
] | |||||
from .embedding import Embedding | |||||
from .static_embedding import StaticEmbedding | |||||
from .elmo_embedding import ElmoEmbedding | |||||
from .bert_embedding import BertEmbedding | |||||
from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding | |||||
from .stack_embedding import StackEmbedding | |||||
from .utils import get_embeddings |
@@ -0,0 +1,321 @@ | |||||
import os | |||||
import collections | |||||
from torch import nn | |||||
import torch | |||||
import numpy as np | |||||
from itertools import chain | |||||
from ..core.vocabulary import Vocabulary | |||||
from ..io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR | |||||
from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer | |||||
from .contextual_embedding import ContextualEmbedding | |||||
class BertEmbedding(ContextualEmbedding): | |||||
""" | |||||
别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` | |||||
使用BERT对words进行encode的Embedding。建议将输入的words长度限制在450以内,而不要使用512。这是由于预训练的bert模型长 | |||||
度限制为512个token,而因为输入的word是未进行word piece分割的,在分割之后长度可能会超过最大长度限制。 | |||||
Example:: | |||||
>>> embedding = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1') | |||||
:param fastNLP.Vocabulary vocab: 词表 | |||||
:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased``. | |||||
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 | |||||
:param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces | |||||
中计算得到它对应的表示。支持``last``, ``first``, ``avg``, ``max``。 | |||||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | |||||
:param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样 | |||||
会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。 | |||||
:param bool requires_grad: 是否需要gradient。 | |||||
""" | |||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', | |||||
pool_method: str='first', word_dropout=0, dropout=0, requires_grad: bool=False, | |||||
include_cls_sep: bool=False): | |||||
super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||||
# 根据model_dir_or_name检查是否存在并下载 | |||||
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: | |||||
PRETRAIN_URL = _get_base_url('bert') | |||||
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] | |||||
model_url = PRETRAIN_URL + model_name | |||||
model_dir = cached_path(model_url) | |||||
# 检查是否存在 | |||||
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): | |||||
model_dir = model_dir_or_name | |||||
else: | |||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||||
self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, | |||||
pool_method=pool_method, include_cls_sep=include_cls_sep) | |||||
self.requires_grad = requires_grad | |||||
self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size | |||||
def _delete_model_weights(self): | |||||
del self.model | |||||
def forward(self, words): | |||||
""" | |||||
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 | |||||
删除这两个token的表示。 | |||||
:param torch.LongTensor words: [batch_size, max_len] | |||||
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | |||||
""" | |||||
words = self.drop_word(words) | |||||
outputs = self._get_sent_reprs(words) | |||||
if outputs is not None: | |||||
return self.dropout(words) | |||||
outputs = self.model(words) | |||||
outputs = torch.cat([*outputs], dim=-1) | |||||
return self.dropout(outputs) | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
requires_grads = set([param.requires_grad for name, param in self.named_parameters() | |||||
if 'word_pieces_lengths' not in name]) | |||||
if len(requires_grads) == 1: | |||||
return requires_grads.pop() | |||||
else: | |||||
return None | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for name, param in self.named_parameters(): | |||||
if 'word_pieces_lengths' in name: # 这个不能加入到requires_grad中 | |||||
continue | |||||
param.requires_grad = value | |||||
class BertWordPieceEncoder(nn.Module): | |||||
""" | |||||
读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。 | |||||
:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased`` | |||||
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 | |||||
:param bool requires_grad: 是否需要gradient。 | |||||
""" | |||||
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', | |||||
requires_grad: bool=False): | |||||
super().__init__() | |||||
PRETRAIN_URL = _get_base_url('bert') | |||||
if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR: | |||||
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] | |||||
model_url = PRETRAIN_URL + model_name | |||||
model_dir = cached_path(model_url) | |||||
# 检查是否存在 | |||||
elif os.path.isdir(model_dir_or_name): | |||||
model_dir = model_dir_or_name | |||||
else: | |||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||||
self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers) | |||||
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size | |||||
self.requires_grad = requires_grad | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
requires_grads = set([param.requires_grad for name, param in self.named_parameters()]) | |||||
if len(requires_grads) == 1: | |||||
return requires_grads.pop() | |||||
else: | |||||
return None | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for name, param in self.named_parameters(): | |||||
param.requires_grad = value | |||||
@property | |||||
def embed_size(self): | |||||
return self._embed_size | |||||
def index_datasets(self, *datasets, field_name): | |||||
""" | |||||
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 | |||||
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 | |||||
:param datasets: DataSet对象 | |||||
:param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。 | |||||
:return: | |||||
""" | |||||
self.model.index_dataset(*datasets, field_name=field_name) | |||||
def forward(self, word_pieces, token_type_ids=None): | |||||
""" | |||||
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 | |||||
:param words: batch_size x max_len | |||||
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话 | |||||
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | |||||
""" | |||||
outputs = self.model(word_pieces, token_type_ids) | |||||
outputs = torch.cat([*outputs], dim=-1) | |||||
return outputs | |||||
class _WordBertModel(nn.Module): | |||||
def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False): | |||||
super().__init__() | |||||
self.tokenzier = BertTokenizer.from_pretrained(model_dir) | |||||
self.encoder = BertModel.from_pretrained(model_dir) | |||||
# 检查encoder_layer_number是否合理 | |||||
encoder_layer_number = len(self.encoder.encoder.layer) | |||||
self.layers = list(map(int, layers.split(','))) | |||||
for layer in self.layers: | |||||
if layer<0: | |||||
assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||||
f"a bert model with {encoder_layer_number} layers." | |||||
else: | |||||
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||||
f"a bert model with {encoder_layer_number} layers." | |||||
assert pool_method in ('avg', 'max', 'first', 'last') | |||||
self.pool_method = pool_method | |||||
self.include_cls_sep = include_cls_sep | |||||
# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] | |||||
print("Start to generating word pieces for word.") | |||||
# 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 | |||||
word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的 | |||||
found_count = 0 | |||||
for word, index in vocab: | |||||
if index == vocab.padding_idx: # pad是个特殊的符号 | |||||
word = '[PAD]' | |||||
elif index == vocab.unknown_idx: | |||||
word = '[UNK]' | |||||
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) | |||||
if len(word_pieces)==1: | |||||
if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到 | |||||
if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面 | |||||
word_piece_dict[word] = 1 # 新增一个值 | |||||
continue | |||||
for word_piece in word_pieces: | |||||
word_piece_dict[word_piece] = 1 | |||||
found_count += 1 | |||||
original_embed = self.encoder.embeddings.word_embeddings.weight.data | |||||
# 特殊词汇要特殊处理 | |||||
embed = nn.Embedding(len(word_piece_dict), original_embed.size(1)) # 新的embed | |||||
new_word_piece_vocab = collections.OrderedDict() | |||||
for index, token in enumerate(['[PAD]', '[UNK]']): | |||||
word_piece_dict.pop(token, None) | |||||
embed.weight.data[index] = original_embed[self.tokenzier.vocab[token]] | |||||
new_word_piece_vocab[token] = index | |||||
for token in word_piece_dict.keys(): | |||||
if token in self.tokenzier.vocab: | |||||
embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab[token]] | |||||
else: | |||||
embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab['[UNK]']] | |||||
new_word_piece_vocab[token] = len(new_word_piece_vocab) | |||||
self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) | |||||
self.encoder.embeddings.word_embeddings = embed | |||||
word_to_wordpieces = [] | |||||
word_pieces_lengths = [] | |||||
for word, index in vocab: | |||||
if index == vocab.padding_idx: # pad是个特殊的符号 | |||||
word = '[PAD]' | |||||
elif index == vocab.unknown_idx: | |||||
word = '[UNK]' | |||||
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) | |||||
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces) | |||||
word_to_wordpieces.append(word_pieces) | |||||
word_pieces_lengths.append(len(word_pieces)) | |||||
print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab))) | |||||
self._cls_index = self.tokenzier.vocab['[CLS]'] | |||||
self._sep_index = self.tokenzier.vocab['[SEP]'] | |||||
self._pad_index = vocab.padding_idx | |||||
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece | |||||
self.word_to_wordpieces = np.array(word_to_wordpieces) | |||||
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) | |||||
print("Successfully generate word pieces.") | |||||
def forward(self, words): | |||||
""" | |||||
:param words: torch.LongTensor, batch_size x max_len | |||||
:return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size | |||||
""" | |||||
batch_size, max_word_len = words.size() | |||||
seq_len = words.ne(self._pad_index).sum(dim=-1) | |||||
batch_word_pieces_length = self.word_pieces_lengths[words] # batch_size x max_len | |||||
word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) | |||||
max_word_piece_length = word_pieces_lengths.max().item() | |||||
# +2是由于需要加入[CLS]与[SEP] | |||||
word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index) | |||||
word_pieces[:, 0].fill_(self._cls_index) | |||||
batch_indexes = torch.arange(batch_size).to(words) | |||||
word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index | |||||
attn_masks = torch.zeros_like(word_pieces) | |||||
# 1. 获取words的word_pieces的id,以及对应的span范围 | |||||
word_indexes = words.tolist() | |||||
for i in range(batch_size): | |||||
word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]])) | |||||
word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i) | |||||
attn_masks[i, :len(word_pieces_i)+2].fill_(1) | |||||
# TODO 截掉长度超过的部分。 | |||||
# 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 | |||||
# all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] | |||||
bert_outputs, _ = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks, | |||||
output_all_encoded_layers=True) | |||||
# output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size | |||||
if self.include_cls_sep: | |||||
outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, | |||||
bert_outputs[-1].size(-1)) | |||||
s_shift = 1 | |||||
else: | |||||
outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len, | |||||
bert_outputs[-1].size(-1)) | |||||
s_shift = 0 | |||||
batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1) | |||||
batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1) # batch_size x max_len | |||||
for l_index, l in enumerate(self.layers): | |||||
output_layer = bert_outputs[l] | |||||
# 从word_piece collapse到word的表示 | |||||
truncate_output_layer = output_layer[:, 1:-1] # 删除[CLS]与[SEP] batch_size x len x hidden_size | |||||
outputs_seq_len = seq_len + s_shift | |||||
if self.pool_method == 'first': | |||||
for i in range(batch_size): | |||||
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 | |||||
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size | |||||
elif self.pool_method == 'last': | |||||
for i in range(batch_size): | |||||
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end | |||||
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] | |||||
elif self.pool_method == 'max': | |||||
for i in range(batch_size): | |||||
for j in range(seq_len[i]): | |||||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] | |||||
outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) | |||||
else: | |||||
for i in range(batch_size): | |||||
for j in range(seq_len[i]): | |||||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] | |||||
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) | |||||
if self.include_cls_sep: | |||||
outputs[l_index, :, 0] = output_layer[:, 0] | |||||
outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift] | |||||
# 3. 最终的embedding结果 | |||||
return outputs | |||||
@@ -0,0 +1,280 @@ | |||||
import torch | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
from typing import List | |||||
from ..modules.encoder.lstm import LSTM | |||||
from ..core.vocabulary import Vocabulary | |||||
from .embedding import TokenEmbedding | |||||
from .utils import _construct_char_vocab_from_vocab | |||||
class CNNCharEmbedding(TokenEmbedding): | |||||
""" | |||||
别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` | |||||
使用CNN生成character embedding。CNN的结果为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool -> fc -> Dropout. | |||||
不同的kernel大小的fitler结果是concat起来的。 | |||||
Example:: | |||||
>>> cnn_char_embed = CNNCharEmbedding(vocab) | |||||
:param vocab: 词表 | |||||
:param embed_size: 该word embedding的大小,默认值为50. | |||||
:param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50. | |||||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||||
:param float dropout: 以多大的概率drop | |||||
:param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20]. | |||||
:param kernel_sizes: kernel的大小. 默认值为[5, 3, 1]. | |||||
:param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. | |||||
:param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. | |||||
:param min_char_freq: character的最少出现次数。默认值为2. | |||||
""" | |||||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, | |||||
dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), | |||||
pool_method: str='max', activation='relu', min_char_freq: int=2): | |||||
super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||||
for kernel in kernel_sizes: | |||||
assert kernel % 2 == 1, "Only odd kernel is allowed." | |||||
assert pool_method in ('max', 'avg') | |||||
self.dropout = nn.Dropout(dropout) | |||||
self.pool_method = pool_method | |||||
# activation function | |||||
if isinstance(activation, str): | |||||
if activation.lower() == 'relu': | |||||
self.activation = F.relu | |||||
elif activation.lower() == 'sigmoid': | |||||
self.activation = F.sigmoid | |||||
elif activation.lower() == 'tanh': | |||||
self.activation = F.tanh | |||||
elif activation is None: | |||||
self.activation = lambda x: x | |||||
elif callable(activation): | |||||
self.activation = activation | |||||
else: | |||||
raise Exception( | |||||
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") | |||||
print("Start constructing character vocabulary.") | |||||
# 建立char的词表 | |||||
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) | |||||
self.char_pad_index = self.char_vocab.padding_idx | |||||
print(f"In total, there are {len(self.char_vocab)} distinct characters.") | |||||
# 对vocab进行index | |||||
max_word_len = max(map(lambda x: len(x[0]), vocab)) | |||||
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len), | |||||
fill_value=self.char_pad_index, dtype=torch.long), | |||||
requires_grad=False) | |||||
self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False) | |||||
for word, index in vocab: | |||||
# if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。修改为不区分pad, 这样所有的<pad>也是同一个embed | |||||
self.words_to_chars_embedding[index, :len(word)] = \ | |||||
torch.LongTensor([self.char_vocab.to_index(c) for c in word]) | |||||
self.word_lengths[index] = len(word) | |||||
self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) | |||||
self.convs = nn.ModuleList([nn.Conv1d( | |||||
char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) | |||||
for i in range(len(kernel_sizes))]) | |||||
self._embed_size = embed_size | |||||
self.fc = nn.Linear(sum(filter_nums), embed_size) | |||||
self.init_param() | |||||
def forward(self, words): | |||||
""" | |||||
输入words的index后,生成对应的words的表示。 | |||||
:param words: [batch_size, max_len] | |||||
:return: [batch_size, max_len, embed_size] | |||||
""" | |||||
words = self.drop_word(words) | |||||
batch_size, max_len = words.size() | |||||
chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len | |||||
word_lengths = self.word_lengths[words] # batch_size x max_len | |||||
max_word_len = word_lengths.max() | |||||
chars = chars[:, :, :max_word_len] | |||||
# 为1的地方为mask | |||||
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 | |||||
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size | |||||
chars = self.dropout(chars) | |||||
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) | |||||
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M | |||||
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) | |||||
for conv in self.convs] | |||||
conv_chars = torch.cat(conv_chars, dim=-1).contiguous() # B x max_len x max_word_len x sum(filters) | |||||
conv_chars = self.activation(conv_chars) | |||||
if self.pool_method == 'max': | |||||
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) | |||||
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) | |||||
else: | |||||
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0) | |||||
chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | |||||
chars = self.fc(chars) | |||||
return self.dropout(chars) | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
params = [] | |||||
for name, param in self.named_parameters(): | |||||
if 'words_to_chars_embedding' not in name and 'word_lengths' not in name: | |||||
params.append(param.requires_grad) | |||||
requires_grads = set(params) | |||||
if len(requires_grads) == 1: | |||||
return requires_grads.pop() | |||||
else: | |||||
return None | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for name, param in self.named_parameters(): | |||||
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 | |||||
continue | |||||
param.requires_grad = value | |||||
def init_param(self): | |||||
for name, param in self.named_parameters(): | |||||
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset | |||||
continue | |||||
if param.data.dim()>1: | |||||
nn.init.xavier_uniform_(param, 1) | |||||
else: | |||||
nn.init.uniform_(param, -1, 1) | |||||
class LSTMCharEmbedding(TokenEmbedding): | |||||
""" | |||||
别名::class:`fastNLP.embeddings.LSTMCharEmbedding` :class:`fastNLP.embeddings.char_embedding.LSTMCharEmbedding` | |||||
使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool | |||||
Example:: | |||||
>>> lstm_char_embed = LSTMCharEmbedding(vocab) | |||||
:param vocab: 词表 | |||||
:param embed_size: embedding的大小。默认值为50. | |||||
:param char_emb_size: character的embedding的大小。默认值为50. | |||||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||||
:param dropout: 以多大概率drop | |||||
:param hidden_size: LSTM的中间hidden的大小,如果为bidirectional的,hidden会除二,默认为50. | |||||
:param pool_method: 支持'max', 'avg' | |||||
:param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数. | |||||
:param min_char_freq: character的最小出现次数。默认值为2. | |||||
:param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。 | |||||
""" | |||||
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, | |||||
dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, | |||||
bidirectional=True): | |||||
super(LSTMCharEmbedding, self).__init__(vocab) | |||||
assert hidden_size % 2 == 0, "Only even kernel is allowed." | |||||
assert pool_method in ('max', 'avg') | |||||
self.pool_method = pool_method | |||||
self.dropout = nn.Dropout(dropout) | |||||
# activation function | |||||
if isinstance(activation, str): | |||||
if activation.lower() == 'relu': | |||||
self.activation = F.relu | |||||
elif activation.lower() == 'sigmoid': | |||||
self.activation = F.sigmoid | |||||
elif activation.lower() == 'tanh': | |||||
self.activation = F.tanh | |||||
elif activation is None: | |||||
self.activation = lambda x: x | |||||
elif callable(activation): | |||||
self.activation = activation | |||||
else: | |||||
raise Exception( | |||||
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") | |||||
print("Start constructing character vocabulary.") | |||||
# 建立char的词表 | |||||
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) | |||||
self.char_pad_index = self.char_vocab.padding_idx | |||||
print(f"In total, there are {len(self.char_vocab)} distinct characters.") | |||||
# 对vocab进行index | |||||
self.max_word_len = max(map(lambda x: len(x[0]), vocab)) | |||||
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len), | |||||
fill_value=self.char_pad_index, dtype=torch.long), | |||||
requires_grad=False) | |||||
self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False) | |||||
for word, index in vocab: | |||||
# if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了. 修改为不区分pad与否 | |||||
self.words_to_chars_embedding[index, :len(word)] = \ | |||||
torch.LongTensor([self.char_vocab.to_index(c) for c in word]) | |||||
self.word_lengths[index] = len(word) | |||||
self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) | |||||
self.fc = nn.Linear(hidden_size, embed_size) | |||||
hidden_size = hidden_size // 2 if bidirectional else hidden_size | |||||
self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) | |||||
self._embed_size = embed_size | |||||
self.bidirectional = bidirectional | |||||
def forward(self, words): | |||||
""" | |||||
输入words的index后,生成对应的words的表示。 | |||||
:param words: [batch_size, max_len] | |||||
:return: [batch_size, max_len, embed_size] | |||||
""" | |||||
words = self.drop_word(words) | |||||
batch_size, max_len = words.size() | |||||
chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len | |||||
word_lengths = self.word_lengths[words] # batch_size x max_len | |||||
max_word_len = word_lengths.max() | |||||
chars = chars[:, :, :max_word_len] | |||||
# 为mask的地方为1 | |||||
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 | |||||
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size | |||||
chars = self.dropout(chars) | |||||
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1) | |||||
char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) | |||||
lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) | |||||
# B x M x M x H | |||||
lstm_chars = self.activation(lstm_chars) | |||||
if self.pool_method == 'max': | |||||
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) | |||||
chars, _ = torch.max(lstm_chars, dim=-2) # batch_size x max_len x H | |||||
else: | |||||
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0) | |||||
chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() | |||||
chars = self.fc(chars) | |||||
return self.dropout(chars) | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
params = [] | |||||
for name, param in self.named_parameters(): | |||||
if 'words_to_chars_embedding' not in name and 'word_lengths' not in name: | |||||
params.append(param) | |||||
requires_grads = set(params) | |||||
if len(requires_grads) == 1: | |||||
return requires_grads.pop() | |||||
else: | |||||
return None | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for name, param in self.named_parameters(): | |||||
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 | |||||
continue | |||||
param.requires_grad = value |
@@ -0,0 +1,100 @@ | |||||
from abc import abstractmethod | |||||
import torch | |||||
from ..core.vocabulary import Vocabulary | |||||
from ..core.dataset import DataSet | |||||
from ..core.batch import DataSetIter | |||||
from ..core.sampler import SequentialSampler | |||||
from ..core.utils import _move_model_to_device, _get_model_device | |||||
from .embedding import TokenEmbedding | |||||
class ContextualEmbedding(TokenEmbedding): | |||||
def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0): | |||||
super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||||
def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True): | |||||
""" | |||||
由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 | |||||
:param datasets: DataSet对象 | |||||
:param batch_size: int, 生成cache的sentence表示时使用的batch的大小 | |||||
:param device: 参考 :class::fastNLP.Trainer 的device | |||||
:param delete_weights: 似乎在生成了cache之后删除权重,在不需要finetune动态模型的情况下,删除权重会大量减少内存占用。 | |||||
:return: | |||||
""" | |||||
for index, dataset in enumerate(datasets): | |||||
try: | |||||
assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed." | |||||
assert 'words' in dataset.get_input_name(), "`words` field has to be set as input." | |||||
except Exception as e: | |||||
print(f"Exception happens at {index} dataset.") | |||||
raise e | |||||
sent_embeds = {} | |||||
_move_model_to_device(self, device=device) | |||||
device = _get_model_device(self) | |||||
pad_index = self._word_vocab.padding_idx | |||||
print("Start to calculate sentence representations.") | |||||
with torch.no_grad(): | |||||
for index, dataset in enumerate(datasets): | |||||
try: | |||||
batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler()) | |||||
for batch_x, batch_y in batch: | |||||
words = batch_x['words'].to(device) | |||||
words_list = words.tolist() | |||||
seq_len = words.ne(pad_index).sum(dim=-1) | |||||
max_len = words.size(1) | |||||
# 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。 | |||||
seq_len_from_behind = (max_len - seq_len).tolist() | |||||
word_embeds = self(words).detach().cpu().numpy() | |||||
for b in range(words.size(0)): | |||||
length = seq_len_from_behind[b] | |||||
if length==0: | |||||
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b] | |||||
else: | |||||
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] | |||||
except Exception as e: | |||||
print(f"Exception happens at {index} dataset.") | |||||
raise e | |||||
print("Finish calculating sentence representations.") | |||||
self.sent_embeds = sent_embeds | |||||
if delete_weights: | |||||
self._delete_model_weights() | |||||
def _get_sent_reprs(self, words): | |||||
""" | |||||
获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None | |||||
:param words: torch.LongTensor | |||||
:return: | |||||
""" | |||||
if hasattr(self, 'sent_embeds'): | |||||
words_list = words.tolist() | |||||
seq_len = words.ne(self._word_pad_index).sum(dim=-1) | |||||
_embeds = [] | |||||
for b in range(len(words)): | |||||
words_i = tuple(words_list[b][:seq_len[b]]) | |||||
embed = self.sent_embeds[words_i] | |||||
_embeds.append(embed) | |||||
max_sent_len = max(map(len, _embeds)) | |||||
embeds = words.new_zeros(len(_embeds), max_sent_len, self.embed_size, dtype=torch.float, | |||||
device=words.device) | |||||
for i, embed in enumerate(_embeds): | |||||
embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device) | |||||
return embeds | |||||
return None | |||||
@abstractmethod | |||||
def _delete_model_weights(self): | |||||
"""删除计算表示的模型以节省资源""" | |||||
raise NotImplementedError | |||||
def remove_sentence_cache(self): | |||||
""" | |||||
删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。 | |||||
:return: | |||||
""" | |||||
del self.sent_embeds |
@@ -0,0 +1,326 @@ | |||||
import os | |||||
import torch | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
import json | |||||
import codecs | |||||
from ..core.vocabulary import Vocabulary | |||||
from ..io.file_utils import cached_path, _get_base_url, PRETRAINED_ELMO_MODEL_DIR | |||||
from ..modules.encoder._elmo import ElmobiLm, ConvTokenEmbedder | |||||
from .contextual_embedding import ContextualEmbedding | |||||
class ElmoEmbedding(ContextualEmbedding): | |||||
""" | |||||
别名::class:`fastNLP.modules.ElmoEmbedding` :class:`fastNLP.modules.encoder.embedding.ElmoEmbedding` | |||||
使用ELMo的embedding。初始化之后,只需要传入words就可以得到对应的embedding。 | |||||
我们提供的ELMo预训练模型来自 https://github.com/HIT-SCIR/ELMoForManyLangs | |||||
Example:: | |||||
>>> embedding = ElmoEmbedding(vocab, model_dir_or_name='en', layers='2', requires_grad=True) | |||||
:param vocab: 词表 | |||||
:param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo权重的文件名,第二种是传入ELMo版本的名称, | |||||
目前支持的ELMo包括{`en` : 英文版本的ELMo, `cn` : 中文版本的ELMo,}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载 | |||||
:param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果 | |||||
按照这个顺序concat起来。默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致, | |||||
初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。) | |||||
:param requires_grad: bool, 该层是否需要gradient, 默认为False. | |||||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | |||||
:param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, | |||||
并删除character encoder,之后将直接使用cache的embedding。默认为False。 | |||||
""" | |||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False, | |||||
word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False): | |||||
super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||||
# 根据model_dir_or_name检查是否存在并下载 | |||||
if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: | |||||
PRETRAIN_URL = _get_base_url('elmo') | |||||
model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name] | |||||
model_url = PRETRAIN_URL + model_name | |||||
model_dir = cached_path(model_url) | |||||
# 检查是否存在 | |||||
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): | |||||
model_dir = model_dir_or_name | |||||
else: | |||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||||
self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) | |||||
if layers == 'mix': | |||||
self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), | |||||
requires_grad=requires_grad) | |||||
self.gamma = nn.Parameter(torch.ones(1), requires_grad=requires_grad) | |||||
self._get_outputs = self._get_mixed_outputs | |||||
self._embed_size = self.model.config['lstm']['projection_dim'] * 2 | |||||
else: | |||||
layers = list(map(int, layers.split(','))) | |||||
assert len(layers) > 0, "Must choose one output" | |||||
for layer in layers: | |||||
assert 0 <= layer <= 2, "Layer index should be in range [0, 2]." | |||||
self.layers = layers | |||||
self._get_outputs = self._get_layer_outputs | |||||
self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 | |||||
self.requires_grad = requires_grad | |||||
def _get_mixed_outputs(self, outputs): | |||||
# outputs: num_layers x batch_size x max_len x hidden_size | |||||
# return: batch_size x max_len x hidden_size | |||||
weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs) | |||||
outputs = torch.einsum('l,lbij->bij', weights, outputs) | |||||
return self.gamma.to(outputs) * outputs | |||||
def set_mix_weights_requires_grad(self, flag=True): | |||||
""" | |||||
当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用 | |||||
该方法没有用。 | |||||
:param bool flag: 混合不同层表示的结果是否可以训练。 | |||||
:return: | |||||
""" | |||||
if hasattr(self, 'layer_weights'): | |||||
self.layer_weights.requires_grad = flag | |||||
self.gamma.requires_grad = flag | |||||
def _get_layer_outputs(self, outputs): | |||||
if len(self.layers) == 1: | |||||
outputs = outputs[self.layers[0]] | |||||
else: | |||||
outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1) | |||||
return outputs | |||||
def forward(self, words: torch.LongTensor): | |||||
""" | |||||
计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的 | |||||
被重复了一次,使得实际上layer=0的结果是[token_embedding;token_embedding], 而layer=1的结果是[forward_hiddens; | |||||
backward_hiddens]. | |||||
:param words: batch_size x max_len | |||||
:return: torch.FloatTensor. batch_size x max_len x (512*len(self.layers)) | |||||
""" | |||||
words = self.drop_word(words) | |||||
outputs = self._get_sent_reprs(words) | |||||
if outputs is not None: | |||||
return self.dropout(outputs) | |||||
outputs = self.model(words) | |||||
outputs = self._get_outputs(outputs) | |||||
return self.dropout(outputs) | |||||
def _delete_model_weights(self): | |||||
for name in ['layers', 'model', 'layer_weights', 'gamma']: | |||||
if hasattr(self, name): | |||||
delattr(self, name) | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
requires_grads = set([param.requires_grad for name, param in self.named_parameters() | |||||
if 'words_to_chars_embedding' not in name and 'words_to_words' not in name]) | |||||
if len(requires_grads) == 1: | |||||
return requires_grads.pop() | |||||
else: | |||||
return None | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for name, param in self.named_parameters(): | |||||
if 'words_to_chars_embedding' in name or 'words_to_words' in name: # 这个不能加入到requires_grad中 | |||||
continue | |||||
param.requires_grad = value | |||||
class _ElmoModel(nn.Module): | |||||
""" | |||||
该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作,包括 | |||||
(1) 根据配置,加载模型; | |||||
(2) 根据vocab,对模型中的embedding进行调整. 并将其正确初始化 | |||||
(3) 保存一个words与chars的对应转换,获取时自动进行相应的转换 | |||||
(4) 设计一个保存token的embedding,允许缓存word的表示。 | |||||
""" | |||||
def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): | |||||
super(_ElmoModel, self).__init__() | |||||
self.model_dir = model_dir | |||||
dir = os.walk(self.model_dir) | |||||
config_file = None | |||||
weight_file = None | |||||
config_count = 0 | |||||
weight_count = 0 | |||||
for path, dir_list, file_list in dir: | |||||
for file_name in file_list: | |||||
if file_name.__contains__(".json"): | |||||
config_file = file_name | |||||
config_count += 1 | |||||
elif file_name.__contains__(".pkl"): | |||||
weight_file = file_name | |||||
weight_count += 1 | |||||
if config_count > 1 or weight_count > 1: | |||||
raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.") | |||||
elif config_count == 0 or weight_count == 0: | |||||
raise Exception(f"No config file or weight file found in {model_dir}") | |||||
config = json.load(open(os.path.join(model_dir, config_file), 'r')) | |||||
self.weight_file = os.path.join(model_dir, weight_file) | |||||
self.config = config | |||||
OOV_TAG = '<oov>' | |||||
PAD_TAG = '<pad>' | |||||
BOS_TAG = '<bos>' | |||||
EOS_TAG = '<eos>' | |||||
BOW_TAG = '<bow>' | |||||
EOW_TAG = '<eow>' | |||||
# For the model trained with character-based word encoder. | |||||
char_lexicon = {} | |||||
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: | |||||
for line in fpi: | |||||
tokens = line.strip().split('\t') | |||||
if len(tokens) == 1: | |||||
tokens.insert(0, '\u3000') | |||||
token, i = tokens | |||||
char_lexicon[token] = int(i) | |||||
# 做一些sanity check | |||||
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: | |||||
assert special_word in char_lexicon, f"{special_word} not found in char.dic." | |||||
# 从vocab中构建char_vocab | |||||
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) | |||||
# 需要保证<bow>与<eow>在里面 | |||||
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) | |||||
for word, index in vocab: | |||||
char_vocab.add_word_lst(list(word)) | |||||
self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx | |||||
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) | |||||
char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), | |||||
padding_idx=len(char_vocab)) | |||||
# 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict | |||||
elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') | |||||
char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] | |||||
found_char_count = 0 | |||||
for char, index in char_vocab: # 调整character embedding | |||||
if char in char_lexicon: | |||||
index_in_pre = char_lexicon.get(char) | |||||
found_char_count += 1 | |||||
else: | |||||
index_in_pre = char_lexicon[OOV_TAG] | |||||
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] | |||||
print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") | |||||
# 生成words到chars的映射 | |||||
max_chars = config['char_cnn']['max_characters_per_token'] | |||||
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), | |||||
fill_value=len(char_vocab), | |||||
dtype=torch.long), | |||||
requires_grad=False) | |||||
for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab) + 1)]: | |||||
if len(word) + 2 > max_chars: | |||||
word = word[:max_chars - 2] | |||||
if index == self._pad_index: | |||||
continue | |||||
elif word == BOS_TAG or word == EOS_TAG: | |||||
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(word)] + [ | |||||
char_vocab.to_index(EOW_TAG)] | |||||
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) | |||||
else: | |||||
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [ | |||||
char_vocab.to_index(EOW_TAG)] | |||||
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) | |||||
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) | |||||
self.char_vocab = char_vocab | |||||
self.token_embedder = ConvTokenEmbedder( | |||||
config, self.weight_file, None, char_emb_layer) | |||||
elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight | |||||
self.token_embedder.load_state_dict(elmo_model["char_cnn"]) | |||||
self.output_dim = config['lstm']['projection_dim'] | |||||
# lstm encoder | |||||
self.encoder = ElmobiLm(config) | |||||
self.encoder.load_state_dict(elmo_model["lstm"]) | |||||
if cache_word_reprs: | |||||
if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 | |||||
print("Start to generate cache word representations.") | |||||
batch_size = 320 | |||||
# bos eos | |||||
word_size = self.words_to_chars_embedding.size(0) | |||||
num_batches = word_size // batch_size + \ | |||||
int(word_size % batch_size != 0) | |||||
self.cached_word_embedding = nn.Embedding(word_size, | |||||
config['lstm']['projection_dim']) | |||||
with torch.no_grad(): | |||||
for i in range(num_batches): | |||||
words = torch.arange(i * batch_size, | |||||
min((i + 1) * batch_size, word_size)).long() | |||||
chars = self.words_to_chars_embedding[words].unsqueeze(1) # batch_size x 1 x max_chars | |||||
word_reprs = self.token_embedder(words.unsqueeze(1), | |||||
chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] | |||||
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) | |||||
print("Finish generating cached word representations. Going to delete the character encoder.") | |||||
del self.token_embedder, self.words_to_chars_embedding | |||||
else: | |||||
print("There is no need to cache word representations, since no character information is used.") | |||||
def forward(self, words): | |||||
""" | |||||
:param words: batch_size x max_len | |||||
:return: num_layers x batch_size x max_len x hidden_size | |||||
""" | |||||
# 扩展<bos>, <eos> | |||||
batch_size, max_len = words.size() | |||||
expanded_words = words.new_zeros(batch_size, max_len + 2) # 因为pad一定为0, | |||||
seq_len = words.ne(self._pad_index).sum(dim=-1) | |||||
expanded_words[:, 1:-1] = words | |||||
expanded_words[:, 0].fill_(self.bos_index) | |||||
expanded_words[torch.arange(batch_size).to(words), seq_len + 1] = self.eos_index | |||||
seq_len = seq_len + 2 | |||||
zero_tensor = expanded_words.new_zeros(expanded_words.shape) | |||||
mask = (expanded_words == zero_tensor).unsqueeze(-1) | |||||
if hasattr(self, 'cached_word_embedding'): | |||||
token_embedding = self.cached_word_embedding(expanded_words) | |||||
else: | |||||
if hasattr(self, 'words_to_chars_embedding'): | |||||
chars = self.words_to_chars_embedding[expanded_words] | |||||
else: | |||||
chars = None | |||||
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim | |||||
encoder_output = self.encoder(token_embedding, seq_len) | |||||
if encoder_output.size(2) < max_len + 2: | |||||
num_layers, _, output_len, hidden_size = encoder_output.size() | |||||
dummy_tensor = encoder_output.new_zeros(num_layers, batch_size, | |||||
max_len + 2 - output_len, hidden_size) | |||||
encoder_output = torch.cat((encoder_output, dummy_tensor), 2) | |||||
sz = encoder_output.size() # 2, batch_size, max_len, hidden_size | |||||
token_embedding = token_embedding.masked_fill(mask, 0) | |||||
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) | |||||
encoder_output = torch.cat((token_embedding, encoder_output), dim=0) | |||||
# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。 | |||||
encoder_output = encoder_output[:, :, 1:-1] | |||||
return encoder_output |
@@ -0,0 +1,180 @@ | |||||
import torch.nn as nn | |||||
from abc import abstractmethod | |||||
import torch | |||||
from .utils import get_embeddings | |||||
class Embedding(nn.Module): | |||||
""" | |||||
别名::class:`fastNLP.embeddings.Embedding` :class:`fastNLP.embeddings.embedding.Embedding` | |||||
Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度""" | |||||
def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): | |||||
""" | |||||
:param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), | |||||
第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding; | |||||
:param float word_dropout: 按照一定概率随机将word设置为unk_index,这样可以使得unk这个token得到足够的训练, 且会对网络有 | |||||
一定的regularize的作用。 | |||||
:param float dropout: 对Embedding的输出的dropout。 | |||||
:param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 | |||||
""" | |||||
super(Embedding, self).__init__() | |||||
self.embed = get_embeddings(init_embed) | |||||
self.dropout = nn.Dropout(dropout) | |||||
if not isinstance(self.embed, TokenEmbedding): | |||||
self._embed_size = self.embed.weight.size(1) | |||||
if word_dropout>0 and not isinstance(unk_index, int): | |||||
raise ValueError("When drop word is set, you need to pass in the unk_index.") | |||||
else: | |||||
self._embed_size = self.embed.embed_size | |||||
unk_index = self.embed.get_word_vocab().unknown_idx | |||||
self.unk_index = unk_index | |||||
self.word_dropout = word_dropout | |||||
def forward(self, x): | |||||
""" | |||||
:param torch.LongTensor x: [batch, seq_len] | |||||
:return: torch.Tensor : [batch, seq_len, embed_dim] | |||||
""" | |||||
if self.word_dropout>0 and self.training: | |||||
mask = torch.ones_like(x).float() * self.word_dropout | |||||
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 | |||||
x = x.masked_fill(mask, self.unk_index) | |||||
x = self.embed(x) | |||||
return self.dropout(x) | |||||
@property | |||||
def num_embedding(self)->int: | |||||
if isinstance(self.embed, nn.Embedding): | |||||
return self.embed.weight.size(0) | |||||
else: | |||||
return self.embed.num_embedding | |||||
def __len__(self): | |||||
return len(self.embed) | |||||
@property | |||||
def embed_size(self) -> int: | |||||
return self._embed_size | |||||
@property | |||||
def embedding_dim(self) -> int: | |||||
return self._embed_size | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
if not isinstance(self.embed, TokenEmbedding): | |||||
return self.embed.weight.requires_grad | |||||
else: | |||||
return self.embed.requires_grad | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
if not isinstance(self.embed, TokenEmbedding): | |||||
self.embed.weight.requires_grad = value | |||||
else: | |||||
self.embed.requires_grad = value | |||||
@property | |||||
def size(self): | |||||
if isinstance(self.embed, TokenEmbedding): | |||||
return self.embed.size | |||||
else: | |||||
return self.embed.weight.size() | |||||
class TokenEmbedding(nn.Module): | |||||
def __init__(self, vocab, word_dropout=0.0, dropout=0.0): | |||||
super(TokenEmbedding, self).__init__() | |||||
assert vocab.padding is not None, "Vocabulary must have a padding entry." | |||||
self._word_vocab = vocab | |||||
self._word_pad_index = vocab.padding_idx | |||||
if word_dropout>0: | |||||
assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." | |||||
self.word_dropout = word_dropout | |||||
self._word_unk_index = vocab.unknown_idx | |||||
self.dropout_layer = nn.Dropout(dropout) | |||||
def drop_word(self, words): | |||||
""" | |||||
按照设定随机将words设置为unknown_index。 | |||||
:param torch.LongTensor words: batch_size x max_len | |||||
:return: | |||||
""" | |||||
if self.word_dropout > 0 and self.training: | |||||
mask = torch.ones_like(words).float() * self.word_dropout | |||||
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 | |||||
words = words.masked_fill(mask, self._word_unk_index) | |||||
return words | |||||
def dropout(self, words): | |||||
""" | |||||
对embedding后的word表示进行drop。 | |||||
:param torch.FloatTensor words: batch_size x max_len x embed_size | |||||
:return: | |||||
""" | |||||
return self.dropout_layer(words) | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
requires_grads = set([param.requires_grad for param in self.parameters()]) | |||||
if len(requires_grads) == 1: | |||||
return requires_grads.pop() | |||||
else: | |||||
return None | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for param in self.parameters(): | |||||
param.requires_grad = value | |||||
def __len__(self): | |||||
return len(self._word_vocab) | |||||
@property | |||||
def embed_size(self) -> int: | |||||
return self._embed_size | |||||
@property | |||||
def embedding_dim(self) -> int: | |||||
return self._embed_size | |||||
@property | |||||
def num_embedding(self) -> int: | |||||
""" | |||||
这个值可能会大于实际的embedding矩阵的大小。 | |||||
:return: | |||||
""" | |||||
return len(self._word_vocab) | |||||
def get_word_vocab(self): | |||||
""" | |||||
返回embedding的词典。 | |||||
:return: Vocabulary | |||||
""" | |||||
return self._word_vocab | |||||
@property | |||||
def size(self): | |||||
return torch.Size(self.num_embedding, self._embed_size) | |||||
@abstractmethod | |||||
def forward(self, *input): | |||||
raise NotImplementedError |
@@ -0,0 +1,92 @@ | |||||
from typing import List | |||||
import torch | |||||
from torch import nn as nn | |||||
from .embedding import TokenEmbedding | |||||
class StackEmbedding(TokenEmbedding): | |||||
""" | |||||
别名::class:`fastNLP.embeddings.StackEmbedding` :class:`fastNLP.embeddings.stack_embedding.StackEmbedding` | |||||
支持将多个embedding集合成一个embedding。 | |||||
Example:: | |||||
>>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) | |||||
>>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True) | |||||
:param embeds: 一个由若干个TokenEmbedding组成的list,要求每一个TokenEmbedding的词表都保持一致 | |||||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。不同embedidng会在相同的位置 | |||||
被设置为unknown。如果这里设置了dropout,则组成的embedding就不要再设置dropout了。 | |||||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | |||||
""" | |||||
def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0): | |||||
vocabs = [] | |||||
for embed in embeds: | |||||
if hasattr(embed, 'get_word_vocab'): | |||||
vocabs.append(embed.get_word_vocab()) | |||||
_vocab = vocabs[0] | |||||
for vocab in vocabs[1:]: | |||||
assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." | |||||
super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) | |||||
assert isinstance(embeds, list) | |||||
for embed in embeds: | |||||
assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported." | |||||
self.embeds = nn.ModuleList(embeds) | |||||
self._embed_size = sum([embed.embed_size for embed in self.embeds]) | |||||
def append(self, embed: TokenEmbedding): | |||||
""" | |||||
添加一个embedding到结尾。 | |||||
:param embed: | |||||
:return: | |||||
""" | |||||
assert isinstance(embed, TokenEmbedding) | |||||
self.embeds.append(embed) | |||||
def pop(self): | |||||
""" | |||||
弹出最后一个embed | |||||
:return: | |||||
""" | |||||
return self.embeds.pop() | |||||
@property | |||||
def embed_size(self): | |||||
return self._embed_size | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
requires_grads = set([embed.requires_grad for embed in self.embeds()]) | |||||
if len(requires_grads) == 1: | |||||
return requires_grads.pop() | |||||
else: | |||||
return None | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for embed in self.embeds(): | |||||
embed.requires_grad = value | |||||
def forward(self, words): | |||||
""" | |||||
得到多个embedding的结果,并把结果按照顺序concat起来。 | |||||
:param words: batch_size x max_len | |||||
:return: 返回的shape和当前这个stack embedding中embedding的组成有关 | |||||
""" | |||||
outputs = [] | |||||
words = self.drop_word(words) | |||||
for embed in self.embeds: | |||||
outputs.append(embed(words)) | |||||
outputs = self.dropout(torch.cat(outputs, dim=-1)) | |||||
return outputs |
@@ -0,0 +1,217 @@ | |||||
import os | |||||
import torch | |||||
import torch.nn as nn | |||||
import numpy as np | |||||
import warnings | |||||
from ..core.vocabulary import Vocabulary | |||||
from ..io.file_utils import PRETRAIN_STATIC_FILES, _get_base_url, cached_path | |||||
from .embedding import TokenEmbedding | |||||
class StaticEmbedding(TokenEmbedding): | |||||
""" | |||||
别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` | |||||
StaticEmbedding组件. 给定embedding的名称,根据vocab从embedding中抽取相应的数据。该Embedding可以就按照正常的embedding使用了 | |||||
Example:: | |||||
>>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50') | |||||
:param vocab: Vocabulary. 若该项为None则会读取所有的embedding。 | |||||
:param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding的文件名,第二种是传入embedding | |||||
的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d, | |||||
`en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 | |||||
:param bool requires_grad: 是否需要gradient. 默认为True | |||||
:param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。 | |||||
:param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 | |||||
为大写的词语开辟一个vector表示,则将lower设置为False。 | |||||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||||
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 | |||||
:param bool normailize: 是否对vector进行normalize,使得每个vector的norm为1。 | |||||
""" | |||||
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None, | |||||
lower=False, dropout=0, word_dropout=0, normalize=False): | |||||
super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) | |||||
# 得到cache_path | |||||
if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: | |||||
PRETRAIN_URL = _get_base_url('static') | |||||
model_name = PRETRAIN_STATIC_FILES[model_dir_or_name] | |||||
model_url = PRETRAIN_URL + model_name | |||||
model_path = cached_path(model_url) | |||||
# 检查是否存在 | |||||
elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))): | |||||
model_path = model_dir_or_name | |||||
else: | |||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||||
# 读取embedding | |||||
if lower: | |||||
lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown) | |||||
for word, index in vocab: | |||||
if not vocab._is_word_no_create_entry(word): | |||||
lowered_vocab.add_word(word.lower()) # 先加入需要创建entry的 | |||||
for word in vocab._no_create_word.keys(): # 不需要创建entry的 | |||||
if word in vocab: | |||||
lowered_word = word.lower() | |||||
if lowered_word not in lowered_vocab.word_count: | |||||
lowered_vocab.add_word(lowered_word) | |||||
lowered_vocab._no_create_word[lowered_word] += 1 | |||||
print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered " | |||||
f"words.") | |||||
embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method, | |||||
normalize=normalize) | |||||
# 需要适配一下 | |||||
if not hasattr(self, 'words_to_words'): | |||||
self.words_to_words = torch.arange(len(lowered_vocab, )).long() | |||||
if lowered_vocab.unknown: | |||||
unknown_idx = lowered_vocab.unknown_idx | |||||
else: | |||||
unknown_idx = embedding.size(0) - 1 # 否则是最后一个为unknow | |||||
words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), | |||||
requires_grad=False) | |||||
for word, index in vocab: | |||||
if word not in lowered_vocab: | |||||
word = word.lower() | |||||
if lowered_vocab._is_word_no_create_entry(word): # 如果不需要创建entry,已经默认unknown了 | |||||
continue | |||||
words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)] | |||||
self.words_to_words = words_to_words | |||||
else: | |||||
embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method, | |||||
normalize=normalize) | |||||
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], | |||||
padding_idx=vocab.padding_idx, | |||||
max_norm=None, norm_type=2, scale_grad_by_freq=False, | |||||
sparse=False, _weight=embedding) | |||||
self._embed_size = self.embedding.weight.size(1) | |||||
self.requires_grad = requires_grad | |||||
@property | |||||
def requires_grad(self): | |||||
""" | |||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
:return: | |||||
""" | |||||
requires_grads = set([param.requires_grad for name, param in self.named_parameters() | |||||
if 'words_to_words' not in name]) | |||||
if len(requires_grads) == 1: | |||||
return requires_grads.pop() | |||||
else: | |||||
return None | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for name, param in self.named_parameters(): | |||||
if 'words_to_words' in name: | |||||
continue | |||||
param.requires_grad = value | |||||
def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', | |||||
normalize=True, error='ignore', init_method=None): | |||||
""" | |||||
从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是 | |||||
word2vec(第一行只有两个元素)还是glove格式的数据。 | |||||
:param str embed_filepath: 预训练的embedding的路径。 | |||||
:param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型,读取出现在vocab中的词的embedding。 | |||||
没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来,以使得整个Embedding是同分布的。 | |||||
:param dtype: 读出的embedding的类型 | |||||
:param str padding: 词表中padding的token | |||||
:param str unknown: 词表中unknown的token | |||||
:param bool normalize: 是否将每个vector归一化到norm为1 | |||||
:param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。 | |||||
这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。 | |||||
:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_ | |||||
:return torch.tensor: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。 | |||||
""" | |||||
assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported." | |||||
if not os.path.exists(embed_filepath): | |||||
raise FileNotFoundError("`{}` does not exist.".format(embed_filepath)) | |||||
with open(embed_filepath, 'r', encoding='utf-8') as f: | |||||
line = f.readline().strip() | |||||
parts = line.split() | |||||
start_idx = 0 | |||||
if len(parts) == 2: | |||||
dim = int(parts[1]) | |||||
start_idx += 1 | |||||
else: | |||||
dim = len(parts) - 1 | |||||
f.seek(0) | |||||
matrix = {} | |||||
found_count = 0 | |||||
for idx, line in enumerate(f, start_idx): | |||||
try: | |||||
parts = line.strip().split() | |||||
word = ''.join(parts[:-dim]) | |||||
nums = parts[-dim:] | |||||
# 对齐unk与pad | |||||
if word == padding and vocab.padding is not None: | |||||
word = vocab.padding | |||||
elif word == unknown and vocab.unknown is not None: | |||||
word = vocab.unknown | |||||
if word in vocab: | |||||
index = vocab.to_index(word) | |||||
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) | |||||
found_count += 1 | |||||
except Exception as e: | |||||
if error == 'ignore': | |||||
warnings.warn("Error occurred at the {} line.".format(idx)) | |||||
else: | |||||
print("Error occurred at the {} line.".format(idx)) | |||||
raise e | |||||
print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) | |||||
for word, index in vocab: | |||||
if index not in matrix and not vocab._is_word_no_create_entry(word): | |||||
if vocab.unknown_idx in matrix: # 如果有unkonwn,用unknown初始化 | |||||
matrix[index] = matrix[vocab.unknown_idx] | |||||
else: | |||||
matrix[index] = None | |||||
vectors = torch.zeros(len(matrix), dim) | |||||
if init_method: | |||||
init_method(vectors) | |||||
else: | |||||
nn.init.uniform_(vectors, -np.sqrt(3/dim), np.sqrt(3/dim)) | |||||
if vocab._no_create_word_length>0: | |||||
if vocab.unknown is None: # 创建一个专门的unknown | |||||
unknown_idx = len(matrix) | |||||
vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() | |||||
else: | |||||
unknown_idx = vocab.unknown_idx | |||||
words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), | |||||
requires_grad=False) | |||||
for order, (index, vec) in enumerate(matrix.items()): | |||||
if vec is not None: | |||||
vectors[order] = vec | |||||
words_to_words[index] = order | |||||
self.words_to_words = words_to_words | |||||
else: | |||||
for index, vec in matrix.items(): | |||||
if vec is not None: | |||||
vectors[index] = vec | |||||
if normalize: | |||||
vectors /= (torch.norm(vectors, dim=1, keepdim=True) + 1e-12) | |||||
return vectors | |||||
def forward(self, words): | |||||
""" | |||||
传入words的index | |||||
:param words: torch.LongTensor, [batch_size, max_len] | |||||
:return: torch.FloatTensor, [batch_size, max_len, embed_size] | |||||
""" | |||||
if hasattr(self, 'words_to_words'): | |||||
words = self.words_to_words[words] | |||||
words = self.drop_word(words) | |||||
words = self.embedding(words) | |||||
words = self.dropout(words) | |||||
return words |
@@ -0,0 +1,49 @@ | |||||
import numpy as np | |||||
import torch | |||||
from torch import nn as nn | |||||
from ..core.vocabulary import Vocabulary | |||||
__all__ = ['get_embeddings'] | |||||
def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1): | |||||
""" | |||||
给定一个word的vocabulary生成character的vocabulary. | |||||
:param vocab: 从vocab | |||||
:param min_freq: | |||||
:return: | |||||
""" | |||||
char_vocab = Vocabulary(min_freq=min_freq) | |||||
for word, index in vocab: | |||||
if not vocab._is_word_no_create_entry(word): | |||||
char_vocab.add_word_lst(list(word)) | |||||
return char_vocab | |||||
def get_embeddings(init_embed): | |||||
""" | |||||
根据输入的init_embed生成nn.Embedding对象。 | |||||
:param init_embed: 可以是 tuple:(num_embedings, embedding_dim), 即embedding的大小和每个词的维度;也可以传入 | |||||
nn.Embedding 对象, 此时就以传入的对象作为embedding; 传入np.ndarray也行,将使用传入的ndarray作为作为Embedding初始 | |||||
化; 传入orch.Tensor, 将使用传入的值作为Embedding初始化。 | |||||
:return nn.Embedding embeddings: | |||||
""" | |||||
if isinstance(init_embed, tuple): | |||||
res = nn.Embedding( | |||||
num_embeddings=init_embed[0], embedding_dim=init_embed[1]) | |||||
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)), | |||||
b=np.sqrt(3/res.weight.data.size(1))) | |||||
elif isinstance(init_embed, nn.Module): | |||||
res = init_embed | |||||
elif isinstance(init_embed, torch.Tensor): | |||||
res = nn.Embedding.from_pretrained(init_embed, freeze=False) | |||||
elif isinstance(init_embed, np.ndarray): | |||||
init_embed = torch.tensor(init_embed, dtype=torch.float32) | |||||
res = nn.Embedding.from_pretrained(init_embed, freeze=False) | |||||
else: | |||||
raise TypeError( | |||||
'invalid init_embed type: {}'.format((type(init_embed)))) | |||||
return res |
@@ -23,6 +23,7 @@ __all__ = [ | |||||
'ConllLoader', | 'ConllLoader', | ||||
'Conll2003Loader', | 'Conll2003Loader', | ||||
'IMDBLoader', | |||||
'MatchingLoader', | 'MatchingLoader', | ||||
'PeopleDailyCorpusLoader', | 'PeopleDailyCorpusLoader', | ||||
'SNLILoader', | 'SNLILoader', | ||||
@@ -1,5 +1,5 @@ | |||||
""" | """ | ||||
用于读数据集的模块, 具体包括: | |||||
用于读数据集的模块, 可以读取文本分类、序列标注、Matching任务的数据集 | |||||
这些模块的使用方法如下: | 这些模块的使用方法如下: | ||||
""" | """ | ||||
@@ -10,7 +10,7 @@ class ConllLoader(DataSetLoader): | |||||
别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader` | 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader` | ||||
读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html. 数据中以"-DOCSTART-"开头的行将被忽略,因为 | 读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html. 数据中以"-DOCSTART-"开头的行将被忽略,因为 | ||||
该符号在conll 2003中被用为文档分割符。 | |||||
该符号在conll 2003中被用为文档分割符。 | |||||
列号从0开始, 每列对应内容为:: | 列号从0开始, 每列对应内容为:: | ||||
@@ -13,9 +13,12 @@ from ..utils import get_tokenizer | |||||
class IMDBLoader(DataSetLoader): | class IMDBLoader(DataSetLoader): | ||||
""" | """ | ||||
别名::class:`fastNLP.io.IMDBLoader` :class:`fastNLP.io.data_loader.IMDBLoader` | |||||
读取IMDB数据集,DataSet包含以下fields: | 读取IMDB数据集,DataSet包含以下fields: | ||||
words: list(str), 需要分类的文本 | words: list(str), 需要分类的文本 | ||||
target: str, 文本的标签 | target: str, 文本的标签 | ||||
""" | """ | ||||
@@ -6,7 +6,7 @@ from ...core.const import Const | |||||
from ...core.vocabulary import Vocabulary | from ...core.vocabulary import Vocabulary | ||||
from ..base_loader import DataBundle, DataSetLoader | from ..base_loader import DataBundle, DataSetLoader | ||||
from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR | from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR | ||||
from ...modules.encoder._bert import BertTokenizer | |||||
from ...modules.encoder.bert import BertTokenizer | |||||
class MatchingLoader(DataSetLoader): | class MatchingLoader(DataSetLoader): | ||||
@@ -12,7 +12,9 @@ class MNLILoader(MatchingLoader, CSVLoader): | |||||
读取MNLI数据集,读取的DataSet包含fields:: | 读取MNLI数据集,读取的DataSet包含fields:: | ||||
words1: list(str),第一句文本, premise | words1: list(str),第一句文本, premise | ||||
words2: list(str), 第二句文本, hypothesis | words2: list(str), 第二句文本, hypothesis | ||||
target: str, 真实标签 | target: str, 真实标签 | ||||
数据来源: | 数据来源: | ||||
@@ -10,9 +10,12 @@ from ..utils import check_dataloader_paths | |||||
class MTL16Loader(CSVLoader): | class MTL16Loader(CSVLoader): | ||||
""" | """ | ||||
别名::class:`fastNLP.io.MTL16Loader` :class:`fastNLP.io.data_loader.MTL16Loader` | |||||
读取MTL16数据集,DataSet包含以下fields: | 读取MTL16数据集,DataSet包含以下fields: | ||||
words: list(str), 需要分类的文本 | words: list(str), 需要分类的文本 | ||||
target: str, 文本的标签 | target: str, 文本的标签 | ||||
数据来源:https://pan.baidu.com/s/1c2L6vdA | 数据来源:https://pan.baidu.com/s/1c2L6vdA | ||||
@@ -12,7 +12,9 @@ class QNLILoader(MatchingLoader, CSVLoader): | |||||
读取QNLI数据集,读取的DataSet包含fields:: | 读取QNLI数据集,读取的DataSet包含fields:: | ||||
words1: list(str),第一句文本, premise | words1: list(str),第一句文本, premise | ||||
words2: list(str), 第二句文本, hypothesis | words2: list(str), 第二句文本, hypothesis | ||||
target: str, 真实标签 | target: str, 真实标签 | ||||
数据来源: | 数据来源: | ||||
@@ -12,7 +12,9 @@ class QuoraLoader(MatchingLoader, CSVLoader): | |||||
读取MNLI数据集,读取的DataSet包含fields:: | 读取MNLI数据集,读取的DataSet包含fields:: | ||||
words1: list(str),第一句文本, premise | words1: list(str),第一句文本, premise | ||||
words2: list(str), 第二句文本, hypothesis | words2: list(str), 第二句文本, hypothesis | ||||
target: str, 真实标签 | target: str, 真实标签 | ||||
数据来源: | 数据来源: | ||||
@@ -12,7 +12,9 @@ class RTELoader(MatchingLoader, CSVLoader): | |||||
读取RTE数据集,读取的DataSet包含fields:: | 读取RTE数据集,读取的DataSet包含fields:: | ||||
words1: list(str),第一句文本, premise | words1: list(str),第一句文本, premise | ||||
words2: list(str), 第二句文本, hypothesis | words2: list(str), 第二句文本, hypothesis | ||||
target: str, 真实标签 | target: str, 真实标签 | ||||
数据来源: | 数据来源: | ||||
@@ -12,7 +12,9 @@ class SNLILoader(MatchingLoader, JsonLoader): | |||||
读取SNLI数据集,读取的DataSet包含fields:: | 读取SNLI数据集,读取的DataSet包含fields:: | ||||
words1: list(str),第一句文本, premise | words1: list(str),第一句文本, premise | ||||
words2: list(str), 第二句文本, hypothesis | words2: list(str), 第二句文本, hypothesis | ||||
target: str, 真实标签 | target: str, 真实标签 | ||||
数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip | 数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip | ||||
@@ -104,7 +104,9 @@ class SSTLoader(DataSetLoader): | |||||
class SST2Loader(CSVLoader): | class SST2Loader(CSVLoader): | ||||
""" | """ | ||||
数据来源"SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8', | |||||
别名::class:`fastNLP.io.SST2Loader` :class:`fastNLP.io.data_loader.SST2Loader` | |||||
数据来源 SST: https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8 | |||||
""" | """ | ||||
def __init__(self): | def __init__(self): | ||||
@@ -13,12 +13,17 @@ from ..utils import check_dataloader_paths, get_tokenizer | |||||
class YelpLoader(DataSetLoader): | class YelpLoader(DataSetLoader): | ||||
""" | """ | ||||
别名::class:`fastNLP.io.YelpLoader` :class:`fastNLP.io.data_loader.YelpLoader` | |||||
读取Yelp_full/Yelp_polarity数据集, DataSet包含fields: | 读取Yelp_full/Yelp_polarity数据集, DataSet包含fields: | ||||
words: list(str), 需要分类的文本 | words: list(str), 需要分类的文本 | ||||
target: str, 文本的标签 | target: str, 文本的标签 | ||||
chars:list(str),未index的字符列表 | chars:list(str),未index的字符列表 | ||||
数据集:yelp_full/yelp_polarity | 数据集:yelp_full/yelp_polarity | ||||
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` | :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` | ||||
:param lower: 是否需要自动转小写,默认为False。 | :param lower: 是否需要自动转小写,默认为False。 | ||||
""" | """ | ||||
@@ -8,7 +8,7 @@ from torch import nn | |||||
from .base_model import BaseModel | from .base_model import BaseModel | ||||
from ..core.const import Const | from ..core.const import Const | ||||
from ..modules.encoder import BertModel | from ..modules.encoder import BertModel | ||||
from ..modules.encoder._bert import BertConfig | |||||
from ..modules.encoder.bert import BertConfig | |||||
class BertForSequenceClassification(BaseModel): | class BertForSequenceClassification(BaseModel): | ||||
@@ -20,7 +20,7 @@ from ..modules.dropout import TimestepDropout | |||||
from ..modules.encoder.transformer import TransformerEncoder | from ..modules.encoder.transformer import TransformerEncoder | ||||
from ..modules.encoder.variational_rnn import VarLSTM | from ..modules.encoder.variational_rnn import VarLSTM | ||||
from ..modules.utils import initial_parameter | from ..modules.utils import initial_parameter | ||||
from ..modules.utils import get_embeddings | |||||
from ..embeddings.utils import get_embeddings | |||||
from .base_model import BaseModel | from .base_model import BaseModel | ||||
from ..core.utils import seq_len_to_mask | from ..core.utils import seq_len_to_mask | ||||
@@ -6,8 +6,9 @@ import torch | |||||
import torch.nn as nn | import torch.nn as nn | ||||
from ..core.const import Const as C | from ..core.const import Const as C | ||||
from ..core.utils import seq_len_to_mask | |||||
from ..modules import encoder | from ..modules import encoder | ||||
from fastNLP import seq_len_to_mask | |||||
from ..embeddings import embedding | |||||
class CNNText(torch.nn.Module): | class CNNText(torch.nn.Module): | ||||
@@ -33,7 +34,7 @@ class CNNText(torch.nn.Module): | |||||
super(CNNText, self).__init__() | super(CNNText, self).__init__() | ||||
# no support for pre-trained embedding currently | # no support for pre-trained embedding currently | ||||
self.embed = encoder.Embedding(init_embed) | |||||
self.embed = embedding.Embedding(init_embed) | |||||
self.conv_pool = encoder.ConvMaxpool( | self.conv_pool = encoder.ConvMaxpool( | ||||
in_channels=self.embed.embedding_dim, | in_channels=self.embed.embedding_dim, | ||||
out_channels=kernel_nums, | out_channels=kernel_nums, | ||||
@@ -14,7 +14,7 @@ except: | |||||
from ..core.utils import _pseudo_tqdm as tqdm | from ..core.utils import _pseudo_tqdm as tqdm | ||||
from ..core.trainer import Trainer | from ..core.trainer import Trainer | ||||
from ..core.batch import Batch | |||||
from ..core.batch import DataSetIter | |||||
from ..core.callback import CallbackManager, CallbackException | from ..core.callback import CallbackManager, CallbackException | ||||
from ..core.dataset import DataSet | from ..core.dataset import DataSet | ||||
from ..core.utils import _move_dict_value_to_device | from ..core.utils import _move_dict_value_to_device | ||||
@@ -124,8 +124,8 @@ class ENASTrainer(Trainer): | |||||
len(self.train_data) % self.batch_size != 0)) * self.n_epochs | len(self.train_data) % self.batch_size != 0)) * self.n_epochs | ||||
with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: | with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: | ||||
avg_loss = 0 | avg_loss = 0 | ||||
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||||
prefetch=self.prefetch) | |||||
data_iterator = DataSetIter(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||||
prefetch=self.prefetch) | |||||
for epoch in range(1, self.n_epochs + 1): | for epoch in range(1, self.n_epochs + 1): | ||||
pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) | pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) | ||||
last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) | last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) | ||||
@@ -209,8 +209,8 @@ class ENASTrainer(Trainer): | |||||
total_loss = 0 | total_loss = 0 | ||||
train_idx = 0 | train_idx = 0 | ||||
avg_loss = 0 | avg_loss = 0 | ||||
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||||
prefetch=self.prefetch) | |||||
data_iterator = DataSetIter(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||||
prefetch=self.prefetch) | |||||
for batch_x, batch_y in data_iterator: | for batch_x, batch_y in data_iterator: | ||||
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | ||||
@@ -262,8 +262,8 @@ class ENASTrainer(Trainer): | |||||
if not isinstance(entropies, np.ndarray): | if not isinstance(entropies, np.ndarray): | ||||
entropies = entropies.data.cpu().numpy() | entropies = entropies.data.cpu().numpy() | ||||
data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||||
prefetch=self.prefetch) | |||||
data_iterator = DataSetIter(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, | |||||
prefetch=self.prefetch) | |||||
for inputs, targets in data_iterator: | for inputs, targets in data_iterator: | ||||
valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) | valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) | ||||
@@ -12,12 +12,13 @@ import torch.nn as nn | |||||
import torch.nn.functional as F | import torch.nn.functional as F | ||||
from .base_model import BaseModel | from .base_model import BaseModel | ||||
from ..embeddings import embedding | |||||
from ..modules import decoder, encoder | from ..modules import decoder, encoder | ||||
from ..modules.decoder.crf import allowed_transitions | from ..modules.decoder.crf import allowed_transitions | ||||
from ..core.utils import seq_len_to_mask | from ..core.utils import seq_len_to_mask | ||||
from ..core.const import Const as C | from ..core.const import Const as C | ||||
from ..modules import LSTM | from ..modules import LSTM | ||||
from ..modules import get_embeddings | |||||
from ..embeddings import get_embeddings | |||||
from ..modules import ConditionalRandomField | from ..modules import ConditionalRandomField | ||||
@@ -91,10 +92,10 @@ class SeqLabeling(BaseModel): | |||||
def __init__(self, init_embed, hidden_size, num_classes): | def __init__(self, init_embed, hidden_size, num_classes): | ||||
super(SeqLabeling, self).__init__() | super(SeqLabeling, self).__init__() | ||||
self.Embedding = encoder.embedding.Embedding(init_embed) | |||||
self.Rnn = encoder.lstm.LSTM(self.Embedding.embedding_dim, hidden_size) | |||||
self.Embedding = embedding.Embedding(init_embed) | |||||
self.Rnn = encoder.LSTM(self.Embedding.embedding_dim, hidden_size) | |||||
self.Linear = nn.Linear(hidden_size, num_classes) | self.Linear = nn.Linear(hidden_size, num_classes) | ||||
self.Crf = decoder.crf.ConditionalRandomField(num_classes) | |||||
self.Crf = decoder.ConditionalRandomField(num_classes) | |||||
self.mask = None | self.mask = None | ||||
def forward(self, words, seq_len, target): | def forward(self, words, seq_len, target): | ||||
@@ -188,7 +189,7 @@ class AdvSeqLabel(nn.Module): | |||||
super().__init__() | super().__init__() | ||||
self.Embedding = encoder.embedding.Embedding(init_embed) | |||||
self.Embedding = embedding.Embedding(init_embed) | |||||
self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim) | self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim) | ||||
self.Rnn = encoder.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2, | self.Rnn = encoder.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2, | ||||
dropout=dropout, | dropout=dropout, | ||||
@@ -8,11 +8,10 @@ import torch.nn.functional as F | |||||
from torch.nn import CrossEntropyLoss | from torch.nn import CrossEntropyLoss | ||||
from fastNLP.models import BaseModel | |||||
from fastNLP.modules.encoder.embedding import TokenEmbedding | |||||
from fastNLP.modules.encoder.lstm import LSTM | |||||
from fastNLP.core.const import Const | |||||
from fastNLP.core.utils import seq_len_to_mask | |||||
from .base_model import BaseModel | |||||
from ..embeddings.embedding import TokenEmbedding | |||||
from ..core.const import Const | |||||
from ..core.utils import seq_len_to_mask | |||||
class ESIM(BaseModel): | class ESIM(BaseModel): | ||||
@@ -13,7 +13,7 @@ from torch import nn | |||||
from ..modules.encoder.star_transformer import StarTransformer | from ..modules.encoder.star_transformer import StarTransformer | ||||
from ..core.utils import seq_len_to_mask | from ..core.utils import seq_len_to_mask | ||||
from ..modules.utils import get_embeddings | |||||
from ..embeddings.utils import get_embeddings | |||||
from ..core.const import Const | from ..core.const import Const | ||||
@@ -24,7 +24,6 @@ __all__ = [ | |||||
"ConvolutionCharEncoder", | "ConvolutionCharEncoder", | ||||
"LSTMCharEncoder", | "LSTMCharEncoder", | ||||
"ConvMaxpool", | "ConvMaxpool", | ||||
"Embedding", | |||||
"LSTM", | "LSTM", | ||||
"StarTransformer", | "StarTransformer", | ||||
"TransformerEncoder", | "TransformerEncoder", | ||||
@@ -48,4 +47,3 @@ from . import encoder | |||||
from .decoder import * | from .decoder import * | ||||
from .dropout import TimestepDropout | from .dropout import TimestepDropout | ||||
from .encoder import * | from .encoder import * | ||||
from .utils import get_embeddings |
@@ -1,19 +1,11 @@ | |||||
__all__ = [ | __all__ = [ | ||||
# "BertModel", | |||||
"BertModel", | |||||
"ConvolutionCharEncoder", | "ConvolutionCharEncoder", | ||||
"LSTMCharEncoder", | "LSTMCharEncoder", | ||||
"ConvMaxpool", | "ConvMaxpool", | ||||
"Embedding", | |||||
"StaticEmbedding", | |||||
"ElmoEmbedding", | |||||
"BertEmbedding", | |||||
"StackEmbedding", | |||||
"LSTMCharEmbedding", | |||||
"CNNCharEmbedding", | |||||
"LSTM", | "LSTM", | ||||
"StarTransformer", | "StarTransformer", | ||||
@@ -31,12 +23,10 @@ __all__ = [ | |||||
"MultiHeadAttention", | "MultiHeadAttention", | ||||
] | ] | ||||
from ._bert import BertModel | |||||
from .bert import BertWordPieceEncoder | |||||
from .bert import BertModel | |||||
from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder | from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder | ||||
from .conv_maxpool import ConvMaxpool | from .conv_maxpool import ConvMaxpool | ||||
from .embedding import Embedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, \ | |||||
StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding | |||||
from .lstm import LSTM | from .lstm import LSTM | ||||
from .star_transformer import StarTransformer | from .star_transformer import StarTransformer | ||||
from .transformer import TransformerEncoder | from .transformer import TransformerEncoder | ||||
@@ -4,18 +4,13 @@ | |||||
from typing import Optional, Tuple, List, Callable | from typing import Optional, Tuple, List, Callable | ||||
import os | |||||
import torch | import torch | ||||
import torch.nn as nn | import torch.nn as nn | ||||
import torch.nn.functional as F | import torch.nn.functional as F | ||||
from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence | from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence | ||||
from ...core.vocabulary import Vocabulary | |||||
import json | |||||
import pickle | |||||
from ..utils import get_dropout_mask | from ..utils import get_dropout_mask | ||||
import codecs | |||||
class LstmCellWithProjection(torch.nn.Module): | class LstmCellWithProjection(torch.nn.Module): | ||||
""" | """ | ||||
@@ -541,188 +536,3 @@ class Highway(torch.nn.Module): | |||||
gate = torch.sigmoid(gate) | gate = torch.sigmoid(gate) | ||||
current_input = gate * linear_part + (1 - gate) * nonlinear_part | current_input = gate * linear_part + (1 - gate) * nonlinear_part | ||||
return current_input | return current_input | ||||
class _ElmoModel(nn.Module): | |||||
""" | |||||
该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作,包括 | |||||
(1) 根据配置,加载模型; | |||||
(2) 根据vocab,对模型中的embedding进行调整. 并将其正确初始化 | |||||
(3) 保存一个words与chars的对应转换,获取时自动进行相应的转换 | |||||
(4) 设计一个保存token的embedding,允许缓存word的表示。 | |||||
""" | |||||
def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): | |||||
super(_ElmoModel, self).__init__() | |||||
self.model_dir = model_dir | |||||
dir = os.walk(self.model_dir) | |||||
config_file = None | |||||
weight_file = None | |||||
config_count = 0 | |||||
weight_count = 0 | |||||
for path, dir_list, file_list in dir: | |||||
for file_name in file_list: | |||||
if file_name.__contains__(".json"): | |||||
config_file = file_name | |||||
config_count += 1 | |||||
elif file_name.__contains__(".pkl"): | |||||
weight_file = file_name | |||||
weight_count += 1 | |||||
if config_count > 1 or weight_count > 1: | |||||
raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.") | |||||
elif config_count == 0 or weight_count == 0: | |||||
raise Exception(f"No config file or weight file found in {model_dir}") | |||||
config = json.load(open(os.path.join(model_dir, config_file), 'r')) | |||||
self.weight_file = os.path.join(model_dir, weight_file) | |||||
self.config = config | |||||
OOV_TAG = '<oov>' | |||||
PAD_TAG = '<pad>' | |||||
BOS_TAG = '<bos>' | |||||
EOS_TAG = '<eos>' | |||||
BOW_TAG = '<bow>' | |||||
EOW_TAG = '<eow>' | |||||
# For the model trained with character-based word encoder. | |||||
char_lexicon = {} | |||||
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: | |||||
for line in fpi: | |||||
tokens = line.strip().split('\t') | |||||
if len(tokens) == 1: | |||||
tokens.insert(0, '\u3000') | |||||
token, i = tokens | |||||
char_lexicon[token] = int(i) | |||||
# 做一些sanity check | |||||
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: | |||||
assert special_word in char_lexicon, f"{special_word} not found in char.dic." | |||||
# 从vocab中构建char_vocab | |||||
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) | |||||
# 需要保证<bow>与<eow>在里面 | |||||
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) | |||||
for word, index in vocab: | |||||
char_vocab.add_word_lst(list(word)) | |||||
self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx | |||||
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) | |||||
char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), | |||||
padding_idx=len(char_vocab)) | |||||
# 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict | |||||
elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') | |||||
char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] | |||||
found_char_count = 0 | |||||
for char, index in char_vocab: # 调整character embedding | |||||
if char in char_lexicon: | |||||
index_in_pre = char_lexicon.get(char) | |||||
found_char_count += 1 | |||||
else: | |||||
index_in_pre = char_lexicon[OOV_TAG] | |||||
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] | |||||
print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") | |||||
# 生成words到chars的映射 | |||||
max_chars = config['char_cnn']['max_characters_per_token'] | |||||
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), | |||||
fill_value=len(char_vocab), | |||||
dtype=torch.long), | |||||
requires_grad=False) | |||||
for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab) + 1)]: | |||||
if len(word) + 2 > max_chars: | |||||
word = word[:max_chars - 2] | |||||
if index == self._pad_index: | |||||
continue | |||||
elif word == BOS_TAG or word == EOS_TAG: | |||||
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(word)] + [ | |||||
char_vocab.to_index(EOW_TAG)] | |||||
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) | |||||
else: | |||||
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [ | |||||
char_vocab.to_index(EOW_TAG)] | |||||
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) | |||||
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) | |||||
self.char_vocab = char_vocab | |||||
self.token_embedder = ConvTokenEmbedder( | |||||
config, self.weight_file, None, char_emb_layer) | |||||
elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight | |||||
self.token_embedder.load_state_dict(elmo_model["char_cnn"]) | |||||
self.output_dim = config['lstm']['projection_dim'] | |||||
# lstm encoder | |||||
self.encoder = ElmobiLm(config) | |||||
self.encoder.load_state_dict(elmo_model["lstm"]) | |||||
if cache_word_reprs: | |||||
if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 | |||||
print("Start to generate cache word representations.") | |||||
batch_size = 320 | |||||
# bos eos | |||||
word_size = self.words_to_chars_embedding.size(0) | |||||
num_batches = word_size // batch_size + \ | |||||
int(word_size % batch_size != 0) | |||||
self.cached_word_embedding = nn.Embedding(word_size, | |||||
config['lstm']['projection_dim']) | |||||
with torch.no_grad(): | |||||
for i in range(num_batches): | |||||
words = torch.arange(i * batch_size, | |||||
min((i + 1) * batch_size, word_size)).long() | |||||
chars = self.words_to_chars_embedding[words].unsqueeze(1) # batch_size x 1 x max_chars | |||||
word_reprs = self.token_embedder(words.unsqueeze(1), | |||||
chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] | |||||
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) | |||||
print("Finish generating cached word representations. Going to delete the character encoder.") | |||||
del self.token_embedder, self.words_to_chars_embedding | |||||
else: | |||||
print("There is no need to cache word representations, since no character information is used.") | |||||
def forward(self, words): | |||||
""" | |||||
:param words: batch_size x max_len | |||||
:return: num_layers x batch_size x max_len x hidden_size | |||||
""" | |||||
# 扩展<bos>, <eos> | |||||
batch_size, max_len = words.size() | |||||
expanded_words = words.new_zeros(batch_size, max_len + 2) # 因为pad一定为0, | |||||
seq_len = words.ne(self._pad_index).sum(dim=-1) | |||||
expanded_words[:, 1:-1] = words | |||||
expanded_words[:, 0].fill_(self.bos_index) | |||||
expanded_words[torch.arange(batch_size).to(words), seq_len + 1] = self.eos_index | |||||
seq_len = seq_len + 2 | |||||
zero_tensor = expanded_words.new_zeros(expanded_words.shape) | |||||
mask = (expanded_words == zero_tensor).unsqueeze(-1) | |||||
if hasattr(self, 'cached_word_embedding'): | |||||
token_embedding = self.cached_word_embedding(expanded_words) | |||||
else: | |||||
if hasattr(self, 'words_to_chars_embedding'): | |||||
chars = self.words_to_chars_embedding[expanded_words] | |||||
else: | |||||
chars = None | |||||
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim | |||||
encoder_output = self.encoder(token_embedding, seq_len) | |||||
if encoder_output.size(2) < max_len + 2: | |||||
num_layers, _, output_len, hidden_size = encoder_output.size() | |||||
dummy_tensor = encoder_output.new_zeros(num_layers, batch_size, | |||||
max_len + 2 - output_len, hidden_size) | |||||
encoder_output = torch.cat((encoder_output, dummy_tensor), 2) | |||||
sz = encoder_output.size() # 2, batch_size, max_len, hidden_size | |||||
token_embedding = token_embedding.masked_fill(mask, 0) | |||||
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) | |||||
encoder_output = torch.cat((token_embedding, encoder_output), dim=0) | |||||
# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。 | |||||
encoder_output = encoder_output[:, :, 1:-1] | |||||
return encoder_output |
@@ -1,79 +1,919 @@ | |||||
""" | |||||
这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码, 如果你发现该代码对你 | |||||
有用,也请引用一下他们。 | |||||
""" | |||||
import collections | |||||
import unicodedata | |||||
import copy | |||||
import json | |||||
import math | |||||
import os | import os | ||||
from torch import nn | |||||
import torch | import torch | ||||
from ...io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR | |||||
from ._bert import _WordPieceBertModel, BertModel | |||||
from torch import nn | |||||
import glob | |||||
import sys | |||||
CONFIG_FILE = 'bert_config.json' | |||||
class BertWordPieceEncoder(nn.Module): | |||||
class BertConfig(object): | |||||
"""Configuration class to store the configuration of a `BertModel`. | |||||
""" | """ | ||||
读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。 | |||||
def __init__(self, | |||||
vocab_size_or_config_json_file, | |||||
hidden_size=768, | |||||
num_hidden_layers=12, | |||||
num_attention_heads=12, | |||||
intermediate_size=3072, | |||||
hidden_act="gelu", | |||||
hidden_dropout_prob=0.1, | |||||
attention_probs_dropout_prob=0.1, | |||||
max_position_embeddings=512, | |||||
type_vocab_size=2, | |||||
initializer_range=0.02, | |||||
layer_norm_eps=1e-12): | |||||
"""Constructs BertConfig. | |||||
Args: | |||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. | |||||
hidden_size: Size of the encoder layers and the pooler layer. | |||||
num_hidden_layers: Number of hidden layers in the Transformer encoder. | |||||
num_attention_heads: Number of attention heads for each attention layer in | |||||
the Transformer encoder. | |||||
intermediate_size: The size of the "intermediate" (i.e., feed-forward) | |||||
layer in the Transformer encoder. | |||||
hidden_act: The non-linear activation function (function or string) in the | |||||
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. | |||||
hidden_dropout_prob: The dropout probabilitiy for all fully connected | |||||
layers in the embeddings, encoder, and pooler. | |||||
attention_probs_dropout_prob: The dropout ratio for the attention | |||||
probabilities. | |||||
max_position_embeddings: The maximum sequence length that this model might | |||||
ever be used with. Typically set this to something large just in case | |||||
(e.g., 512 or 1024 or 2048). | |||||
type_vocab_size: The vocabulary size of the `token_type_ids` passed into | |||||
`BertModel`. | |||||
initializer_range: The sttdev of the truncated_normal_initializer for | |||||
initializing all weight matrices. | |||||
layer_norm_eps: The epsilon used by LayerNorm. | |||||
""" | |||||
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 | |||||
and isinstance(vocab_size_or_config_json_file, unicode)): | |||||
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: | |||||
json_config = json.loads(reader.read()) | |||||
for key, value in json_config.items(): | |||||
self.__dict__[key] = value | |||||
elif isinstance(vocab_size_or_config_json_file, int): | |||||
self.vocab_size = vocab_size_or_config_json_file | |||||
self.hidden_size = hidden_size | |||||
self.num_hidden_layers = num_hidden_layers | |||||
self.num_attention_heads = num_attention_heads | |||||
self.hidden_act = hidden_act | |||||
self.intermediate_size = intermediate_size | |||||
self.hidden_dropout_prob = hidden_dropout_prob | |||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob | |||||
self.max_position_embeddings = max_position_embeddings | |||||
self.type_vocab_size = type_vocab_size | |||||
self.initializer_range = initializer_range | |||||
self.layer_norm_eps = layer_norm_eps | |||||
else: | |||||
raise ValueError("First argument must be either a vocabulary size (int)" | |||||
"or the path to a pretrained model config file (str)") | |||||
@classmethod | |||||
def from_dict(cls, json_object): | |||||
"""Constructs a `BertConfig` from a Python dictionary of parameters.""" | |||||
config = BertConfig(vocab_size_or_config_json_file=-1) | |||||
for key, value in json_object.items(): | |||||
config.__dict__[key] = value | |||||
return config | |||||
@classmethod | |||||
def from_json_file(cls, json_file): | |||||
"""Constructs a `BertConfig` from a json file of parameters.""" | |||||
with open(json_file, "r", encoding='utf-8') as reader: | |||||
text = reader.read() | |||||
return cls.from_dict(json.loads(text)) | |||||
def __repr__(self): | |||||
return str(self.to_json_string()) | |||||
def to_dict(self): | |||||
"""Serializes this instance to a Python dictionary.""" | |||||
output = copy.deepcopy(self.__dict__) | |||||
return output | |||||
def to_json_string(self): | |||||
"""Serializes this instance to a JSON string.""" | |||||
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" | |||||
def to_json_file(self, json_file_path): | |||||
""" Save this instance to a json file.""" | |||||
with open(json_file_path, "w", encoding='utf-8') as writer: | |||||
writer.write(self.to_json_string()) | |||||
:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased`` | |||||
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 | |||||
:param bool requires_grad: 是否需要gradient。 | |||||
def gelu(x): | |||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) | |||||
def swish(x): | |||||
return x * torch.sigmoid(x) | |||||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} | |||||
class BertLayerNorm(nn.Module): | |||||
def __init__(self, hidden_size, eps=1e-12): | |||||
"""Construct a layernorm module in the TF style (epsilon inside the square root). | |||||
""" | |||||
super(BertLayerNorm, self).__init__() | |||||
self.weight = nn.Parameter(torch.ones(hidden_size)) | |||||
self.bias = nn.Parameter(torch.zeros(hidden_size)) | |||||
self.variance_epsilon = eps | |||||
def forward(self, x): | |||||
u = x.mean(-1, keepdim=True) | |||||
s = (x - u).pow(2).mean(-1, keepdim=True) | |||||
x = (x - u) / torch.sqrt(s + self.variance_epsilon) | |||||
return self.weight * x + self.bias | |||||
class BertEmbeddings(nn.Module): | |||||
"""Construct the embeddings from word, position and token_type embeddings. | |||||
""" | """ | ||||
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', | |||||
requires_grad: bool=False): | |||||
super().__init__() | |||||
PRETRAIN_URL = _get_base_url('bert') | |||||
if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR: | |||||
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] | |||||
model_url = PRETRAIN_URL + model_name | |||||
model_dir = cached_path(model_url) | |||||
# 检查是否存在 | |||||
elif os.path.isdir(model_dir_or_name): | |||||
model_dir = model_dir_or_name | |||||
def __init__(self, config): | |||||
super(BertEmbeddings, self).__init__() | |||||
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) | |||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) | |||||
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) | |||||
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load | |||||
# any TensorFlow checkpoint file | |||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) | |||||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||||
def forward(self, input_ids, token_type_ids=None): | |||||
seq_length = input_ids.size(1) | |||||
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) | |||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) | |||||
if token_type_ids is None: | |||||
token_type_ids = torch.zeros_like(input_ids) | |||||
words_embeddings = self.word_embeddings(input_ids) | |||||
position_embeddings = self.position_embeddings(position_ids) | |||||
token_type_embeddings = self.token_type_embeddings(token_type_ids) | |||||
embeddings = words_embeddings + position_embeddings + token_type_embeddings | |||||
embeddings = self.LayerNorm(embeddings) | |||||
embeddings = self.dropout(embeddings) | |||||
return embeddings | |||||
class BertSelfAttention(nn.Module): | |||||
def __init__(self, config): | |||||
super(BertSelfAttention, self).__init__() | |||||
if config.hidden_size % config.num_attention_heads != 0: | |||||
raise ValueError( | |||||
"The hidden size (%d) is not a multiple of the number of attention " | |||||
"heads (%d)" % (config.hidden_size, config.num_attention_heads)) | |||||
self.num_attention_heads = config.num_attention_heads | |||||
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) | |||||
self.all_head_size = self.num_attention_heads * self.attention_head_size | |||||
self.query = nn.Linear(config.hidden_size, self.all_head_size) | |||||
self.key = nn.Linear(config.hidden_size, self.all_head_size) | |||||
self.value = nn.Linear(config.hidden_size, self.all_head_size) | |||||
self.dropout = nn.Dropout(config.attention_probs_dropout_prob) | |||||
def transpose_for_scores(self, x): | |||||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) | |||||
x = x.view(*new_x_shape) | |||||
return x.permute(0, 2, 1, 3) | |||||
def forward(self, hidden_states, attention_mask): | |||||
mixed_query_layer = self.query(hidden_states) | |||||
mixed_key_layer = self.key(hidden_states) | |||||
mixed_value_layer = self.value(hidden_states) | |||||
query_layer = self.transpose_for_scores(mixed_query_layer) | |||||
key_layer = self.transpose_for_scores(mixed_key_layer) | |||||
value_layer = self.transpose_for_scores(mixed_value_layer) | |||||
# Take the dot product between "query" and "key" to get the raw attention scores. | |||||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) | |||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size) | |||||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function) | |||||
attention_scores = attention_scores + attention_mask | |||||
# Normalize the attention scores to probabilities. | |||||
attention_probs = nn.Softmax(dim=-1)(attention_scores) | |||||
# This is actually dropping out entire tokens to attend to, which might | |||||
# seem a bit unusual, but is taken from the original Transformer paper. | |||||
attention_probs = self.dropout(attention_probs) | |||||
context_layer = torch.matmul(attention_probs, value_layer) | |||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() | |||||
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) | |||||
context_layer = context_layer.view(*new_context_layer_shape) | |||||
return context_layer | |||||
class BertSelfOutput(nn.Module): | |||||
def __init__(self, config): | |||||
super(BertSelfOutput, self).__init__() | |||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size) | |||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) | |||||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||||
def forward(self, hidden_states, input_tensor): | |||||
hidden_states = self.dense(hidden_states) | |||||
hidden_states = self.dropout(hidden_states) | |||||
hidden_states = self.LayerNorm(hidden_states + input_tensor) | |||||
return hidden_states | |||||
class BertAttention(nn.Module): | |||||
def __init__(self, config): | |||||
super(BertAttention, self).__init__() | |||||
self.self = BertSelfAttention(config) | |||||
self.output = BertSelfOutput(config) | |||||
def forward(self, input_tensor, attention_mask): | |||||
self_output = self.self(input_tensor, attention_mask) | |||||
attention_output = self.output(self_output, input_tensor) | |||||
return attention_output | |||||
class BertIntermediate(nn.Module): | |||||
def __init__(self, config): | |||||
super(BertIntermediate, self).__init__() | |||||
self.dense = nn.Linear(config.hidden_size, config.intermediate_size) | |||||
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): | |||||
self.intermediate_act_fn = ACT2FN[config.hidden_act] | |||||
else: | else: | ||||
raise ValueError(f"Cannot recognize {model_dir_or_name}.") | |||||
self.intermediate_act_fn = config.hidden_act | |||||
def forward(self, hidden_states): | |||||
hidden_states = self.dense(hidden_states) | |||||
hidden_states = self.intermediate_act_fn(hidden_states) | |||||
return hidden_states | |||||
class BertOutput(nn.Module): | |||||
def __init__(self, config): | |||||
super(BertOutput, self).__init__() | |||||
self.dense = nn.Linear(config.intermediate_size, config.hidden_size) | |||||
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) | |||||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||||
def forward(self, hidden_states, input_tensor): | |||||
hidden_states = self.dense(hidden_states) | |||||
hidden_states = self.dropout(hidden_states) | |||||
hidden_states = self.LayerNorm(hidden_states + input_tensor) | |||||
return hidden_states | |||||
class BertLayer(nn.Module): | |||||
def __init__(self, config): | |||||
super(BertLayer, self).__init__() | |||||
self.attention = BertAttention(config) | |||||
self.intermediate = BertIntermediate(config) | |||||
self.output = BertOutput(config) | |||||
def forward(self, hidden_states, attention_mask): | |||||
attention_output = self.attention(hidden_states, attention_mask) | |||||
intermediate_output = self.intermediate(attention_output) | |||||
layer_output = self.output(intermediate_output, attention_output) | |||||
return layer_output | |||||
class BertEncoder(nn.Module): | |||||
def __init__(self, config): | |||||
super(BertEncoder, self).__init__() | |||||
layer = BertLayer(config) | |||||
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) | |||||
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): | |||||
all_encoder_layers = [] | |||||
for layer_module in self.layer: | |||||
hidden_states = layer_module(hidden_states, attention_mask) | |||||
if output_all_encoded_layers: | |||||
all_encoder_layers.append(hidden_states) | |||||
if not output_all_encoded_layers: | |||||
all_encoder_layers.append(hidden_states) | |||||
return all_encoder_layers | |||||
class BertPooler(nn.Module): | |||||
def __init__(self, config): | |||||
super(BertPooler, self).__init__() | |||||
self.dense = nn.Linear(config.hidden_size, config.hidden_size) | |||||
self.activation = nn.Tanh() | |||||
def forward(self, hidden_states): | |||||
# We "pool" the model by simply taking the hidden state corresponding | |||||
# to the first token. | |||||
first_token_tensor = hidden_states[:, 0] | |||||
pooled_output = self.dense(first_token_tensor) | |||||
pooled_output = self.activation(pooled_output) | |||||
return pooled_output | |||||
class BertModel(nn.Module): | |||||
"""BERT(Bidirectional Embedding Representations from Transformers). | |||||
如果你想使用预训练好的权重矩阵,请在以下网址下载. | |||||
sources:: | |||||
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", | |||||
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", | |||||
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", | |||||
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", | |||||
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", | |||||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", | |||||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", | |||||
'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin", | |||||
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", | |||||
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", | |||||
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", | |||||
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", | |||||
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin" | |||||
用预训练权重矩阵来建立BERT模型:: | |||||
model = BertModel.from_pretrained("path/to/weights/directory") | |||||
用随机初始化权重矩阵来建立BERT模型:: | |||||
self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers) | |||||
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size | |||||
self.requires_grad = requires_grad | |||||
model = BertModel() | |||||
@property | |||||
def requires_grad(self): | |||||
:param int vocab_size: 词表大小,默认值为30522,为BERT English uncase版本的词表大小 | |||||
:param int hidden_size: 隐层大小,默认值为768,为BERT base的版本 | |||||
:param int num_hidden_layers: 隐藏层数,默认值为12,为BERT base的版本 | |||||
:param int num_attention_heads: 多头注意力头数,默认值为12,为BERT base的版本 | |||||
:param int intermediate_size: FFN隐藏层大小,默认值是3072,为BERT base的版本 | |||||
:param str hidden_act: FFN隐藏层激活函数,默认值为``gelu`` | |||||
:param float hidden_dropout_prob: FFN隐藏层dropout,默认值为0.1 | |||||
:param float attention_probs_dropout_prob: Attention层的dropout,默认值为0.1 | |||||
:param int max_position_embeddings: 最大的序列长度,默认值为512, | |||||
:param int type_vocab_size: 最大segment数量,默认值为2 | |||||
:param int initializer_range: 初始化权重范围,默认值为0.02 | |||||
""" | |||||
def __init__(self, config, *inputs, **kwargs): | |||||
super(BertModel, self).__init__() | |||||
if not isinstance(config, BertConfig): | |||||
raise ValueError( | |||||
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. " | |||||
"To create a model from a Google pretrained model use " | |||||
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( | |||||
self.__class__.__name__, self.__class__.__name__ | |||||
)) | |||||
super(BertModel, self).__init__() | |||||
self.config = config | |||||
self.hidden_size = self.config.hidden_size | |||||
self.embeddings = BertEmbeddings(config) | |||||
self.encoder = BertEncoder(config) | |||||
self.pooler = BertPooler(config) | |||||
self.apply(self.init_bert_weights) | |||||
def init_bert_weights(self, module): | |||||
""" Initialize the weights. | |||||
""" | """ | ||||
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 | |||||
if isinstance(module, (nn.Linear, nn.Embedding)): | |||||
# Slightly different from the TF version which uses truncated_normal for initialization | |||||
# cf https://github.com/pytorch/pytorch/pull/5617 | |||||
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) | |||||
elif isinstance(module, BertLayerNorm): | |||||
module.bias.data.zero_() | |||||
module.weight.data.fill_(1.0) | |||||
if isinstance(module, nn.Linear) and module.bias is not None: | |||||
module.bias.data.zero_() | |||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True): | |||||
if attention_mask is None: | |||||
attention_mask = torch.ones_like(input_ids) | |||||
if token_type_ids is None: | |||||
token_type_ids = torch.zeros_like(input_ids) | |||||
# We create a 3D attention mask from a 2D tensor mask. | |||||
# Sizes are [batch_size, 1, 1, to_seq_length] | |||||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] | |||||
# this attention mask is more simple than the triangular masking of causal attention | |||||
# used in OpenAI GPT, we just need to prepare the broadcast dimension here. | |||||
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) | |||||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for | |||||
# masked positions, this operation will create a tensor which is 0.0 for | |||||
# positions we want to attend and -10000.0 for masked positions. | |||||
# Since we are adding it to the raw scores before the softmax, this is | |||||
# effectively the same as removing these entirely. | |||||
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility | |||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 | |||||
embedding_output = self.embeddings(input_ids, token_type_ids) | |||||
encoded_layers = self.encoder(embedding_output, | |||||
extended_attention_mask, | |||||
output_all_encoded_layers=output_all_encoded_layers) | |||||
sequence_output = encoded_layers[-1] | |||||
pooled_output = self.pooler(sequence_output) | |||||
if not output_all_encoded_layers: | |||||
encoded_layers = encoded_layers[-1] | |||||
return encoded_layers, pooled_output | |||||
@classmethod | |||||
def from_pretrained(cls, pretrained_model_dir, *inputs, **kwargs): | |||||
state_dict = kwargs.get('state_dict', None) | |||||
kwargs.pop('state_dict', None) | |||||
cache_dir = kwargs.get('cache_dir', None) | |||||
kwargs.pop('cache_dir', None) | |||||
from_tf = kwargs.get('from_tf', False) | |||||
kwargs.pop('from_tf', None) | |||||
# Load config | |||||
config_file = os.path.join(pretrained_model_dir, CONFIG_FILE) | |||||
config = BertConfig.from_json_file(config_file) | |||||
# logger.info("Model config {}".format(config)) | |||||
# Instantiate model. | |||||
model = cls(config, *inputs, **kwargs) | |||||
if state_dict is None: | |||||
files = glob.glob(os.path.join(pretrained_model_dir, '*.bin')) | |||||
if len(files)==0: | |||||
raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}") | |||||
elif len(files)>1: | |||||
raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}") | |||||
weights_path = files[0] | |||||
state_dict = torch.load(weights_path, map_location='cpu') | |||||
old_keys = [] | |||||
new_keys = [] | |||||
for key in state_dict.keys(): | |||||
new_key = None | |||||
if 'gamma' in key: | |||||
new_key = key.replace('gamma', 'weight') | |||||
if 'beta' in key: | |||||
new_key = key.replace('beta', 'bias') | |||||
if new_key: | |||||
old_keys.append(key) | |||||
new_keys.append(new_key) | |||||
for old_key, new_key in zip(old_keys, new_keys): | |||||
state_dict[new_key] = state_dict.pop(old_key) | |||||
missing_keys = [] | |||||
unexpected_keys = [] | |||||
error_msgs = [] | |||||
# copy state_dict so _load_from_state_dict can modify it | |||||
metadata = getattr(state_dict, '_metadata', None) | |||||
state_dict = state_dict.copy() | |||||
if metadata is not None: | |||||
state_dict._metadata = metadata | |||||
def load(module, prefix=''): | |||||
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) | |||||
module._load_from_state_dict( | |||||
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) | |||||
for name, child in module._modules.items(): | |||||
if child is not None: | |||||
load(child, prefix + name + '.') | |||||
load(model, prefix='' if hasattr(model, 'bert') else 'bert.') | |||||
if len(missing_keys) > 0: | |||||
print("Weights of {} not initialized from pretrained model: {}".format( | |||||
model.__class__.__name__, missing_keys)) | |||||
if len(unexpected_keys) > 0: | |||||
print("Weights from pretrained model not used in {}: {}".format( | |||||
model.__class__.__name__, unexpected_keys)) | |||||
return model | |||||
def whitespace_tokenize(text): | |||||
"""Runs basic whitespace cleaning and splitting on a piece of text.""" | |||||
text = text.strip() | |||||
if not text: | |||||
return [] | |||||
tokens = text.split() | |||||
return tokens | |||||
class WordpieceTokenizer(object): | |||||
"""Runs WordPiece tokenization.""" | |||||
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): | |||||
self.vocab = vocab | |||||
self.unk_token = unk_token | |||||
self.max_input_chars_per_word = max_input_chars_per_word | |||||
def tokenize(self, text): | |||||
"""Tokenizes a piece of text into its word pieces. | |||||
This uses a greedy longest-match-first algorithm to perform tokenization | |||||
using the given vocabulary. | |||||
For example: | |||||
input = "unaffable" | |||||
output = ["un", "##aff", "##able"] | |||||
Args: | |||||
text: A single token or whitespace separated tokens. This should have | |||||
already been passed through `BasicTokenizer`. | |||||
Returns: | |||||
A list of wordpiece tokens. | |||||
""" | |||||
output_tokens = [] | |||||
for token in whitespace_tokenize(text): | |||||
chars = list(token) | |||||
if len(chars) > self.max_input_chars_per_word: | |||||
output_tokens.append(self.unk_token) | |||||
continue | |||||
is_bad = False | |||||
start = 0 | |||||
sub_tokens = [] | |||||
while start < len(chars): | |||||
end = len(chars) | |||||
cur_substr = None | |||||
while start < end: | |||||
substr = "".join(chars[start:end]) | |||||
if start > 0: | |||||
substr = "##" + substr | |||||
if substr in self.vocab: | |||||
cur_substr = substr | |||||
break | |||||
end -= 1 | |||||
if cur_substr is None: | |||||
is_bad = True | |||||
break | |||||
sub_tokens.append(cur_substr) | |||||
start = end | |||||
if is_bad: | |||||
output_tokens.append(self.unk_token) | |||||
else: | |||||
output_tokens.extend(sub_tokens) | |||||
return output_tokens | |||||
def load_vocab(vocab_file): | |||||
"""Loads a vocabulary file into a dictionary.""" | |||||
vocab = collections.OrderedDict() | |||||
index = 0 | |||||
with open(vocab_file, "r", encoding="utf-8") as reader: | |||||
while True: | |||||
token = reader.readline() | |||||
if not token: | |||||
break | |||||
token = token.strip() | |||||
vocab[token] = index | |||||
index += 1 | |||||
return vocab | |||||
class BasicTokenizer(object): | |||||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.).""" | |||||
def __init__(self, | |||||
do_lower_case=True, | |||||
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): | |||||
"""Constructs a BasicTokenizer. | |||||
Args: | |||||
do_lower_case: Whether to lower case the input. | |||||
""" | |||||
self.do_lower_case = do_lower_case | |||||
self.never_split = never_split | |||||
def tokenize(self, text): | |||||
"""Tokenizes a piece of text.""" | |||||
text = self._clean_text(text) | |||||
# This was added on November 1st, 2018 for the multilingual and Chinese | |||||
# models. This is also applied to the English models now, but it doesn't | |||||
# matter since the English models were not trained on any Chinese data | |||||
# and generally don't have any Chinese data in them (there are Chinese | |||||
# characters in the vocabulary because Wikipedia does have some Chinese | |||||
# words in the English Wikipedia.). | |||||
text = self._tokenize_chinese_chars(text) | |||||
orig_tokens = whitespace_tokenize(text) | |||||
split_tokens = [] | |||||
for token in orig_tokens: | |||||
if self.do_lower_case and token not in self.never_split: | |||||
token = token.lower() | |||||
token = self._run_strip_accents(token) | |||||
split_tokens.extend(self._run_split_on_punc(token)) | |||||
output_tokens = whitespace_tokenize(" ".join(split_tokens)) | |||||
return output_tokens | |||||
def _run_strip_accents(self, text): | |||||
"""Strips accents from a piece of text.""" | |||||
text = unicodedata.normalize("NFD", text) | |||||
output = [] | |||||
for char in text: | |||||
cat = unicodedata.category(char) | |||||
if cat == "Mn": | |||||
continue | |||||
output.append(char) | |||||
return "".join(output) | |||||
def _run_split_on_punc(self, text): | |||||
"""Splits punctuation on a piece of text.""" | |||||
if text in self.never_split: | |||||
return [text] | |||||
chars = list(text) | |||||
i = 0 | |||||
start_new_word = True | |||||
output = [] | |||||
while i < len(chars): | |||||
char = chars[i] | |||||
if _is_punctuation(char): | |||||
output.append([char]) | |||||
start_new_word = True | |||||
else: | |||||
if start_new_word: | |||||
output.append([]) | |||||
start_new_word = False | |||||
output[-1].append(char) | |||||
i += 1 | |||||
return ["".join(x) for x in output] | |||||
def _tokenize_chinese_chars(self, text): | |||||
"""Adds whitespace around any CJK character.""" | |||||
output = [] | |||||
for char in text: | |||||
cp = ord(char) | |||||
if self._is_chinese_char(cp): | |||||
output.append(" ") | |||||
output.append(char) | |||||
output.append(" ") | |||||
else: | |||||
output.append(char) | |||||
return "".join(output) | |||||
def _is_chinese_char(self, cp): | |||||
"""Checks whether CP is the codepoint of a CJK character.""" | |||||
# This defines a "chinese character" as anything in the CJK Unicode block: | |||||
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) | |||||
# | |||||
# Note that the CJK Unicode block is NOT all Japanese and Korean characters, | |||||
# despite its name. The modern Korean Hangul alphabet is a different block, | |||||
# as is Japanese Hiragana and Katakana. Those alphabets are used to write | |||||
# space-separated words, so they are not treated specially and handled | |||||
# like the all of the other languages. | |||||
if ((cp >= 0x4E00 and cp <= 0x9FFF) or # | |||||
(cp >= 0x3400 and cp <= 0x4DBF) or # | |||||
(cp >= 0x20000 and cp <= 0x2A6DF) or # | |||||
(cp >= 0x2A700 and cp <= 0x2B73F) or # | |||||
(cp >= 0x2B740 and cp <= 0x2B81F) or # | |||||
(cp >= 0x2B820 and cp <= 0x2CEAF) or | |||||
(cp >= 0xF900 and cp <= 0xFAFF) or # | |||||
(cp >= 0x2F800 and cp <= 0x2FA1F)): # | |||||
return True | |||||
return False | |||||
def _clean_text(self, text): | |||||
"""Performs invalid character removal and whitespace cleanup on text.""" | |||||
output = [] | |||||
for char in text: | |||||
cp = ord(char) | |||||
if cp == 0 or cp == 0xfffd or _is_control(char): | |||||
continue | |||||
if _is_whitespace(char): | |||||
output.append(" ") | |||||
else: | |||||
output.append(char) | |||||
return "".join(output) | |||||
def _is_whitespace(char): | |||||
"""Checks whether `chars` is a whitespace character.""" | |||||
# \t, \n, and \r are technically contorl characters but we treat them | |||||
# as whitespace since they are generally considered as such. | |||||
if char == " " or char == "\t" or char == "\n" or char == "\r": | |||||
return True | |||||
cat = unicodedata.category(char) | |||||
if cat == "Zs": | |||||
return True | |||||
return False | |||||
def _is_control(char): | |||||
"""Checks whether `chars` is a control character.""" | |||||
# These are technically control characters but we count them as whitespace | |||||
# characters. | |||||
if char == "\t" or char == "\n" or char == "\r": | |||||
return False | |||||
cat = unicodedata.category(char) | |||||
if cat.startswith("C"): | |||||
return True | |||||
return False | |||||
def _is_punctuation(char): | |||||
"""Checks whether `chars` is a punctuation character.""" | |||||
cp = ord(char) | |||||
# We treat all non-letter/number ASCII as punctuation. | |||||
# Characters such as "^", "$", and "`" are not in the Unicode | |||||
# Punctuation class but we treat them as punctuation anyways, for | |||||
# consistency. | |||||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or | |||||
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): | |||||
return True | |||||
cat = unicodedata.category(char) | |||||
if cat.startswith("P"): | |||||
return True | |||||
return False | |||||
class BertTokenizer(object): | |||||
"""Runs end-to-end tokenization: punctuation splitting + wordpiece""" | |||||
def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, | |||||
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): | |||||
"""Constructs a BertTokenizer. | |||||
Args: | |||||
vocab_file: Path to a one-wordpiece-per-line vocabulary file | |||||
do_lower_case: Whether to lower case the input | |||||
Only has an effect when do_wordpiece_only=False | |||||
do_basic_tokenize: Whether to do basic tokenization before wordpiece. | |||||
max_len: An artificial maximum length to truncate tokenized sequences to; | |||||
Effective maximum length is always the minimum of this | |||||
value (if specified) and the underlying BERT model's | |||||
sequence length. | |||||
never_split: List of tokens which will never be split during tokenization. | |||||
Only has an effect when do_wordpiece_only=False | |||||
""" | |||||
if not os.path.isfile(vocab_file): | |||||
raise ValueError( | |||||
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " | |||||
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) | |||||
self.vocab = load_vocab(vocab_file) | |||||
self.ids_to_tokens = collections.OrderedDict( | |||||
[(ids, tok) for tok, ids in self.vocab.items()]) | |||||
self.do_basic_tokenize = do_basic_tokenize | |||||
if do_basic_tokenize: | |||||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, | |||||
never_split=never_split) | |||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) | |||||
self.max_len = max_len if max_len is not None else int(1e12) | |||||
def _reinit_on_new_vocab(self, vocab): | |||||
""" | |||||
在load bert之后,可能会对vocab进行重新排列。重新排列之后调用这个函数重新初始化与vocab相关的性质 | |||||
:param vocab: | |||||
:return: | :return: | ||||
""" | """ | ||||
requires_grads = set([param.requires_grad for name, param in self.named_parameters()]) | |||||
if len(requires_grads)==1: | |||||
return requires_grads.pop() | |||||
self.vocab = vocab | |||||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) | |||||
def tokenize(self, text): | |||||
split_tokens = [] | |||||
if self.do_basic_tokenize: | |||||
for token in self.basic_tokenizer.tokenize(text): | |||||
for sub_token in self.wordpiece_tokenizer.tokenize(token): | |||||
split_tokens.append(sub_token) | |||||
else: | |||||
split_tokens = self.wordpiece_tokenizer.tokenize(text) | |||||
return split_tokens | |||||
def convert_tokens_to_ids(self, tokens): | |||||
"""Converts a sequence of tokens into ids using the vocab.""" | |||||
ids = [] | |||||
for token in tokens: | |||||
ids.append(self.vocab[token]) | |||||
if len(ids) > self.max_len: | |||||
print( | |||||
"Token indices sequence length is longer than the specified maximum " | |||||
" sequence length for this BERT model ({} > {}). Running this" | |||||
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len) | |||||
) | |||||
return ids | |||||
def convert_ids_to_tokens(self, ids): | |||||
"""Converts a sequence of ids in wordpiece tokens using the vocab.""" | |||||
tokens = [] | |||||
for i in ids: | |||||
tokens.append(self.ids_to_tokens[i]) | |||||
return tokens | |||||
def save_vocabulary(self, vocab_path): | |||||
"""Save the tokenizer vocabulary to a directory or file.""" | |||||
index = 0 | |||||
if os.path.isdir(vocab_path): | |||||
vocab_file = os.path.join(vocab_path, VOCAB_NAME) | |||||
else: | else: | ||||
return None | |||||
vocab_file = vocab_path | |||||
with open(vocab_file, "w", encoding="utf-8") as writer: | |||||
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): | |||||
if index != token_index: | |||||
print("Saving vocabulary to {}: vocabulary indices are not consecutive." | |||||
" Please check that the vocabulary is not corrupted!".format(vocab_file)) | |||||
index = token_index | |||||
writer.write(token + u'\n') | |||||
index += 1 | |||||
return vocab_file | |||||
@classmethod | |||||
def from_pretrained(cls, model_dir, *inputs, **kwargs): | |||||
""" | |||||
给定path,直接读取vocab. | |||||
""" | |||||
pretrained_model_name_or_path = os.path.join(model_dir, VOCAB_NAME) | |||||
print("loading vocabulary file {}".format(pretrained_model_name_or_path)) | |||||
max_len = 512 | |||||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) | |||||
# Instantiate tokenizer. | |||||
tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs) | |||||
return tokenizer | |||||
@requires_grad.setter | |||||
def requires_grad(self, value): | |||||
for name, param in self.named_parameters(): | |||||
param.requires_grad = value | |||||
VOCAB_NAME = 'vocab.txt' | |||||
@property | |||||
def embed_size(self): | |||||
return self._embed_size | |||||
def index_datasets(self, *datasets, field_name): | |||||
class _WordPieceBertModel(nn.Module): | |||||
""" | |||||
这个模块用于直接计算word_piece的结果. | |||||
""" | |||||
def __init__(self, model_dir:str, layers:str='-1'): | |||||
super().__init__() | |||||
self.tokenzier = BertTokenizer.from_pretrained(model_dir) | |||||
self.encoder = BertModel.from_pretrained(model_dir) | |||||
# 检查encoder_layer_number是否合理 | |||||
encoder_layer_number = len(self.encoder.encoder.layer) | |||||
self.layers = list(map(int, layers.split(','))) | |||||
for layer in self.layers: | |||||
if layer<0: | |||||
assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||||
f"a bert model with {encoder_layer_number} layers." | |||||
else: | |||||
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \ | |||||
f"a bert model with {encoder_layer_number} layers." | |||||
self._cls_index = self.tokenzier.vocab['[CLS]'] | |||||
self._sep_index = self.tokenzier.vocab['[SEP]'] | |||||
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece | |||||
def index_dataset(self, *datasets, field_name): | |||||
""" | """ | ||||
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 | 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 | ||||
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 | [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 | ||||
:param datasets: DataSet对象 | :param datasets: DataSet对象 | ||||
:param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。 | |||||
:param field_name: 基于哪一列index | |||||
:return: | :return: | ||||
""" | """ | ||||
self.model.index_dataset(*datasets, field_name=field_name) | |||||
def convert_words_to_word_pieces(words): | |||||
word_pieces = [] | |||||
for word in words: | |||||
tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word) | |||||
word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens) | |||||
word_pieces.extend(word_piece_ids) | |||||
if word_pieces[0]!=self._cls_index: | |||||
word_pieces.insert(0, self._cls_index) | |||||
if word_pieces[-1]!=self._sep_index: | |||||
word_pieces.insert(-1, self._sep_index) | |||||
return word_pieces | |||||
for index, dataset in enumerate(datasets): | |||||
try: | |||||
dataset.apply_field(convert_words_to_word_pieces, field_name=field_name, new_field_name='word_pieces', | |||||
is_input=True) | |||||
dataset.set_pad_val('word_pieces', self._wordpiece_pad_index) | |||||
except Exception as e: | |||||
print(f"Exception happens when processing the {index} dataset.") | |||||
raise e | |||||
def forward(self, word_pieces, token_type_ids=None): | def forward(self, word_pieces, token_type_ids=None): | ||||
""" | """ | ||||
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 | |||||
:param words: batch_size x max_len | |||||
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话 | |||||
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) | |||||
:param word_pieces: torch.LongTensor, batch_size x max_len | |||||
:param token_type_ids: torch.LongTensor, batch_size x max_len | |||||
:return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size | |||||
""" | """ | ||||
outputs = self.model(word_pieces, token_type_ids) | |||||
outputs = torch.cat([*outputs], dim=-1) | |||||
batch_size, max_len = word_pieces.size() | |||||
attn_masks = word_pieces.ne(self._wordpiece_pad_index) | |||||
bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, | |||||
output_all_encoded_layers=True) | |||||
# output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size | |||||
outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1))) | |||||
for l_index, l in enumerate(self.layers): | |||||
outputs[l_index] = bert_outputs[l] | |||||
return outputs | return outputs | ||||
@@ -1,6 +1,5 @@ | |||||
from functools import reduce | from functools import reduce | ||||
import numpy as np | |||||
import torch | import torch | ||||
import torch.nn as nn | import torch.nn as nn | ||||
import torch.nn.init as init | import torch.nn.init as init | ||||
@@ -70,33 +69,6 @@ def initial_parameter(net, initial_method=None): | |||||
net.apply(weights_init) | net.apply(weights_init) | ||||
def get_embeddings(init_embed): | |||||
""" | |||||
根据输入的init_embed生成nn.Embedding对象。 | |||||
:param init_embed: 可以是 tuple:(num_embedings, embedding_dim), 即embedding的大小和每个词的维度;也可以传入 | |||||
nn.Embedding 对象, 此时就以传入的对象作为embedding; 传入np.ndarray也行,将使用传入的ndarray作为作为Embedding初始 | |||||
化; 传入orch.Tensor, 将使用传入的值作为Embedding初始化。 | |||||
:return nn.Embedding embeddings: | |||||
""" | |||||
if isinstance(init_embed, tuple): | |||||
res = nn.Embedding( | |||||
num_embeddings=init_embed[0], embedding_dim=init_embed[1]) | |||||
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)), | |||||
b=np.sqrt(3/res.weight.data.size(1))) | |||||
elif isinstance(init_embed, nn.Module): | |||||
res = init_embed | |||||
elif isinstance(init_embed, torch.Tensor): | |||||
res = nn.Embedding.from_pretrained(init_embed, freeze=False) | |||||
elif isinstance(init_embed, np.ndarray): | |||||
init_embed = torch.tensor(init_embed, dtype=torch.float32) | |||||
res = nn.Embedding.from_pretrained(init_embed, freeze=False) | |||||
else: | |||||
raise TypeError( | |||||
'invalid init_embed type: {}'.format((type(init_embed)))) | |||||
return res | |||||
def summary(model: nn.Module): | def summary(model: nn.Module): | ||||
""" | """ | ||||
得到模型的总参数量 | 得到模型的总参数量 | ||||
@@ -1,7 +1,7 @@ | |||||
from util import get_argparser, set_gpu, set_rng_seeds, add_model_args | |||||
from reproduction.Star_transformer.util import get_argparser, set_gpu, set_rng_seeds, add_model_args | |||||
seed = set_rng_seeds(15360) | seed = set_rng_seeds(15360) | ||||
print('RNG SEED {}'.format(seed)) | print('RNG SEED {}'.format(seed)) | ||||
from datasets import load_seqtag, load_sst, load_snli, EmbedLoader, MAX_LEN | |||||
from reproduction.Star_transformer.datasets import load_seqtag, load_sst, load_snli, EmbedLoader, MAX_LEN | |||||
import torch.nn as nn | import torch.nn as nn | ||||
import torch | import torch | ||||
import numpy as np | import numpy as np | ||||
@@ -2,7 +2,7 @@ import torch | |||||
from torch import nn | from torch import nn | ||||
from torch.nn import init | from torch.nn import init | ||||
from fastNLP.modules.encoder._bert import BertModel | |||||
from fastNLP.modules.encoder.bert import BertModel | |||||
class Classifier(nn.Module): | class Classifier(nn.Module): | ||||
@@ -12,7 +12,7 @@ from torch.nn import functional as F | |||||
from fastNLP.modules.dropout import TimestepDropout | from fastNLP.modules.dropout import TimestepDropout | ||||
from fastNLP.modules.encoder.variational_rnn import VarLSTM | from fastNLP.modules.encoder.variational_rnn import VarLSTM | ||||
from fastNLP import seq_len_to_mask | from fastNLP import seq_len_to_mask | ||||
from fastNLP.modules import Embedding | |||||
from fastNLP.embeddings import Embedding | |||||
def drop_input_independent(word_embeddings, dropout_emb): | def drop_input_independent(word_embeddings, dropout_emb): | ||||
@@ -2,15 +2,15 @@ import sys | |||||
sys.path.append('../..') | sys.path.append('../..') | ||||
from reproduction.joint_cws_parse.data.data_loader import CTBxJointLoader | from reproduction.joint_cws_parse.data.data_loader import CTBxJointLoader | ||||
from fastNLP.modules.encoder.embedding import StaticEmbedding | |||||
from fastNLP.embeddings.static_embedding import StaticEmbedding | |||||
from torch import nn | from torch import nn | ||||
from functools import partial | from functools import partial | ||||
from reproduction.joint_cws_parse.models.CharParser import CharParser | from reproduction.joint_cws_parse.models.CharParser import CharParser | ||||
from reproduction.joint_cws_parse.models.metrics import SegAppCharParseF1Metric, CWSMetric | from reproduction.joint_cws_parse.models.metrics import SegAppCharParseF1Metric, CWSMetric | ||||
from fastNLP import cache_results, BucketSampler, Trainer | |||||
from fastNLP import BucketSampler, Trainer | |||||
from torch import optim | from torch import optim | ||||
from reproduction.joint_cws_parse.models.callbacks import DevCallback, OptimizerCallback | |||||
from torch.optim.lr_scheduler import LambdaLR, StepLR | |||||
from reproduction.joint_cws_parse.models.callbacks import DevCallback | |||||
from torch.optim.lr_scheduler import StepLR | |||||
from fastNLP import Tester | from fastNLP import Tester | ||||
from fastNLP import GradientClipCallback, LRScheduler | from fastNLP import GradientClipCallback, LRScheduler | ||||
import os | import os | ||||
@@ -1,5 +1,7 @@ | |||||
# Prototype | # Prototype | ||||
这是一个很旧版本的reproduction,待修改 | |||||
## Word2Idx.py | ## Word2Idx.py | ||||
A mapping model between words and indexes | A mapping model between words and indexes | ||||
@@ -1,6 +1,9 @@ | |||||
# 这是一个很旧版本的代码 | |||||
""" | |||||
import torch.nn.functional as F | import torch.nn.functional as F | ||||
from fastNLP.core.trainer import ClassificationTrainer | |||||
from fastNLP.core.trainer import Trainer | |||||
from fastNLP.core.utils import ClassPreprocess as Preprocess | from fastNLP.core.utils import ClassPreprocess as Preprocess | ||||
from fastNLP.io.config_io import ConfigLoader | from fastNLP.io.config_io import ConfigLoader | ||||
from fastNLP.io.config_io import ConfigSection | from fastNLP.io.config_io import ConfigSection | ||||
@@ -8,7 +11,7 @@ from fastNLP.io.dataset_loader import DummyClassificationReader as Dataset_loade | |||||
from fastNLP.models.base_model import BaseModel | from fastNLP.models.base_model import BaseModel | ||||
from fastNLP.modules.aggregator.self_attention import SelfAttention | from fastNLP.modules.aggregator.self_attention import SelfAttention | ||||
from fastNLP.modules.decoder.mlp import MLP | from fastNLP.modules.decoder.mlp import MLP | ||||
from fastNLP.modules.encoder.embedding import Embedding as Embedding | |||||
from fastNLP.embeddings.embedding import Embedding as Embedding | |||||
from fastNLP.modules.encoder.lstm import LSTM | from fastNLP.modules.encoder.lstm import LSTM | ||||
train_data_path = 'small_train_data.txt' | train_data_path = 'small_train_data.txt' | ||||
@@ -61,12 +64,13 @@ class SELF_ATTENTION_YELP_CLASSIFICATION(BaseModel): | |||||
train_args = ConfigSection() | train_args = ConfigSection() | ||||
ConfigLoader("good path").load_config('config.cfg',{"train": train_args}) | ConfigLoader("good path").load_config('config.cfg',{"train": train_args}) | ||||
train_args['vocab'] = len(word2index) | |||||
# train_args['vocab'] = len(word2index) | |||||
trainer = ClassificationTrainer(**train_args.data) | |||||
trainer = Trainer(**train_args.data) | |||||
# for k in train_args.__dict__.keys(): | # for k in train_args.__dict__.keys(): | ||||
# print(k, train_args[k]) | # print(k, train_args[k]) | ||||
model = SELF_ATTENTION_YELP_CLASSIFICATION(train_args) | model = SELF_ATTENTION_YELP_CLASSIFICATION(train_args) | ||||
trainer.train(model,train_data , dev_data) | |||||
trainer.train() | |||||
""" |
@@ -1,3 +1,7 @@ | |||||
""" | |||||
这个文件的内容已合并到fastNLP.io.data_loader里,这个文件的内容不再更新 | |||||
""" | |||||
import os | import os | ||||
@@ -3,9 +3,8 @@ import numpy as np | |||||
import torch | import torch | ||||
from fastNLP.core import Trainer, Tester, AccuracyMetric, Const, Adam | from fastNLP.core import Trainer, Tester, AccuracyMetric, Const, Adam | ||||
from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader | |||||
from reproduction.matching.data.MatchingDataLoader import SNLILoader, RTELoader, \ | |||||
MNLILoader, QNLILoader, QuoraLoader | |||||
from reproduction.matching.model.bert import BertForNLI | from reproduction.matching.model.bert import BertForNLI | ||||
@@ -1,11 +1,10 @@ | |||||
import argparse | import argparse | ||||
import torch | import torch | ||||
import os | |||||
from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const | from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const | ||||
from fastNLP.modules.encoder.embedding import StaticEmbedding | |||||
from fastNLP.embeddings import StaticEmbedding | |||||
from fastNLP.io.data_loader import QNLILoader, RTELoader, SNLILoader, MNLILoader | |||||
from reproduction.matching.data.MatchingDataLoader import QNLILoader, RTELoader, SNLILoader, MNLILoader | |||||
from reproduction.matching.model.cntn import CNTNModel | from reproduction.matching.model.cntn import CNTNModel | ||||
# define hyper-parameters | # define hyper-parameters | ||||
@@ -7,11 +7,10 @@ from torch.optim.lr_scheduler import StepLR | |||||
from fastNLP.core import Trainer, Tester, AccuracyMetric, Const | from fastNLP.core import Trainer, Tester, AccuracyMetric, Const | ||||
from fastNLP.core.callback import GradientClipCallback, LRScheduler | from fastNLP.core.callback import GradientClipCallback, LRScheduler | ||||
from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding | |||||
from reproduction.matching.data.MatchingDataLoader import SNLILoader, RTELoader, \ | |||||
MNLILoader, QNLILoader, QuoraLoader | |||||
from reproduction.matching.model.esim import ESIMModel | |||||
from fastNLP.embeddings.static_embedding import StaticEmbedding | |||||
from fastNLP.embeddings.elmo_embedding import ElmoEmbedding | |||||
from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader | |||||
from fastNLP.models.snli import ESIM | |||||
# define hyper-parameters | # define hyper-parameters | ||||
@@ -81,7 +80,7 @@ else: | |||||
raise RuntimeError(f'NOT support {arg.embedding} embedding yet!') | raise RuntimeError(f'NOT support {arg.embedding} embedding yet!') | ||||
# define model | # define model | ||||
model = ESIMModel(embedding, num_labels=len(data_info.vocabs[Const.TARGET])) | |||||
model = ESIM(embedding, num_labels=len(data_info.vocabs[Const.TARGET])) | |||||
# define optimizer and callback | # define optimizer and callback | ||||
optimizer = Adamax(lr=arg.lr, params=model.parameters()) | optimizer = Adamax(lr=arg.lr, params=model.parameters()) | ||||
@@ -1,23 +1,17 @@ | |||||
import sys | |||||
import os | |||||
import random | import random | ||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
from torch.optim import Adadelta, SGD | |||||
from torch.optim import Adadelta | |||||
from torch.optim.lr_scheduler import StepLR | from torch.optim.lr_scheduler import StepLR | ||||
from tqdm import tqdm | |||||
from fastNLP import CrossEntropyLoss | from fastNLP import CrossEntropyLoss | ||||
from fastNLP import cache_results | from fastNLP import cache_results | ||||
from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const | |||||
from fastNLP.core.predictor import Predictor | |||||
from fastNLP.core.callback import GradientClipCallback, LRScheduler, FitlogCallback | |||||
from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding | |||||
from fastNLP.core import Trainer, Tester, AccuracyMetric, Const | |||||
from fastNLP.core.callback import LRScheduler, FitlogCallback | |||||
from fastNLP.embeddings import StaticEmbedding | |||||
from fastNLP.io.data_loader import MNLILoader, QNLILoader, QuoraLoader, SNLILoader, RTELoader | |||||
from fastNLP.io.data_loader import MNLILoader, QNLILoader, SNLILoader, RTELoader | |||||
from reproduction.matching.model.mwan import MwanModel | from reproduction.matching.model.mwan import MwanModel | ||||
import fitlog | import fitlog | ||||
@@ -4,7 +4,7 @@ import torch.nn as nn | |||||
from fastNLP.core.const import Const | from fastNLP.core.const import Const | ||||
from fastNLP.models import BaseModel | from fastNLP.models import BaseModel | ||||
from fastNLP.modules.encoder.bert import BertModel | |||||
from fastNLP.embeddings.bert import BertModel | |||||
class BertForNLI(BaseModel): | class BertForNLI(BaseModel): | ||||
@@ -6,7 +6,7 @@ import numpy as np | |||||
from torch.nn import CrossEntropyLoss | from torch.nn import CrossEntropyLoss | ||||
from fastNLP.models import BaseModel | from fastNLP.models import BaseModel | ||||
from fastNLP.modules.encoder.embedding import TokenEmbedding | |||||
from fastNLP.embeddings.embedding import TokenEmbedding | |||||
from fastNLP.core.const import Const | from fastNLP.core.const import Const | ||||
@@ -5,8 +5,7 @@ import torch.nn.functional as F | |||||
from torch.nn import CrossEntropyLoss | from torch.nn import CrossEntropyLoss | ||||
from fastNLP.models import BaseModel | from fastNLP.models import BaseModel | ||||
from fastNLP.modules.encoder.embedding import TokenEmbedding | |||||
from fastNLP.modules.encoder.lstm import LSTM | |||||
from fastNLP.embeddings.embedding import TokenEmbedding | |||||
from fastNLP.core.const import Const | from fastNLP.core.const import Const | ||||
from fastNLP.core.utils import seq_len_to_mask | from fastNLP.core.utils import seq_len_to_mask | ||||
@@ -0,0 +1,78 @@ | |||||
""" | |||||
使用Bert进行中文命名实体识别 | |||||
""" | |||||
import sys | |||||
sys.path.append('../../../') | |||||
from torch import nn | |||||
from fastNLP.embeddings import BertEmbedding, Embedding | |||||
from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader | |||||
from fastNLP import Trainer, Const | |||||
from fastNLP import BucketSampler, SpanFPreRecMetric, GradientClipCallback | |||||
from fastNLP.modules import MLP | |||||
from fastNLP.core.callback import WarmupCallback | |||||
from fastNLP import CrossEntropyLoss | |||||
from fastNLP.core.optimizer import AdamW | |||||
import os | |||||
from fastNLP import cache_results | |||||
encoding_type = 'bio' | |||||
@cache_results('caches/msra.pkl') | |||||
def get_data(): | |||||
data = ChineseNERLoader(encoding_type=encoding_type).process("MSRA/") | |||||
return data | |||||
data = get_data() | |||||
print(data) | |||||
class BertCNNER(nn.Module): | |||||
def __init__(self, embed, tag_size): | |||||
super().__init__() | |||||
self.embedding = Embedding(embed, dropout=0.1) | |||||
self.tag_size = tag_size | |||||
self.mlp = MLP(size_layer=[self.embedding.embedding_dim, tag_size]) | |||||
def forward(self, chars): | |||||
# batch_size, max_len = words.size() | |||||
chars = self.embedding(chars) | |||||
outputs = self.mlp(chars) | |||||
return {Const.OUTPUT: outputs} | |||||
embed = BertEmbedding(data.vocabs[Const.CHAR_INPUT], model_dir_or_name='en-base', | |||||
pool_method='max', requires_grad=True, layers='11') | |||||
for name, dataset in data.datasets.items(): | |||||
dataset.set_pad_val(Const.TARGET, -100) | |||||
callbacks = [ | |||||
GradientClipCallback(clip_type='norm', clip_value=1), | |||||
WarmupCallback(warmup=0.1, schedule='linear') | |||||
] | |||||
model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) | |||||
optimizer = AdamW(model.parameters(), lr=1e-4) | |||||
for name, dataset in data.datasets.items(): | |||||
original_len = len(dataset) | |||||
dataset.drop(lambda x:x['seq_len']>256, inplace=True) | |||||
clipped_len = len(dataset) | |||||
print("Delete {} instances in {}.".format(original_len-clipped_len, name)) | |||||
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' | |||||
trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), | |||||
device=[0, 1], dev_data=data.datasets['test'], batch_size=20, | |||||
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), | |||||
loss=CrossEntropyLoss(reduction='sum'), | |||||
callbacks=callbacks, num_workers=2, n_epochs=5, | |||||
check_code_level=-1, update_every=3) | |||||
trainer.train() | |||||
@@ -2,11 +2,11 @@ | |||||
from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader | from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader | ||||
from fastNLP.modules.encoder.embedding import StaticEmbedding | |||||
from fastNLP.embeddings import StaticEmbedding | |||||
from torch import nn | from torch import nn | ||||
import torch | import torch | ||||
from fastNLP.modules import get_embeddings | |||||
from fastNLP.embeddings.utils import get_embeddings | |||||
from fastNLP.modules import LSTM | from fastNLP.modules import LSTM | ||||
from fastNLP.modules import ConditionalRandomField | from fastNLP.modules import ConditionalRandomField | ||||
from fastNLP.modules import allowed_transitions | from fastNLP.modules import allowed_transitions | ||||
@@ -73,13 +73,13 @@ class CNBiLSTMCRFNER(nn.Module): | |||||
return self._forward(chars, bigrams, trigrams, seq_len) | return self._forward(chars, bigrams, trigrams, seq_len) | ||||
# data_bundle = pickle.load(open('caches/msra.pkl', 'rb')) | # data_bundle = pickle.load(open('caches/msra.pkl', 'rb')) | ||||
@cache_results('caches/msra.pkl', _refresh=False) | |||||
@cache_results('caches/msra.pkl', _refresh=True) | |||||
def get_data(): | def get_data(): | ||||
data_bundle = ChineseNERLoader().process('/remote-home/hyan01/exps/fastNLP/others/data/MSRA-NER', bigrams=True) | |||||
data_bundle = ChineseNERLoader().process('MSRA-NER/', bigrams=True) | |||||
char_embed = StaticEmbedding(data_bundle.vocabs['chars'], | char_embed = StaticEmbedding(data_bundle.vocabs['chars'], | ||||
model_dir_or_name='/remote-home/hyan01/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt') | |||||
model_dir_or_name='cn-char') | |||||
bigram_embed = StaticEmbedding(data_bundle.vocabs['bigrams'], | bigram_embed = StaticEmbedding(data_bundle.vocabs['bigrams'], | ||||
model_dir_or_name='/remote-home/hyan01/exps/CWS/pretrain/vectors/2gram_t3_m50_merge.txt') | |||||
model_dir_or_name='cn-bigram') | |||||
return data_bundle, char_embed, bigram_embed | return data_bundle, char_embed, bigram_embed | ||||
data_bundle, char_embed, bigram_embed = get_data() | data_bundle, char_embed, bigram_embed = get_data() | ||||
print(data_bundle) | print(data_bundle) | ||||
@@ -1,6 +1,6 @@ | |||||
from torch import nn | from torch import nn | ||||
import torch | import torch | ||||
from fastNLP.modules import Embedding | |||||
from fastNLP.embeddings import Embedding | |||||
import numpy as np | import numpy as np | ||||
from reproduction.seqence_labelling.cws.model.module import FeatureFunMax, SemiCRFShiftRelay | from reproduction.seqence_labelling.cws.model.module import FeatureFunMax, SemiCRFShiftRelay | ||||
from fastNLP.modules import LSTM | from fastNLP.modules import LSTM | ||||
@@ -1,7 +1,7 @@ | |||||
import sys | import sys | ||||
sys.path.append('../../..') | sys.path.append('../../..') | ||||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, StackEmbedding | |||||
from fastNLP.embeddings.embedding import CNNCharEmbedding, StaticEmbedding | |||||
from fastNLP.core.vocabulary import VocabularyOption | from fastNLP.core.vocabulary import VocabularyOption | ||||
from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | ||||
@@ -9,13 +9,11 @@ from fastNLP import Trainer | |||||
from fastNLP import SpanFPreRecMetric | from fastNLP import SpanFPreRecMetric | ||||
from fastNLP import BucketSampler | from fastNLP import BucketSampler | ||||
from fastNLP import Const | from fastNLP import Const | ||||
from torch.optim import SGD, Adam | |||||
from torch.optim import SGD | |||||
from fastNLP import GradientClipCallback | from fastNLP import GradientClipCallback | ||||
from fastNLP.core.callback import FitlogCallback, LRScheduler | from fastNLP.core.callback import FitlogCallback, LRScheduler | ||||
from torch.optim.lr_scheduler import LambdaLR | from torch.optim.lr_scheduler import LambdaLR | ||||
from fastNLP.core.optimizer import AdamW | |||||
# from reproduction.seqence_labelling.ner.model.swats import SWATS | # from reproduction.seqence_labelling.ner.model.swats import SWATS | ||||
from reproduction.seqence_labelling.chinese_ner.callbacks import SaveModelCallback | |||||
from fastNLP import cache_results | from fastNLP import cache_results | ||||
import fitlog | import fitlog | ||||
@@ -1,21 +1,18 @@ | |||||
from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader | from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader | ||||
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader | |||||
from fastNLP.core.callback import FitlogCallback, LRScheduler | |||||
from fastNLP.core.callback import LRScheduler | |||||
from fastNLP import GradientClipCallback | from fastNLP import GradientClipCallback | ||||
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR | |||||
from torch.optim import SGD, Adam | |||||
from torch.optim.lr_scheduler import LambdaLR | |||||
from torch.optim import Adam | |||||
from fastNLP import Const | from fastNLP import Const | ||||
from fastNLP import RandomSampler, BucketSampler | |||||
from fastNLP import BucketSampler | |||||
from fastNLP import SpanFPreRecMetric | from fastNLP import SpanFPreRecMetric | ||||
from fastNLP import Trainer, Tester | from fastNLP import Trainer, Tester | ||||
from fastNLP.core.metrics import MetricBase | from fastNLP.core.metrics import MetricBase | ||||
from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN | from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN | ||||
from fastNLP.core.utils import Option | from fastNLP.core.utils import Option | ||||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding | |||||
from fastNLP.embeddings.embedding import StaticEmbedding | |||||
from fastNLP.core.utils import cache_results | from fastNLP.core.utils import cache_results | ||||
from fastNLP.core.vocabulary import VocabularyOption | from fastNLP.core.vocabulary import VocabularyOption | ||||
import fitlog | |||||
import sys | |||||
import torch.cuda | import torch.cuda | ||||
import os | import os | ||||
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' | os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' | ||||
@@ -2,14 +2,13 @@ import sys | |||||
sys.path.append('../../..') | sys.path.append('../../..') | ||||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding | |||||
from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding | |||||
from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF | ||||
from fastNLP import Trainer | from fastNLP import Trainer | ||||
from fastNLP import SpanFPreRecMetric | from fastNLP import SpanFPreRecMetric | ||||
from fastNLP import BucketSampler | |||||
from fastNLP import Const | from fastNLP import Const | ||||
from torch.optim import SGD, Adam | |||||
from torch.optim import SGD | |||||
from torch.optim.lr_scheduler import LambdaLR | from torch.optim.lr_scheduler import LambdaLR | ||||
from fastNLP import GradientClipCallback | from fastNLP import GradientClipCallback | ||||
from fastNLP.core.vocabulary import VocabularyOption | from fastNLP.core.vocabulary import VocabularyOption | ||||
@@ -1,7 +1,7 @@ | |||||
import torch | import torch | ||||
import torch.nn as nn | import torch.nn as nn | ||||
from torch.autograd import Variable | from torch.autograd import Variable | ||||
from fastNLP.modules.utils import get_embeddings | |||||
from fastNLP.embeddings.utils import get_embeddings | |||||
from fastNLP.core import Const as C | from fastNLP.core import Const as C | ||||
@@ -2,7 +2,7 @@ import torch | |||||
import torch.nn as nn | import torch.nn as nn | ||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
from .awdlstm_module import LSTM | from .awdlstm_module import LSTM | ||||
from fastNLP.modules import encoder | |||||
from fastNLP.embeddings.utils import get_embeddings | |||||
from fastNLP.modules.decoder.mlp import MLP | from fastNLP.modules.decoder.mlp import MLP | ||||
@@ -14,7 +14,7 @@ class AWDLSTMSentiment(nn.Module): | |||||
nfc=128, | nfc=128, | ||||
wdrop=0.5): | wdrop=0.5): | ||||
super(AWDLSTMSentiment,self).__init__() | super(AWDLSTMSentiment,self).__init__() | ||||
self.embed = encoder.Embedding(init_embed) | |||||
self.embed = get_embeddings(init_embed) | |||||
self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True, wdrop=wdrop) | self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True, wdrop=wdrop) | ||||
self.mlp = MLP(size_layer=[hidden_dim* 2, nfc, num_classes]) | self.mlp = MLP(size_layer=[hidden_dim* 2, nfc, num_classes]) | ||||
@@ -1,6 +1,6 @@ | |||||
import torch | import torch | ||||
import torch.nn as nn | import torch.nn as nn | ||||
from fastNLP.modules.utils import get_embeddings | |||||
from fastNLP.embeddings.utils import get_embeddings | |||||
from fastNLP.core import Const as C | from fastNLP.core import Const as C | ||||
@@ -2,7 +2,7 @@ import torch | |||||
import torch.nn as nn | import torch.nn as nn | ||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
from fastNLP.modules.encoder.lstm import LSTM | from fastNLP.modules.encoder.lstm import LSTM | ||||
from fastNLP.modules import get_embeddings | |||||
from fastNLP.embeddings.utils import get_embeddings | |||||
from fastNLP.modules.decoder.mlp import MLP | from fastNLP.modules.decoder.mlp import MLP | ||||
@@ -2,8 +2,8 @@ import torch | |||||
import torch.nn as nn | import torch.nn as nn | ||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
from fastNLP.modules.encoder.lstm import LSTM | from fastNLP.modules.encoder.lstm import LSTM | ||||
from fastNLP.modules import encoder | |||||
from fastNLP.modules.aggregator.attention import SelfAttention | |||||
from fastNLP.embeddings.utils import get_embeddings | |||||
from fastNLP.modules.encoder.attention import SelfAttention | |||||
from fastNLP.modules.decoder.mlp import MLP | from fastNLP.modules.decoder.mlp import MLP | ||||
@@ -16,7 +16,7 @@ class BiLSTM_SELF_ATTENTION(nn.Module): | |||||
attention_hops=1, | attention_hops=1, | ||||
nfc=128): | nfc=128): | ||||
super(BiLSTM_SELF_ATTENTION,self).__init__() | super(BiLSTM_SELF_ATTENTION,self).__init__() | ||||
self.embed = encoder.Embedding(init_embed) | |||||
self.embed = get_embeddings(init_embed) | |||||
self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True) | self.lstm = LSTM(input_size=self.embed.embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, bidirectional=True) | ||||
self.attention = SelfAttention(input_size=hidden_dim * 2 , attention_unit=attention_unit, attention_hops=attention_hops) | self.attention = SelfAttention(input_size=hidden_dim * 2 , attention_unit=attention_unit, attention_hops=attention_hops) | ||||
self.mlp = MLP(size_layer=[hidden_dim* 2*attention_hops, nfc, num_classes]) | self.mlp = MLP(size_layer=[hidden_dim* 2*attention_hops, nfc, num_classes]) | ||||
@@ -9,11 +9,9 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" | |||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
from fastNLP.core import LRScheduler | from fastNLP.core import LRScheduler | ||||
import torch.nn as nn | |||||
from fastNLP.io.dataset_loader import SSTLoader | |||||
from reproduction.text_classification.data.yelpLoader import yelpLoader | |||||
from fastNLP.io.data_loader import YelpLoader | |||||
from reproduction.text_classification.model.HAN import HANCLS | from reproduction.text_classification.model.HAN import HANCLS | ||||
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding | |||||
from fastNLP.embeddings import StaticEmbedding | |||||
from fastNLP import CrossEntropyLoss, AccuracyMetric | from fastNLP import CrossEntropyLoss, AccuracyMetric | ||||
from fastNLP.core.trainer import Trainer | from fastNLP.core.trainer import Trainer | ||||
from torch.optim import SGD | from torch.optim import SGD | ||||
@@ -44,7 +42,7 @@ ops = Config() | |||||
##1.task相关信息:利用dataloader载入dataInfo | ##1.task相关信息:利用dataloader载入dataInfo | ||||
datainfo = yelpLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train']) | |||||
datainfo = YelpLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train']) | |||||
print(len(datainfo.datasets['train'])) | print(len(datainfo.datasets['train'])) | ||||
print(len(datainfo.datasets['test'])) | print(len(datainfo.datasets['test'])) | ||||
@@ -5,20 +5,13 @@ import os | |||||
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' | os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' | ||||
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' | os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' | ||||
import torch.nn as nn | |||||
from data.IMDBLoader import IMDBLoader | |||||
from fastNLP.modules.encoder.embedding import StaticEmbedding | |||||
from fastNLP.io.data_loader import IMDBLoader | |||||
from fastNLP.embeddings import StaticEmbedding | |||||
from model.awd_lstm import AWDLSTMSentiment | from model.awd_lstm import AWDLSTMSentiment | ||||
from fastNLP.core.const import Const as C | |||||
from fastNLP import CrossEntropyLoss, AccuracyMetric | from fastNLP import CrossEntropyLoss, AccuracyMetric | ||||
from fastNLP import Trainer, Tester | |||||
from fastNLP import Trainer | |||||
from torch.optim import Adam | from torch.optim import Adam | ||||
from fastNLP.io.model_io import ModelLoader, ModelSaver | |||||
import argparse | |||||
class Config(): | class Config(): | ||||
@@ -2,7 +2,7 @@ import sys | |||||
sys.path.append('../../') | sys.path.append('../../') | ||||
from reproduction.text_classification.data.IMDBLoader import IMDBLoader | from reproduction.text_classification.data.IMDBLoader import IMDBLoader | ||||
from fastNLP.modules.encoder.embedding import BertEmbedding | |||||
from fastNLP.embeddings import BertEmbedding | |||||
from reproduction.text_classification.model.lstm import BiLSTMSentiment | from reproduction.text_classification.model.lstm import BiLSTMSentiment | ||||
from fastNLP import Trainer | from fastNLP import Trainer | ||||
from fastNLP import CrossEntropyLoss, AccuracyMetric | from fastNLP import CrossEntropyLoss, AccuracyMetric | ||||
@@ -23,7 +23,7 @@ data_bundle.datasets['train'].drop(lambda x:len(x['words'])>400) | |||||
data_bundle.datasets['dev'].drop(lambda x:len(x['words'])>400) | data_bundle.datasets['dev'].drop(lambda x:len(x['words'])>400) | ||||
data_bundle.datasets['test'].drop(lambda x:len(x['words'])>400) | data_bundle.datasets['test'].drop(lambda x:len(x['words'])>400) | ||||
bert_embed = BertEmbedding(data_bundle.vocabs['words'], requires_grad=False, | bert_embed = BertEmbedding(data_bundle.vocabs['words'], requires_grad=False, | ||||
model_dir_or_name="en-base") | |||||
model_dir_or_name="en-base-uncased") | |||||
model = BiLSTMSentiment(bert_embed, len(data_bundle.vocabs['target'])) | model = BiLSTMSentiment(bert_embed, len(data_bundle.vocabs['target'])) | ||||
Trainer(data_bundle.datasets['train'], model, optimizer=None, loss=CrossEntropyLoss(), device=0, | Trainer(data_bundle.datasets['train'], model, optimizer=None, loss=CrossEntropyLoss(), device=0, | ||||
@@ -7,23 +7,17 @@ import sys | |||||
sys.path.append('../..') | sys.path.append('../..') | ||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
import torch.nn as nn | import torch.nn as nn | ||||
from data.yelpLoader import yelpLoader | |||||
from fastNLP.io.data_loader import YelpLoader | |||||
#from data.sstLoader import sst2Loader | #from data.sstLoader import sst2Loader | ||||
from fastNLP.io.data_loader.sst import SST2Loader | |||||
from data.IMDBLoader import IMDBLoader | |||||
from model.char_cnn import CharacterLevelCNN | from model.char_cnn import CharacterLevelCNN | ||||
from fastNLP.core.vocabulary import Vocabulary | |||||
from fastNLP.models.cnn_text_classification import CNNText | |||||
from fastNLP.modules.encoder.embedding import CNNCharEmbedding,StaticEmbedding,StackEmbedding,LSTMCharEmbedding | |||||
from fastNLP import CrossEntropyLoss, AccuracyMetric | from fastNLP import CrossEntropyLoss, AccuracyMetric | ||||
from fastNLP.core.trainer import Trainer | from fastNLP.core.trainer import Trainer | ||||
from torch.optim import SGD | from torch.optim import SGD | ||||
from torch.autograd import Variable | from torch.autograd import Variable | ||||
import torch | import torch | ||||
from fastNLP import BucketSampler | |||||
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR | |||||
from torch.optim.lr_scheduler import LambdaLR | |||||
from fastNLP.core import LRScheduler | from fastNLP.core import LRScheduler | ||||
from utils.util_init import set_rng_seeds | |||||
##hyper | ##hyper | ||||
#todo 这里加入fastnlp的记录 | #todo 这里加入fastnlp的记录 | ||||
@@ -117,7 +111,7 @@ ops=Config | |||||
##1.task相关信息:利用dataloader载入dataInfo | ##1.task相关信息:利用dataloader载入dataInfo | ||||
#dataloader=SST2Loader() | #dataloader=SST2Loader() | ||||
#dataloader=IMDBLoader() | #dataloader=IMDBLoader() | ||||
dataloader=yelpLoader(fine_grained=True) | |||||
dataloader=YelpLoader(fine_grained=True) | |||||
datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False) | datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False) | ||||
char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"] | char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"] | ||||
ops.number_of_characters=len(char_vocab) | ops.number_of_characters=len(char_vocab) | ||||
@@ -3,15 +3,14 @@ | |||||
import torch.cuda | import torch.cuda | ||||
from fastNLP.core.utils import cache_results | from fastNLP.core.utils import cache_results | ||||
from torch.optim import SGD | from torch.optim import SGD | ||||
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR | |||||
from torch.optim.lr_scheduler import CosineAnnealingLR | |||||
from fastNLP.core.trainer import Trainer | from fastNLP.core.trainer import Trainer | ||||
from fastNLP import CrossEntropyLoss, AccuracyMetric | from fastNLP import CrossEntropyLoss, AccuracyMetric | ||||
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding | |||||
from fastNLP.embeddings import StaticEmbedding | |||||
from reproduction.text_classification.model.dpcnn import DPCNN | from reproduction.text_classification.model.dpcnn import DPCNN | ||||
from data.yelpLoader import yelpLoader | |||||
from fastNLP.io.data_loader import YelpLoader | |||||
from fastNLP.core.sampler import BucketSampler | from fastNLP.core.sampler import BucketSampler | ||||
import torch.nn as nn | |||||
from fastNLP.core import LRScheduler, Callback | |||||
from fastNLP.core import LRScheduler | |||||
from fastNLP.core.const import Const as C | from fastNLP.core.const import Const as C | ||||
from fastNLP.core.vocabulary import VocabularyOption | from fastNLP.core.vocabulary import VocabularyOption | ||||
from utils.util_init import set_rng_seeds | from utils.util_init import set_rng_seeds | ||||
@@ -59,7 +58,7 @@ print('RNG SEED: {}'.format(ops.seed)) | |||||
@cache_results(ops.model_dir_or_name+'-data-cache') | @cache_results(ops.model_dir_or_name+'-data-cache') | ||||
def load_data(): | def load_data(): | ||||
datainfo = yelpLoader(fine_grained=True, lower=True).process( | |||||
datainfo = YelpLoader(fine_grained=True, lower=True).process( | |||||
paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op) | paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op) | ||||
for ds in datainfo.datasets.values(): | for ds in datainfo.datasets.values(): | ||||
ds.apply_field(len, C.INPUT, C.INPUT_LEN) | ds.apply_field(len, C.INPUT, C.INPUT_LEN) | ||||
@@ -3,20 +3,13 @@ import os | |||||
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' | os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' | ||||
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' | os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' | ||||
import torch.nn as nn | |||||
from data.IMDBLoader import IMDBLoader | |||||
from fastNLP.modules.encoder.embedding import StaticEmbedding | |||||
from fastNLP.io.data_loader import IMDBLoader | |||||
from fastNLP.embeddings import StaticEmbedding | |||||
from model.lstm import BiLSTMSentiment | from model.lstm import BiLSTMSentiment | ||||
from fastNLP.core.const import Const as C | |||||
from fastNLP import CrossEntropyLoss, AccuracyMetric | from fastNLP import CrossEntropyLoss, AccuracyMetric | ||||
from fastNLP import Trainer, Tester | |||||
from fastNLP import Trainer | |||||
from torch.optim import Adam | from torch.optim import Adam | ||||
from fastNLP.io.model_io import ModelLoader, ModelSaver | |||||
import argparse | |||||
class Config(): | class Config(): | ||||
@@ -3,20 +3,13 @@ import os | |||||
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' | os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' | ||||
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' | os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' | ||||
import torch.nn as nn | |||||
from data.IMDBLoader import IMDBLoader | |||||
from fastNLP.modules.encoder.embedding import StaticEmbedding | |||||
from fastNLP.io.data_loader import IMDBLoader | |||||
from fastNLP.embeddings import StaticEmbedding | |||||
from model.lstm_self_attention import BiLSTM_SELF_ATTENTION | from model.lstm_self_attention import BiLSTM_SELF_ATTENTION | ||||
from fastNLP.core.const import Const as C | |||||
from fastNLP import CrossEntropyLoss, AccuracyMetric | from fastNLP import CrossEntropyLoss, AccuracyMetric | ||||
from fastNLP import Trainer, Tester | |||||
from fastNLP import Trainer | |||||
from torch.optim import Adam | from torch.optim import Adam | ||||
from fastNLP.io.model_io import ModelLoader, ModelSaver | |||||
import argparse | |||||
class Config(): | class Config(): | ||||
@@ -0,0 +1,26 @@ | |||||
import unittest | |||||
import torch | |||||
from fastNLP import Vocabulary, DataSet, Instance | |||||
from fastNLP.embeddings.char_embedding import LSTMCharEmbedding, CNNCharEmbedding | |||||
class TestCharEmbed(unittest.TestCase): | |||||
def test_case_1(self): | |||||
ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])]) | |||||
vocab = Vocabulary().from_dataset(ds, field_name='words') | |||||
self.assertEqual(len(vocab), 5) | |||||
embed = LSTMCharEmbedding(vocab, embed_size=60) | |||||
x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) | |||||
y = embed(x) | |||||
self.assertEqual(tuple(y.size()), (2, 3, 60)) | |||||
def test_case_2(self): | |||||
ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])]) | |||||
vocab = Vocabulary().from_dataset(ds, field_name='words') | |||||
self.assertEqual(len(vocab), 5) | |||||
embed = CNNCharEmbedding(vocab, embed_size=60) | |||||
x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) | |||||
y = embed(x) | |||||
self.assertEqual(tuple(y.size()), (2, 3, 60)) |
@@ -0,0 +1,20 @@ | |||||
import unittest | |||||
import torch | |||||
from fastNLP import Vocabulary, DataSet, Instance | |||||
from fastNLP.embeddings import LSTMCharEmbedding, CNNCharEmbedding, StackEmbedding | |||||
class TestCharEmbed(unittest.TestCase): | |||||
def test_case_1(self): | |||||
ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['hello', 'Jack'])]) | |||||
vocab = Vocabulary().from_dataset(ds, field_name='words') | |||||
self.assertEqual(len(vocab), 5) | |||||
cnn_embed = CNNCharEmbedding(vocab, embed_size=60) | |||||
lstm_embed = LSTMCharEmbedding(vocab, embed_size=70) | |||||
embed = StackEmbedding([cnn_embed, lstm_embed]) | |||||
x = torch.LongTensor([[2, 1, 0], [4, 3, 4]]) | |||||
y = embed(x) | |||||
self.assertEqual(tuple(y.size()), (2, 3, 130)) | |||||
@@ -16,7 +16,7 @@ class TestEmbedLoader(unittest.TestCase): | |||||
self.assertEqual(g_m.shape, (4, 50)) | self.assertEqual(g_m.shape, (4, 50)) | ||||
w_m = EmbedLoader.load_with_vocab(word2vec, vocab, normalize=True) | w_m = EmbedLoader.load_with_vocab(word2vec, vocab, normalize=True) | ||||
self.assertEqual(w_m.shape, (4, 50)) | self.assertEqual(w_m.shape, (4, 50)) | ||||
self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 4) | |||||
self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 4, delta=1e-4) | |||||
def test_load_without_vocab(self): | def test_load_without_vocab(self): | ||||
words = ['the', 'of', 'in', 'a', 'to', 'and'] | words = ['the', 'of', 'in', 'a', 'to', 'and'] | ||||
@@ -28,13 +28,13 @@ class TestEmbedLoader(unittest.TestCase): | |||||
self.assertIn(word, vocab) | self.assertIn(word, vocab) | ||||
w_m, vocab = EmbedLoader.load_without_vocab(word2vec, normalize=True) | w_m, vocab = EmbedLoader.load_without_vocab(word2vec, normalize=True) | ||||
self.assertEqual(w_m.shape, (8, 50)) | self.assertEqual(w_m.shape, (8, 50)) | ||||
self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 8) | |||||
self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 8, delta=1e-4) | |||||
for word in words: | for word in words: | ||||
self.assertIn(word, vocab) | self.assertIn(word, vocab) | ||||
# no unk | # no unk | ||||
w_m, vocab = EmbedLoader.load_without_vocab(word2vec, normalize=True, unknown=None) | w_m, vocab = EmbedLoader.load_without_vocab(word2vec, normalize=True, unknown=None) | ||||
self.assertEqual(w_m.shape, (7, 50)) | self.assertEqual(w_m.shape, (7, 50)) | ||||
self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 7) | |||||
self.assertAlmostEqual(np.linalg.norm(w_m, axis=1).sum(), 7, delta=1e-4) | |||||
for word in words: | for word in words: | ||||
self.assertIn(word, vocab) | self.assertIn(word, vocab) | ||||
@@ -8,7 +8,7 @@ from fastNLP.models.bert import * | |||||
class TestBert(unittest.TestCase): | class TestBert(unittest.TestCase): | ||||
def test_bert_1(self): | def test_bert_1(self): | ||||
from fastNLP.core.const import Const | from fastNLP.core.const import Const | ||||
from fastNLP.modules.encoder._bert import BertConfig | |||||
from fastNLP.modules.encoder.bert import BertConfig | |||||
model = BertForSequenceClassification(2, BertConfig(32000)) | model = BertForSequenceClassification(2, BertConfig(32000)) | ||||
@@ -23,7 +23,7 @@ class TestBert(unittest.TestCase): | |||||
def test_bert_2(self): | def test_bert_2(self): | ||||
from fastNLP.core.const import Const | from fastNLP.core.const import Const | ||||
from fastNLP.modules.encoder._bert import BertConfig | |||||
from fastNLP.modules.encoder.bert import BertConfig | |||||
model = BertForMultipleChoice(2, BertConfig(32000)) | model = BertForMultipleChoice(2, BertConfig(32000)) | ||||
@@ -38,7 +38,7 @@ class TestBert(unittest.TestCase): | |||||
def test_bert_3(self): | def test_bert_3(self): | ||||
from fastNLP.core.const import Const | from fastNLP.core.const import Const | ||||
from fastNLP.modules.encoder._bert import BertConfig | |||||
from fastNLP.modules.encoder.bert import BertConfig | |||||
model = BertForTokenClassification(7, BertConfig(32000)) | model = BertForTokenClassification(7, BertConfig(32000)) | ||||
@@ -53,7 +53,7 @@ class TestBert(unittest.TestCase): | |||||
def test_bert_4(self): | def test_bert_4(self): | ||||
from fastNLP.core.const import Const | from fastNLP.core.const import Const | ||||
from fastNLP.modules.encoder._bert import BertConfig | |||||
from fastNLP.modules.encoder.bert import BertConfig | |||||
model = BertForQuestionAnswering(BertConfig(32000)) | model = BertForQuestionAnswering(BertConfig(32000)) | ||||
@@ -8,7 +8,7 @@ from fastNLP.models.bert import BertModel | |||||
class TestBert(unittest.TestCase): | class TestBert(unittest.TestCase): | ||||
def test_bert_1(self): | def test_bert_1(self): | ||||
from fastNLP.modules.encoder._bert import BertConfig | |||||
from fastNLP.modules.encoder.bert import BertConfig | |||||
config = BertConfig(32000) | config = BertConfig(32000) | ||||
model = BertModel(config) | model = BertModel(config) | ||||