Browse Source

Merge branch 'dev0.5.0' of https://github.com/fastnlp/fastNLP into dev0.5.0

tags/v0.4.10
yh 5 years ago
parent
commit
685e9900e5
22 changed files with 261 additions and 1356 deletions
  1. +1
    -1
      docs/source/fastNLP.core.callback.rst
  2. +1
    -1
      docs/source/fastNLP.io.loader.rst
  3. +1
    -1
      docs/source/fastNLP.io.pipe.rst
  4. +1
    -1
      docs/source/fastNLP.io.rst
  5. +1
    -1
      docs/source/fastNLP.rst
  6. +111
    -46
      docs/source/tutorials/tutorial_10_callback.rst
  7. +1
    -1
      docs/source/tutorials/tutorial_2_vocabulary.rst
  8. +8
    -7
      docs/source/tutorials/tutorial_3_embedding.rst
  9. +1
    -1
      docs/source/tutorials/tutorial_4_load_dataset.rst
  10. +36
    -7
      fastNLP/io/__init__.py
  11. +16
    -9
      fastNLP/io/loader/__init__.py
  12. +9
    -3
      fastNLP/io/loader/classification.py
  13. +19
    -11
      fastNLP/io/loader/matching.py
  14. +21
    -10
      fastNLP/io/pipe/__init__.py
  15. +3
    -1
      fastNLP/io/pipe/classification.py
  16. +30
    -24
      fastNLP/io/pipe/matching.py
  17. +1
    -2
      fastNLP/modules/encoder/attention.py
  18. +0
    -280
      tutorials/quickstart.ipynb
  19. +0
    -77
      tutorials/sample_data/tutorial_sample_dataset.csv
  20. +0
    -831
      tutorials/tutorial_1.ipynb
  21. +0
    -0
      tutorials/tutorial_10_callback.ipynb
  22. +0
    -41
      tutorials/命名实体识别.ipynb

+ 1
- 1
docs/source/fastNLP.core.callback.rst View File

@@ -2,6 +2,6 @@ fastNLP.core.callback
===================== =====================


.. automodule:: fastNLP.core.callback .. automodule:: fastNLP.core.callback
:members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, CallbackException, EarlyStopError
:members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError
:inherited-members: :inherited-members:



+ 1
- 1
docs/source/fastNLP.io.loader.rst View File

@@ -2,6 +2,6 @@ fastNLP.io.loader
================= =================


.. automodule:: fastNLP.io.loader .. automodule:: fastNLP.io.loader
:members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CoReferenceLoader
:members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, CoReferenceLoader
:inherited-members: :inherited-members:



+ 1
- 1
docs/source/fastNLP.io.pipe.rst View File

@@ -2,6 +2,6 @@ fastNLP.io.pipe
=============== ===============


.. automodule:: fastNLP.io.pipe .. automodule:: fastNLP.io.pipe
:members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CoReferencePipe
:members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, LCQMCPipe, CNXNLIPipe, BQCorpusPipe, RenamePipe, GranularizePipe, MachingTruncatePipe, CoReferencePipe
:inherited-members: :inherited-members:



+ 1
- 1
docs/source/fastNLP.io.rst View File

@@ -2,7 +2,7 @@ fastNLP.io
========== ==========


.. automodule:: fastNLP.io .. automodule:: fastNLP.io
:members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver
:members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, LCQMCLoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver
:inherited-members: :inherited-members:


子模块 子模块


+ 1
- 1
docs/source/fastNLP.rst View File

@@ -2,7 +2,7 @@ fastNLP
======= =======


.. automodule:: fastNLP .. automodule:: fastNLP
:members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger
:members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, CallbackException, EarlyStopError, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger
:inherited-members: :inherited-members:


子模块 子模块


+ 111
- 46
docs/source/tutorials/tutorial_10_callback.rst View File

@@ -1,67 +1,132 @@
=================================================== ===================================================
使用Callback自定义你的训练过程
使用 Callback 自定义你的训练过程
=================================================== ===================================================


在训练时,我们常常要使用trick来提高模型的性能(如调节学习率),或者要打印训练中的信息。
这里我们提供Callback类,在Trainer中插入代码,完成一些自定义的操作。
- 什么是 Callback
- 使用 Callback
- 一些常用的 Callback
- 自定义实现 Callback


我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。
给出一段评价性文字,预测其情感倾向是积极(label=1)、消极(label=0)还是中性(label=2),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。
关于数据处理,Loss和Optimizer的选择可以看其他教程,这里仅在训练时加入学习率衰减。


什么是Callback
--------------------- ---------------------
Callback的构建和使用

Callback 是与 Trainer 紧密结合的模块,利用 Callback 可以在 Trainer 训练时,加入自定义的操作,比如梯度裁剪,学习率调节,测试模型的性能等。定义的 Callback 会在训练的特定阶段被调用。

fastNLP 中提供了很多常用的 Callback ,开箱即用。


使用 Callback
--------------------- ---------------------


创建Callback
我们可以继承fastNLP :class:`~fastNLP.Callback` 类来定义自己的Callback。
这里我们实现一个让学习率线性衰减的Callback。
使用 Callback 很简单,将需要的 callback 按 list 存储,以对应参数 ``callbacks`` 传入对应的 Trainer。Trainer 在训练时就会自动执行这些 Callback 指定的操作了。


.. code-block:: python

from fastNLP import (Callback, EarlyStopCallback,
Trainer, CrossEntropyLoss, AccuracyMetric)
from fastNLP.models import CNNText
import torch.cuda

# prepare data
def get_data():
from fastNLP.io import ChnSentiCorpPipe as pipe
data = pipe().process_from_file()
print(data)
data.rename_field('chars', 'words')
train_data = data.datasets['train']
dev_data = data.datasets['dev']
test_data = data.datasets['test']
vocab = data.vocabs['words']
tgt_vocab = data.vocabs['target']
return train_data, dev_data, test_data, vocab, tgt_vocab

# prepare model
train_data, dev_data, _, vocab, tgt_vocab = get_data()
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = CNNText((len(vocab),50), num_classes=len(tgt_vocab))

# define callback
callbacks=[EarlyStopCallback(5)]

# pass callbacks to Trainer
def train_with_callback(cb_list):
trainer = Trainer(
device=device,
n_epochs=3,
model=model,
train_data=train_data,
dev_data=dev_data,
loss=CrossEntropyLoss(),
metrics=AccuracyMetric(),
callbacks=cb_list,
check_code_level=-1
)
trainer.train()


.. code-block:: python
train_with_callback(callbacks)


import fastNLP


class LRDecay(fastNLP.Callback):
def __init__(self):
super(LRDecay, self).__init__()
self.base_lrs = []
self.delta = []


def on_train_begin(self):
# 初始化,仅训练开始时调用
self.base_lrs = [pg['lr'] for pg in self.optimizer.param_groups]
self.delta = [float(lr) / self.n_epochs for lr in self.base_lrs]
fastNLP 中的 Callback
---------------------


def on_epoch_end(self):
# 每个epoch结束时,更新学习率
ep = self.epoch
lrs = [lr - d * ep for lr, d in zip(self.base_lrs, self.delta)]
self.change_lr(lrs)
fastNLP 中提供了很多常用的 Callback,如梯度裁剪,训练时早停和测试验证集,fitlog 等等。具体 Callback 请参考 fastNLP.core.callbacks


def change_lr(self, lrs):
for pg, lr in zip(self.optimizer.param_groups, lrs):
pg['lr'] = lr
.. code-block:: python


这里,:class:`~fastNLP.Callback` 中所有以 ``on_`` 开头的类方法会在 :class:`~fastNLP.Trainer` 的训练中在特定时间调用。
如 on_train_begin() 会在训练开始时被调用,on_epoch_end() 会在每个 epoch 结束时调用。
具体有哪些类方法,参见文档 :class:`~fastNLP.Callback` 。
from fastNLP import EarlyStopCallback, GradientClipCallback, EvaluateCallback
callbacks = [
EarlyStopCallback(5),
GradientClipCallback(clip_value=5, clip_type='value'),
EvaluateCallback(dev_data)
]


另外,为了使用方便,可以在 :class:`~fastNLP.Callback` 内部访问 :class:`~fastNLP.Trainer` 中的属性,如 optimizer, epoch, step,分别对应训练时的优化器,当前epoch数,和当前的总step数。
具体可访问的属性,参见文档 :class:`~fastNLP.Callback` 。
train_with_callback(callbacks)


使用Callback
在定义好 :class:`~fastNLP.Callback` 之后,就能将它传入Trainer的 ``callbacks`` 参数,在实际训练时使用。
自定义 Callback
---------------------


.. code-block:: python
这里我们以一个简单的 Callback作为例子,它的作用是打印每一个 Epoch 平均训练 loss。


"""
数据预处理,模型定义等等
"""
1. 创建 Callback
要自定义 Callback,我们要实现一个类,继承 fastNLP.Callback。这里我们定义 MyCallBack ,继承 fastNLP.Callback 。


trainer = fastNLP.Trainer(
model=model, train_data=train_data, dev_data=dev_data,
optimizer=optimizer, metrics=metrics,
batch_size=10, n_epochs=100,
callbacks=[LRDecay()])
2. 指定 Callback 调用的阶段
Callback 中所有以 `on_` 开头的类方法会在 Trainer 的训练中在特定阶段调用。 如 on_train_begin() 会在训练开始时被调用,on_epoch_end()
会在每个 epoch 结束时调用。 具体有哪些类方法,参见 Callback 文档。这里, MyCallBack 在求得loss时调用 on_backward_begin() 记录
当前 loss,在每一个 epoch 结束时调用 on_epoch_end() ,求当前 epoch 平均loss并输出。

3. 使用 Callback 的属性访问 Trainer 的内部信息
为了方便使用,可以使用 Callback 的属性,访问 Trainer 中的对应信息,如 optimizer, epoch, n_epochs,分别对应训练时的优化器,
当前 epoch 数,和总 epoch 数。 具体可访问的属性,参见文档 Callback 。这里, MyCallBack 为了求平均 loss ,需要知道当前 epoch 的总步
数,可以通过 self.step 属性得到当前训练了多少步。

.. code-block:: python

from fastNLP import Callback
from fastNLP import logger

class MyCallBack(Callback):
"""Print average loss in each epoch"""
def __init__(self):
super().__init__()
self.total_loss = 0
self.start_step = 0

def on_backward_begin(self, loss):
self.total_loss += loss.item()

def on_epoch_end(self):
n_steps = self.step - self.start_step
avg_loss = self.total_loss / n_steps
logger.info('Avg loss at epoch %d, %.6f', self.epoch, avg_loss)
self.start_step = self.step

callbacks = [MyCallBack()]
train_with_callback(callbacks)


trainer.train()

+ 1
- 1
docs/source/tutorials/tutorial_2_vocabulary.rst View File

@@ -86,7 +86,7 @@ Vocabulary
vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])




:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集
:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集
传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的
情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们
会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。通过与fastNLP中的各种Embedding配合使用,会有如下的效果, 会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。通过与fastNLP中的各种Embedding配合使用,会有如下的效果,


+ 8
- 7
docs/source/tutorials/tutorial_3_embedding.rst View File

@@ -187,7 +187,7 @@ BertEmbedding的使用
torch.Size([1, 7, 768]) torch.Size([1, 7, 768])


在英文Bert模型中,一个英文单词可能会被切分为多个subword,例如"fairness"会被拆分为 ``["fair", "##ness"]`` ,这样一个word对应的将有两个输出, 在英文Bert模型中,一个英文单词可能会被切分为多个subword,例如"fairness"会被拆分为 ``["fair", "##ness"]`` ,这样一个word对应的将有两个输出,
:class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector,通过pool_method可以控制
:class:`~fastNLP.embeddings.BertEmbedding` 会使用pooling方法将一个word的subword的表示合并成一个vector,通过pool_method可以控制
该pooling方法,支持的有"first"(即使用fair的表示作为fairness的表示), "last"(使用##ness的表示作为fairness的表示), "max"(对fair和 该pooling方法,支持的有"first"(即使用fair的表示作为fairness的表示), "last"(使用##ness的表示作为fairness的表示), "max"(对fair和
##ness在每一维上做max),"avg"(对fair和##ness每一维做average)。 ##ness在每一维上做max),"avg"(对fair和##ness每一维做average)。


@@ -200,8 +200,8 @@ BertEmbedding的使用


torch.Size([1, 5, 768]) torch.Size([1, 5, 768])


另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
<https://arxiv.org/abs/1810.04805>`_ ,Bert在针对具有两句话的任务时(如matching,Q&A任务),句子之间通过[SEP]拼接起来,前一句话的token embedding为0,
另外,根据 `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ ,
Bert在针对具有两句话的任务时(如matching,Q&A任务),句子之间通过[SEP]拼接起来,前一句话的token embedding为0,
后一句话的token embedding为1。BertEmbedding能够自动识别句子中间的[SEP]来正确设置对应的token_type_id的。 后一句话的token embedding为1。BertEmbedding能够自动识别句子中间的[SEP]来正确设置对应的token_type_id的。


.. code-block:: python .. code-block:: python
@@ -230,7 +230,7 @@ Part VI: 使用character-level的embedding
----------------------------------------------------- -----------------------------------------------------


除了预训练的embedding以外,fastNLP还提供了两种Character Embedding: :class:`~fastNLP.embeddings.CNNCharEmbedding` 和 除了预训练的embedding以外,fastNLP还提供了两种Character Embedding: :class:`~fastNLP.embeddings.CNNCharEmbedding` 和
:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时,需要在预处理的时候将word拆分成character,这
:class:`~fastNLP.embeddings.LSTMCharEmbedding` 。一般在使用character embedding时,需要在预处理的时候将word拆分成character,这
会使得预处理过程变得非常繁琐。在fastNLP中,使用character embedding也只需要传入 :class:`~fastNLP.Vocabulary` 即可,而且该 会使得预处理过程变得非常繁琐。在fastNLP中,使用character embedding也只需要传入 :class:`~fastNLP.Vocabulary` 即可,而且该
Vocabulary与其它Embedding使用的Vocabulary是一致的,下面我们看两个例子。 Vocabulary与其它Embedding使用的Vocabulary是一致的,下面我们看两个例子。


@@ -298,11 +298,12 @@ Part VII: 叠加使用多个embedding


torch.Size([1, 5, 114]) torch.Size([1, 5, 114])


:class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` ,
:class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。
:class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的,即输出index返回对应的表示。但能够拼接起来的Embedding
:class:`~fastNLP.embeddings.StaticEmbedding` , :class:`~fastNLP.embeddings.ElmoEmbedding` ,
:class:`~fastNLP.embeddings.CNNCharEmbedding` , :class:`~fastNLP.embeddings.BertEmbedding` 等都可以互相拼接。
:class:`~fastNLP.embeddings.StackEmbedding` 的使用也是和其它Embedding是一致的,即输出index返回对应的表示。但能够拼接起来的Embedding
必须使用同样的 :class:`~fastNLP.Vocabulary` ,因为只有使用同样的 :class:`~fastNLP.Vocabulary` 才能保证同一个index指向的是同一个词或字 必须使用同样的 :class:`~fastNLP.Vocabulary` ,因为只有使用同样的 :class:`~fastNLP.Vocabulary` 才能保证同一个index指向的是同一个词或字



----------------------------------------------------------- -----------------------------------------------------------
Part VIII: Embedding的其它说明 Part VIII: Embedding的其它说明
----------------------------------------------------------- -----------------------------------------------------------


+ 1
- 1
docs/source/tutorials/tutorial_4_load_dataset.rst View File

@@ -20,7 +20,7 @@ Part I: 数据集容器DataBundle
来承载同一个任务的多个数据集 :class:`~fastNLP.DataSet` 以及它们的词表 :class:`~fastNLP.Vocabulary` 。下面会有例子介绍 :class:`~fastNLP.io.DataBundle` 来承载同一个任务的多个数据集 :class:`~fastNLP.DataSet` 以及它们的词表 :class:`~fastNLP.Vocabulary` 。下面会有例子介绍 :class:`~fastNLP.io.DataBundle`
的相关使用。 的相关使用。


:class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 中被使用。
:class:`~fastNLP.io.DataBundle` 在fastNLP中主要在各个 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 中被使用。
下面我们先介绍一下 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 。 下面我们先介绍一下 :class:`~fastNLP.io.Loader` 和 :class:`~fastNLP.io.Pipe` 。


Part II: 加载的各种数据集的Loader Part II: 加载的各种数据集的Loader


+ 36
- 7
fastNLP/io/__init__.py View File

@@ -47,7 +47,7 @@ __all__ = [
"SNLILoader", "SNLILoader",
"QNLILoader", "QNLILoader",
"RTELoader", "RTELoader",
"XNLILoader",
"CNXNLILoader",
"BQCorpusLoader", "BQCorpusLoader",
"LCQMCLoader", "LCQMCLoader",


@@ -70,32 +70,61 @@ __all__ = [
"WeiboNERPipe", "WeiboNERPipe",


"CWSPipe", "CWSPipe",

"Pipe",
"CWSPipe",
"YelpFullPipe",
"YelpPolarityPipe",
"SSTPipe",
"SST2Pipe",
"IMDBPipe",
"ChnSentiCorpPipe",
"THUCNewsPipe",
"WeiboSenti100kPipe",
"Conll2003NERPipe",
"OntoNotesNERPipe",
"MsraNERPipe",
"WeiboNERPipe",
"PeopleDailyPipe",
"Conll2003Pipe",
"MatchingBertPipe", "MatchingBertPipe",
"RTEBertPipe", "RTEBertPipe",
"SNLIBertPipe", "SNLIBertPipe",
"QuoraBertPipe", "QuoraBertPipe",
"QNLIBertPipe", "QNLIBertPipe",
"MNLIBertPipe", "MNLIBertPipe",
"CNXNLIBertPipe",
"BQCorpusBertPipe",
"LCQMCBertPipe",
"MatchingPipe", "MatchingPipe",
"RTEPipe", "RTEPipe",
"SNLIPipe", "SNLIPipe",
"QuoraPipe", "QuoraPipe",
"QNLIPipe", "QNLIPipe",
"MNLIPipe", "MNLIPipe",
"LCQMCPipe",
"CNXNLIPipe",
"BQCorpusPipe",
"RenamePipe",
"GranularizePipe",
"MachingTruncatePipe",


'ModelLoader', 'ModelLoader',
'ModelSaver', 'ModelSaver',


] ]


from .embed_loader import EmbedLoader
from .data_bundle import DataBundle
from .model_io import ModelLoader, ModelSaver
import sys


from .data_bundle import DataBundle
from .embed_loader import EmbedLoader
from .loader import * from .loader import *
from .model_io import ModelLoader, ModelSaver
from .pipe import * from .pipe import *

import sys
from ..doc_utils import doc_process from ..doc_utils import doc_process

doc_process(sys.modules[__name__]) doc_process(sys.modules[__name__])

+ 16
- 9
fastNLP/io/loader/__init__.py View File

@@ -54,7 +54,9 @@ __all__ = [
'SSTLoader', 'SSTLoader',
'SST2Loader', 'SST2Loader',
"ChnSentiCorpLoader", "ChnSentiCorpLoader",

"THUCNewsLoader",
"WeiboSenti100kLoader",
'ConllLoader', 'ConllLoader',
'Conll2003Loader', 'Conll2003Loader',
'Conll2003NERLoader', 'Conll2003NERLoader',
@@ -63,26 +65,31 @@ __all__ = [
"MsraNERLoader", "MsraNERLoader",
"PeopleDailyNERLoader", "PeopleDailyNERLoader",
"WeiboNERLoader", "WeiboNERLoader",
'CSVLoader', 'CSVLoader',
'JsonLoader', 'JsonLoader',
'CWSLoader', 'CWSLoader',
'MNLILoader', 'MNLILoader',
"QuoraLoader", "QuoraLoader",
"SNLILoader", "SNLILoader",
"QNLILoader", "QNLILoader",
"RTELoader", "RTELoader",

"CNXNLILoader",
"BQCorpusLoader",
"LCQMCLoader",
"CoReferenceLoader" "CoReferenceLoader"
] ]
from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader
from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, \
ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader
from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader
from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader
from .coreference import CoReferenceLoader
from .csv import CSVLoader from .csv import CSVLoader
from .cws import CWSLoader from .cws import CWSLoader
from .json import JsonLoader from .json import JsonLoader
from .loader import Loader from .loader import Loader
from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader
from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader
from .coreference import CoReferenceLoader
from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, CNXNLILoader, BQCorpusLoader, \
LCQMCLoader

+ 9
- 3
fastNLP/io/loader/classification.py View File

@@ -409,6 +409,7 @@ class THUCNewsLoader(Loader):


.. csv-table:: .. csv-table::
:header: "raw_words", "target" :header: "raw_words", "target"
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育" "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育"
"...", "..." "...", "..."


@@ -446,13 +447,18 @@ class WeiboSenti100kLoader(Loader):
别名: 别名:
数据集简介:微博sentiment classification,二分类 数据集简介:微博sentiment classification,二分类
原始数据内容为: 原始数据内容为:
label text
0 六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]
1 听过一场!笑死了昂,一听茄子脱口秀,从此节操是路人![嘻嘻] //@中国梦网官微:@Pencil彭赛 @茄子脱口秀 [圣诞帽][圣诞树][平安果]
.. .. code-block:: text
label text
0 六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]
1 听过一场!笑死了昂,一听茄子脱口秀,从此节操是路人![嘻嘻] //@中国梦网官微:@Pencil彭赛 @茄子脱口秀 [圣诞帽][圣诞树][平安果]
读取后的Dataset将具有以下数据结构: 读取后的Dataset将具有以下数据结构:


.. csv-table:: .. csv-table::
:header: "raw_chars", "target" :header: "raw_chars", "target"
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0" "六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0"
"...", "..." "...", "..."




+ 19
- 11
fastNLP/io/loader/matching.py View File

@@ -15,14 +15,14 @@ import os
import warnings import warnings
from typing import Union, Dict from typing import Union, Dict


from .csv import CSVLoader
from .json import JsonLoader from .json import JsonLoader
from .loader import Loader from .loader import Loader
from .. import DataBundle from .. import DataBundle
from ..utils import check_loader_paths
from ...core.const import Const from ...core.const import Const
from ...core.dataset import DataSet from ...core.dataset import DataSet
from ...core.instance import Instance from ...core.instance import Instance
from .csv import CSVLoader
from ..utils import check_loader_paths




class MNLILoader(Loader): class MNLILoader(Loader):
@@ -348,8 +348,9 @@ class CNXNLILoader(Loader):


.. csv-table:: .. csv-table::
:header: "raw_chars1", "raw_chars2", "target" :header: "raw_chars1", "raw_chars2", "target"
"从概念上看,奶油收入有两个基本方面产品和地理.", "产品和地理是什么使奶油抹霜工作.", "1" "从概念上看,奶油收入有两个基本方面产品和地理.", "产品和地理是什么使奶油抹霜工作.", "1"
""...", "...", "..."
"...", "...", "..."


""" """


@@ -412,6 +413,7 @@ class BQCorpusLoader(Loader):


.. csv-table:: .. csv-table::
:header: "raw_chars1", "raw_chars2", "target" :header: "raw_chars1", "raw_chars2", "target"
"不是邀请的如何贷款?", "我不是你们邀请的客人可以贷款吗?", "1" "不是邀请的如何贷款?", "我不是你们邀请的客人可以贷款吗?", "1"
"如何满足微粒银行的审核", "建设银行有微粒贷的资格吗", "0" "如何满足微粒银行的审核", "建设银行有微粒贷的资格吗", "0"
"...", "...", "..." "...", "...", "..."
@@ -448,20 +450,26 @@ class BQCorpusLoader(Loader):




class LCQMCLoader(Loader): class LCQMCLoader(Loader):
"""
别名:
r"""
数据集简介:句对匹配(question matching) 数据集简介:句对匹配(question matching)
原始数据为: 原始数据为:
'喜欢打篮球的男生喜欢什么样的女生\t爱打篮球的男生喜欢什么样的女生\t1\n'
'晚上睡觉带着耳机听音乐有什么害处吗?\t孕妇可以戴耳机听音乐吗?\t0\n'
读取后的Dataset将具有以下的数据结构:

.. code-block:: text
'喜欢打篮球的男生喜欢什么样的女生\t爱打篮球的男生喜欢什么样的女生\t1\n'
'晚上睡觉带着耳机听音乐有什么害处吗?\t孕妇可以戴耳机听音乐吗?\t0\n'
读取后的Dataset将具有以下的数据结构
.. csv-table:: .. csv-table::
:header: "raw_chars1", "raw_chars2", "target" :header: "raw_chars1", "raw_chars2", "target"
"喜欢打篮球的男生喜欢什么样的女生?", "爱打篮球的男生喜欢什么样的女生?", "1" "喜欢打篮球的男生喜欢什么样的女生?", "爱打篮球的男生喜欢什么样的女生?", "1"
"晚上睡觉带着耳机听音乐有什么害处吗?", "妇可以戴耳机听音乐吗?", "0" "晚上睡觉带着耳机听音乐有什么害处吗?", "妇可以戴耳机听音乐吗?", "0"
""...", "...", "..."

"...", "...", "..."
""" """


def __init__(self): def __init__(self):


+ 21
- 10
fastNLP/io/pipe/__init__.py View File

@@ -9,9 +9,9 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce
""" """
__all__ = [ __all__ = [
"Pipe", "Pipe",
"CWSPipe", "CWSPipe",
"YelpFullPipe", "YelpFullPipe",
"YelpPolarityPipe", "YelpPolarityPipe",
"SSTPipe", "SSTPipe",
@@ -20,35 +20,46 @@ __all__ = [
"ChnSentiCorpPipe", "ChnSentiCorpPipe",
"THUCNewsPipe", "THUCNewsPipe",
"WeiboSenti100kPipe", "WeiboSenti100kPipe",
"Conll2003NERPipe", "Conll2003NERPipe",
"OntoNotesNERPipe", "OntoNotesNERPipe",
"MsraNERPipe", "MsraNERPipe",
"WeiboNERPipe", "WeiboNERPipe",
"PeopleDailyPipe", "PeopleDailyPipe",
"Conll2003Pipe", "Conll2003Pipe",
"MatchingBertPipe", "MatchingBertPipe",
"RTEBertPipe", "RTEBertPipe",
"SNLIBertPipe", "SNLIBertPipe",
"QuoraBertPipe", "QuoraBertPipe",
"QNLIBertPipe", "QNLIBertPipe",
"MNLIBertPipe", "MNLIBertPipe",
"CNXNLIBertPipe",
"BQCorpusBertPipe",
"LCQMCBertPipe",
"MatchingPipe", "MatchingPipe",
"RTEPipe", "RTEPipe",
"SNLIPipe", "SNLIPipe",
"QuoraPipe", "QuoraPipe",
"QNLIPipe", "QNLIPipe",
"MNLIPipe", "MNLIPipe",

"LCQMCPipe",
"CNXNLIPipe",
"BQCorpusPipe",
"RenamePipe",
"GranularizePipe",
"MachingTruncatePipe",
"CoReferencePipe" "CoReferencePipe"
] ]


from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, \
WeiboSenti100kPipe
from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe
from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
from .pipe import Pipe
from .conll import Conll2003Pipe from .conll import Conll2003Pipe
from .cws import CWSPipe
from .coreference import CoReferencePipe from .coreference import CoReferencePipe
from .cws import CWSPipe
from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, CNXNLIBertPipe, CNXNLIPipe, BQCorpusBertPipe, \
LCQMCPipe, BQCorpusPipe, LCQMCBertPipe, RenamePipe, GranularizePipe, MachingTruncatePipe
from .pipe import Pipe

+ 3
- 1
fastNLP/io/pipe/classification.py View File

@@ -21,11 +21,11 @@ from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_insta
from ..data_bundle import DataBundle from ..data_bundle import DataBundle
from ..loader.classification import ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader from ..loader.classification import ChnSentiCorpLoader, THUCNewsLoader, WeiboSenti100kLoader
from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader
from ...core._logger import logger
from ...core.const import Const from ...core.const import Const
from ...core.dataset import DataSet from ...core.dataset import DataSet
from ...core.instance import Instance from ...core.instance import Instance
from ...core.vocabulary import Vocabulary from ...core.vocabulary import Vocabulary
from ...core._logger import logger


nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')


@@ -718,6 +718,7 @@ class THUCNewsPipe(_CLSPipe):


.. csv-table:: .. csv-table::
:header: "raw_words", "target" :header: "raw_words", "target"
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育" "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 ... ", "体育"
"...", "..." "...", "..."


@@ -826,6 +827,7 @@ class WeiboSenti100kPipe(_CLSPipe):


.. csv-table:: .. csv-table::
:header: "raw_chars", "target" :header: "raw_chars", "target"
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0" "六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "0"
"...", "..." "...", "..."




+ 30
- 24
fastNLP/io/pipe/matching.py View File

@@ -16,20 +16,24 @@ __all__ = [
"QuoraPipe", "QuoraPipe",
"QNLIPipe", "QNLIPipe",
"MNLIPipe", "MNLIPipe",
"LCQMCPipe",
"CNXNLIPipe", "CNXNLIPipe",
"BQCorpusPipe", "BQCorpusPipe",
"LCQMCPipe",
"RenamePipe",
"GranularizePipe",
"MachingTruncatePipe",
] ]


import warnings import warnings


from .pipe import Pipe from .pipe import Pipe
from .utils import get_tokenizer from .utils import get_tokenizer
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader
from ..data_bundle import DataBundle
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, \
LCQMCLoader
from ...core._logger import logger
from ...core.const import Const from ...core.const import Const
from ...core.vocabulary import Vocabulary from ...core.vocabulary import Vocabulary
from ...core._logger import logger
from ..data_bundle import DataBundle




class MatchingBertPipe(Pipe): class MatchingBertPipe(Pipe):
@@ -145,7 +149,7 @@ class MatchingBertPipe(Pipe):
f"data set but not in train data set!." f"data set but not in train data set!."
warnings.warn(warn_msg) warnings.warn(warn_msg)
logger.warning(warn_msg) logger.warning(warn_msg)
has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if
dataset.has_field(Const.TARGET)] dataset.has_field(Const.TARGET)]
target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET)
@@ -294,7 +298,7 @@ class MatchingPipe(Pipe):
f"data set but not in train data set!." f"data set but not in train data set!."
warnings.warn(warn_msg) warnings.warn(warn_msg)
logger.warning(warn_msg) logger.warning(warn_msg)
has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if
dataset.has_field(Const.TARGET)] dataset.has_field(Const.TARGET)]
target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET)
@@ -345,8 +349,9 @@ class MNLIPipe(MatchingPipe):
data_bundle = MNLILoader().load(paths) data_bundle = MNLILoader().load(paths)
return self.process(data_bundle) return self.process(data_bundle)



class LCQMCPipe(MatchingPipe): class LCQMCPipe(MatchingPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = LCQMCLoader().load(paths) data_bundle = LCQMCLoader().load(paths)
data_bundle = RenamePipe().process(data_bundle) data_bundle = RenamePipe().process(data_bundle)
data_bundle = self.process(data_bundle) data_bundle = self.process(data_bundle)
@@ -358,14 +363,14 @@ class CNXNLIPipe(MatchingPipe):
def process_from_file(self, paths=None): def process_from_file(self, paths=None):
data_bundle = CNXNLILoader().load(paths) data_bundle = CNXNLILoader().load(paths)
data_bundle = GranularizePipe(task='XNLI').process(data_bundle) data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
data_bundle = RenamePipe().process(data_bundle) #使中文数据的field
data_bundle = RenamePipe().process(data_bundle) # 使中文数据的field
data_bundle = self.process(data_bundle) data_bundle = self.process(data_bundle)
data_bundle = RenamePipe().process(data_bundle) data_bundle = RenamePipe().process(data_bundle)
return data_bundle return data_bundle




class BQCorpusPipe(MatchingPipe): class BQCorpusPipe(MatchingPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = BQCorpusLoader().load(paths) data_bundle = BQCorpusLoader().load(paths)
data_bundle = RenamePipe().process(data_bundle) data_bundle = RenamePipe().process(data_bundle)
data_bundle = self.process(data_bundle) data_bundle = self.process(data_bundle)
@@ -374,12 +379,12 @@ class BQCorpusPipe(MatchingPipe):




class RenamePipe(Pipe): class RenamePipe(Pipe):
def __init__(self, task = 'cn-nli'):
def __init__(self, task='cn-nli'):
super().__init__() super().__init__()
self.task = task self.task = task
def process(self, data_bundle: DataBundle): # rename field name for Chinese Matching dataset def process(self, data_bundle: DataBundle): # rename field name for Chinese Matching dataset
if(self.task == 'cn-nli'):
if (self.task == 'cn-nli'):
for name, dataset in data_bundle.datasets.items(): for name, dataset in data_bundle.datasets.items():
if (dataset.has_field(Const.RAW_CHARS(0))): if (dataset.has_field(Const.RAW_CHARS(0))):
dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS
@@ -392,12 +397,12 @@ class RenamePipe(Pipe):
else: else:
raise RuntimeError( raise RuntimeError(
"field name of dataset is not qualified. It should have ether RAW_CHARS or WORDS") "field name of dataset is not qualified. It should have ether RAW_CHARS or WORDS")
elif(self.task == 'cn-nli-bert'):
elif (self.task == 'cn-nli-bert'):
for name, dataset in data_bundle.datasets.items(): for name, dataset in data_bundle.datasets.items():
if (dataset.has_field(Const.RAW_CHARS(0))): if (dataset.has_field(Const.RAW_CHARS(0))):
dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS dataset.rename_field(Const.RAW_CHARS(0), Const.RAW_WORDS(0)) # RAW_CHARS->RAW_WORDS
dataset.rename_field(Const.RAW_CHARS(1), Const.RAW_WORDS(1)) dataset.rename_field(Const.RAW_CHARS(1), Const.RAW_WORDS(1))
elif(dataset.has_field(Const.RAW_WORDS(0))):
elif (dataset.has_field(Const.RAW_WORDS(0))):
dataset.rename_field(Const.RAW_WORDS(0), Const.RAW_CHARS(0)) dataset.rename_field(Const.RAW_WORDS(0), Const.RAW_CHARS(0))
dataset.rename_field(Const.RAW_WORDS(1), Const.RAW_CHARS(1)) dataset.rename_field(Const.RAW_WORDS(1), Const.RAW_CHARS(1))
dataset.rename_field(Const.INPUT, Const.CHAR_INPUT) dataset.rename_field(Const.INPUT, Const.CHAR_INPUT)
@@ -409,15 +414,15 @@ class RenamePipe(Pipe):
raise RuntimeError( raise RuntimeError(
"Only support task='cn-nli' or 'cn-nli-bert'" "Only support task='cn-nli' or 'cn-nli-bert'"
) )
return data_bundle return data_bundle




class GranularizePipe(Pipe): class GranularizePipe(Pipe):
def __init__(self, task = None):
def __init__(self, task=None):
super().__init__() super().__init__()
self.task = task self.task = task
def _granularize(self, data_bundle, tag_map): def _granularize(self, data_bundle, tag_map):
""" """
该函数对data_bundle中'target'列中的内容进行转换。 该函数对data_bundle中'target'列中的内容进行转换。
@@ -434,21 +439,22 @@ class GranularizePipe(Pipe):
dataset.drop(lambda ins: ins[Const.TARGET] == -100) dataset.drop(lambda ins: ins[Const.TARGET] == -100)
data_bundle.set_dataset(dataset, name) data_bundle.set_dataset(dataset, name)
return data_bundle return data_bundle
def process(self, data_bundle: DataBundle): def process(self, data_bundle: DataBundle):
task_tag_dict = { task_tag_dict = {
'XNLI':{'neutral': 0, 'entailment': 1, 'contradictory': 2, 'contradiction': 2}
'XNLI': {'neutral': 0, 'entailment': 1, 'contradictory': 2, 'contradiction': 2}
} }
if self.task in task_tag_dict: if self.task in task_tag_dict:
data_bundle = self._granularize(data_bundle=data_bundle, tag_map= task_tag_dict[self.task])
data_bundle = self._granularize(data_bundle=data_bundle, tag_map=task_tag_dict[self.task])
else: else:
raise RuntimeError(f"Only support {task_tag_dict.keys()} task_tag_map.") raise RuntimeError(f"Only support {task_tag_dict.keys()} task_tag_map.")
return data_bundle return data_bundle




class MachingTruncatePipe(Pipe): #truncate sentence for bert, modify seq_len
class MachingTruncatePipe(Pipe): # truncate sentence for bert, modify seq_len
def __init__(self): def __init__(self):
super().__init__() super().__init__()
def process(self, data_bundle: DataBundle): def process(self, data_bundle: DataBundle):
for name, dataset in data_bundle.datasets.items(): for name, dataset in data_bundle.datasets.items():
pass pass
@@ -456,7 +462,7 @@ class MachingTruncatePipe(Pipe): #truncate sentence for bert, modify seq_len




class LCQMCBertPipe(MatchingBertPipe): class LCQMCBertPipe(MatchingBertPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = LCQMCLoader().load(paths) data_bundle = LCQMCLoader().load(paths)
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)
data_bundle = self.process(data_bundle) data_bundle = self.process(data_bundle)
@@ -465,7 +471,7 @@ class LCQMCBertPipe(MatchingBertPipe):




class BQCorpusBertPipe(MatchingBertPipe): class BQCorpusBertPipe(MatchingBertPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = BQCorpusLoader().load(paths) data_bundle = BQCorpusLoader().load(paths)
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)
data_bundle = self.process(data_bundle) data_bundle = self.process(data_bundle)
@@ -474,7 +480,7 @@ class BQCorpusBertPipe(MatchingBertPipe):




class CNXNLIBertPipe(MatchingBertPipe): class CNXNLIBertPipe(MatchingBertPipe):
def process_from_file(self, paths = None):
def process_from_file(self, paths=None):
data_bundle = CNXNLILoader().load(paths) data_bundle = CNXNLILoader().load(paths)
data_bundle = GranularizePipe(task='XNLI').process(data_bundle) data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)


+ 1
- 2
fastNLP/modules/encoder/attention.py View File

@@ -152,8 +152,7 @@ class BiAttention(nn.Module):
:param torch.Tensor premise_mask: [batch_size, a_seq_len] :param torch.Tensor premise_mask: [batch_size, a_seq_len]
:param torch.Tensor hypothesis_batch: [batch_size, b_seq_len, hidden_size] :param torch.Tensor hypothesis_batch: [batch_size, b_seq_len, hidden_size]
:param torch.Tensor hypothesis_mask: [batch_size, b_seq_len] :param torch.Tensor hypothesis_mask: [batch_size, b_seq_len]
:return: torch.Tensor attended_premises: [batch_size, a_seq_len, hidden_size]
torch.Tensor attended_hypotheses: [batch_size, b_seq_len, hidden_size]
:return: torch.Tensor attended_premises: [batch_size, a_seq_len, hidden_size] torch.Tensor attended_hypotheses: [batch_size, b_seq_len, hidden_size]
""" """
similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1) similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1)
.contiguous()) .contiguous())


+ 0
- 280
tutorials/quickstart.ipynb View File

@@ -1,280 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# 快速入门"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP.io import CSVLoader\n",
"\n",
"loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\\t')\n",
"dataset = loader.load(\"./sample_data/tutorial_sample_dataset.csv\")\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': ['a', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.'] type=list}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将所有字母转为小写, 并所有句子变成单词序列\n",
"dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')\n",
"dataset.apply(lambda x: x['sentence'].split(), new_field_name='words', is_input=True)\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"# 使用Vocabulary类统计单词,并将单词序列转化为数字序列\n",
"vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')\n",
"vocab.index_dataset(dataset, field_name='words',new_field_name='words')\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n",
"'target': 1 type=int}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将label转为整数,并设置为 target\n",
"dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True)\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CNNText(\n",
" (embed): Embedding(\n",
" 177, 50\n",
" (dropout): Dropout(p=0.0)\n",
" )\n",
" (conv_pool): ConvMaxpool(\n",
" (convs): ModuleList(\n",
" (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n",
" (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n",
" (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n",
" )\n",
" )\n",
" (dropout): Dropout(p=0.1)\n",
" (fc): Linear(in_features=12, out_features=5, bias=True)\n",
")"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP.models import CNNText\n",
"model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n",
"model"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(62, 15)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 分割训练集/验证集\n",
"train_data, dev_data = dataset.split(0.2)\n",
"len(train_data), len(dev_data)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-05-09-10-59-39\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.333333\n",
"\n",
"Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.533333\n",
"\n",
"Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.533333\n",
"\n",
"Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.533333\n",
"\n",
"Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.6\n",
"\n",
"Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.8\n",
"\n",
"Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.8\n",
"\n",
"Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.733333\n",
"\n",
"Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.733333\n",
"\n",
"Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.733333\n",
"\n",
"\n",
"In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.8\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.8}},\n",
" 'best_epoch': 6,\n",
" 'best_step': 12,\n",
" 'seconds': 0.22}"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric\n",
"\n",
"# 定义trainer并进行训练\n",
"trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,\n",
" loss=CrossEntropyLoss(), metrics=AccuracyMetric())\n",
"trainer.train()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

+ 0
- 77
tutorials/sample_data/tutorial_sample_dataset.csv View File

@@ -1,77 +0,0 @@
A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . 1
This quiet , introspective and entertaining independent is worth seeking . 4
Even fans of Ismail Merchant 's work , I suspect , would have a hard time sitting through this one . 1
A positively thrilling combination of ethnography and all the intrigue , betrayal , deceit and murder of a Shakespearean tragedy or a juicy soap opera . 3
Aggressive self-glorification and a manipulative whitewash . 1
A comedy-drama of nearly epic proportions rooted in a sincere performance by the title character undergoing midlife crisis . 4
Narratively , Trouble Every Day is a plodding mess . 1
The Importance of Being Earnest , so thick with wit it plays like a reading from Bartlett 's Familiar Quotations 3
But it does n't leave you with much . 1
You could hate it for the same reason . 1
There 's little to recommend Snow Dogs , unless one considers cliched dialogue and perverse escapism a source of high hilarity . 1
Kung Pow is Oedekerk 's realization of his childhood dream to be in a martial-arts flick , and proves that sometimes the dreams of youth should remain just that . 1
The performances are an absolute joy . 4
Fresnadillo has something serious to say about the ways in which extravagant chance can distort our perspective and throw us off the path of good sense . 3
I still like Moonlight Mile , better judgment be damned . 3
A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
a bilingual charmer , just like the woman who inspired it 3
Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
It 's everything you 'd expect -- but nothing more . 2
Best indie of the year , so far . 4
Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
The plot is romantic comedy boilerplate from start to finish . 2
It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
A film that clearly means to preach exclusively to the converted . 2
While The Importance of Being Earnest offers opportunities for occasional smiles and chuckles , it does n't give us a reason to be in the theater beyond Wilde 's wit and the actors ' performances . 1
The latest vapid actor 's exercise to appropriate the structure of Arthur Schnitzler 's Reigen . 1
More vaudeville show than well-constructed narrative , but on those terms it 's inoffensive and actually rather sweet . 2
Nothing more than a run-of-the-mill action flick . 2
Hampered -- no , paralyzed -- by a self-indulgent script ... that aims for poetry and ends up sounding like satire . 0
Ice Age is the first computer-generated feature cartoon to feel like other movies , and that makes for some glacial pacing early on . 2
There 's very little sense to what 's going on here , but the makers serve up the cliches with considerable dash . 2
Cattaneo should have followed the runaway success of his first film , The Full Monty , with something different . 2
They 're the unnamed , easily substitutable forces that serve as whatever terror the heroes of horror movies try to avoid . 1
It almost feels as if the movie is more interested in entertaining itself than in amusing us . 1
The movie 's progression into rambling incoherence gives new meaning to the phrase ` fatal script error . ' 0
I still like Moonlight Mile , better judgment be damned . 3
A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
a bilingual charmer , just like the woman who inspired it 3
Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
It 's everything you 'd expect -- but nothing more . 2
Best indie of the year , so far . 4
Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
The plot is romantic comedy boilerplate from start to finish . 2
It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
A film that clearly means to preach exclusively to the converted . 2
I still like Moonlight Mile , better judgment be damned . 3
A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
a bilingual charmer , just like the woman who inspired it 3
Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
It 's everything you 'd expect -- but nothing more . 2
Best indie of the year , so far . 4
Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
The plot is romantic comedy boilerplate from start to finish . 2
It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
A film that clearly means to preach exclusively to the converted . 2
I still like Moonlight Mile , better judgment be damned . 3
A welcome relief from baseball movies that try too hard to be mythic , this one is a sweet and modest and ultimately winning story . 3
a bilingual charmer , just like the woman who inspired it 3
Like a less dizzily gorgeous companion to Mr. Wong 's In the Mood for Love -- very much a Hong Kong movie despite its mainland setting . 2
As inept as big-screen remakes of The Avengers and The Wild Wild West . 1
It 's everything you 'd expect -- but nothing more . 2
Best indie of the year , so far . 4
Hatfield and Hicks make the oddest of couples , and in this sense the movie becomes a study of the gambles of the publishing world , offering a case study that exists apart from all the movie 's political ramifications . 3
It 's like going to a house party and watching the host defend himself against a frothing ex-girlfriend . 1
That the Chuck Norris `` grenade gag '' occurs about 7 times during Windtalkers is a good indication of how serious-minded the film is . 2
The plot is romantic comedy boilerplate from start to finish . 2
It arrives with an impeccable pedigree , mongrel pep , and almost indecipherable plot complications . 2
A film that clearly means to preach exclusively to the converted . 2

+ 0
- 831
tutorials/tutorial_1.ipynb View File

@@ -1,831 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# 详细指南"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据读入"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str}"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP.io import CSVLoader\n",
"\n",
"loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\\t')\n",
"dataset = loader.load(\"./sample_data/tutorial_sample_dataset.csv\")\n",
"dataset[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Instance表示一个样本,由一个或多个field(域,属性,特征)组成,每个field有名字和值。\n",
"\n",
"在初始化Instance时即可定义它包含的域,使用 \"field_name=field_value\"的写法。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': fake data type=str,\n",
"'label': 0 type=str}"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Instance\n",
"\n",
"dataset.append(Instance(raw_sentence='fake data', label='0'))\n",
"dataset[-1]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 数据处理"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n",
"'target': 1 type=int}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"# 将所有字母转为小写, 并所有句子变成单词序列\n",
"dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')\n",
"dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words')\n",
"\n",
"# 使用Vocabulary类统计单词,并将单词序列转化为数字序列\n",
"vocab = Vocabulary(min_freq=2).from_dataset(dataset, field_name='words')\n",
"vocab.index_dataset(dataset, field_name='words',new_field_name='words')\n",
"\n",
"# 将label转为整数\n",
"dataset.apply(lambda x: int(x['label']), new_field_name='target')\n",
"dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'raw_sentence': A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'label': 1 type=str,\n",
"'sentence': a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story . type=str,\n",
"'words': [4, 1, 6, 1, 1, 2, 1, 11, 153, 10, 28, 17, 2, 1, 10, 1, 28, 17, 2, 1, 5, 154, 6, 149, 1, 1, 23, 1, 6, 149, 1, 8, 30, 6, 4, 35, 3] type=list,\n",
"'target': 1 type=int,\n",
"'seq_len': 37 type=int}\n"
]
}
],
"source": [
"# 增加长度信息\n",
"dataset.apply_field(lambda x: len(x), field_name='words', new_field_name='seq_len')\n",
"print(dataset[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用内置模块CNNText\n",
"设置为符合内置模块的名称"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CNNText(\n",
" (embed): Embedding(\n",
" 177, 50\n",
" (dropout): Dropout(p=0.0)\n",
" )\n",
" (conv_pool): ConvMaxpool(\n",
" (convs): ModuleList(\n",
" (0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))\n",
" (1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))\n",
" (2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))\n",
" )\n",
" )\n",
" (dropout): Dropout(p=0.1)\n",
" (fc): Linear(in_features=12, out_features=5, bias=True)\n",
")"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP.models import CNNText\n",
"\n",
"model_cnn = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n",
"model_cnn"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"我们在使用内置模块的时候,还应该使用应该注意把 field 设定成符合内置模型输入输出的名字。"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"words\n",
"seq_len\n",
"target\n"
]
}
],
"source": [
"from fastNLP import Const\n",
"\n",
"dataset.rename_field('words', Const.INPUT)\n",
"dataset.rename_field('seq_len', Const.INPUT_LEN)\n",
"dataset.rename_field('target', Const.TARGET)\n",
"\n",
"dataset.set_input(Const.INPUT, Const.INPUT_LEN)\n",
"dataset.set_target(Const.TARGET)\n",
"\n",
"print(Const.INPUT)\n",
"print(Const.INPUT_LEN)\n",
"print(Const.TARGET)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 分割训练集/验证集/测试集"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(64, 7, 7)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_dev_data, test_data = dataset.split(0.1)\n",
"train_data, dev_data = train_dev_data.split(0.1)\n",
"len(train_data), len(dev_data), len(test_data)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 训练(model_cnn)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### loss\n",
"训练模型需要提供一个损失函数\n",
"\n",
"下面提供了一个在分类问题中常用的交叉熵损失。注意它的**初始化参数**。\n",
"\n",
"pred参数对应的是模型的forward返回的dict的一个key的名字,这里是\"output\"。\n",
"\n",
"target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import CrossEntropyLoss\n",
"\n",
"# loss = CrossEntropyLoss()\n",
"# 等价于\n",
"loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Metric\n",
"定义评价指标\n",
"\n",
"这里使用准确率。参数的“命名规则”跟上面类似。\n",
"\n",
"pred参数对应的是模型的predict方法返回的dict的一个key的名字,这里是\"predict\"。\n",
"\n",
"target参数对应的是dataset作为标签的field的名字,这里是\"label_seq\"。"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import AccuracyMetric\n",
"\n",
"# metrics=AccuracyMetric()\n",
"# 等价于\n",
"metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-05-12-21-38-34\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714\n",
"\n",
"Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.428571\n",
"\n",
"Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n",
"\n",
"\n",
"In Epoch:8/Step:16, got best dev performance:AccuracyMetric: acc=0.857143\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n",
" 'best_epoch': 8,\n",
" 'best_step': 16,\n",
" 'seconds': 0.21}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer\n",
"\n",
"trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics)\n",
"trainer.train()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 测试(model_cnn)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tester] \n",
"AccuracyMetric: acc=0.857143\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.857143}}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Tester\n",
"\n",
"tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 编写自己的模型\n",
"\n",
"完全支持 pytorch 的模型,与 pytorch 唯一不同的是返回结果是一个字典,字典中至少需要包含 \"pred\" 这个字段"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"\n",
"class LSTMText(nn.Module):\n",
" def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim=64, num_layers=2, dropout=0.5):\n",
" super().__init__()\n",
"\n",
" self.embedding = nn.Embedding(vocab_size, embedding_dim)\n",
" self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout)\n",
" self.fc = nn.Linear(hidden_dim * 2, output_dim)\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def forward(self, words):\n",
" # (input) words : (batch_size, seq_len)\n",
" words = words.permute(1,0)\n",
" # words : (seq_len, batch_size)\n",
"\n",
" embedded = self.dropout(self.embedding(words))\n",
" # embedded : (seq_len, batch_size, embedding_dim)\n",
" output, (hidden, cell) = self.lstm(embedded)\n",
" # output: (seq_len, batch_size, hidden_dim * 2)\n",
" # hidden: (num_layers * 2, batch_size, hidden_dim)\n",
" # cell: (num_layers * 2, batch_size, hidden_dim)\n",
"\n",
" hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)\n",
" hidden = self.dropout(hidden)\n",
" # hidden: (batch_size, hidden_dim * 2)\n",
"\n",
" pred = self.fc(hidden.squeeze(0))\n",
" # result: (batch_size, output_dim)\n",
" return {\"pred\":pred}"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-05-12-21-38-36\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.714286\n",
"\n",
"Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n",
"\n",
"\n",
"In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.857143\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n",
" 'best_epoch': 6,\n",
" 'best_step': 12,\n",
" 'seconds': 2.15}"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_lstm = LSTMText(len(vocab),50,5)\n",
"trainer = Trainer(model=model_lstm, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics)\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[tester] \n",
"AccuracyMetric: acc=0.857143\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.857143}}"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tester = Tester(test_data, model_lstm, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 使用 Batch编写自己的训练过程"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 0 Avg Loss: 3.11 18ms\n",
"Epoch 1 Avg Loss: 2.88 30ms\n",
"Epoch 2 Avg Loss: 2.69 42ms\n",
"Epoch 3 Avg Loss: 2.47 54ms\n",
"Epoch 4 Avg Loss: 2.38 67ms\n",
"Epoch 5 Avg Loss: 2.10 78ms\n",
"Epoch 6 Avg Loss: 2.06 91ms\n",
"Epoch 7 Avg Loss: 1.92 103ms\n",
"Epoch 8 Avg Loss: 1.91 114ms\n",
"Epoch 9 Avg Loss: 1.76 126ms\n",
"[tester] \n",
"AccuracyMetric: acc=0.571429\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.571429}}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import BucketSampler\n",
"from fastNLP import Batch\n",
"import torch\n",
"import time\n",
"\n",
"model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n",
"\n",
"def train(epoch, data):\n",
" optim = torch.optim.Adam(model.parameters(), lr=0.001)\n",
" lossfunc = torch.nn.CrossEntropyLoss()\n",
" batch_size = 32\n",
"\n",
" # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。\n",
" # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)\n",
" train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')\n",
" train_batch = Batch(batch_size=batch_size, dataset=data, sampler=train_sampler)\n",
" \n",
" start_time = time.time()\n",
" for i in range(epoch):\n",
" loss_list = []\n",
" for batch_x, batch_y in train_batch:\n",
" optim.zero_grad()\n",
" output = model(batch_x['words'])\n",
" loss = lossfunc(output['pred'], batch_y['target'])\n",
" loss.backward()\n",
" optim.step()\n",
" loss_list.append(loss.item())\n",
" print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=\" \")\n",
" print('{:d}ms'.format(round((time.time()-start_time)*1000)))\n",
" loss_list.clear()\n",
" \n",
"train(10, train_data)\n",
"tester = Tester(test_data, model, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 使用 Callback 实现自己想要的效果"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-05-12-21-38-40\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=20), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluation at Epoch 1/10. Step:2/20. AccuracyMetric: acc=0.285714\n",
"\n",
"Sum Time: 51ms\n",
"\n",
"\n",
"Evaluation at Epoch 2/10. Step:4/20. AccuracyMetric: acc=0.285714\n",
"\n",
"Sum Time: 69ms\n",
"\n",
"\n",
"Evaluation at Epoch 3/10. Step:6/20. AccuracyMetric: acc=0.285714\n",
"\n",
"Sum Time: 91ms\n",
"\n",
"\n",
"Evaluation at Epoch 4/10. Step:8/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 107ms\n",
"\n",
"\n",
"Evaluation at Epoch 5/10. Step:10/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 125ms\n",
"\n",
"\n",
"Evaluation at Epoch 6/10. Step:12/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 142ms\n",
"\n",
"\n",
"Evaluation at Epoch 7/10. Step:14/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 158ms\n",
"\n",
"\n",
"Evaluation at Epoch 8/10. Step:16/20. AccuracyMetric: acc=0.571429\n",
"\n",
"Sum Time: 176ms\n",
"\n",
"\n",
"Evaluation at Epoch 9/10. Step:18/20. AccuracyMetric: acc=0.714286\n",
"\n",
"Sum Time: 193ms\n",
"\n",
"\n",
"Evaluation at Epoch 10/10. Step:20/20. AccuracyMetric: acc=0.857143\n",
"\n",
"Sum Time: 212ms\n",
"\n",
"\n",
"\n",
"In Epoch:10/Step:20, got best dev performance:AccuracyMetric: acc=0.857143\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.857143}},\n",
" 'best_epoch': 10,\n",
" 'best_step': 20,\n",
" 'seconds': 0.2}"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Callback\n",
"\n",
"start_time = time.time()\n",
"\n",
"class MyCallback(Callback):\n",
" def on_epoch_end(self):\n",
" print('Sum Time: {:d}ms\\n\\n'.format(round((time.time()-start_time)*1000)))\n",
" \n",
"\n",
"model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)\n",
"trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,\n",
" loss=CrossEntropyLoss(), metrics=AccuracyMetric(), callbacks=[MyCallback()])\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 1
}

tutorials/tutorial_callback.ipynb → tutorials/tutorial_10_callback.ipynb View File


+ 0
- 41
tutorials/命名实体识别.ipynb View File

@@ -1,41 +0,0 @@
{
"cells": [
{
"cell_type": "raw",
"metadata": {},
"source": [
"##1. 命名实体识别(name entity recognition, NER)\n",
"命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。\n",
"如下面的例子中\n",
"\n",
"我来自复旦大学。\n",
"\n",
"其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题现在一般被转换为了\n",
"在本tutorial中我们将通过fastNLP尝试写出一个\n",
"\n",
"##2. 数据\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Loading…
Cancel
Save