Browse Source

Merge pull request #273 from fastnlp/tutorial-fix

Tutorial fix
tags/v0.5.5
ChenXin GitHub 4 years ago
parent
commit
10204ef0f2
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
29 changed files with 6513 additions and 751 deletions
  1. +0
    -0
      docs/source/tutorials/cn_cls_example.png
  2. +28
    -24
      docs/source/tutorials/extend_1_bert_embedding.rst
  3. +21
    -13
      docs/source/tutorials/tutorial_1_data_preprocess.rst
  4. +9
    -7
      docs/source/tutorials/tutorial_2_vocabulary.rst
  5. +2
    -2
      docs/source/tutorials/tutorial_3_embedding.rst
  6. +1
    -1
      docs/source/tutorials/tutorial_4_load_dataset.rst
  7. +30
    -27
      docs/source/tutorials/tutorial_5_loss_optimizer.rst
  8. +109
    -107
      docs/source/tutorials/tutorial_6_datasetiter.rst
  9. +40
    -34
      docs/source/tutorials/tutorial_7_metrics.rst
  10. +22
    -45
      docs/source/tutorials/tutorial_8_modules_models.rst
  11. +0
    -0
      docs/source/tutorials/tutorial_9_callback.rst
  12. +18
    -8
      docs/source/tutorials/序列标注.rst
  13. +6
    -4
      docs/source/tutorials/文本分类.rst
  14. +4
    -4
      docs/source/user/quickstart.rst
  15. +3
    -4
      docs/source/user/tutorials.rst
  16. +75
    -0
      test/test_tutorials.py
  17. +1
    -1
      tutorials/README.md
  18. +0
    -470
      tutorials/bert_embedding_tutorial.ipynb
  19. +260
    -0
      tutorials/extend_1_bert_embedding.ipynb
  20. +292
    -0
      tutorials/tutorial_1_data_preprocess.ipynb
  21. +343
    -0
      tutorials/tutorial_2_vocabulary.ipynb
  22. +524
    -0
      tutorials/tutorial_3_embedding.ipynb
  23. +309
    -0
      tutorials/tutorial_4_load_dataset.ipynb
  24. +603
    -0
      tutorials/tutorial_5_loss_optimizer.ipynb
  25. +681
    -0
      tutorials/tutorial_6_datasetiter.ipynb
  26. +1206
    -0
      tutorials/tutorial_7_metrics.ipynb
  27. +1014
    -0
      tutorials/tutorial_8_modules_models.ipynb
  28. +0
    -0
      tutorials/tutorial_9_callback.ipynb
  29. +912
    -0
      tutorials/序列标注.ipynb

docs/source/quickstart/cn_cls_example.png → docs/source/tutorials/cn_cls_example.png View File


+ 28
- 24
docs/source/tutorials/extend_1_bert_embedding.rst View File

@@ -15,6 +15,10 @@ Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Languag
---------------------------------- ----------------------------------
下面我们将介绍通过使用Bert来进行文本分类, 中文命名实体识别, 文本匹配, 中文问答。 下面我们将介绍通过使用Bert来进行文本分类, 中文命名实体识别, 文本匹配, 中文问答。


.. note::

本教程必须使用 GPU 进行实验,并且会花费大量的时间

1. 使用Bert进行文本分类 1. 使用Bert进行文本分类
---------------------------------- ----------------------------------
文本分类是指给定一段文字,判定其所属的类别。例如下面的文本情感分类 文本分类是指给定一段文字,判定其所属的类别。例如下面的文本情感分类
@@ -28,26 +32,25 @@ Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Languag
.. code-block:: python .. code-block:: python


from fastNLP.io import WeiboSenti100kPipe from fastNLP.io import WeiboSenti100kPipe
from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForSequenceClassification
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam
import torch


data_bundle =WeiboSenti100kPipe().process_from_file() data_bundle =WeiboSenti100kPipe().process_from_file()
data_bundle.rename_field('chars', 'words') data_bundle.rename_field('chars', 'words')


# 载入BertEmbedding # 载入BertEmbedding
from fastNLP.embeddings import BertEmbedding

embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True) embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)


# 载入模型 # 载入模型
from fastNLP.models import BertForSequenceClassification

model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target'))) model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))


# 训练模型 # 训练模型
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam

device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, trainer = Trainer(data_bundle.get_dataset('train'), model,
optimizer=Adam(model_params=model.parameters(), lr=2e-5), optimizer=Adam(model_params=model.parameters(), lr=2e-5),
loss=CrossEntropyLoss(), device=0,
loss=CrossEntropyLoss(), device=device,
batch_size=8, dev_data=data_bundle.get_dataset('dev'), batch_size=8, dev_data=data_bundle.get_dataset('dev'),
metrics=AccuracyMetric(), n_epochs=2, print_every=1) metrics=AccuracyMetric(), n_epochs=2, print_every=1)
trainer.train() trainer.train()
@@ -92,7 +95,7 @@ Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Languag
贺 O 贺 O
词 O 词 O


这部分内容请参考 :doc:`快速实现序列标注模型 </tutorials/tutorial_9_seq_labeling>`
这部分内容请参考 :doc:`/tutorials/序列标注`




3. 使用Bert进行文本匹配 3. 使用Bert进行文本匹配
@@ -102,36 +105,36 @@ Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Languag


.. code-block:: python .. code-block:: python


data_bundle = CNXNLIBertPipe().process_from_file(paths)
from fastNLP.io import CNXNLIBertPipe
from fastNLP.embeddings import BertEmbedding
from fastNLP.models import BertForSentenceMatching
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam
from fastNLP.core.optimizer import AdamW
from fastNLP.core.callback import WarmupCallback
from fastNLP import Tester
import torch

data_bundle = CNXNLIBertPipe().process_from_file()
data_bundle.rename_field('chars', 'words') data_bundle.rename_field('chars', 'words')
print(data_bundle) print(data_bundle)


# 载入BertEmbedding # 载入BertEmbedding
from fastNLP.embeddings import BertEmbedding

embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True) embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)


# 载入模型 # 载入模型
from fastNLP.models import BertForSentenceMatching

model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target'))) model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target')))


# 训练模型 # 训练模型
from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam
from fastNLP.core.optimizer import AdamW
from fastNLP.core.callback import WarmupCallback

callbacks = [WarmupCallback(warmup=0.1, schedule='linear'), ] callbacks = [WarmupCallback(warmup=0.1, schedule='linear'), ]
device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, trainer = Trainer(data_bundle.get_dataset('train'), model,
optimizer=AdamW(params=model.parameters(), lr=4e-5), optimizer=AdamW(params=model.parameters(), lr=4e-5),
loss=CrossEntropyLoss(), device=0,
loss=CrossEntropyLoss(), device=device,
batch_size=8, dev_data=data_bundle.get_dataset('dev'), batch_size=8, dev_data=data_bundle.get_dataset('dev'),
metrics=AccuracyMetric(), n_epochs=5, print_every=1, metrics=AccuracyMetric(), n_epochs=5, print_every=1,
update_every=8, callbacks=callbacks) update_every=8, callbacks=callbacks)
trainer.train() trainer.train()


from fastNLP import Tester
tester = Tester(data_bundle.get_dataset('test'), model, batch_size=8, metrics=AccuracyMetric()) tester = Tester(data_bundle.get_dataset('test'), model, batch_size=8, metrics=AccuracyMetric())
tester.test() tester.test()


@@ -174,7 +177,7 @@ Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Languag
} }
] ]


您可以通过以下的代码训练 `CMRC2018 <https://github.com/ymcui/cmrc2018>`_
您可以通过以下的代码训练 (原文代码:`CMRC2018 <https://github.com/ymcui/cmrc2018>`_)


.. code-block:: python .. code-block:: python


@@ -186,7 +189,7 @@ Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Languag
from fastNLP import Trainer, BucketSampler from fastNLP import Trainer, BucketSampler
from fastNLP import WarmupCallback, GradientClipCallback from fastNLP import WarmupCallback, GradientClipCallback
from fastNLP.core.optimizer import AdamW from fastNLP.core.optimizer import AdamW
import torch


data_bundle = CMRC2018BertPipe().process_from_file() data_bundle = CMRC2018BertPipe().process_from_file()
data_bundle.rename_field('chars', 'words') data_bundle.rename_field('chars', 'words')
@@ -205,14 +208,15 @@ Bert自从在 `BERT: Pre-training of Deep Bidirectional Transformers for Languag


optimizer = AdamW(model.parameters(), lr=5e-5) optimizer = AdamW(model.parameters(), lr=5e-5)


device = 0 if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,
sampler=BucketSampler(seq_len_field_name='context_len'), sampler=BucketSampler(seq_len_field_name='context_len'),
dev_data=data_bundle.get_dataset('dev'), metrics=metric, dev_data=data_bundle.get_dataset('dev'), metrics=metric,
callbacks=callbacks, device=0, batch_size=6, num_workers=2, n_epochs=2, print_every=1,
callbacks=callbacks, device=device, batch_size=6, num_workers=2, n_epochs=2, print_every=1,
test_use_tqdm=False, update_every=10) test_use_tqdm=False, update_every=10)
trainer.train(load_best_model=False) trainer.train(load_best_model=False)


训练结果(和论文中报道的基本一致)::
训练结果(和论文中报道的基本一致)::


In Epoch:2/Step:1692, got best dev performance: In Epoch:2/Step:1692, got best dev performance:
CMRC2018Metric: f1=85.61, em=66.08 CMRC2018Metric: f1=85.61, em=66.08


+ 21
- 13
docs/source/tutorials/tutorial_1_data_preprocess.rst View File

@@ -16,10 +16,10 @@ fastNLP中的DataSet
每一行是一个instance (在fastNLP中被称为 :mod:`~fastNLP.core.Instance` ), 每一行是一个instance (在fastNLP中被称为 :mod:`~fastNLP.core.Instance` ),
每一列是一个field (在fastNLP中称为 :mod:`~fastNLP.core.FieldArray` )。 每一列是一个field (在fastNLP中称为 :mod:`~fastNLP.core.FieldArray` )。


DataSet构建和删除
DataSet构建
----------------------------- -----------------------------


我们使用传入字典的方式构建一个数据集,这是 :class:`~fastNLP.DataSet` 初始化的最基础的方式
我们使用传入字典的方式初始化一个DataSet,这是 :class:`~fastNLP.DataSet` 初始化的最基础的方式


.. code-block:: python .. code-block:: python


@@ -42,7 +42,7 @@ DataSet构建和删除
+------------------------------+------------------------------------------------+---------+ +------------------------------+------------------------------------------------+---------+




我们还可以使用 :func:`~fastNLP.DataSet.append` 方法向数据集内增加数据
我们还可以使用 :func:`~fastNLP.DataSet.append` 方法向DataSet增加数据


.. code-block:: python .. code-block:: python


@@ -55,7 +55,7 @@ DataSet构建和删除
dataset.append(instance) dataset.append(instance)
# 可以继续append更多内容,但是append的instance应该和前面的instance拥有完全相同的field # 可以继续append更多内容,但是append的instance应该和前面的instance拥有完全相同的field


另外,我们还可以用 :class:`~fastNLP.Instance` 数组的方式构建数据集
另外,我们还可以用 :class:`~fastNLP.Instance` 数组的方式构建DataSet


.. code-block:: python .. code-block:: python


@@ -70,23 +70,32 @@ DataSet构建和删除
seq_len=3) seq_len=3)
]) ])


在初步构建完数据集之后,我们可以通过 `for` 循环遍历 :class:`~fastNLP.DataSet` 中的内容。
在初步构建完DataSet之后,我们可以通过 `for` 循环遍历 :class:`~fastNLP.DataSet` 中的内容。


.. code-block:: python .. code-block:: python


for instance in dataset: for instance in dataset:
# do something # do something


DataSet的删除
-----------------------------

FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` 、 :func:`~fastNLP.DataSet.delete_instance` 和 :func:`~fastNLP.DataSet.delete_field` FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` 、 :func:`~fastNLP.DataSet.delete_instance` 和 :func:`~fastNLP.DataSet.delete_field`
我们先用下面的代码生成一个只有两列的样例DataSet,第一列的值分别为 -5 ~ 4,第二列的值均为 0.


.. code-block:: python .. code-block:: python


from fastNLP import DataSet from fastNLP import DataSet
dataset = DataSet({'a': list(range(-5, 5))})
# 返回满足条件的instance,并放入DataSet中
dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})

然后我们使用三种方法进行删除,删除后的DataSet仅包含名为 c 的一列,包含4个值为0 的数据。

.. code-block:: python

# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet
dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False) dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)
# 在dataset中删除满足条件的instance # 在dataset中删除满足条件的instance
dataset.drop(lambda ins:ins['a']<0) # dataset的instance数量减少
dataset.drop(lambda ins:ins['a']<0)
# 删除第3个instance # 删除第3个instance
dataset.delete_instance(2) dataset.delete_instance(2)
# 删除名为'a'的field # 删除名为'a'的field
@@ -103,15 +112,14 @@ FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop`


# 检查是否存在名为'a'的field # 检查是否存在名为'a'的field
dataset.has_field('a') # 或 ('a' in dataset) dataset.has_field('a') # 或 ('a' in dataset)
# 将名为'a'的field改名为'b'
dataset.rename_field('a', 'b')
# 将名为'c'的field改名为'b'
dataset.rename_field('c', 'b')
# DataSet的长度 # DataSet的长度
len(dataset) len(dataset)


其次,我们可以使用 :func:`~fastNLP.DataSet.apply` 或 :func:`~fastNLP.DataSet.apply_field` 进行数据预处理操作操作。 其次,我们可以使用 :func:`~fastNLP.DataSet.apply` 或 :func:`~fastNLP.DataSet.apply_field` 进行数据预处理操作操作。
这两个方法通过传入一个对单一 :mod:`~fastNLP.core.instance` 操作的函数,
自动地帮助你对一个 :mod:`~fastNLP.core.field` 中的每个 :mod:`~fastNLP.core.instance` 调用这个函数,完成整体的操作。
这个传入的函数可以是 lambda 匿名函数,也可以是完整定义的函数。同时,你还可以用 ``new_field_name`` 参数指定数据处理后存储的 :mod:`~fastNLP.core.field` 的名称。
使用以上的两个方法需要传入一个函数,函数可以是 lambda 匿名函数,也可以是完整定义的函数,fastNLP将对DataSet遍历地应用该函数。
同时,你还可以用 ``new_field_name`` 参数指定函数返回值组成的新 :mod:`~fastNLP.core.field` 的名称。


.. code-block:: python .. code-block:: python




+ 9
- 7
docs/source/tutorials/tutorial_2_vocabulary.rst View File

@@ -24,7 +24,7 @@ fastNLP中的Vocabulary
vocab.to_index('positive') # 输出0 vocab.to_index('positive') # 输出0
vocab.to_index('neutral') # 会报错,因为没有unk这种情况 vocab.to_index('neutral') # 会报错,因为没有unk这种情况


除了通过以上的方式建立词表,Vocabulary还可以通过使用下面的函数直从 :class:`~fastNLP.DataSet` 中的某一列建立词表以及将该列转换为index
除了通过以上的方式建立词表,Vocabulary还可以通过使用下面的函数直从 :class:`~fastNLP.DataSet` 中的某一列建立词表以及将该列转换为index


.. code-block:: python .. code-block:: python


@@ -39,7 +39,9 @@ fastNLP中的Vocabulary
}) })


vocab = Vocabulary() vocab = Vocabulary()
# 从该dataset中的chars列建立词表
vocab.from_dataset(dataset, field_name='chars') vocab.from_dataset(dataset, field_name='chars')
# 使用vocabulary将chars列转换为index
vocab.index_dataset(dataset, field_name='chars') vocab.index_dataset(dataset, field_name='chars')


target_vocab = Vocabulary(padding=None, unknown=None) target_vocab = Vocabulary(padding=None, unknown=None)
@@ -60,7 +62,7 @@ fastNLP中的Vocabulary
一些使用tips 一些使用tips
----------------------------- -----------------------------


通过使用from_dataset()函数在DataSet上建立词表时,将测试集和验证集放入参数no_create_entry_dataset中,如下所示
在使用from_dataset()函数建立词表时,将测试集和验证集放入参数no_create_entry_dataset中,如下所示


.. code-block:: python .. code-block:: python


@@ -84,18 +86,18 @@ fastNLP中的Vocabulary
# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。 # 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。
vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])



:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集
:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集
传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的
情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们
会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。通过与fastNLP中的各种Embedding配合使用,会有如下的效果,
会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。

通过与fastNLP中的各种Embedding配合使用,会有如下的效果,
如果一个词出现在了train中,但是没在预训练模型中,embedding会为随机初始化,且它单独的一个vector,如果finetune embedding的话, 如果一个词出现在了train中,但是没在预训练模型中,embedding会为随机初始化,且它单独的一个vector,如果finetune embedding的话,
这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中,那么就不能为它们单独建立vector,而应该让它指向unk这个vector的 这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中,那么就不能为它们单独建立vector,而应该让它指向unk这个vector的
值(当unk的值更新时,这个词也使用的是更新之后的vector)。所以被认为是no_create_entry的token,将首先从预训练的词表中寻找它的表示,如 值(当unk的值更新时,这个词也使用的是更新之后的vector)。所以被认为是no_create_entry的token,将首先从预训练的词表中寻找它的表示,如
果找到了,就使用该表示; 如果没有找到,则认为该词的表示应该为unk的表示。 果找到了,就使用该表示; 如果没有找到,则认为该词的表示应该为unk的表示。


下面我们结合部分 :class:`~fastNLP.embeddings.StaticEmbedding` 的例子来说明下该值造成的影响,如果您对
:class:`~fastNLP.embeddings.StaticEmbedding` 不太了解,您可以先参考 :doc:`使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>` 部分再来阅读该部分
下面我们结合部分 :class:`~fastNLP.embeddings.StaticEmbedding` 的例子来说明下该值造成的影响,如果您对 :class:`~fastNLP.embeddings.StaticEmbedding` 不太了解,您可以先参考 :doc:`使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>` 部分再来阅读该部分


.. code-block:: python .. code-block:: python




+ 2
- 2
docs/source/tutorials/tutorial_3_embedding.rst View File

@@ -254,14 +254,14 @@ CNNCharEmbedding的使用例子如下:


.. code-block:: python .. code-block:: python


from fastNLP.embeddings import LSTMCharEmbeddding
from fastNLP.embeddings import LSTMCharEmbedding
from fastNLP import Vocabulary from fastNLP import Vocabulary


vocab = Vocabulary() vocab = Vocabulary()
vocab.add_word_lst("this is a demo .".split()) vocab.add_word_lst("this is a demo .".split())


# character的embedding维度大小为50,返回的embedding结果维度大小为64。 # character的embedding维度大小为50,返回的embedding结果维度大小为64。
embed = LSTMCharEmbeddding(vocab, embed_size=64, char_emb_size=50)
embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)
words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
print(embed(words).size()) print(embed(words).size())




+ 1
- 1
docs/source/tutorials/tutorial_4_load_dataset.rst View File

@@ -187,7 +187,7 @@ Part V: 不同格式类型的基础Loader
.. code-block:: python .. code-block:: python


from fastNLP.io.loader import JsonLoader from fastNLP.io.loader import JsonLoader
oader = JsonLoader(
loader = JsonLoader(
fields={'sentence1': 'raw_words1', 'sentence2': 'raw_words2', 'gold_label': 'target'} fields={'sentence1': 'raw_words1', 'sentence2': 'raw_words2', 'gold_label': 'target'}
) )
# 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'raw_words1'、'raw_words2'、'target'这三个fields # 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'raw_words1'、'raw_words2'、'target'这三个fields


+ 30
- 27
docs/source/tutorials/tutorial_5_loss_optimizer.rst View File

@@ -1,17 +1,20 @@
============================================================================== ==============================================================================
动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试
使用Trainer和Tester快速训练和测试
============================================================================== ==============================================================================


我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极的(label=0)、
还是消极的(label=1),使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。
我们使用前面介绍过的 :doc:`/tutorials/文本分类` 任务来进行详细的介绍。这里我们把数据集换成了SST2,使用 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester` 来进行快速训练和测试。

.. note::

本教程中的代码没有使用 GPU 。读者可以自行修改代码,扩大数据量并使用 GPU 进行训练。


数据读入和处理 数据读入和处理
----------------- -----------------


数据读入 数据读入
我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的 我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的
:meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数 据集,函数默认paths值为None。
此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` :
:meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数据集,函数默认paths值为None。
此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` :


* raw_words: 原source句子 * raw_words: 原source句子
* target: 标签值 * target: 标签值
@@ -50,27 +53,27 @@
Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...) Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)


除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:` </tutorials/tutorial_4_load_dataset>` 。
除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:`/tutorials/tutorial_4_load_dataset` 。
数据集分割 数据集分割
由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法
由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法,
为了能让读者快速运行完整个教程,我们只取了训练集的前5000个数据。


.. code-block:: python .. code-block:: python


train_data = databundle.get_dataset('train')
train_data = databundle.get_dataset('train')[:5000]
train_data, test_data = train_data.split(0.015) train_data, test_data = train_data.split(0.015)
dev_data = databundle.get_dataset('dev') dev_data = databundle.get_dataset('dev')
print(len(train_data),len(dev_data),len(test_data)) print(len(train_data),len(dev_data),len(test_data))


输出结果为:: 输出结果为::
66339 872 1010
4925 872 75


数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数 数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数
:class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证 :class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证
集的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将 `target` :mod:`~fastNLP.core.field` 设定 集的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将 `target` :mod:`~fastNLP.core.field` 设定
为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个
:mod:`~fastNLP.core.field` 的设定情况,代码如下:
为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个 :mod:`~fastNLP.core.field` 的设定情况,代码如下:


.. code-block:: python .. code-block:: python


@@ -92,7 +95,7 @@
当 :mod:`~fastNLP.core.field` 设定为input或者target的时候才有存在的意义。 当 :mod:`~fastNLP.core.field` 设定为input或者target的时候才有存在的意义。


is_input为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的batch_x 中,而is_target为true is_input为true的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的batch_x 中,而is_target为true
的 :mod:`~fastNLP.core.field` 在:class:`~fastNLP.DataSetIter` 迭代取出的 batch_y 中。
的 :mod:`~fastNLP.core.field` 在 :class:`~fastNLP.DataSetIter` 迭代取出的 batch_y 中。
具体分析见 :doc:`使用DataSetIter实现自定义训练过程 </tutorials/tutorial_6_datasetiter>` 。 具体分析见 :doc:`使用DataSetIter实现自定义训练过程 </tutorials/tutorial_6_datasetiter>` 。


使用内置模型训练 使用内置模型训练
@@ -111,7 +114,7 @@
#还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值 #还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值
model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, dropout=0.1) model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, dropout=0.1)


使用fastNLP快速搭建自己的模型详见 :doc:`</tutorials/tutorial_8_modules_models>` 。
使用fastNLP快速搭建自己的模型详见 :doc:`/tutorials/tutorial_8_modules_models` 。


评价指标 评价指标
训练模型需要提供一个评价指标。这里使用准确率做为评价指标。 训练模型需要提供一个评价指标。这里使用准确率做为评价指标。
@@ -199,25 +202,25 @@
训练过程的输出如下:: 训练过程的输出如下::


input fields after batch(if batch size is 2): input fields after batch(if batch size is 2):
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 16])
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 13])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
target fields after batch(if batch size is 2): target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])


training epochs started 2019-09-17-14-29-00
training epochs started 2020-02-26-16-45-40
Evaluate data in 0.5 seconds!
Evaluation on dev at Epoch 1/10. Step:308/3080:
AccuracyMetric: acc=0.677752


Evaluate data in 0.11 seconds!
Evaluation on dev at Epoch 1/10. Step:4147/41470:
AccuracyMetric: acc=0.762615
......


...
Evaluate data in 0.44 seconds!
Evaluation on dev at Epoch 10/10. Step:3080/3080:
AccuracyMetric: acc=0.725917


Evaluate data in 0.2 seconds!
Evaluation on dev at Epoch 10/10. Step:41470/41470:
AccuracyMetric: acc=0.769495


In Epoch:2/Step:8294, got best dev performance:
AccuracyMetric: acc=0.800459
In Epoch:5/Step:1540, got best dev performance:
AccuracyMetric: acc=0.740826
Reloaded the best model. Reloaded the best model.


快速测试 快速测试
@@ -232,6 +235,6 @@
训练过程输出如下:: 训练过程输出如下::
Evaluate data in 0.19 seconds!
[tester]
AccuracyMetric: acc=0.889109
Evaluate data in 0.43 seconds!
[tester]
AccuracyMetric: acc=0.773333

+ 109
- 107
docs/source/tutorials/tutorial_6_datasetiter.rst View File

@@ -1,11 +1,13 @@
============================================================================== ==============================================================================
动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程
使用DataSetIter实现自定义训练过程
============================================================================== ==============================================================================


我们使用和 :doc:`/user/quickstart` 中一样的任务来进行详细的介绍。给出一段评价性文字,预测其情感倾向是积极的(label=0)、
还是消极的(label=1),使用 :class:`~fastNLP.DataSetIter` 类来编写自己的训练过程。
我们使用前面介绍过的 :doc:`/tutorials/文本分类` 任务来进行详细的介绍。这里我们把数据集换成了SST2,使用 :class:`~fastNLP.DataSetIter` 类来编写自己的训练过程。
DataSetIter初探之前的内容与 :doc:`/tutorials/tutorial_5_loss_optimizer` 中的完全一样,如已经阅读过可以跳过。 DataSetIter初探之前的内容与 :doc:`/tutorials/tutorial_5_loss_optimizer` 中的完全一样,如已经阅读过可以跳过。


.. note::

本教程中的代码没有使用 GPU 。读者可以自行修改代码,扩大数据量并使用 GPU 进行训练。


数据读入和预处理 数据读入和预处理
-------------------- --------------------
@@ -55,18 +57,19 @@ DataSetIter初探之前的内容与 :doc:`/tutorials/tutorial_5_loss_optimizer`
除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:`/tutorials/tutorial_4_load_dataset` 。 除了可以对数据进行读入的Pipe类,fastNLP还提供了读入和下载数据的Loader类,不同数据集的Pipe和Loader及其用法详见 :doc:`/tutorials/tutorial_4_load_dataset` 。
数据集分割 数据集分割
由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法
由于SST2数据集的测试集并不带有标签数值,故我们分割出一部分训练集作为测试集。下面这段代码展示了 :meth:`~fastNLP.DataSet.split` 的使用方法,
为了能让读者快速运行完整个教程,我们只取了训练集的前5000个数据。


.. code-block:: python .. code-block:: python


train_data = databundle.get_dataset('train')
train_data = databundle.get_dataset('train')[:5000]
train_data, test_data = train_data.split(0.015) train_data, test_data = train_data.split(0.015)
dev_data = databundle.get_dataset('dev') dev_data = databundle.get_dataset('dev')
print(len(train_data),len(dev_data),len(test_data)) print(len(train_data),len(dev_data),len(test_data))


输出结果为:: 输出结果为::
66339 872 1010
4925 872 75


数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数 数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数
:class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证集 :class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证集
@@ -162,33 +165,33 @@ DataSetIter自动padding
输出结果如下:: 输出结果如下::


batch_x: {'words': tensor([[ 4, 278, 686, 18, 7],
[15619, 3205, 5, 1676, 0]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9,
1218, 7, 0, 0, 0, 0, 0, 0, 0, 0],
[ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620,
4, 674, 663, 15, 4, 1155, 241, 640, 418, 7]]), 'seq_len': tensor([12, 20])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3,
2, 18, 11, 4, 1019, 433, 144, 32, 246, 309,
batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,
2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,
13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,
1323, 4398, 7],
[ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,
2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0],
[ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15,
2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17,
13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8,
1324, 4399, 7]]), 'seq_len': tensor([21, 33])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 14, 10, 438, 31, 78, 3, 78, 438, 7],
[ 14, 10, 4, 312, 5, 155, 1419, 610, 7]]), 'seq_len': tensor([9, 9])}
0, 0, 0]]), 'seq_len': tensor([33, 21])}
batch_y: {'target': tensor([1, 0])} batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134,
2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7],
[ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137,
40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7]]), 'seq_len': tensor([20, 20])}
batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],
[ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],
[15618, 3204, 5, 1675, 0]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,
41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],
[ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,
2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])} batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,
4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],
[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,
1217, 7, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 12])}
batch_y: {'target': tensor([0, 1])}


可以看到那些设定为input的 :mod:`~fastNLP.core.field` 都出现在batch_x中,而设定为target的 :mod:`~fastNLP.core.field` 则出现在batch_y中。同时对于同一个batch_x中的两个数 据,长度偏短的那个会被自动padding到和长度偏长的句子长度一致,默认的padding值为0。
可以看到那些设定为input的 :mod:`~fastNLP.core.field` 都出现在batch_x中,而设定为target的 :mod:`~fastNLP.core.field` 则出现在batch_y中。同时对于同一个batch_x中的两个数据,长度偏短的那个会被自动padding到和长度偏长的句子长度一致,默认的padding值为0。


Dataset改变padding值 Dataset改变padding值
可以通过 :meth:`~fastNLP.core.Dataset.set_pad_val` 方法修改默认的pad值,代码如下: 可以通过 :meth:`~fastNLP.core.Dataset.set_pad_val` 方法修改默认的pad值,代码如下:
@@ -203,36 +206,36 @@ Dataset改变padding值


输出结果如下:: 输出结果如下::


batch_x: {'words': tensor([[15619, 3205, 5, 1676, -1],
[ 4, 278, 686, 18, 7]]), 'seq_len': tensor([4, 5])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3,
2, 18, 11, 4, 1019, 433, 144, 32, 246, 309,
batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,
2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,
13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,
1323, 4398, 7],
[ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,
2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,
7, -1, -1, -1, -1, -1, -1, -1, -1, -1, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1],
[ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15,
2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17,
13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8,
1324, 4399, 7]]), 'seq_len': tensor([21, 33])}
batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 14, 10, 4, 312, 5, 155, 1419, 610, 7],
[ 14, 10, 438, 31, 78, 3, 78, 438, 7]]), 'seq_len': tensor([9, 9])}
-1, -1, -1]]), 'seq_len': tensor([33, 21])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],
[ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([0, 1])} batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137,
40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7],
[ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134,
2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7]]), 'seq_len': tensor([20, 20])}
batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,
41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],
[ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,
2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])} batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9,
1218, 7, -1, -1, -1, -1, -1, -1, -1, -1],
[ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620,
4, 674, 663, 15, 4, 1155, 241, 640, 418, 7]]), 'seq_len': tensor([12, 20])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],
[15618, 3204, 5, 1675, -1]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,
4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],
[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,
1217, 7, -1, -1, -1, -1, -1, -1, -1, -1]]), 'seq_len': tensor([20, 12])}
batch_y: {'target': tensor([0, 1])}
可以看到使用了-1进行padding。 可以看到使用了-1进行padding。


Dataset个性化padding Dataset个性化padding
如果我们希望对某一些 :mod:`~fastNLP.core.field` 进行个性化padding,可以自己构造Padder类,并使用 :meth:`~fastNLP.core.Dataset.set_padder` 函数修改padder来实现。下面通 过构造一个将数据padding到固定长度的padder进行展示:
如果我们希望对某一些 :mod:`~fastNLP.core.field` 进行个性化padding,可以自己构造Padder类,并使用 :meth:`~fastNLP.core.Dataset.set_padder` 函数修改padder来实现。下面通过构造一个将数据padding到固定长度的padder进行展示:


.. code-block:: python .. code-block:: python


@@ -265,53 +268,53 @@ Dataset个性化padding


输出结果如下:: 输出结果如下::


batch_x: {'words': tensor([[ 4, 278, 686, 18, 7, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[15619, 3205, 5, 1676, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}
batch_x: {'words': tensor([[ 2, 156, 3, 4427, 3, 240, 3, 740, 5, 1137,
40, 42, 2428, 737, 2, 649, 10, 15621, 2286, 7,
batch_x: {'words': tensor([[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,
1217, 7, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 24, 96, 27, 45, 8, 337, 37, 240, 8, 2134,
2, 18, 10, 15623, 1422, 6, 60, 5, 388, 7,
[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,
4, 673, 662, 15, 4, 1154, 240, 639, 417, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 13, 831, 7747, 175, 3, 46, 6, 84, 5753, 15,
2178, 15, 62, 56, 407, 85, 1010, 4974, 26, 17,
13786, 3, 534, 3688, 15624, 38, 376, 8, 15625, 8,
1324, 4399, 7, 0, 0, 0, 0, 0, 0, 0],
[ 1046, 11114, 16, 105, 5, 4, 177, 1825, 1705, 3,
2, 18, 11, 4, 1019, 433, 144, 32, 246, 309,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([12, 20])}
batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,
2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,
13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,
1323, 4398, 7, 0, 0, 0, 0, 0, 0, 0],
[ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,
2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,
7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([33, 21])} 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([33, 21])}
batch_y: {'target': tensor([1, 0])} batch_y: {'target': tensor([1, 0])}
batch_x: {'words': tensor([[ 14, 10, 4, 312, 5, 155, 1419, 610, 7, 0, 0, 0,
batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0], 0, 0, 0, 0],
[ 14, 10, 438, 31, 78, 3, 78, 438, 7, 0, 0, 0,
[ 14, 10, 437, 32, 78, 3, 78, 437, 7, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0]]), 'seq_len': tensor([9, 9])} 0, 0, 0, 0]]), 'seq_len': tensor([9, 9])}
batch_y: {'target': tensor([0, 1])} batch_y: {'target': tensor([0, 1])}
batch_x: {'words': tensor([[ 44, 753, 328, 181, 10, 15622, 16, 71, 8905, 9,
1218, 7, 0, 0, 0, 0, 0, 0, 0, 0,
batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,
41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[ 880, 97, 8, 1027, 12, 8068, 11, 13624, 8, 15620,
4, 674, 663, 15, 4, 1155, 241, 640, 418, 7,
[ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,
2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([12, 20])}
batch_y: {'target': tensor([1, 0])}
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 20])}
batch_y: {'target': tensor([0, 0])}
batch_x: {'words': tensor([[ 4, 277, 685, 18, 7, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[15618, 3204, 5, 1675, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([5, 4])}
batch_y: {'target': tensor([1, 1])}


在这里所有的`words`都被pad成了长度为40的list。
在这里所有的 `words` 都被pad成了长度为40的list。




使用DataSetIter自己编写训练过程 使用DataSetIter自己编写训练过程
@@ -375,39 +378,38 @@ Dataset个性化padding


-----start training----- -----start training-----


Evaluate data in 0.2 seconds!
Epoch 0 Avg Loss: 0.33 AccuracyMetric: acc=0.825688 48895ms
Evaluate data in 2.68 seconds!
Epoch 0 Avg Loss: 0.66 AccuracyMetric: acc=0.708716 29307ms


Evaluate data in 0.19 seconds!
Epoch 1 Avg Loss: 0.16 AccuracyMetric: acc=0.829128 102081ms
Evaluate data in 0.38 seconds!
Epoch 1 Avg Loss: 0.41 AccuracyMetric: acc=0.770642 52200ms


Evaluate data in 0.18 seconds!
Epoch 2 Avg Loss: 0.10 AccuracyMetric: acc=0.822248 152853ms
Evaluate data in 0.51 seconds!
Epoch 2 Avg Loss: 0.16 AccuracyMetric: acc=0.747706 70268ms


Evaluate data in 0.17 seconds!
Epoch 3 Avg Loss: 0.08 AccuracyMetric: acc=0.821101 200184ms
Evaluate data in 0.96 seconds!
Epoch 3 Avg Loss: 0.06 AccuracyMetric: acc=0.741972 90349ms


Evaluate data in 0.17 seconds!
Epoch 4 Avg Loss: 0.06 AccuracyMetric: acc=0.827982 253097ms
Evaluate data in 1.04 seconds!
Epoch 4 Avg Loss: 0.03 AccuracyMetric: acc=0.740826 114250ms


Evaluate data in 0.27 seconds!
Epoch 5 Avg Loss: 0.05 AccuracyMetric: acc=0.806193 303883ms
Evaluate data in 0.8 seconds!
Epoch 5 Avg Loss: 0.02 AccuracyMetric: acc=0.738532 134742ms


Evaluate data in 0.26 seconds!
Epoch 6 Avg Loss: 0.04 AccuracyMetric: acc=0.803899 392315ms
Evaluate data in 0.65 seconds!
Epoch 6 Avg Loss: 0.01 AccuracyMetric: acc=0.731651 154503ms


Evaluate data in 0.36 seconds!
Epoch 7 Avg Loss: 0.04 AccuracyMetric: acc=0.802752 527211ms
Evaluate data in 0.8 seconds!
Epoch 7 Avg Loss: 0.01 AccuracyMetric: acc=0.738532 175397ms


Evaluate data in 0.15 seconds!
Epoch 8 Avg Loss: 0.03 AccuracyMetric: acc=0.809633 661533ms
Evaluate data in 0.36 seconds!
Epoch 8 Avg Loss: 0.01 AccuracyMetric: acc=0.733945 192384ms


Evaluate data in 0.31 seconds!
Epoch 9 Avg Loss: 0.03 AccuracyMetric: acc=0.797018 812232ms
Evaluate data in 0.84 seconds!
Epoch 9 Avg Loss: 0.01 AccuracyMetric: acc=0.744266 214417ms


Evaluate data in 0.25 seconds!
[tester]
AccuracyMetric: acc=0.917822
Evaluate data in 0.04 seconds!
[tester]
AccuracyMetric: acc=0.786667





+ 40
- 34
docs/source/tutorials/tutorial_7_metrics.rst View File

@@ -3,14 +3,12 @@
=============================== ===============================


在进行训练时,fastNLP提供了各种各样的 :mod:`~fastNLP.core.metrics` 。 在进行训练时,fastNLP提供了各种各样的 :mod:`~fastNLP.core.metrics` 。
:doc:`/user/quickstart` 中所介绍的,:class:`~fastNLP.AccuracyMetric` 类的对象被直接传到 :class:`~fastNLP.Trainer` 中用于训练
前面的教程中所介绍,:class:`~fastNLP.AccuracyMetric` 类的对象被直接传到 :class:`~fastNLP.Trainer` 中用于训练


.. code-block:: python .. code-block:: python


from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric

trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=CrossEntropyLoss(), metrics=AccuracyMetric())
trainer = Trainer(train_data=train_data, dev_data=dev_data, model=model,
loss=loss, device=device, metrics=metric)
trainer.train() trainer.train()


除了 :class:`~fastNLP.AccuracyMetric` 之外,:class:`~fastNLP.SpanFPreRecMetric` 也是一种非常见的评价指标, 除了 :class:`~fastNLP.AccuracyMetric` 之外,:class:`~fastNLP.SpanFPreRecMetric` 也是一种非常见的评价指标,
@@ -40,7 +38,7 @@


get_metric(xxx) 当所有数据处理完毕时调用该方法,它将根据 evaluate函数累计的评价指标统计量来计算最终的评价结果 get_metric(xxx) 当所有数据处理完毕时调用该方法,它将根据 evaluate函数累计的评价指标统计量来计算最终的评价结果


以分类问题中,Accuracy计算为例,假设model的forward返回dict中包含 `pred` 这个key, 并且该key需要用于Accuracy::
以分类问题中,accuracy 计算为例,假设 model 的 `forward` 返回 dict 中包含 `pred` 这个 key , 并且该 key 需要用于 accuracy::


class Model(nn.Module): class Model(nn.Module):
def __init__(xxx): def __init__(xxx):
@@ -49,58 +47,67 @@
# do something # do something
return {'pred': pred, 'other_keys':xxx} # pred's shape: batch_size x num_classes return {'pred': pred, 'other_keys':xxx} # pred's shape: batch_size x num_classes


假设dataset中 `label` 这个field是需要预测的值,并且该field被设置为了target
对应的AccMetric可以按如下的定义, version1, 只使用这一次::
假设dataset中 `target` 这个 field 是需要预测的值,并且该 field 被设置为了 target 对应的 `AccMetric` 可以按如下的定义( Version 1, 只使用这一次)::

from fastNLP import MetricBase


class AccMetric(MetricBase): class AccMetric(MetricBase):

def __init__(self): def __init__(self):
super().__init__() super().__init__()

# 根据你的情况自定义指标 # 根据你的情况自定义指标
self.corr_num = 0
self.total = 0 self.total = 0
self.acc_count = 0


def evaluate(self, label, pred): # 这里的名称需要和dataset中target field与model返回的key是一样的,不然找不到对应的value
# evaluate的参数需要和DataSet 中 field 名以及模型输出的结果 field 名一致,不然找不到对应的value
# pred, target 的参数是 fastNLP 的默认配置
def evaluate(self, pred, target):
# dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric # dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric
self.total += label.size(0)
self.corr_num += label.eq(pred).sum().item()
self.total += target.size(0)
self.acc_count += target.eq(pred).sum().item()


def get_metric(self, reset=True): # 在这里定义如何计算metric def get_metric(self, reset=True): # 在这里定义如何计算metric
acc = self.corr_num/self.total
acc = self.acc_count/self.total
if reset: # 是否清零以便重新计算 if reset: # 是否清零以便重新计算
self.corr_num = 0
self.acc_count = 0
self.total = 0 self.total = 0
return {'acc': acc} # 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中
return {'acc': acc}
# 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中




version2,如果需要复用Metric,比如下一次使用AccMetric时,dataset中目标field不叫label而叫y,或者model的输出不是pred::
如果需要复用 metric,比如下一次使用 `AccMetric` 时,dataset中目标field不叫 `target` 而叫 `y` ,或者model的输出不是 `pred` (Version 2)::


class AccMetric(MetricBase): class AccMetric(MetricBase):
def __init__(self, label=None, pred=None):
# 假设在另一场景使用时,目标field叫y,model给出的key为pred_y。则只需要在初始化AccMetric时,
# acc_metric = AccMetric(label='y', pred='pred_y')即可。
# 当初始化为acc_metric = AccMetric(),即label=None, pred=None, fastNLP会直接使用'label', 'pred'作为key去索取对
# 应的的值
def __init__(self, pred=None, target=None):
"""
假设在另一场景使用时,目标field叫y,model给出的key为pred_y。则只需要在初始化AccMetric时,
acc_metric = AccMetric(pred='pred_y', target='y')即可。
当初始化为acc_metric = AccMetric() 时,fastNLP会直接使用 'pred', 'target' 作为key去索取对应的的值
"""

super().__init__() super().__init__()
self._init_param_map(label=label, pred=pred) # 该方法会注册label和pred. 仅需要注册evaluate()方法会用到的参数名即可
# 如果没有注册该则效果与version1就是一样的

# 如果没有注册该则效果与 Version 1 就是一样的
self._init_param_map(pred=pred, target=target) # 该方法会注册 pred 和 target . 仅需要注册evaluate()方法会用到的参数名即可


# 根据你的情况自定义指标 # 根据你的情况自定义指标
self.corr_num = 0
self.total = 0 self.total = 0
self.acc_count = 0


def evaluate(self, label, pred): # 这里的参数名称需要和self._init_param_map()注册时一致。
# evaluate的参数需要和DataSet 中 field 名以及模型输出的结果 field 名一致,不然找不到对应的value
# pred, target 的参数是 fastNLP 的默认配置
def evaluate(self, pred, target):
# dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric # dev或test时,每个batch结束会调用一次该方法,需要实现如何根据每个batch累加metric
self.total += label.size(0)
self.corr_num += label.eq(pred).sum().item()
self.total += target.size(0)
self.acc_count += target.eq(pred).sum().item()


def get_metric(self, reset=True): # 在这里定义如何计算metric def get_metric(self, reset=True): # 在这里定义如何计算metric
acc = self.corr_num/self.total
acc = self.acc_count/self.total
if reset: # 是否清零以便重新计算 if reset: # 是否清零以便重新计算
self.corr_num = 0
self.acc_count = 0
self.total = 0 self.total = 0
return {'acc': acc} # 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中
return {'acc': acc}
# 需要返回一个dict,key为该metric的名称,该名称会显示到Trainer的progress bar中


``MetricBase`` 将会在输入的字典 ``pred_dict`` 和 ``target_dict`` 中进行检查. ``MetricBase`` 将会在输入的字典 ``pred_dict`` 和 ``target_dict`` 中进行检查.
``pred_dict`` 是模型当中 ``forward()`` 函数或者 ``predict()`` 函数的返回值. ``pred_dict`` 是模型当中 ``forward()`` 函数或者 ``predict()`` 函数的返回值.
@@ -108,14 +115,13 @@ version2,如果需要复用Metric,比如下一次使用AccMetric时,datase


``MetricBase`` 会进行以下的类型检测: ``MetricBase`` 会进行以下的类型检测:


1. self.evaluate当中是否有varargs, 这是不支持的.
1. self.evaluate当中是否有 varargs, 这是不支持的.
2. self.evaluate当中所需要的参数是否既不在 ``pred_dict`` 也不在 ``target_dict`` . 2. self.evaluate当中所需要的参数是否既不在 ``pred_dict`` 也不在 ``target_dict`` .
3. self.evaluate当中所需要的参数是否既在 ``pred_dict`` 也在 ``target_dict`` . 3. self.evaluate当中所需要的参数是否既在 ``pred_dict`` 也在 ``target_dict`` .


除此以外,在参数被传入self.evaluate以前,这个函数会检测 ``pred_dict`` 和 ``target_dict`` 当中没有被用到的参数 除此以外,在参数被传入self.evaluate以前,这个函数会检测 ``pred_dict`` 和 ``target_dict`` 当中没有被用到的参数
如果kwargs是self.evaluate的参数,则不会检测 如果kwargs是self.evaluate的参数,则不会检测



self.evaluate将计算一个批次(batch)的评价指标,并累计。 没有返回值 self.evaluate将计算一个批次(batch)的评价指标,并累计。 没有返回值
self.get_metric将统计当前的评价指标并返回评价结果, 返回值需要是一个dict, key是指标名称,value是指标的值 self.get_metric将统计当前的评价指标并返回评价结果, 返回值需要是一个dict, key是指标名称,value是指标的值



+ 22
- 45
docs/source/tutorials/tutorial_8_modules_models.rst View File

@@ -11,39 +11,16 @@


fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models.CNNText` 、 fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models.CNNText` 、
:class:`~fastNLP.models.SeqLabeling` 等完整的模型,以供用户直接使用。 :class:`~fastNLP.models.SeqLabeling` 等完整的模型,以供用户直接使用。
以 :class:`~fastNLP.models.CNNText` 为例,我们看一个简单的文本分类的任务的实现过程。

首先是数据读入和处理部分,这里的代码和 :doc:`快速入门 </user/quickstart>` 中一致。

.. code-block:: python

from fastNLP.io import CSVLoader
from fastNLP import Vocabulary, CrossEntropyLoss, AccuracyMetric

loader = CSVLoader(headers=('raw_sentence', 'label'), sep='\t')
dataset = loader.load("./sample_data/tutorial_sample_dataset.csv")

dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='sentence')
dataset.apply_field(lambda x: x.split(), field_name='sentence', new_field_name='words', is_input=True)
dataset.apply(lambda x: int(x['label']), new_field_name='target', is_target=True)

train_dev_data, test_data = dataset.split(0.1)
train_data, dev_data = train_dev_data.split(0.1)

vocab = Vocabulary(min_freq=2).from_dataset(train_data, field_name='words')
vocab.index_dataset(train_data, dev_data, test_data, field_name='words', new_field_name='words')

然后我们从 :mod:`~fastNLP.models` 中导入 ``CNNText`` 模型,用它进行训练
以文本分类的任务为例,我们从 models 中导入 :class:`~fastNLP.models.CNNText` 模型,用它进行训练。


.. code-block:: python .. code-block:: python


from fastNLP.models import CNNText from fastNLP.models import CNNText
from fastNLP import Trainer


model_cnn = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)
model_cnn = CNNText((len(vocab),100), num_classes=2, dropout=0.1)


trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data,
loss=CrossEntropyLoss(), metrics=AccuracyMetric())
trainer = Trainer(train_data=train_data, dev_data=dev_data, metrics=metric,
loss=loss, device=device, model=model_cnn)
trainer.train() trainer.train()


在 iPython 环境输入 `model_cnn` ,我们可以看到 ``model_cnn`` 的网络结构 在 iPython 环境输入 `model_cnn` ,我们可以看到 ``model_cnn`` 的网络结构
@@ -52,18 +29,18 @@ fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models


CNNText( CNNText(
(embed): Embedding( (embed): Embedding(
169, 50
(dropout): Dropout(p=0.0)
(embed): Embedding(16292, 100)
(dropout): Dropout(p=0.0, inplace=False)
) )
(conv_pool): ConvMaxpool( (conv_pool): ConvMaxpool(
(convs): ModuleList( (convs): ModuleList(
(0): Conv1d(50, 3, kernel_size=(3,), stride=(1,), padding=(2,))
(1): Conv1d(50, 4, kernel_size=(4,), stride=(1,), padding=(2,))
(2): Conv1d(50, 5, kernel_size=(5,), stride=(1,), padding=(2,))
(0): Conv1d(100, 30, kernel_size=(1,), stride=(1,), bias=False)
(1): Conv1d(100, 40, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
(2): Conv1d(100, 50, kernel_size=(5,), stride=(1,), padding=(2,), bias=False)
) )
) )
(dropout): Dropout(p=0.1)
(fc): Linear(in_features=12, out_features=5, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(fc): Linear(in_features=120, out_features=2, bias=True)
) )


FastNLP 中内置的 models 如下表所示,您可以点击具体的名称查看详细的 API: FastNLP 中内置的 models 如下表所示,您可以点击具体的名称查看详细的 API:
@@ -131,10 +108,10 @@ FastNLP 完全支持使用 pyTorch 编写的模型,但与 pyTorch 中编写模
.. parsed-literal:: .. parsed-literal::


LSTMText( LSTMText(
(embedding): Embedding(169, 50)
(lstm): LSTM(50, 64, num_layers=2, dropout=0.5, bidirectional=True)
(fc): Linear(in_features=128, out_features=5, bias=True)
(dropout): Dropout(p=0.5)
(embedding): Embedding(16292, 100)
(lstm): LSTM(100, 64, num_layers=2, dropout=0.5, bidirectional=True)
(fc): Linear(in_features=128, out_features=2, bias=True)
(dropout): Dropout(p=0.5, inplace=False)
) )




@@ -148,7 +125,7 @@ FastNLP 完全支持使用 pyTorch 编写的模型,但与 pyTorch 中编写模


from fastNLP.modules import Embedding, LSTM, MLP from fastNLP.modules import Embedding, LSTM, MLP


class Model(nn.Module):
class MyText(nn.Module):
def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim=64, num_layers=2, dropout=0.5): def __init__(self, vocab_size, embedding_dim, output_dim, hidden_dim=64, num_layers=2, dropout=0.5):
super().__init__() super().__init__()


@@ -166,18 +143,18 @@ FastNLP 完全支持使用 pyTorch 编写的模型,但与 pyTorch 中编写模


.. parsed-literal:: .. parsed-literal::


Model(
MyText(
(embedding): Embedding( (embedding): Embedding(
169, 50
(dropout): Dropout(p=0.0)
(embed): Embedding(16292, 100)
(dropout): Dropout(p=0.0, inplace=False)
) )
(lstm): LSTM( (lstm): LSTM(
(lstm): LSTM(50, 64, num_layers=2, batch_first=True, bidirectional=True)
(lstm): LSTM(100, 64, num_layers=2, batch_first=True, bidirectional=True)
) )
(mlp): MLP( (mlp): MLP(
(hiddens): ModuleList() (hiddens): ModuleList()
(output): Linear(in_features=128, out_features=5, bias=True)
(dropout): Dropout(p=0.5)
(output): Linear(in_features=128, out_features=2, bias=True)
(dropout): Dropout(p=0.5, inplace=False)
) )
) )




docs/source/tutorials/tutorial_10_callback.rst → docs/source/tutorials/tutorial_9_callback.rst View File


docs/source/tutorials/tutorial_9_seq_labeling.rst → docs/source/tutorials/序列标注.rst View File

@@ -1,9 +1,13 @@
===================== =====================
快速实现序列标注模型
序列标注
===================== =====================


这一部分的内容主要展示如何使用fastNLP实现序列标注任务。您可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。
在阅读这篇Tutorial前,希望您已经熟悉了fastNLP的基础使用,尤其是数据的载入以及模型的构建,通过这个小任务的能让您进一步熟悉fastNLP的使用。
这一部分的内容主要展示如何使用fastNLP实现序列标注(Sequence labeling)任务。您可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。
在阅读这篇教程前,希望您已经熟悉了fastNLP的基础使用,尤其是数据的载入以及模型的构建。通过这个小任务,能让您进一步熟悉fastNLP的使用。

.. note::

本教程推荐使用 GPU 进行实验


命名实体识别(name entity recognition, NER) 命名实体识别(name entity recognition, NER)
------------------------------------------ ------------------------------------------
@@ -16,8 +20,8 @@
其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题在实际做的时候会被 其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题在实际做的时候会被
转换为序列标注问题 转换为序列标注问题


针对"我来自复旦大学"这句话,我们的预测目标将是[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG],其中O表示out,即不是一个实体,B-ORG是ORG(
organization的缩写)这个类别的开头(Begin),I-ORG是ORG类别的中间(Inside)。
针对"我来自复旦大学"这句话,我们的预测目标将是[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG],其中O表示out,即不是一个实体,B-ORG是ORG(
organization的缩写)这个类别的开头(Begin),I-ORG是ORG类别的中间(Inside)。


在本tutorial中我们将通过fastNLP尝试写出一个能够执行以上任务的模型。 在本tutorial中我们将通过fastNLP尝试写出一个能够执行以上任务的模型。


@@ -64,6 +68,9 @@ fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您
model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5, model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,
target_vocab=data_bundle.get_vocab('target')) target_vocab=data_bundle.get_vocab('target'))


进行训练
--------------------------------

下面我们选择用来评估模型的metric,以及优化用到的优化函数。 下面我们选择用来评估模型的metric,以及优化用到的优化函数。


.. code-block:: python .. code-block:: python
@@ -76,7 +83,7 @@ fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您
optimizer = Adam(model.parameters(), lr=1e-2) optimizer = Adam(model.parameters(), lr=1e-2)
loss = LossInForward() loss = LossInForward()


使用Trainer进行训练
使用Trainer进行训练, 您可以通过修改 device 的值来选择显卡。


.. code-block:: python .. code-block:: python


@@ -114,9 +121,12 @@ fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您
SpanFPreRecMetric: f=0.515528, pre=0.65098, rec=0.426735 SpanFPreRecMetric: f=0.515528, pre=0.65098, rec=0.426735
Reloaded the best model. Reloaded the best model.


进行测试
--------------------------------

训练结束之后过,可以通过 :class:`~fastNLP.Tester` 测试其在测试集上的性能 训练结束之后过,可以通过 :class:`~fastNLP.Tester` 测试其在测试集上的性能


.. code-block::python
.. code-block:: python


from fastNLP import Tester from fastNLP import Tester


@@ -132,7 +142,7 @@ fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您
使用更强的Bert做序列标注 使用更强的Bert做序列标注
-------------------------------- --------------------------------


在fastNLP使用Bert进行任务,您只需要切换为 :class:`fastNLP.embeddings.BertEmbedding` 即可
在fastNLP使用Bert进行任务,您只需要把 :class:`fastNLP.embeddings.StaticEmbedding` 切换为 :class:`fastNLP.embeddings.BertEmbedding` (可修改 device 选择显卡)


.. code-block:: python .. code-block:: python



docs/source/quickstart/文本分类.rst → docs/source/tutorials/文本分类.rst View File

@@ -1,7 +1,11 @@
文本分类(Text classification)
文本分类
============================= =============================


文本分类任务是将一句话或一段话划分到某个具体的类别。比如垃圾邮件识别,文本情绪分类等。
文本分类(Text classification)任务是将一句话或一段话划分到某个具体的类别。比如垃圾邮件识别,文本情绪分类等。这篇教程可以带你从零开始了解 fastNLP 的使用

.. note::

本教程推荐使用 GPU 进行实验


.. code-block:: text .. code-block:: text


@@ -15,8 +19,6 @@
.. figure:: ./cn_cls_example.png .. figure:: ./cn_cls_example.png
:alt: jupyter :alt: jupyter


jupyter

步骤 步骤
---- ----



+ 4
- 4
docs/source/user/quickstart.rst View File

@@ -2,13 +2,13 @@
快速入门 快速入门
=============== ===============


如果你想用 fastNLP 来快速地解决某类自然语言处理问题,你可以参考以下教程之一
如果你想用 fastNLP 来快速地解决某类 NLP 问题,你可以参考以下教程之一:


.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1


/quickstart/文本分类
/tutorials/文本分类
/tutorials/序列标注



这些教程是简单地介绍了使用 fastNLP 的流程,更多的教程分析见 :doc:`/user/tutorials`
这些教程是简单地介绍了 fastNLP 的使用流程,其中文本分类相对简单,序列标注则较为复杂。更多的教程分析见 :doc:`/user/tutorials`



+ 3
- 4
docs/source/user/tutorials.rst View File

@@ -11,12 +11,11 @@ fastNLP 详细使用教程
使用Vocabulary转换文本与index </tutorials/tutorial_2_vocabulary> 使用Vocabulary转换文本与index </tutorials/tutorial_2_vocabulary>
使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding> 使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>
使用Loader和Pipe加载并处理数据集 </tutorials/tutorial_4_load_dataset> 使用Loader和Pipe加载并处理数据集 </tutorials/tutorial_4_load_dataset>
动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 </tutorials/tutorial_5_loss_optimizer>
动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 </tutorials/tutorial_6_datasetiter>
使用Trainer和Tester快速训练和测试 </tutorials/tutorial_5_loss_optimizer>
使用DataSetIter实现自定义训练过程 </tutorials/tutorial_6_datasetiter>
使用Metric快速评测你的模型 </tutorials/tutorial_7_metrics> 使用Metric快速评测你的模型 </tutorials/tutorial_7_metrics>
使用Modules和Models快速搭建自定义模型 </tutorials/tutorial_8_modules_models> 使用Modules和Models快速搭建自定义模型 </tutorials/tutorial_8_modules_models>
快速实现序列标注模型 </tutorials/tutorial_9_seq_labeling>
使用Callback自定义你的训练过程 </tutorials/tutorial_10_callback>
使用Callback自定义你的训练过程 </tutorials/tutorial_9_callback>


.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1


+ 75
- 0
test/test_tutorials.py View File

@@ -7,7 +7,82 @@ from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric from fastNLP.core.metrics import AccuracyMetric
from fastNLP.io.loader import CSVLoader from fastNLP.io.loader import CSVLoader



class TestTutorial(unittest.TestCase): class TestTutorial(unittest.TestCase):
def test_tutorial_1_data_preprocess(self):
from fastNLP import DataSet
data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."],
'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'],
['Third', 'instance', '.']],
'seq_len': [6, 3, 3]}
dataset = DataSet(data)
# 传入的dict的每个key的value应该为具有相同长度的list

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
instance = Instance(raw_words="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6)
dataset.append(instance)

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet([
Instance(raw_words="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6),
Instance(raw_words="Second instance .",
words=['Second', 'instance', '.'],
seq_len=3)
])

from fastNLP import DataSet
dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10})

# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet
dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False)
# 在dataset中删除满足条件的instance
dataset.drop(lambda ins: ins['a'] < 0)
# 删除第3个instance
dataset.delete_instance(2)
# 删除名为'a'的field
dataset.delete_field('a')

# 检查是否存在名为'a'的field
print(dataset.has_field('a')) # 或 ('a' in dataset)
# 将名为'a'的field改名为'b'
dataset.rename_field('c', 'b')
# DataSet的长度
len(dataset)

from fastNLP import DataSet
data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]}
dataset = DataSet(data)

# 将句子分成单词形式, 详见DataSet.apply()方法
dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')

# 或使用DataSet.apply_field()
dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words')

# 除了匿名函数,也可以定义函数传递进去
def get_words(instance):
sentence = instance['raw_words']
words = sentence.split()
return words

dataset.apply(get_words, new_field_name='words')
def setUp(self):
import os
self._init_wd = os.path.abspath(os.curdir)

def tearDown(self):
import os
os.chdir(self._init_wd)
class TestOldTutorial(unittest.TestCase):
def test_fastnlp_10min_tutorial(self): def test_fastnlp_10min_tutorial(self):
# 从csv读取数据到DataSet # 从csv读取数据到DataSet
sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"


+ 1
- 1
tutorials/README.md View File

@@ -1,3 +1,3 @@
# fastNLP 教程 # fastNLP 教程


这里只保留了部分的
这里是 fastNLP 文档中的**快速入门**和**详细教程**部分的 jupyter notebook 文件。

+ 0
- 470
tutorials/bert_embedding_tutorial.ipynb View File

@@ -1,470 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# BertEmbedding的各种用法\n",
"fastNLP的BertEmbedding以pytorch-transformer.BertModel的代码为基础,是一个使用BERT对words进行编码的Embedding。\n",
"\n",
"使用BertEmbedding和fastNLP.models.bert里面模型可以搭建BERT应用到五种下游任务的模型。\n",
"\n",
"*预训练好的Embedding参数及数据集的介绍和自动下载功能见 [Embedding教程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_3_embedding.html) 和 [数据处理教程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_4_load_dataset.html)。*\n",
"\n",
"## 1. BERT for Squence Classification\n",
"在文本分类任务中,我们采用SST数据集作为例子来介绍BertEmbedding的使用方法。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"import torch\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"In total 3 datasets:\n",
"\ttest has 2210 instances.\n",
"\ttrain has 8544 instances.\n",
"\tdev has 1101 instances.\n",
"In total 2 vocabs:\n",
"\twords has 21701 entries.\n",
"\ttarget has 5 entries."
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 载入数据集\n",
"from fastNLP.io import SSTPipe\n",
"data_bundle = SSTPipe(subtree=False, train_subtree=False, lower=False, tokenizer='raw').process_from_file()\n",
"data_bundle"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/source/fastnlp_caches/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/source/fastnlp_caches/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 21701 words out of 21701.\n"
]
}
],
"source": [
"# 载入BertEmbedding\n",
"from fastNLP.embeddings import BertEmbedding\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-cased', include_cls_sep=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# 载入模型\n",
"from fastNLP.models import BertForSequenceClassification\n",
"model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 37]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-09-11-17-35-26\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=268), HTML(value='')), layout=Layout(display=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.08 seconds!\n",
"Evaluation on dev at Epoch 1/2. Step:134/268: \n",
"AccuracyMetric: acc=0.459582\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.2 seconds!\n",
"Evaluation on dev at Epoch 2/2. Step:268/268: \n",
"AccuracyMetric: acc=0.468665\n",
"\n",
"\n",
"In Epoch:2/Step:268, got best dev performance:\n",
"AccuracyMetric: acc=0.468665\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.468665}},\n",
" 'best_epoch': 2,\n",
" 'best_step': 268,\n",
" 'seconds': 114.5}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 训练模型\n",
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, \n",
" optimizer=Adam(model_params=model.parameters(), lr=2e-5), \n",
" loss=CrossEntropyLoss(), device=[0],\n",
" batch_size=64, dev_data=data_bundle.get_dataset('dev'), \n",
" metrics=AccuracyMetric(), n_epochs=2, print_every=1)\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 4.52 seconds!\n",
"[tester] \n",
"AccuracyMetric: acc=0.504072\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.504072}}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 测试结果并删除模型\n",
"from fastNLP import Tester\n",
"tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"## 2. BERT for Sentence Matching\n",
"在Matching任务中,我们采用RTE数据集作为例子来介绍BertEmbedding的使用方法。"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"In total 3 datasets:\n",
"\ttest has 3000 instances.\n",
"\ttrain has 2490 instances.\n",
"\tdev has 277 instances.\n",
"In total 2 vocabs:\n",
"\twords has 41281 entries.\n",
"\ttarget has 2 entries."
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 载入数据集\n",
"from fastNLP.io import RTEBertPipe\n",
"data_bundle = RTEBertPipe(lower=False, tokenizer='raw').process_from_file()\n",
"data_bundle"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/source/fastnlp_caches/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/source/fastnlp_caches/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 41279 words out of 41281.\n"
]
}
],
"source": [
"# 载入BertEmbedding\n",
"from fastNLP.embeddings import BertEmbedding\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='en-base-cased', include_cls_sep=True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# 载入模型\n",
"from fastNLP.models import BertForSentenceMatching\n",
"model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target')))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 45]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2019-09-11-17-37-36\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=312), HTML(value='')), layout=Layout(display=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 1.72 seconds!\n",
"Evaluation on dev at Epoch 1/2. Step:156/312: \n",
"AccuracyMetric: acc=0.624549\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(IntProgress(value=0, layout=Layout(flex='2'), max=18), HTML(value='')), layout=Layout(display='…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 1.74 seconds!\n",
"Evaluation on dev at Epoch 2/2. Step:312/312: \n",
"AccuracyMetric: acc=0.649819\n",
"\n",
"\n",
"In Epoch:2/Step:312, got best dev performance:\n",
"AccuracyMetric: acc=0.649819\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.649819}},\n",
" 'best_epoch': 2,\n",
" 'best_step': 312,\n",
" 'seconds': 109.87}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 训练模型\n",
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, \n",
" optimizer=Adam(model_params=model.parameters(), lr=2e-5), \n",
" loss=CrossEntropyLoss(), device=[0],\n",
" batch_size=16, dev_data=data_bundle.get_dataset('dev'), \n",
" metrics=AccuracyMetric(), n_epochs=2, print_every=1)\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 260
- 0
tutorials/extend_1_bert_embedding.ipynb View File

@@ -0,0 +1,260 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# BertEmbedding的各种用法\n",
"Bert自从在 BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding 中被提出后,因其性能卓越受到了极大的关注,在这里我们展示一下在fastNLP中如何使用Bert进行各类任务。其中中文Bert我们使用的模型的权重来自于 中文Bert预训练 。\n",
"\n",
"为了方便大家的使用,fastNLP提供了预训练的Embedding权重及数据集的自动下载,支持自动下载的Embedding和数据集见 数据集 。或您可从 使用Embedding模块将文本转成向量 与 使用Loader和Pipe加载并处理数据集 了解更多相关信息\n",
"\n",
"\n",
"下面我们将介绍通过使用Bert来进行文本分类, 中文命名实体识别, 文本匹配, 中文问答。\n",
"\n",
"## 1. 使用Bert进行文本分类\n",
"\n",
"文本分类是指给定一段文字,判定其所属的类别。例如下面的文本情感分类\n",
"\n",
" *1, 商务大床房,房间很大,床有2M宽,整体感觉经济实惠不错!*\n",
"\n",
"这里我们使用fastNLP提供自动下载的微博分类进行测试"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import WeiboSenti100kPipe\n",
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForSequenceClassification\n",
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n",
"import torch\n",
"\n",
"data_bundle =WeiboSenti100kPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"# 载入BertEmbedding\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)\n",
"\n",
"# 载入模型\n",
"model = BertForSequenceClassification(embed, len(data_bundle.get_vocab('target')))\n",
"\n",
"# 训练模型\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model,\n",
" optimizer=Adam(model_params=model.parameters(), lr=2e-5),\n",
" loss=CrossEntropyLoss(), device=device,\n",
" batch_size=8, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=AccuracyMetric(), n_epochs=2, print_every=1)\n",
"trainer.train()\n",
"\n",
"# 测试结果\n",
"from fastNLP import Tester\n",
"\n",
"tester = Tester(data_bundle.get_dataset('test'), model, batch_size=128, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. 使用Bert进行命名实体识别\n",
"\n",
"命名实体识别是给定一句话,标记出其中的实体。一般序列标注的任务都使用conll格式,conll格式是至一行中通过制表符分隔不同的内容,使用空行分隔 两句话,例如下面的例子\n",
"\n",
"```\n",
" 中 B-ORG\n",
" 共 I-ORG\n",
" 中 I-ORG\n",
" 央 I-ORG\n",
" 致 O\n",
" 中 B-ORG\n",
" 国 I-ORG\n",
" 致 I-ORG\n",
" 公 I-ORG\n",
" 党 I-ORG\n",
" 十 I-ORG\n",
" 一 I-ORG\n",
" 大 I-ORG\n",
" 的 O\n",
" 贺 O\n",
" 词 O\n",
"```\n",
"\n",
"这部分内容请参考 快速实现序列标注模型\n",
"\n",
"## 3. 使用Bert进行文本匹配\n",
"\n",
"文本匹配任务是指给定两句话判断他们的关系。比如,给定两句话判断前一句是否和后一句具有因果关系或是否是矛盾关系;或者给定两句话判断两句话是否 具有相同的意思。这里我们使用"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.io import CNXNLIBertPipe\n",
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForSentenceMatching\n",
"from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam\n",
"from fastNLP.core.optimizer import AdamW\n",
"from fastNLP.core.callback import WarmupCallback\n",
"from fastNLP import Tester\n",
"import torch\n",
"\n",
"data_bundle = CNXNLIBertPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"print(data_bundle)\n",
"\n",
"# 载入BertEmbedding\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn-wwm', include_cls_sep=True)\n",
"\n",
"# 载入模型\n",
"model = BertForSentenceMatching(embed, len(data_bundle.get_vocab('target')))\n",
"\n",
"# 训练模型\n",
"callbacks = [WarmupCallback(warmup=0.1, schedule='linear'), ]\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model,\n",
" optimizer=AdamW(params=model.parameters(), lr=4e-5),\n",
" loss=CrossEntropyLoss(), device=device,\n",
" batch_size=8, dev_data=data_bundle.get_dataset('dev'),\n",
" metrics=AccuracyMetric(), n_epochs=5, print_every=1,\n",
" update_every=8, callbacks=callbacks)\n",
"trainer.train()\n",
"\n",
"tester = Tester(data_bundle.get_dataset('test'), model, batch_size=8, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. 使用Bert进行中文问答\n",
"\n",
"问答任务是给定一段内容,以及一个问题,需要从这段内容中找到答案。 例如:\n",
"\n",
"```\n",
"\"context\": \"锣鼓经是大陆传统器乐及戏曲里面常用的打击乐记谱方法,以中文字的声音模拟敲击乐的声音,纪录打击乐的各种不同的演奏方法。常\n",
"用的节奏型称为「锣鼓点」。而锣鼓是戏曲节奏的支柱,除了加强演员身段动作的节奏感,也作为音乐的引子和尾声,提示音乐的板式和速度,以及\n",
"作为唱腔和念白的伴奏,令诗句的韵律更加抑扬顿锉,段落分明。锣鼓的运用有约定俗成的程式,依照角色行当的身份、性格、情绪以及环境,配合\n",
"相应的锣鼓点。锣鼓亦可以模仿大自然的音响效果,如雷电、波浪等等。戏曲锣鼓所运用的敲击乐器主要分为鼓、锣、钹和板四类型:鼓类包括有单\n",
"皮鼓(板鼓)、大鼓、大堂鼓(唐鼓)、小堂鼓、怀鼓、花盆鼓等;锣类有大锣、小锣(手锣)、钲锣、筛锣、马锣、镗锣、云锣;钹类有铙钹、大\n",
"钹、小钹、水钹、齐钹、镲钹、铰子、碰钟等;打拍子用的檀板、木鱼、梆子等。因为京剧的锣鼓通常由四位乐师负责,又称为四大件,领奏的师\n",
"傅称为:「鼓佬」,其职责有如西方乐队的指挥,负责控制速度以及利用各种手势提示乐师演奏不同的锣鼓点。粤剧吸收了部份京剧的锣鼓,但以木鱼\n",
"和沙的代替了京剧的板和鼓,作为打拍子的主要乐器。以下是京剧、昆剧和粤剧锣鼓中乐器对应的口诀用字:\",\n",
"\"question\": \"锣鼓经是什么?\",\n",
"\"answers\": [\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" },\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" },\n",
" {\n",
" \"text\": \"大陆传统器乐及戏曲里面常用的打击乐记谱方法\",\n",
" \"answer_start\": 4\n",
" }\n",
"]\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"您可以通过以下的代码训练 (原文代码:[CMRC2018](https://github.com/ymcui/cmrc2018) )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP.models import BertForQuestionAnswering\n",
"from fastNLP.core.losses import CMRC2018Loss\n",
"from fastNLP.core.metrics import CMRC2018Metric\n",
"from fastNLP.io.pipe.qa import CMRC2018BertPipe\n",
"from fastNLP import Trainer, BucketSampler\n",
"from fastNLP import WarmupCallback, GradientClipCallback\n",
"from fastNLP.core.optimizer import AdamW\n",
"import torch\n",
"\n",
"data_bundle = CMRC2018BertPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"print(data_bundle)\n",
"\n",
"embed = BertEmbedding(data_bundle.get_vocab('words'), model_dir_or_name='cn', requires_grad=True, include_cls_sep=False, auto_truncate=True,\n",
" dropout=0.5, word_dropout=0.01)\n",
"model = BertForQuestionAnswering(embed)\n",
"loss = CMRC2018Loss()\n",
"metric = CMRC2018Metric()\n",
"\n",
"wm_callback = WarmupCallback(schedule='linear')\n",
"gc_callback = GradientClipCallback(clip_value=1, clip_type='norm')\n",
"callbacks = [wm_callback, gc_callback]\n",
"\n",
"optimizer = AdamW(model.parameters(), lr=5e-5)\n",
"\n",
"device = 0 if torch.cuda.is_available() else 'cpu' \n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,\n",
" sampler=BucketSampler(seq_len_field_name='context_len'),\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric,\n",
" callbacks=callbacks, device=device, batch_size=6, num_workers=2, n_epochs=2, print_every=1,\n",
" test_use_tqdm=False, update_every=10)\n",
"trainer.train(load_best_model=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"训练结果(和论文中报道的基本一致):\n",
"\n",
"```\n",
" In Epoch:2/Step:1692, got best dev performance:\n",
" CMRC2018Metric: f1=85.61, em=66.08\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 292
- 0
tutorials/tutorial_1_data_preprocess.ipynb View File

@@ -0,0 +1,292 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fastNLP中的DataSet"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+------------------------------+---------------------------------------------+---------+\n",
"| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"| Third instance . | ['Third', 'instance', '.'] | 3 |\n",
"+------------------------------+---------------------------------------------+---------+\n"
]
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
" 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
" 'seq_len': [6, 3, 3]}\n",
"dataset = DataSet(data)\n",
"# 传入的dict的每个key的value应该为具有相同长度的list\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的构建"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet()\n",
"instance = Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6)\n",
"dataset.append(instance)\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet([\n",
" Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6),\n",
" Instance(raw_words=\"Second instance .\",\n",
" words=['Second', 'instance', '.'],\n",
" seq_len=3)\n",
" ])\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的删除"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----+---+\n",
"| a | c |\n",
"+----+---+\n",
"| -5 | 0 |\n",
"| -4 | 0 |\n",
"| -3 | 0 |\n",
"| -2 | 0 |\n",
"| -1 | 0 |\n",
"| 0 | 0 |\n",
"| 1 | 0 |\n",
"| 2 | 0 |\n",
"| 3 | 0 |\n",
"| 4 | 0 |\n",
"+----+---+"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+---+\n",
"| c |\n",
"+---+\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"+---+"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet\n",
"dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
"# 在dataset中删除满足条件的instance\n",
"dataset.drop(lambda ins:ins['a']<0)\n",
"# 删除第3个instance\n",
"dataset.delete_instance(2)\n",
"# 删除名为'a'的field\n",
"dataset.delete_field('a')\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 简单的数据预处理"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 检查是否存在名为'a'的field\n",
"print(dataset.has_field('a')) # 或 ('a' in dataset)\n",
"# 将名为'a'的field改名为'b'\n",
"dataset.rename_field('c', 'b')\n",
"# DataSet的长度\n",
"len(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+------------------------------+-------------------------------------------------+\n",
"| raw_words | words |\n",
"+------------------------------+-------------------------------------------------+\n",
"| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
"| Second instance . | ['Second', 'instance', '.'] |\n",
"| Third instance . | ['Third', 'instance', '.'] |\n",
"+------------------------------+-------------------------------------------------+"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
"dataset = DataSet(data)\n",
"\n",
"# 将句子分成单词形式, 详见DataSet.apply()方法\n",
"dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
"\n",
"# 或使用DataSet.apply_field()\n",
"dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
"\n",
"# 除了匿名函数,也可以定义函数传递进去\n",
"def get_words(instance):\n",
" sentence = instance['raw_words']\n",
" words = sentence.split()\n",
" return words\n",
"dataset.apply(get_words, new_field_name='words')\n",
"dataset"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 343
- 0
tutorials/tutorial_2_vocabulary.ipynb View File

@@ -0,0 +1,343 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fastNLP中的 Vocabulary\n",
"## 构建 Vocabulary"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(['复', '旦', '大', '学']) # 加入新的字\n",
"vocab.add_word('上海') # `上海`会作为一个整体\n",
"vocab.to_index('复') # 应该会为3\n",
"vocab.to_index('我') # 会输出1,Vocabulary中默认pad的index为0, unk(没有找到的词)的index为1\n",
"\n",
"# 在构建target的Vocabulary时,词表中应该用不上pad和unk,可以通过以下的初始化\n",
"vocab = Vocabulary(unknown=None, padding=None)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['positive', 'negative']...)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab.add_word_lst(['positive', 'negative'])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab.to_index('positive')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 没有设置 unk 的情况"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "ValueError",
"evalue": "word `neutral` not in vocabulary",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-c6d424040b45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvocab\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'neutral'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# 会报错,因为没有unk这种情况\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36mto_index\u001b[0;34m(self, w)\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m \u001b[0mint\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mnumber\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 415\u001b[0m \"\"\"\n\u001b[0;32m--> 416\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 417\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 418\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36m_wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word2idx\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrebuild\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 44\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 45\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, w)\u001b[0m\n\u001b[1;32m 272\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word2idx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munknown\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 274\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"word `{}` not in vocabulary\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0m_check_build_vocab\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: word `neutral` not in vocabulary"
]
}
],
"source": [
"vocab.to_index('neutral') # 会报错,因为没有unk这种情况"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 设置 unk 的情况"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(0, '<unk>')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary(unknown='<unk>', padding=None)\n",
"vocab.add_word_lst(['positive', 'negative'])\n",
"vocab.to_index('neutral'), vocab.to_word(vocab.to_index('neutral'))"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['positive', 'negative']...)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------------------------------------------------+--------+\n",
"| chars | target |\n",
"+---------------------------------------------------+--------+\n",
"| [4, 2, 2, 5, 6, 7, 3] | 0 |\n",
"| [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 3] | 1 |\n",
"+---------------------------------------------------+--------+\n"
]
}
],
"source": [
"from fastNLP import Vocabulary\n",
"from fastNLP import DataSet\n",
"\n",
"dataset = DataSet({'chars': [\n",
" ['今', '天', '天', '气', '很', '好', '。'],\n",
" ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']\n",
" ],\n",
" 'target': ['neutral', 'negative']\n",
"})\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.from_dataset(dataset, field_name='chars')\n",
"vocab.index_dataset(dataset, field_name='chars')\n",
"\n",
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"target_vocab.from_dataset(dataset, field_name='target')\n",
"target_vocab.index_dataset(dataset, field_name='target')\n",
"print(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Vocabulary(['今', '天', '心', '情', '很']...)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Vocabulary\n",
"from fastNLP import DataSet\n",
"\n",
"tr_data = DataSet({'chars': [\n",
" ['今', '天', '心', '情', '很', '好', '。'],\n",
" ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']\n",
" ],\n",
" 'target': ['positive', 'negative']\n",
"})\n",
"dev_data = DataSet({'chars': [\n",
" ['住', '宿', '条', '件', '还', '不', '错'],\n",
" ['糟', '糕', '的', '天', '气', ',', '无', '法', '出', '行', '。']\n",
" ],\n",
" 'target': ['positive', 'negative']\n",
"})\n",
"\n",
"vocab = Vocabulary()\n",
"# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。\n",
"vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 4%|▎ | 2.31M/63.5M [00:00<00:02, 22.9MB/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"http://212.129.155.247/embedding/glove.6B.50d.zip not found in cache, downloading to /tmp/tmpvziobj_e\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 63.5M/63.5M [00:01<00:00, 41.3MB/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Finish download from http://212.129.155.247/embedding/glove.6B.50d.zip\n",
"Copy file to /remote-home/ynzheng/.fastNLP/embedding/glove.6B.50d\n",
"Found 2 out of 6 words in the pre-training embedding.\n",
"tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, -0.7208, -0.2931, -0.7468, 0.6512,\n",
" 0.4730, -0.7401, 0.1877, -0.3828, -0.5590, 0.4295, -0.2698, -0.4238,\n",
" -0.3124, 1.3423, -0.7857, -0.6302, 0.9182, 0.2113, -0.5744, 1.4549,\n",
" 0.7546, -1.6165, -0.0085, 0.0029, 0.5130, -0.4745, 2.5306, 0.8594,\n",
" -0.3067, 0.0578, 0.6623, 0.2080, 0.6424, -0.5246, -0.0534, 1.1404,\n",
" -0.1370, -0.1836, 0.4546, -0.5096, -0.0255, -0.0286, 0.1805, -0.4483,\n",
" 0.4053, -0.3682]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.1320, -0.2392, 0.1732, -0.2390, -0.0463, 0.0494, 0.0488, -0.0886,\n",
" 0.0224, -0.1300, 0.0369, 0.1800, 0.0750, -0.0183, 0.2264, 0.1628,\n",
" 0.1261, -0.1259, 0.1663, -0.1230, -0.1904, -0.0532, 0.1397, -0.0259,\n",
" -0.1799, 0.0226, 0.1858, 0.1981, 0.1338, 0.2394, 0.0248, 0.0203,\n",
" -0.1722, -0.1683, -0.1892, 0.0874, 0.0562, -0.0394, 0.0306, -0.1761,\n",
" 0.1015, -0.0171, 0.1172, 0.1357, 0.1519, -0.0011, 0.1572, 0.1265,\n",
" -0.2391, -0.0258]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.1318, -0.2552, -0.0679, 0.2619, -0.2616, 0.2357, 0.1308, -0.0118,\n",
" 1.7659, 0.2078, 0.2620, -0.1643, -0.8464, 0.0201, 0.0702, 0.3978,\n",
" 0.1528, -0.2021, -1.6184, -0.5433, -0.1786, 0.5389, 0.4987, -0.1017,\n",
" 0.6626, -1.7051, 0.0572, -0.3241, -0.6683, 0.2665, 2.8420, 0.2684,\n",
" -0.5954, -0.5004, 1.5199, 0.0396, 1.6659, 0.9976, -0.5597, -0.7049,\n",
" -0.0309, -0.2830, -0.1356, 0.6429, 0.4149, 1.2362, 0.7659, 0.9780,\n",
" 0.5851, -0.3018]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0.]], grad_fn=<EmbeddingBackward>)\n",
"tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
" 0., 0.]], grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"import torch\n",
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word('train')\n",
"vocab.add_word('only_in_train') # 仅在train出现,但肯定在预训练词表中不存在\n",
"vocab.add_word('test', no_create_entry=True) # 该词只在dev或test中出现\n",
"vocab.add_word('only_in_test', no_create_entry=True) # 这个词在预训练的词表中找不到\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"print(embed(torch.LongTensor([vocab.to_index('train')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('only_in_train')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('test')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('only_in_test')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 524
- 0
tutorials/tutorial_3_embedding.ipynb View File

@@ -0,0 +1,524 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5 out of 7 words in the pre-training embedding.\n",
"torch.Size([1, 5, 50])\n"
]
}
],
"source": [
"import torch\n",
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n",
"print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 5, 30])\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n",
"\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 256])\n"
]
}
],
"source": [
"from fastNLP.embeddings import ElmoEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 512])\n"
]
}
],
"source": [
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"torch.Size([1, 5, 256])\n"
]
}
],
"source": [
"embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n",
"print(embed(words).size()) # 三层输出按照权重element-wise的加起来"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 768])\n"
]
}
],
"source": [
"from fastNLP.embeddings import BertEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 1536])\n"
]
}
],
"source": [
"# 使用后面两层的输出\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n",
"print(embed(words).size()) # 结果将是在最后一维做拼接"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 7, 768])\n"
]
}
],
"source": [
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n",
"print(embed(words).size()) # 结果将在序列维度上增加2\n",
"# 取出句子的cls表示\n",
"cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"torch.Size([1, 5, 768])\n"
]
}
],
"source": [
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 10 words out of 10.\n",
"torch.Size([1, 9, 768])\n"
]
}
],
"source": [
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"torch.Size([1, 5, 64])\n"
]
}
],
"source": [
"from fastNLP.embeddings import CNNCharEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
"embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"torch.Size([1, 5, 64])\n"
]
}
],
"source": [
"from fastNLP.embeddings import LSTMCharEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
"embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
"words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
"print(embed(words).size())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5 out of 7 words in the pre-training embedding.\n",
"50\n",
"Start constructing character vocabulary.\n",
"In total, there are 8 distinct characters.\n",
"30\n",
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"256\n",
"22 out of 22 characters were found in pretrained elmo embedding.\n",
"512\n",
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"768\n",
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n",
"1536\n",
"80\n"
]
}
],
"source": [
"from fastNLP.embeddings import *\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
"print(static_embed.embedding_dim) # 50\n",
"char_embed = CNNCharEmbedding(vocab, embed_size=30)\n",
"print(char_embed.embedding_dim) # 30\n",
"elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n",
"print(elmo_embed_1.embedding_dim) # 256\n",
"elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n",
"print(elmo_embed_2.embedding_dim) # 512\n",
"bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n",
"print(bert_embed_1.embedding_dim) # 768\n",
"bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n",
"print(bert_embed_2.embedding_dim) # 1536\n",
"stack_embed = StackEmbedding([static_embed, char_embed])\n",
"print(stack_embed.embedding_dim) # 80"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 7 words out of 7.\n"
]
}
],
"source": [
"from fastNLP.embeddings import *\n",
"\n",
"vocab = Vocabulary()\n",
"vocab.add_word_lst(\"this is a demo .\".split())\n",
"\n",
"embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n",
"embed.requires_grad = False # 修改BertEmbedding的权重为不更新"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n",
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n",
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n",
"print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 out of 4 words have frequency less than 2.\n",
"tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n",
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 out of 5 words have frequency less than 2.\n",
"All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n",
"tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
" grad_fn=<EmbeddingBackward>)\n",
"tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n",
" grad_fn=<EmbeddingBackward>)\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"from fastNLP import Vocabulary\n",
"\n",
"vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n",
"# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
"embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n",
"print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
"print(embed(torch.LongTensor([vocab.to_index('A')])))\n",
"print(embed(torch.LongTensor([vocab.unknown_idx])))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 309
- 0
tutorials/tutorial_4_load_dataset.ipynb View File

@@ -0,0 +1,309 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Loader和Pipe加载并处理数据集\n",
"\n",
"这一部分是关于如何加载数据集的教程\n",
"\n",
"## Part I: 数据集容器DataBundle\n",
"\n",
"而由于对于同一个任务,训练集,验证集和测试集会共用同一个词表以及具有相同的目标值,所以在fastNLP中我们使用了 DataBundle 来承载同一个任务的多个数据集 DataSet 以及它们的词表 Vocabulary 。下面会有例子介绍 DataBundle 的相关使用。\n",
"\n",
"DataBundle 在fastNLP中主要在各个 Loader 和 Pipe 中被使用。 下面我们先介绍一下 Loader 和 Pipe 。\n",
"\n",
"## Part II: 加载的各种数据集的Loader\n",
"\n",
"在fastNLP中,所有的 Loader 都可以通过其文档判断其支持读取的数据格式,以及读取之后返回的 DataSet 的格式, 例如 ChnSentiCorpLoader \n",
"\n",
"- download() 函数:自动将该数据集下载到缓存地址,默认缓存地址为~/.fastNLP/datasets/。由于版权等原因,不是所有的Loader都实现了该方法。该方法会返回下载后文件所处的缓存地址。\n",
"\n",
"- _load() 函数:从一个数据文件中读取数据,返回一个 DataSet 。返回的DataSet的格式可从Loader文档判断。\n",
"\n",
"- load() 函数:从文件或者文件夹中读取数据为 DataSet 并将它们组装成 DataBundle。支持接受的参数类型有以下的几种\n",
"\n",
" - None, 将尝试读取自动缓存的数据,仅支持提供了自动下载数据的Loader\n",
" - 文件夹路径, 默认将尝试在该文件夹下匹配文件名中含有 train , test , dev 的文件,如果有多个文件含有相同的关键字,将无法通过该方式读取\n",
" - dict, 例如{'train':\"/path/to/tr.conll\", 'dev':\"/to/validate.conll\", \"test\":\"/to/te.conll\"}。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1944 instances.\n",
"\ttrain has 17196 instances.\n",
"\tdev has 1858 instances.\n",
"\n"
]
}
],
"source": [
"from fastNLP.io import CWSLoader\n",
"\n",
"loader = CWSLoader(dataset_name='pku')\n",
"data_bundle = loader.load()\n",
"print(data_bundle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"这里表示一共有3个数据集。其中:\n",
"\n",
" 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance\n",
"\n",
"也可以取出DataSet,并打印DataSet中的具体内容"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------------------------------------------------------------+\n",
"| raw_words |\n",
"+----------------------------------------------------------------+\n",
"| 迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ... |\n",
"| 中共中央 总书记 、 国家 主席 江 泽民 |\n",
"+----------------------------------------------------------------+\n"
]
}
],
"source": [
"tr_data = data_bundle.get_dataset('train')\n",
"print(tr_data[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Part III: 使用Pipe对数据集进行预处理\n",
"\n",
"通过 Loader 可以将文本数据读入,但并不能直接被神经网络使用,还需要进行一定的预处理。\n",
"\n",
"在fastNLP中,我们使用 Pipe 的子类作为数据预处理的类, Loader 和 Pipe 一般具备一一对应的关系,该关系可以从其名称判断, 例如 CWSLoader 与 CWSPipe 是一一对应的。一般情况下Pipe处理包含以下的几个过程,\n",
"1. 将raw_words或 raw_chars进行tokenize以切分成不同的词或字; \n",
"2. 再建立词或字的 Vocabulary , 并将词或字转换为index; \n",
"3. 将target 列建立词表并将target列转为index;\n",
"\n",
"所有的Pipe都可通过其文档查看该Pipe支持处理的 DataSet 以及返回的 DataBundle 中的Vocabulary的情况; 如 OntoNotesNERPipe\n",
"\n",
"各种数据集的Pipe当中,都包含了以下的两个函数:\n",
"\n",
"- process() 函数:对输入的 DataBundle 进行处理, 然后返回处理之后的 DataBundle 。process函数的文档中包含了该Pipe支持处理的DataSet的格式。\n",
"- process_from_file() 函数:输入数据集所在文件夹,使用对应的Loader读取数据(所以该函数支持的参数类型是由于其对应的Loader的load函数决定的),然后调用相对应的process函数对数据进行预处理。相当于是把Load和process放在一个函数中执行。\n",
"\n",
"接着上面 CWSLoader 的例子,我们展示一下 CWSPipe 的功能:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1944 instances.\n",
"\ttrain has 17196 instances.\n",
"\tdev has 1858 instances.\n",
"In total 2 vocabs:\n",
"\tchars has 4777 entries.\n",
"\ttarget has 4 entries.\n",
"\n"
]
}
],
"source": [
"from fastNLP.io import CWSPipe\n",
"\n",
"data_bundle = CWSPipe().process(data_bundle)\n",
"print(data_bundle)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"表示一共有3个数据集和2个词表。其中:\n",
"\n",
"- 3个数据集的名称分别为train、dev、test,分别有17223、1831、1944个instance\n",
"- 2个词表分别为chars词表与target词表。其中chars词表为句子文本所构建的词表,一共有4777个不同的字;target词表为目标标签所构建的词表,一共有4种标签。\n",
"\n",
"相较于之前CWSLoader读取的DataBundle,新增了两个Vocabulary。 我们可以打印一下处理之后的DataSet"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+---------------------+---------------------+---------------------+---------+\n",
"| raw_words | chars | target | seq_len |\n",
"+---------------------+---------------------+---------------------+---------+\n",
"| 迈向 充满 希望... | [1224, 178, 674,... | [0, 1, 0, 1, 0, ... | 29 |\n",
"| 中共中央 总书记... | [11, 212, 11, 33... | [0, 3, 3, 1, 0, ... | 15 |\n",
"+---------------------+---------------------+---------------------+---------+\n"
]
}
],
"source": [
"tr_data = data_bundle.get_dataset('train')\n",
"print(tr_data[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"可以看到有两列为int的field: chars和target。这两列的名称同时也是DataBundle中的Vocabulary的名称。可以通过下列的代码获取并查看Vocabulary的 信息"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vocabulary(['B', 'E', 'S', 'M']...)\n"
]
}
],
"source": [
"vocab = data_bundle.get_vocab('target')\n",
"print(vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Part IV: fastNLP封装好的Loader和Pipe\n",
"\n",
"fastNLP封装了多种任务/数据集的 Loader 和 Pipe 并提供自动下载功能,具体参见文档 [数据集](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)\n",
"\n",
"## Part V: 不同格式类型的基础Loader\n",
"\n",
"除了上面提到的针对具体任务的Loader,我们还提供了CSV格式和JSON格式的Loader\n",
"\n",
"**CSVLoader** 读取CSV类型的数据集文件。例子如下:\n",
"\n",
"```python\n",
"from fastNLP.io.loader import CSVLoader\n",
"data_set_loader = CSVLoader(\n",
" headers=('raw_words', 'target'), sep='\\t'\n",
")\n",
"```\n",
"\n",
"表示将CSV文件中每一行的第一项将填入'raw_words' field,第二项填入'target' field。其中项之间由'\\t'分割开来\n",
"\n",
"```python\n",
"data_set = data_set_loader._load('path/to/your/file')\n",
"```\n",
"\n",
"文件内容样例如下\n",
"\n",
"```csv\n",
"But it does not leave you with much . 1\n",
"You could hate it for the same reason . 1\n",
"The performances are an absolute joy . 4\n",
"```\n",
"\n",
"读取之后的DataSet具有以下的field\n",
"\n",
"| raw_words | target |\n",
"| --------------------------------------- | ------ |\n",
"| But it does not leave you with much . | 1 |\n",
"| You could hate it for the same reason . | 1 |\n",
"| The performances are an absolute joy . | 4 |\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**JsonLoader** 读取Json类型的数据集文件,数据必须按行存储,每行是一个包含各类属性的Json对象。例子如下\n",
"\n",
"```python\n",
"from fastNLP.io.loader import JsonLoader\n",
"loader = JsonLoader(\n",
" fields={'sentence1': 'raw_words1', 'sentence2': 'raw_words2', 'gold_label': 'target'}\n",
")\n",
"```\n",
"\n",
"表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'raw_words1'、'raw_words2'、'target'这三个fields\n",
"\n",
"```python\n",
"data_set = loader._load('path/to/your/file')\n",
"```\n",
"\n",
"数据集内容样例如下\n",
"```\n",
"{\"annotator_labels\": [\"neutral\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"neutral\", ... }\n",
"{\"annotator_labels\": [\"contradiction\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"contradiction\", ... }\n",
"{\"annotator_labels\": [\"entailment\"], \"captionID\": \"3416050480.jpg#4\", \"gold_label\": \"entailment\", ... }\n",
"```\n",
"\n",
"读取之后的DataSet具有以下的field\n",
"\n",
"| raw_words0 | raw_words1 | target |\n",
"| ------------------------------------------------------ | ------------------------------------------------- | ------------- |\n",
"| A person on a horse jumps over a broken down airplane. | A person is training his horse for a competition. | neutral |\n",
"| A person on a horse jumps over a broken down airplane. | A person is at a diner, ordering an omelette. | contradiction |\n",
"| A person on a horse jumps over a broken down airplane. | A person is outdoors, on a horse. | entailment |"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 603
- 0
tutorials/tutorial_5_loss_optimizer.ipynb View File

@@ -0,0 +1,603 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Trainer和Tester快速训练和测试"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 数据读入和处理"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/remote-home/ynzheng/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/io/loader/classification.py:340: UserWarning: SST2's test file has no target.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1821 instances.\n",
"\ttrain has 67349 instances.\n",
"\tdev has 872 instances.\n",
"In total 2 vocabs:\n",
"\twords has 16292 entries.\n",
"\ttarget has 2 entries.\n",
"\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| raw_words | target | words | seq_len |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| hide new secretions from the p... | 1 | [4110, 97, 12009, 39, 2, 6843,... | 7 |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)\n"
]
}
],
"source": [
"from fastNLP.io import SST2Pipe\n",
"\n",
"pipe = SST2Pipe()\n",
"databundle = pipe.process_from_file()\n",
"vocab = databundle.get_vocab('words')\n",
"print(databundle)\n",
"print(databundle.get_dataset('train')[0])\n",
"print(databundle.get_vocab('words'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4925 872 75\n"
]
}
],
"source": [
"train_data = databundle.get_dataset('train')[:5000]\n",
"train_data, test_data = train_data.split(0.015)\n",
"dev_data = databundle.get_dataset('dev')\n",
"print(len(train_data),len(dev_data),len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-----------+--------+-------+---------+\n",
"| field_names | raw_words | target | words | seq_len |\n",
"+-------------+-----------+--------+-------+---------+\n",
"| is_input | False | False | True | True |\n",
"| is_target | False | True | False | False |\n",
"| ignore_type | | False | False | False |\n",
"| pad_value | | 0 | 0 | 0 |\n",
"+-------------+-----------+--------+-------+---------+\n"
]
},
{
"data": {
"text/plain": [
"<prettytable.PrettyTable at 0x7f49ec540160>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data.print_field_meta()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用内置模型训练"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.models import CNNText\n",
"\n",
"#词嵌入的维度\n",
"EMBED_DIM = 100\n",
"\n",
"#使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数\n",
"#还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值\n",
"model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=2, dropout=0.1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import AccuracyMetric\n",
"from fastNLP import Const\n",
"\n",
"# metrics=AccuracyMetric() 在本例中与下面这行代码等价\n",
"metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import CrossEntropyLoss\n",
"\n",
"# loss = CrossEntropyLoss() 在本例中与下面这行代码等价\n",
"loss = CrossEntropyLoss(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# 这表示构建了一个损失函数类,由func计算损失函数,其中将从模型返回值或者DataSet的target=True的field\n",
"# 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数;找到一个参数名为`label`的参数\n",
"# 传入func作为一个名为`target`的参数\n",
"#下面自己构建了一个交叉熵函数,和之后直接使用fastNLP中的交叉熵函数是一个效果\n",
"import torch\n",
"from fastNLP import LossFunc\n",
"func = torch.nn.functional.cross_entropy\n",
"loss_func = LossFunc(func, input=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"import torch.optim as optim\n",
"\n",
"#使用 torch.optim 定义优化器\n",
"optimizer=optim.RMSprop(model_cnn.parameters(), lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 4]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-11-31-25\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=3080.0), HTML(value='')), layout=Layout(d…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.75 seconds!\n",
"\r",
"Evaluation on dev at Epoch 1/10. Step:308/3080: \n",
"\r",
"AccuracyMetric: acc=0.751147\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.83 seconds!\n",
"\r",
"Evaluation on dev at Epoch 2/10. Step:616/3080: \n",
"\r",
"AccuracyMetric: acc=0.755734\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.32 seconds!\n",
"\r",
"Evaluation on dev at Epoch 3/10. Step:924/3080: \n",
"\r",
"AccuracyMetric: acc=0.758028\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.88 seconds!\n",
"\r",
"Evaluation on dev at Epoch 4/10. Step:1232/3080: \n",
"\r",
"AccuracyMetric: acc=0.741972\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.96 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:1540/3080: \n",
"\r",
"AccuracyMetric: acc=0.728211\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.87 seconds!\n",
"\r",
"Evaluation on dev at Epoch 6/10. Step:1848/3080: \n",
"\r",
"AccuracyMetric: acc=0.755734\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.04 seconds!\n",
"\r",
"Evaluation on dev at Epoch 7/10. Step:2156/3080: \n",
"\r",
"AccuracyMetric: acc=0.732798\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.57 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:2464/3080: \n",
"\r",
"AccuracyMetric: acc=0.747706\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.48 seconds!\n",
"\r",
"Evaluation on dev at Epoch 9/10. Step:2772/3080: \n",
"\r",
"AccuracyMetric: acc=0.732798\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.48 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:3080/3080: \n",
"\r",
"AccuracyMetric: acc=0.740826\n",
"\n",
"\r\n",
"In Epoch:3/Step:924, got best dev performance:\n",
"AccuracyMetric: acc=0.758028\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'AccuracyMetric': {'acc': 0.758028}},\n",
" 'best_epoch': 3,\n",
" 'best_step': 924,\n",
" 'seconds': 160.58}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer\n",
"\n",
"#训练的轮数和batch size\n",
"N_EPOCHS = 10\n",
"BATCH_SIZE = 16\n",
"\n",
"#如果在定义trainer的时候没有传入optimizer参数,模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3\n",
"#这里只使用了loss作为损失函数输入,感兴趣可以尝试其他损失函数(如之前自定义的loss_func)作为输入\n",
"trainer = Trainer(model=model_cnn, train_data=train_data, dev_data=dev_data, loss=loss, metrics=metrics,\n",
"optimizer=optimizer,n_epochs=N_EPOCHS, batch_size=BATCH_SIZE)\n",
"trainer.train()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.43 seconds!\n",
"[tester] \n",
"AccuracyMetric: acc=0.773333\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.773333}}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Tester\n",
"\n",
"tester = Tester(test_data, model_cnn, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 681
- 0
tutorials/tutorial_6_datasetiter.ipynb View File

@@ -0,0 +1,681 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 使用Trainer和Tester快速训练和测试"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 数据读入和处理"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/remote-home/ynzheng/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/io/loader/classification.py:340: UserWarning: SST2's test file has no target.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 3 datasets:\n",
"\ttest has 1821 instances.\n",
"\ttrain has 67349 instances.\n",
"\tdev has 872 instances.\n",
"In total 2 vocabs:\n",
"\twords has 16292 entries.\n",
"\ttarget has 2 entries.\n",
"\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| raw_words | target | words | seq_len |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"| hide new secretions from the p... | 1 | [4110, 97, 12009, 39, 2, 6843,... | 7 |\n",
"+-----------------------------------+--------+-----------------------------------+---------+\n",
"Vocabulary(['hide', 'new', 'secretions', 'from', 'the']...)\n"
]
}
],
"source": [
"from fastNLP.io import SST2Pipe\n",
"\n",
"pipe = SST2Pipe()\n",
"databundle = pipe.process_from_file()\n",
"vocab = databundle.get_vocab('words')\n",
"print(databundle)\n",
"print(databundle.get_dataset('train')[0])\n",
"print(databundle.get_vocab('words'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"4925 872 75\n"
]
}
],
"source": [
"train_data = databundle.get_dataset('train')[:5000]\n",
"train_data, test_data = train_data.split(0.015)\n",
"dev_data = databundle.get_dataset('dev')\n",
"print(len(train_data),len(dev_data),len(test_data))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------------+-----------+--------+-------+---------+\n",
"| field_names | raw_words | target | words | seq_len |\n",
"+-------------+-----------+--------+-------+---------+\n",
"| is_input | False | False | True | True |\n",
"| is_target | False | True | False | False |\n",
"| ignore_type | | False | False | False |\n",
"| pad_value | | 0 | 0 | 0 |\n",
"+-------------+-----------+--------+-------+---------+\n"
]
},
{
"data": {
"text/plain": [
"<prettytable.PrettyTable at 0x7f0db03d0640>"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data.print_field_meta()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import AccuracyMetric\n",
"from fastNLP import Const\n",
"\n",
"# metrics=AccuracyMetric() 在本例中与下面这行代码等价\n",
"metrics=AccuracyMetric(pred=Const.OUTPUT, target=Const.TARGET)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSetIter初探"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],\n",
" [15618, 3204, 5, 1675, 0]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],\n",
" [ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 12])}\n",
"batch_y: {'target': tensor([0, 1])}\n"
]
}
],
"source": [
"from fastNLP import BucketSampler\n",
"from fastNLP import DataSetIter\n",
"\n",
"tmp_data = dev_data[:10]\n",
"# 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。\n",
"# 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)\n",
"sampler = BucketSampler(batch_size=2, seq_len_field_name='seq_len')\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, -1, -1, -1, -1, -1, -1, -1, -1, -1,\n",
" -1, -1, -1]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7],\n",
" [15618, 3204, 5, 1675, -1]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n",
"batch_x: {'words': tensor([[ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7],\n",
" [ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, -1, -1, -1, -1, -1, -1, -1, -1]]), 'seq_len': tensor([20, 12])}\n",
"batch_y: {'target': tensor([0, 1])}\n"
]
}
],
"source": [
"tmp_data.set_pad_val('words',-1)\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x: {'words': tensor([[ 45, 752, 327, 180, 10, 15621, 16, 72, 8904, 9,\n",
" 1217, 7, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 879, 96, 8, 1026, 12, 8067, 11, 13623, 8, 15619,\n",
" 4, 673, 662, 15, 4, 1154, 240, 639, 417, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([12, 20])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 13, 830, 7746, 174, 3, 47, 6, 83, 5752, 15,\n",
" 2177, 15, 63, 57, 406, 84, 1009, 4973, 27, 17,\n",
" 13785, 3, 533, 3687, 15623, 39, 375, 8, 15624, 8,\n",
" 1323, 4398, 7, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 1045, 11113, 16, 104, 5, 4, 176, 1824, 1704, 3,\n",
" 2, 18, 11, 4, 1018, 432, 143, 33, 245, 308,\n",
" 7, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([33, 21])}\n",
"batch_y: {'target': tensor([1, 0])}\n",
"batch_x: {'words': tensor([[ 14, 10, 4, 311, 5, 154, 1418, 609, 7, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0],\n",
" [ 14, 10, 437, 32, 78, 3, 78, 437, 7, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0]]), 'seq_len': tensor([9, 9])}\n",
"batch_y: {'target': tensor([0, 1])}\n",
"batch_x: {'words': tensor([[ 2, 155, 3, 4426, 3, 239, 3, 739, 5, 1136,\n",
" 41, 43, 2427, 736, 2, 648, 10, 15620, 2285, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 24, 95, 28, 46, 8, 336, 38, 239, 8, 2133,\n",
" 2, 18, 10, 15622, 1421, 6, 61, 5, 387, 7,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([20, 20])}\n",
"batch_y: {'target': tensor([0, 0])}\n",
"batch_x: {'words': tensor([[ 4, 277, 685, 18, 7, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [15618, 3204, 5, 1675, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'seq_len': tensor([5, 4])}\n",
"batch_y: {'target': tensor([1, 1])}\n"
]
}
],
"source": [
"from fastNLP.core.field import Padder\n",
"import numpy as np\n",
"class FixLengthPadder(Padder):\n",
" def __init__(self, pad_val=0, length=None):\n",
" super().__init__(pad_val=pad_val)\n",
" self.length = length\n",
" assert self.length is not None, \"Creating FixLengthPadder with no specific length!\"\n",
"\n",
" def __call__(self, contents, field_name, field_ele_dtype, dim):\n",
" #计算当前contents中的最大长度\n",
" max_len = max(map(len, contents))\n",
" #如果当前contents中的最大长度大于指定的padder length的话就报错\n",
" assert max_len <= self.length, \"Fixed padder length smaller than actual length! with length {}\".format(max_len)\n",
" array = np.full((len(contents), self.length), self.pad_val, dtype=field_ele_dtype)\n",
" for i, content_i in enumerate(contents):\n",
" array[i, :len(content_i)] = content_i\n",
" return array\n",
"\n",
"#设定FixLengthPadder的固定长度为40\n",
"tmp_padder = FixLengthPadder(pad_val=0,length=40)\n",
"#利用dataset的set_padder函数设定words field的padder\n",
"tmp_data.set_padder('words',tmp_padder)\n",
"batch = DataSetIter(batch_size=2, dataset=tmp_data, sampler=sampler)\n",
"for batch_x, batch_y in batch:\n",
" print(\"batch_x: \",batch_x)\n",
" print(\"batch_y: \", batch_y)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用DataSetIter自己编写训练过程\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"-----start training-----\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.68 seconds!\n",
"Epoch 0 Avg Loss: 0.66 AccuracyMetric: acc=0.708716 29307ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.38 seconds!\n",
"Epoch 1 Avg Loss: 0.41 AccuracyMetric: acc=0.770642 52200ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.51 seconds!\n",
"Epoch 2 Avg Loss: 0.16 AccuracyMetric: acc=0.747706 70268ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.96 seconds!\n",
"Epoch 3 Avg Loss: 0.06 AccuracyMetric: acc=0.741972 90349ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.04 seconds!\n",
"Epoch 4 Avg Loss: 0.03 AccuracyMetric: acc=0.740826 114250ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"Epoch 5 Avg Loss: 0.02 AccuracyMetric: acc=0.738532 134742ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.65 seconds!\n",
"Epoch 6 Avg Loss: 0.01 AccuracyMetric: acc=0.731651 154503ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"Epoch 7 Avg Loss: 0.01 AccuracyMetric: acc=0.738532 175397ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.36 seconds!\n",
"Epoch 8 Avg Loss: 0.01 AccuracyMetric: acc=0.733945 192384ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=55.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.84 seconds!\n",
"Epoch 9 Avg Loss: 0.01 AccuracyMetric: acc=0.744266 214417ms\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=5.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.04 seconds!\n",
"[tester] \n",
"AccuracyMetric: acc=0.786667\n"
]
},
{
"data": {
"text/plain": [
"{'AccuracyMetric': {'acc': 0.786667}}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import BucketSampler\n",
"from fastNLP import DataSetIter\n",
"from fastNLP.models import CNNText\n",
"from fastNLP import Tester\n",
"import torch\n",
"import time\n",
"\n",
"embed_dim = 100\n",
"model = CNNText((len(vocab),embed_dim), num_classes=2, dropout=0.1)\n",
"\n",
"def train(epoch, data, devdata):\n",
" optimizer = torch.optim.Adam(model.parameters(), lr=0.001)\n",
" lossfunc = torch.nn.CrossEntropyLoss()\n",
" batch_size = 32\n",
"\n",
" # 定义一个Batch,传入DataSet,规定batch_size和去batch的规则。\n",
" # 顺序(Sequential),随机(Random),相似长度组成一个batch(Bucket)\n",
" train_sampler = BucketSampler(batch_size=batch_size, seq_len_field_name='seq_len')\n",
" train_batch = DataSetIter(batch_size=batch_size, dataset=data, sampler=train_sampler)\n",
"\n",
" start_time = time.time()\n",
" print(\"-\"*5+\"start training\"+\"-\"*5)\n",
" for i in range(epoch):\n",
" loss_list = []\n",
" for batch_x, batch_y in train_batch:\n",
" optimizer.zero_grad()\n",
" output = model(batch_x['words'])\n",
" loss = lossfunc(output['pred'], batch_y['target'])\n",
" loss.backward()\n",
" optimizer.step()\n",
" loss_list.append(loss.item())\n",
"\n",
" #这里verbose如果为0,在调用Tester对象的test()函数时不输出任何信息,返回评估信息; 如果为1,打印出验证结果,返回评估信息\n",
" #在调用过Tester对象的test()函数后,调用其_format_eval_results(res)函数,结构化输出验证结果\n",
" tester_tmp = Tester(devdata, model, metrics=AccuracyMetric(), verbose=0)\n",
" res=tester_tmp.test()\n",
"\n",
" print('Epoch {:d} Avg Loss: {:.2f}'.format(i, sum(loss_list) / len(loss_list)),end=\" \")\n",
" print(tester_tmp._format_eval_results(res),end=\" \")\n",
" print('{:d}ms'.format(round((time.time()-start_time)*1000)))\n",
" loss_list.clear()\n",
"\n",
"train(10, train_data, dev_data)\n",
"#使用tester进行快速测试\n",
"tester = Tester(test_data, model, metrics=AccuracyMetric())\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 1206
- 0
tutorials/tutorial_7_metrics.ipynb
File diff suppressed because it is too large
View File


+ 1014
- 0
tutorials/tutorial_8_modules_models.ipynb
File diff suppressed because it is too large
View File


tutorials/tutorial_10_callback.ipynb → tutorials/tutorial_9_callback.ipynb View File


+ 912
- 0
tutorials/序列标注.ipynb View File

@@ -0,0 +1,912 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 序列标注\n",
"\n",
"这一部分的内容主要展示如何使用fastNLP实现序列标注(Sequence labeling)任务。您可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。 在阅读这篇教程前,希望您已经熟悉了fastNLP的基础使用,尤其是数据的载入以及模型的构建,通过这个小任务的能让您进一步熟悉fastNLP的使用。\n",
"\n",
"## 命名实体识别(name entity recognition, NER)\n",
"\n",
"命名实体识别任务是从文本中抽取出具有特殊意义或者指代性非常强的实体,通常包括人名、地名、机构名和时间等。 如下面的例子中\n",
"\n",
"*我来自复旦大学*\n",
"\n",
"其中“复旦大学”就是一个机构名,命名实体识别就是要从中识别出“复旦大学”这四个字是一个整体,且属于机构名这个类别。这个问题在实际做的时候会被 转换为序列标注问题\n",
"\n",
"针对\"我来自复旦大学\"这句话,我们的预测目标将是[O, O, O, B-ORG, I-ORG, I-ORG, I-ORG],其中O表示out,即不是一个实体,B-ORG是ORG( organization的缩写)这个类别的开头(Begin),I-ORG是ORG类别的中间(Inside)。\n",
"\n",
"在本tutorial中我们将通过fastNLP尝试写出一个能够执行以上任务的模型。\n",
"\n",
"## 载入数据\n",
"\n",
"fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的,您可以通过《使用Loader和Pipe处理数据》了解如何使用fastNLP提供的数据加载函数。下面我们以微博命名实体任务来演示一下在fastNLP进行序列标注任务。"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n",
"| raw_chars | target | chars | seq_len |\n",
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n",
"| ['科', '技', '全', '方', '位',... | [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,... | [792, 1015, 156, 198, 291, 714... | 26 |\n",
"| ['对', ',', '输', '给', '一',... | [0, 0, 0, 0, 0, 0, 3, 1, 0, 0,... | [123, 2, 1205, 115, 8, 24, 101... | 15 |\n",
"+-----------------------------------+-----------------------------------+-----------------------------------+---------+\n"
]
}
],
"source": [
"from fastNLP.io import WeiboNERPipe\n",
"data_bundle = WeiboNERPipe().process_from_file()\n",
"print(data_bundle.get_dataset('train')[:2])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 模型构建\n",
"\n",
"首先选择需要使用的Embedding类型。关于Embedding的相关说明可以参见《使用Embedding模块将文本转成向量》。 在这里我们使用通过word2vec预训练的中文汉字embedding。"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 3321 out of 3471 words in the pre-training embedding.\n"
]
}
],
"source": [
"from fastNLP.embeddings import StaticEmbedding\n",
"\n",
"embed = StaticEmbedding(vocab=data_bundle.get_vocab('chars'), model_dir_or_name='cn-char-fastnlp-100d')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"选择好Embedding之后,我们可以使用fastNLP中自带的 fastNLP.models.BiLSTMCRF 作为模型。"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP.models import BiLSTMCRF\n",
"\n",
"data_bundle.rename_field('chars', 'words') # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名\n",
"model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,\n",
" target_vocab=data_bundle.get_vocab('target'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 进行训练\n",
"下面我们选择用来评估模型的metric,以及优化用到的优化函数。"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import SpanFPreRecMetric\n",
"from torch.optim import Adam\n",
"from fastNLP import LossInForward\n",
"\n",
"metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))\n",
"optimizer = Adam(model.parameters(), lr=1e-2)\n",
"loss = LossInForward()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"使用Trainer进行训练, 您可以通过修改 device 的值来选择显卡。"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"input fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-13-53-24\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=430.0), HTML(value='')), layout=Layout(di…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.89 seconds!\n",
"\r",
"Evaluation on dev at Epoch 1/10. Step:43/430: \n",
"\r",
"SpanFPreRecMetric: f=0.067797, pre=0.192771, rec=0.041131\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.9 seconds!\n",
"\r",
"Evaluation on dev at Epoch 2/10. Step:86/430: \n",
"\r",
"SpanFPreRecMetric: f=0.344086, pre=0.568047, rec=0.246787\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.88 seconds!\n",
"\r",
"Evaluation on dev at Epoch 3/10. Step:129/430: \n",
"\r",
"SpanFPreRecMetric: f=0.446701, pre=0.653465, rec=0.339332\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.81 seconds!\n",
"\r",
"Evaluation on dev at Epoch 4/10. Step:172/430: \n",
"\r",
"SpanFPreRecMetric: f=0.479871, pre=0.642241, rec=0.383033\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.91 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:215/430: \n",
"\r",
"SpanFPreRecMetric: f=0.486312, pre=0.650862, rec=0.388175\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.87 seconds!\n",
"\r",
"Evaluation on dev at Epoch 6/10. Step:258/430: \n",
"\r",
"SpanFPreRecMetric: f=0.541401, pre=0.711297, rec=0.437018\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.86 seconds!\n",
"\r",
"Evaluation on dev at Epoch 7/10. Step:301/430: \n",
"\r",
"SpanFPreRecMetric: f=0.430335, pre=0.685393, rec=0.313625\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.82 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:344/430: \n",
"\r",
"SpanFPreRecMetric: f=0.477759, pre=0.665138, rec=0.372751\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.81 seconds!\n",
"\r",
"Evaluation on dev at Epoch 9/10. Step:387/430: \n",
"\r",
"SpanFPreRecMetric: f=0.500759, pre=0.611111, rec=0.424165\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=9.0), HTML(value='')), layout=Layout(disp…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 0.8 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:430/430: \n",
"\r",
"SpanFPreRecMetric: f=0.496025, pre=0.65, rec=0.401028\n",
"\n",
"\r\n",
"In Epoch:6/Step:258, got best dev performance:\n",
"SpanFPreRecMetric: f=0.541401, pre=0.711297, rec=0.437018\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"text/plain": [
"{'best_eval': {'SpanFPreRecMetric': {'f': 0.541401,\n",
" 'pre': 0.711297,\n",
" 'rec': 0.437018}},\n",
" 'best_epoch': 6,\n",
" 'best_step': 258,\n",
" 'seconds': 121.39}"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Trainer\n",
"import torch\n",
"\n",
"device= 0 if torch.cuda.is_available() else 'cpu'\n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer,\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)\n",
"trainer.train()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 进行测试\n",
"训练结束之后过,可以通过 Tester 测试其在测试集上的性能"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=17.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.54 seconds!\n",
"[tester] \n",
"SpanFPreRecMetric: f=0.439024, pre=0.685279, rec=0.322967\n"
]
},
{
"data": {
"text/plain": [
"{'SpanFPreRecMetric': {'f': 0.439024, 'pre': 0.685279, 'rec': 0.322967}}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import Tester\n",
"tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)\n",
"tester.test()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 使用更强的Bert做序列标注\n",
"\n",
"在fastNLP使用Bert进行任务,您只需要把fastNLP.embeddings.StaticEmbedding 切换为 fastNLP.embeddings.BertEmbedding(可修改 device 选择显卡)。"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-chinese-wwm/vocab.txt\n",
"Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-chinese-wwm/chinese_wwm_pytorch.bin.\n",
"Start to generate word pieces for word.\n",
"Found(Or segment into word pieces) 3384 words out of 3471.\n",
"input fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\twords: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"target fields after batch(if batch size is 2):\n",
"\ttarget: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 26]) \n",
"\tseq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2]) \n",
"\n",
"training epochs started 2020-02-27-13-58-51\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=1130.0), HTML(value='')), layout=Layout(d…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.7 seconds!\n",
"Evaluation on dev at Epoch 1/10. Step:113/1130: \n",
"SpanFPreRecMetric: f=0.008114, pre=0.019231, rec=0.005141\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.49 seconds!\n",
"Evaluation on dev at Epoch 2/10. Step:226/1130: \n",
"SpanFPreRecMetric: f=0.467866, pre=0.467866, rec=0.467866\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.6 seconds!\n",
"Evaluation on dev at Epoch 3/10. Step:339/1130: \n",
"SpanFPreRecMetric: f=0.566879, pre=0.482821, rec=0.686375\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.56 seconds!\n",
"Evaluation on dev at Epoch 4/10. Step:452/1130: \n",
"SpanFPreRecMetric: f=0.651972, pre=0.59408, rec=0.722365\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.69 seconds!\n",
"\r",
"Evaluation on dev at Epoch 5/10. Step:565/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.640909, pre=0.574338, rec=0.724936\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.52 seconds!\n",
"Evaluation on dev at Epoch 6/10. Step:678/1130: \n",
"SpanFPreRecMetric: f=0.661836, pre=0.624146, rec=0.70437\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.67 seconds!\n",
"Evaluation on dev at Epoch 7/10. Step:791/1130: \n",
"SpanFPreRecMetric: f=0.683429, pre=0.615226, rec=0.768638\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.37 seconds!\n",
"\r",
"Evaluation on dev at Epoch 8/10. Step:904/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.674699, pre=0.634921, rec=0.719794\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Evaluate data in 2.42 seconds!\n",
"Evaluation on dev at Epoch 9/10. Step:1017/1130: \n",
"SpanFPreRecMetric: f=0.693878, pre=0.650901, rec=0.742931\n",
"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=23.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 2.46 seconds!\n",
"\r",
"Evaluation on dev at Epoch 10/10. Step:1130/1130: \n",
"\r",
"SpanFPreRecMetric: f=0.686845, pre=0.62766, rec=0.758355\n",
"\n",
"\r\n",
"In Epoch:9/Step:1017, got best dev performance:\n",
"SpanFPreRecMetric: f=0.693878, pre=0.650901, rec=0.742931\n",
"Reloaded the best model.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=17.0), HTML(value='')), layout=Layout(dis…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r",
"Evaluate data in 1.96 seconds!\n",
"[tester] \n",
"SpanFPreRecMetric: f=0.626561, pre=0.596112, rec=0.660287\n"
]
},
{
"data": {
"text/plain": [
"{'SpanFPreRecMetric': {'f': 0.626561, 'pre': 0.596112, 'rec': 0.660287}}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"from fastNLP.io import WeiboNERPipe\n",
"data_bundle = WeiboNERPipe().process_from_file()\n",
"data_bundle.rename_field('chars', 'words')\n",
"\n",
"from fastNLP.embeddings import BertEmbedding\n",
"embed = BertEmbedding(vocab=data_bundle.get_vocab('words'), model_dir_or_name='cn')\n",
"model = BiLSTMCRF(embed=embed, num_classes=len(data_bundle.get_vocab('target')), num_layers=1, hidden_size=200, dropout=0.5,\n",
" target_vocab=data_bundle.get_vocab('target'))\n",
"\n",
"from fastNLP import SpanFPreRecMetric\n",
"from torch.optim import Adam\n",
"from fastNLP import LossInForward\n",
"metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))\n",
"optimizer = Adam(model.parameters(), lr=2e-5)\n",
"loss = LossInForward()\n",
"\n",
"from fastNLP import Trainer\n",
"import torch\n",
"device= 5 if torch.cuda.is_available() else 'cpu'\n",
"trainer = Trainer(data_bundle.get_dataset('train'), model, loss=loss, optimizer=optimizer, batch_size=12,\n",
" dev_data=data_bundle.get_dataset('dev'), metrics=metric, device=device)\n",
"trainer.train()\n",
"\n",
"from fastNLP import Tester\n",
"tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)\n",
"tester.test()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Loading…
Cancel
Save