diff --git a/tutorials/fastnlp_tutorial_1.ipynb b/tutorials/fastnlp_tutorial_1.ipynb index c378b54a..ba7452b9 100644 --- a/tutorials/fastnlp_tutorial_1.ipynb +++ b/tutorials/fastnlp_tutorial_1.ipynb @@ -80,7 +80,7 @@ } ], "source": [ - "from fastNLP.core.dataset import DataSet\n", + "from fastNLP import DataSet\n", "\n", "data = {'idx': [0, 1, 2], \n", " 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"],\n", @@ -153,7 +153,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1608199516936 1607874531400\n", + "1630555358408 1630228349768\n", "+-----+------------------------+------------------------+-----+\n", "| idx | sentence | words | num |\n", "+-----+------------------------+------------------------+-----+\n", @@ -198,7 +198,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "1607874531400 1607874531400\n", + "1630228349768 1630228349768\n", "+-----+------------------------+------------------------+-----+\n", "| idx | sentence | words | num |\n", "+-----+------------------------+------------------------+-----+\n", @@ -302,6 +302,7 @@ "\n", "  **`apply`和`apply_field`返回的是个列表**,**`apply_more`和`apply_field_more`返回的是个字典**\n", "\n", + "    预处理过程中,通过`progress_bar`参数设置显示进度条类型,通过`num_proc`设置多进程\n", "***\n", "\n", "`apply`的参数包括一个函数`func`和一个新字段名`new_field_name`,函数`func`的处理对象是`dataset`模块中\n", @@ -311,30 +312,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "72a0b5f9", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8532c5609a394c19b60315663a6f0f4a", + "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Output()" + "Processing: 0%| | 0/3 [00:00,\n", + " 'words': ,\n", + " 'num': }" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dataset.get_all_fields()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "5433815c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['num', 'sentence', 'words']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "dataset.get_field_names()" ] @@ -553,10 +729,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "25ce5488", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3 False\n", + "6 True\n", + "+------------------------------+------------------------------+--------+\n", + "| sentence | words | length |\n", + "+------------------------------+------------------------------+--------+\n", + "| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n", + "| I like apples . | ['I', 'like', 'apples', '... | 4 |\n", + "| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n", + "| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n", + "| I like apples . | ['I', 'like', 'apples', '... | 4 |\n", + "| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n", + "+------------------------------+------------------------------+--------+\n" + ] + } + ], "source": [ "print(len(dataset), dataset.has_field('length')) \n", "if 'num' in dataset:\n", @@ -588,12 +783,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "id": "3515e096", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vocabulary([]...)\n", + "{'': 0, '': 1}\n", + " 0\n", + " 1\n" + ] + } + ], "source": [ - "from fastNLP.core.vocabulary import Vocabulary\n", + "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary()\n", "print(vocab)\n", @@ -614,10 +820,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "id": "88c7472a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n", + "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n", + "6 {'': 0, '': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n" + ] + } + ], "source": [ "vocab.add_word_lst(['生活', '就像', '海洋'])\n", "print(len(vocab), vocab.word_count)\n", @@ -640,10 +856,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "3447acde", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0\n", + " 1\n", + "生活 2\n", + "彼岸 1 False\n" + ] + } + ], "source": [ "print(vocab.to_word(0), vocab.to_index(''))\n", "print(vocab.to_word(1), vocab.to_index(''))\n", @@ -665,10 +892,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "490b101c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "生活 2\n", + "彼岸 12 True\n", + "13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n", + "13 {'': 0, '': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n" + ] + } + ], "source": [ "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n", "print(vocab.to_word(2), vocab.to_index('生活'))\n", @@ -691,10 +929,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "a99ff909", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'positive': 0, 'negative': 1}\n", + "ValueError: word `neutral` not in vocabulary\n" + ] + } + ], "source": [ "vocab = Vocabulary(unknown=None, padding=None)\n", "\n", @@ -717,10 +964,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "432f74c1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'': 0, 'positive': 1, 'negative': 2}\n", + "0 \n" + ] + } + ], "source": [ "vocab = Vocabulary(unknown='', padding=None)\n", "\n", @@ -746,10 +1002,92 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "3dbd985d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SentenceIdSentenceSentiment
01A series of escapades demonstrating the adage ...negative
12This quiet , introspective and entertaining in...positive
23Even fans of Ismail Merchant 's work , I suspe...negative
34A positively thrilling combination of ethnogra...neutral
45A comedy-drama of nearly epic proportions root...positive
56The Importance of Being Earnest , so thick wit...neutral
\n", + "
" + ], + "text/plain": [ + " SentenceId Sentence Sentiment\n", + "0 1 A series of escapades demonstrating the adage ... negative\n", + "1 2 This quiet , introspective and entertaining in... positive\n", + "2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n", + "3 4 A positively thrilling combination of ethnogra... neutral\n", + "4 5 A comedy-drama of nearly epic proportions root... positive\n", + "5 6 The Importance of Being Earnest , so thick wit... neutral" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -767,17 +1105,49 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "4f634586", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing: 0%| | 0/6 [00:00': 0, '': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n", + "\n", + "Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n" + ] + } + ], "source": [ - "from fastNLP.core.vocabulary import Vocabulary\n", + "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary()\n", "vocab = vocab.from_dataset(dataset, field_name='Sentence')\n", @@ -841,10 +1223,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "id": "2f9a04b2", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------+------------------------------+-----------+\n", + "| SentenceId | Sentence | Sentiment |\n", + "+------------+------------------------------+-----------+\n", + "| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n", + "| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n", + "| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n", + "| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n", + "| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n", + "| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n", + "+------------+------------------------------+-----------+\n" + ] + } + ], "source": [ "vocab.index_dataset(dataset, field_name='Sentence')\n", "print(dataset)" @@ -860,10 +1259,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "id": "5f5eed18", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'negative': 0, 'positive': 1, 'neutral': 2}\n", + "+------------+------------------------------+-----------+\n", + "| SentenceId | Sentence | Sentiment |\n", + "+------------+------------------------------+-----------+\n", + "| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n", + "| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n", + "| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n", + "| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n", + "| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n", + "| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n", + "+------------+------------------------------+-----------+\n" + ] + } + ], "source": [ "target_vocab = Vocabulary(padding=None, unknown=None)\n", "\n", diff --git a/tutorials/fastnlp_tutorial_2.ipynb b/tutorials/fastnlp_tutorial_2.ipynb index 260d5bf4..ba9ad109 100644 --- a/tutorials/fastnlp_tutorial_2.ipynb +++ b/tutorials/fastnlp_tutorial_2.ipynb @@ -1,20 +1,327 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# T2. dataloader 和 tokenizer 的基本使用\n", + "\n", + "  1   fastNLP 中的 dataloader\n", + "\n", + "    1.1   databundle 的结构与使用\n", + "\n", + "    1.2   dataloader 的结构与使用\n", + "\n", + "  2   fastNLP 中的 tokenizer\n", + " \n", + "    2.1   传统 GloVe 词嵌入的加载\n", + " \n", + "    2.2   PreTrainedTokenizer 的概念\n", + "\n", + "    2.3   BertTokenizer 的基本使用\n", + "\n", + "  3   实例:NG20 数据集的完整加载过程\n", + " \n", + "    3.1   \n", + "\n", + "    3.2   " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. fastNLP 中的 dataloader\n", + "\n", + "### 1.1 databundle 的结构与使用\n", + "\n", + "在`fastNLP 0.8`中,在常用的数据加载模块`DataLoader`和数据集`DataSet`模块之间,还存在\n", + "\n", + "  一个中间模块,即 **数据包`DataBundle`模块**,可以从`fastNLP.io`路径中导入该模块\n", + "\n", + "在`fastNLP 0.8`中,**一个`databundle`数据包包含若干`dataset`数据集和`vocabulary`词汇表**\n", + "\n", + "  分别存储在`datasets`和`vocabs`两个变量中,所以了解`databundle`数据包之前\n", + "\n", + "  需要首先**复习`dataset`数据集和`vocabulary`词汇表**,**下面的一串代码**,**你知道其大概含义吗?**\n", + "\n", + "必要提示:`NG20`,全称[`News Group 20`](http://qwone.com/~jason/20Newsgroups/),是一个新闻文本分类数据集,包含20个大类以及若干小类\n", + "\n", + "  数据集包含训练集`'ng20_train.csv'`和测试集`'ng20_test.csv'`两部分,每条数据\n", + "\n", + "  包括`'label'`标签和`'text'`文本两个条目,通过`sample(frac=1)[:10]`随机采样并读取前十条" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Processing: 0%| | 0/10 [00:00': 0, '': 1, 'rec': 2, 'talk': 3, 'comp': 4, 'soc': 5, 'misc': 6, 'sci': 7}\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "from fastNLP import DataSet\n", + "from fastNLP import Vocabulary\n", + "from fastNLP.io import DataBundle\n", + "\n", + "datasets = {}\n", + "datasets['train'] = DataSet.from_pandas(pd.read_csv('./data/ng20_train.csv').sample(frac=1)[:10])\n", + "datasets['train'].apply_more(lambda ins:{'label': ins['label'].lower().split('.')[0], \n", + " 'text': ins['text'].lower().split()},\n", + " progress_bar='tqdm')\n", + "datasets['test'] = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv').sample(frac=1)[:10])\n", + "datasets['test'].apply_more(lambda ins:{'label': ins['label'].lower().split('.')[0], \n", + " 'text': ins['text'].lower().split()},\n", + " progress_bar='tqdm')\n", + "print(datasets['train'])\n", + "\n", + "vocabs = {}\n", + "vocabs['label'] = Vocabulary().from_dataset(datasets['train'].concat(datasets['test'], inplace=False), field_name='label')\n", + "vocabs['text'] = Vocabulary().from_dataset(datasets['train'].concat(datasets['test'], inplace=False), field_name='text')\n", + "print(vocabs['label'].word2idx)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。\n", + " 该对象一般由fastNLP中各种Loader的load函数生成,可以通过以下的方法获取里面的内容" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In total 2 datasets:\n", + "\ttrain has 10 instances.\n", + "\ttest has 10 instances.\n", + "In total 2 vocabs:\n", + "\tlabel has 8 entries.\n", + "\ttext has 1687 entries.\n", + "\n" + ] + } + ], + "source": [ + "data_bundle = DataBundle(datasets=datasets, vocabs=vocabs)\n", + "print(data_bundle)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1.2 dataloader 的结构与使用" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. fastNLP 中的 tokenizer\n", + "\n", + "### 2.1 传统 GloVe 词嵌入的加载" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.2 PreTrainTokenizer 的提出\n", + "\n", + "在`fastNLP 0.8`中,**使用`PreTrainedTokenizer`模块来为数据集中的词语进行词向量的标注**\n", + "\n", + "  需要注意的是,`PreTrainedTokenizer`模块的下载和导入**需要确保环境安装了`transformers`模块**\n", + "\n", + "  这是因为 `fastNLP 0.8`中`PreTrainedTokenizer`模块的实现基于`Huggingface Transformers`库\n", + "\n", + "**`Huggingface Transformers`是基于一个开源的**,**基于`transformer`模型结构提供的预训练语言库**\n", + "\n", + "  包含了多种经典的基于`transformer`的预训练模型,如`BERT`、`BART`、`RoBERTa`、`GPT2`、`CPT`\n", + "\n", + "  更多相关内容可以参考`Huggingface Transformers`的[相关论文](https://arxiv.org/pdf/1910.03771.pdf)、[官方文档](https://huggingface.co/transformers/)以及[的代码仓库](https://github.com/huggingface/transformers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2.3 BertTokenizer 的基本使用\n", + "\n", + "在`fastNLP 0.8`中,以`PreTrainedTokenizer`为基类,泛化出多个子类,实现基于`BERT`等模型的标注\n", + "\n", + "  本节以`BertTokenizer`模块为例,展示`PreTrainedTokenizer`模块的使用方法与应用实例\n", + "\n", + "**`BertTokenizer`的初始化包括 导入模块和导入数据 两步**,先通过从`fastNLP.transformers.torch`中\n", + "\n", + "  导入`BertTokenizer`模块,再通过`from_pretrained`方法指定`tokenizer`参数类型下载\n", + "\n", + "  其中,**`'bert-base-uncased'`指定`tokenizer`使用的预训练`BERT`类型**:单词不区分大小写\n", + "\n", + "    **模块层数`L=12`**,**隐藏层维度`H=768`**,**自注意力头数`A=12`**,**总参数量`110M`**\n", + "\n", + "  另外,模型参数自动下载至 home 目录下的`~\\.cache\\huggingface\\transformers`文件夹中" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "scrolled": false }, "outputs": [], - "source": [] + "source": [ + "from fastNLP.transformers.torch import BertTokenizer\n", + "\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "dir(tokenizer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. 实例:NG20 数据集的完整加载过程\n", + "\n", + "### 3.1 使用 BertTokenizer 处理数据集\n", + "\n", + "在`fastNLP 0.8`中,**`Trainer`模块和`Evaluator`模块分别表示“训练器”和“评测器”**\n", + "\n", + "  对应于之前的`fastNLP`版本中的`Trainer`模块和`Tester`模块,其定义方法如下所示\n", + "\n", + "在`fastNLP 0.8`中,需要注意,在同个`python`脚本中先使用`Trainer`训练,然后使用`Evaluator`评测\n", + "\n", + "  非常关键的问题在于**如何正确设置二者的`driver`**。这就引入了另一个问题:什么是 `driver`?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from fastNLP import DataSet\n", + "from fastNLP import Vocabulary\n", + "\n", + "dataset = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from functools import partial\n", + "\n", + "encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n", + " return_attention_mask=True)\n", + "# 会新增 input_ids 、 attention_mask 和 token_type_ids 这三个 field\n", + "dataset.apply_field_more(encode, field_name='text')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target_vocab = Vocabulary(padding=None, unknown=None)\n", + "\n", + "target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n", + "target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n", + " new_field_name='labels')\n", + "# 需要将 input_ids 的 pad 值设置为 tokenizer 的 pad 值\n", + "dataset.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n", + "dataset.set_ignore('label', 'text') # 因为 label 是原始的不需要的 str ,所以我们可以忽略它,让它不要在 batch 的输出中出现" + ] } ], "metadata": { diff --git a/tutorials/fastnlp_tutorial_3.ipynb b/tutorials/fastnlp_tutorial_3.ipynb new file mode 100644 index 00000000..dbc0d42c --- /dev/null +++ b/tutorials/fastnlp_tutorial_3.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "213d538c", + "metadata": {}, + "source": [ + "# T3. \n", + "\n", + "  1   \n", + " \n", + "    1.1   \n", + "\n", + "    1.2   \n", + "\n", + "  2   \n", + "\n", + "    2.1   \n", + "\n", + "    2.2   \n", + "\n", + "  3   \n", + " \n", + "    3.1   \n", + "\n", + "    3.2   " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b369137f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/figures/T0-fig-parameter-matching.png b/tutorials/figures/T0-fig-parameter-matching.png index 410256ae..24013cc1 100644 Binary files a/tutorials/figures/T0-fig-parameter-matching.png and b/tutorials/figures/T0-fig-parameter-matching.png differ diff --git a/tutorials/figures/T0-fig-training-structure.png b/tutorials/figures/T0-fig-training-structure.png index 6569f3d4..edc2e2ff 100644 Binary files a/tutorials/figures/T0-fig-training-structure.png and b/tutorials/figures/T0-fig-training-structure.png differ