Browse Source

update tutorial-12 lxr 220514

tags/v1.0.0alpha
lxr-tech x54-729 2 years ago
parent
commit
0f9d0758c6
5 changed files with 845 additions and 62 deletions
  1. +476
    -59
      tutorials/fastnlp_tutorial_1.ipynb
  2. +310
    -3
      tutorials/fastnlp_tutorial_2.ipynb
  3. +59
    -0
      tutorials/fastnlp_tutorial_3.ipynb
  4. BIN
      tutorials/figures/T0-fig-parameter-matching.png
  5. BIN
      tutorials/figures/T0-fig-training-structure.png

+ 476
- 59
tutorials/fastnlp_tutorial_1.ipynb View File

@@ -80,7 +80,7 @@
} }
], ],
"source": [ "source": [
"from fastNLP.core.dataset import DataSet\n",
"from fastNLP import DataSet\n",
"\n", "\n",
"data = {'idx': [0, 1, 2], \n", "data = {'idx': [0, 1, 2], \n",
" 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"],\n", " 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"],\n",
@@ -153,7 +153,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"1608199516936 1607874531400\n",
"1630555358408 1630228349768\n",
"+-----+------------------------+------------------------+-----+\n", "+-----+------------------------+------------------------+-----+\n",
"| idx | sentence | words | num |\n", "| idx | sentence | words | num |\n",
"+-----+------------------------+------------------------+-----+\n", "+-----+------------------------+------------------------+-----+\n",
@@ -198,7 +198,7 @@
"name": "stdout", "name": "stdout",
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"1607874531400 1607874531400\n",
"1630228349768 1630228349768\n",
"+-----+------------------------+------------------------+-----+\n", "+-----+------------------------+------------------------+-----+\n",
"| idx | sentence | words | num |\n", "| idx | sentence | words | num |\n",
"+-----+------------------------+------------------------+-----+\n", "+-----+------------------------+------------------------+-----+\n",
@@ -302,6 +302,7 @@
"\n", "\n",
"  **`apply`和`apply_field`返回的是个列表**,**`apply_more`和`apply_field_more`返回的是个字典**\n", "  **`apply`和`apply_field`返回的是个列表**,**`apply_more`和`apply_field_more`返回的是个字典**\n",
"\n", "\n",
"    预处理过程中,通过`progress_bar`参数设置显示进度条类型,通过`num_proc`设置多进程\n",
"***\n", "***\n",
"\n", "\n",
"`apply`的参数包括一个函数`func`和一个新字段名`new_field_name`,函数`func`的处理对象是`dataset`模块中\n", "`apply`的参数包括一个函数`func`和一个新字段名`new_field_name`,函数`func`的处理对象是`dataset`模块中\n",
@@ -311,30 +312,45 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "72a0b5f9", "id": "72a0b5f9",
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
"data": { "data": {
"application/vnd.jupyter.widget-view+json": { "application/vnd.jupyter.widget-view+json": {
"model_id": "8532c5609a394c19b60315663a6f0f4a",
"model_id": "",
"version_major": 2, "version_major": 2,
"version_minor": 0 "version_minor": 0
}, },
"text/plain": [ "text/plain": [
"Output()"
"Processing: 0%| | 0/3 [00:00<?, ?it/s]"
] ]
}, },
"metadata": {}, "metadata": {},
"output_type": "display_data" "output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----+------------------------------+------------------------------+\n",
"| idx | sentence | words |\n",
"+-----+------------------------------+------------------------------+\n",
"| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
"| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
"| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
"+-----+------------------------------+------------------------------+\n"
]
} }
], ],
"source": [ "source": [
"from fastNLP import DataSet\n",
"\n",
"data = {'idx': [0, 1, 2], \n", "data = {'idx': [0, 1, 2], \n",
" 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"], }\n", " 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"], }\n",
"dataset = DataSet(data)\n", "dataset = DataSet(data)\n",
"dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words')\n",
"dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words', progress_bar=\"tqdm\") #\n",
"print(dataset)" "print(dataset)"
] ]
}, },
@@ -348,10 +364,38 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 8,
"id": "b1a8631f", "id": "b1a8631f",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing: 0%| | 0/3 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----+------------------------------+------------------------------+\n",
"| idx | sentence | words |\n",
"+-----+------------------------------+------------------------------+\n",
"| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
"| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
"| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
"+-----+------------------------------+------------------------------+\n"
]
}
],
"source": [ "source": [
"dataset = DataSet(data)\n", "dataset = DataSet(data)\n",
"\n", "\n",
@@ -360,7 +404,7 @@
" words = sentence.split()\n", " words = sentence.split()\n",
" return words\n", " return words\n",
"\n", "\n",
"dataset.apply(get_words, new_field_name='words')\n",
"dataset.apply(get_words, new_field_name='words', progress_bar=\"tqdm\")\n",
"print(dataset)" "print(dataset)"
] ]
}, },
@@ -376,13 +420,42 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "057c1d2c", "id": "057c1d2c",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing: 0%| | 0/3 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----+------------------------------+------------------------------+\n",
"| idx | sentence | words |\n",
"+-----+------------------------------+------------------------------+\n",
"| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
"| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
"| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
"+-----+------------------------------+------------------------------+\n"
]
}
],
"source": [ "source": [
"dataset = DataSet(data)\n", "dataset = DataSet(data)\n",
"dataset.apply_field(lambda sent:sent.split(), field_name='sentence', new_field_name='words')\n",
"dataset.apply_field(lambda sent:sent.split(), field_name='sentence', new_field_name='words', \n",
" progress_bar=\"tqdm\")\n",
"print(dataset)" "print(dataset)"
] ]
}, },
@@ -398,13 +471,42 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 10,
"id": "51e2f02c", "id": "51e2f02c",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing: 0%| | 0/3 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----+------------------------+------------------------+-----+\n",
"| idx | sentence | words | num |\n",
"+-----+------------------------+------------------------+-----+\n",
"| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
"| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
"| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
"+-----+------------------------+------------------------+-----+\n"
]
}
],
"source": [ "source": [
"dataset = DataSet(data)\n", "dataset = DataSet(data)\n",
"dataset.apply_more(lambda ins:{'words': ins['sentence'].split(), 'num': len(ins['sentence'].split())})\n",
"dataset.apply_more(lambda ins:{'words': ins['sentence'].split(), 'num': len(ins['sentence'].split())}, \n",
" progress_bar=\"tqdm\")\n",
"print(dataset)" "print(dataset)"
] ]
}, },
@@ -420,14 +522,42 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 11,
"id": "db4295d5", "id": "db4295d5",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing: 0%| | 0/3 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-----+------------------------+------------------------+-----+\n",
"| idx | sentence | words | num |\n",
"+-----+------------------------+------------------------+-----+\n",
"| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
"| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
"| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
"+-----+------------------------+------------------------+-----+\n"
]
}
],
"source": [ "source": [
"dataset = DataSet(data)\n", "dataset = DataSet(data)\n",
"dataset.apply_field_more(lambda sent:{'words': sent.split(), 'num': len(sent.split())}, \n", "dataset.apply_field_more(lambda sent:{'words': sent.split(), 'num': len(sent.split())}, \n",
" field_name='sentence')\n",
" field_name='sentence', progress_bar=\"tqdm\")\n",
"print(dataset)" "print(dataset)"
] ]
}, },
@@ -445,13 +575,13 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 12,
"id": "012f537c", "id": "012f537c",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from fastNLP.core.dataset import DataSet\n",
"from fastNLP.core.dataset import Instance\n",
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"\n", "\n",
"dataset = DataSet([\n", "dataset = DataSet([\n",
" Instance(sentence=\"This is an apple .\",\n", " Instance(sentence=\"This is an apple .\",\n",
@@ -476,10 +606,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 13,
"id": "a4c1c10d", "id": "a4c1c10d",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"dict_items([('sentence', 'This is an apple .'), ('words', ['This', 'is', 'an', 'apple', '.']), ('num', 5)])\n",
"dict_keys(['sentence', 'words', 'num'])\n",
"dict_values(['This is an apple .', ['This', 'is', 'an', 'apple', '.'], 5])\n"
]
}
],
"source": [ "source": [
"ins = Instance(sentence=\"This is an apple .\", words=['This', 'is', 'an', 'apple', '.'], num=5)\n", "ins = Instance(sentence=\"This is an apple .\", words=['This', 'is', 'an', 'apple', '.'], num=5)\n",
"\n", "\n",
@@ -498,10 +638,22 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 14,
"id": "55376402", "id": "55376402",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+--------------------+------------------------+-----+-----+\n",
"| sentence | words | num | idx |\n",
"+--------------------+------------------------+-----+-----+\n",
"| This is an apple . | ['This', 'is', 'an'... | 5 | 0 |\n",
"+--------------------+------------------------+-----+-----+\n"
]
}
],
"source": [ "source": [
"ins.add_field(field_name='idx', field=0)\n", "ins.add_field(field_name='idx', field=0)\n",
"print(ins)" "print(ins)"
@@ -521,20 +673,44 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 15,
"id": "fe15f4c1", "id": "fe15f4c1",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x17ba4b3a648>,\n",
" 'words': <fastNLP.core.dataset.field.FieldArray at 0x17ba4b3a6c8>,\n",
" 'num': <fastNLP.core.dataset.field.FieldArray at 0x17ba4b3a748>}"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"dataset.get_all_fields()" "dataset.get_all_fields()"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 16,
"id": "5433815c", "id": "5433815c",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"['num', 'sentence', 'words']"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"dataset.get_field_names()" "dataset.get_field_names()"
] ]
@@ -553,10 +729,29 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 17,
"id": "25ce5488", "id": "25ce5488",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3 False\n",
"6 True\n",
"+------------------------------+------------------------------+--------+\n",
"| sentence | words | length |\n",
"+------------------------------+------------------------------+--------+\n",
"| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n",
"| I like apples . | ['I', 'like', 'apples', '... | 4 |\n",
"| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n",
"| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n",
"| I like apples . | ['I', 'like', 'apples', '... | 4 |\n",
"| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n",
"+------------------------------+------------------------------+--------+\n"
]
}
],
"source": [ "source": [
"print(len(dataset), dataset.has_field('length')) \n", "print(len(dataset), dataset.has_field('length')) \n",
"if 'num' in dataset:\n", "if 'num' in dataset:\n",
@@ -588,12 +783,23 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 18,
"id": "3515e096", "id": "3515e096",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Vocabulary([]...)\n",
"{'<pad>': 0, '<unk>': 1}\n",
"<pad> 0\n",
"<unk> 1\n"
]
}
],
"source": [ "source": [
"from fastNLP.core.vocabulary import Vocabulary\n",
"from fastNLP import Vocabulary\n",
"\n", "\n",
"vocab = Vocabulary()\n", "vocab = Vocabulary()\n",
"print(vocab)\n", "print(vocab)\n",
@@ -614,10 +820,20 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 19,
"id": "88c7472a", "id": "88c7472a",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n",
"6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n",
"6 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n"
]
}
],
"source": [ "source": [
"vocab.add_word_lst(['生活', '就像', '海洋'])\n", "vocab.add_word_lst(['生活', '就像', '海洋'])\n",
"print(len(vocab), vocab.word_count)\n", "print(len(vocab), vocab.word_count)\n",
@@ -640,10 +856,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 20,
"id": "3447acde", "id": "3447acde",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<pad> 0\n",
"<unk> 1\n",
"生活 2\n",
"彼岸 1 False\n"
]
}
],
"source": [ "source": [
"print(vocab.to_word(0), vocab.to_index('<pad>'))\n", "print(vocab.to_word(0), vocab.to_index('<pad>'))\n",
"print(vocab.to_word(1), vocab.to_index('<unk>'))\n", "print(vocab.to_word(1), vocab.to_index('<unk>'))\n",
@@ -665,10 +892,21 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 21,
"id": "490b101c", "id": "490b101c",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"生活 2\n",
"彼岸 12 True\n",
"13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
"13 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n"
]
}
],
"source": [ "source": [
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n", "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n",
"print(vocab.to_word(2), vocab.to_index('生活'))\n", "print(vocab.to_word(2), vocab.to_index('生活'))\n",
@@ -691,10 +929,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 22,
"id": "a99ff909", "id": "a99ff909",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'positive': 0, 'negative': 1}\n",
"ValueError: word `neutral` not in vocabulary\n"
]
}
],
"source": [ "source": [
"vocab = Vocabulary(unknown=None, padding=None)\n", "vocab = Vocabulary(unknown=None, padding=None)\n",
"\n", "\n",
@@ -717,10 +964,19 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 23,
"id": "432f74c1", "id": "432f74c1",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'<unk>': 0, 'positive': 1, 'negative': 2}\n",
"0 <unk>\n"
]
}
],
"source": [ "source": [
"vocab = Vocabulary(unknown='<unk>', padding=None)\n", "vocab = Vocabulary(unknown='<unk>', padding=None)\n",
"\n", "\n",
@@ -746,10 +1002,92 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 24,
"id": "3dbd985d", "id": "3dbd985d",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SentenceId</th>\n",
" <th>Sentence</th>\n",
" <th>Sentiment</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>A series of escapades demonstrating the adage ...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>This quiet , introspective and entertaining in...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>Even fans of Ismail Merchant 's work , I suspe...</td>\n",
" <td>negative</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>A positively thrilling combination of ethnogra...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>A comedy-drama of nearly epic proportions root...</td>\n",
" <td>positive</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>The Importance of Being Earnest , so thick wit...</td>\n",
" <td>neutral</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SentenceId Sentence Sentiment\n",
"0 1 A series of escapades demonstrating the adage ... negative\n",
"1 2 This quiet , introspective and entertaining in... positive\n",
"2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n",
"3 4 A positively thrilling combination of ethnogra... neutral\n",
"4 5 A comedy-drama of nearly epic proportions root... positive\n",
"5 6 The Importance of Being Earnest , so thick wit... neutral"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"import pandas as pd\n", "import pandas as pd\n",
"\n", "\n",
@@ -767,17 +1105,49 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 25,
"id": "4f634586", "id": "4f634586",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing: 0%| | 0/6 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | ['a', 'series', 'of', 'es... | negative |\n",
"| 2 | ['this', 'quiet', ',', 'i... | positive |\n",
"| 3 | ['even', 'fans', 'of', 'i... | negative |\n",
"| 4 | ['a', 'positively', 'thri... | neutral |\n",
"| 5 | ['a', 'comedy-drama', 'of... | positive |\n",
"| 6 | ['the', 'importance', 'of... | neutral |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [ "source": [
"from fastNLP.core.dataset import DataSet\n",
"from fastNLP import DataSet\n",
"\n", "\n",
"dataset = DataSet()\n", "dataset = DataSet()\n",
"dataset = dataset.from_pandas(df)\n", "dataset = dataset.from_pandas(df)\n",
"dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n", "dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n",
" 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']})\n",
" 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']}, \n",
" progress_bar=\"tqdm\")\n",
"print(dataset)" "print(dataset)"
] ]
}, },
@@ -791,7 +1161,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 26,
"id": "46722efc", "id": "46722efc",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
@@ -815,12 +1185,24 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 27,
"id": "a2de615b", "id": "a2de615b",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n",
"\n",
"{'<pad>': 0, '<unk>': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n",
"\n",
"Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n"
]
}
],
"source": [ "source": [
"from fastNLP.core.vocabulary import Vocabulary\n",
"from fastNLP import Vocabulary\n",
"\n", "\n",
"vocab = Vocabulary()\n", "vocab = Vocabulary()\n",
"vocab = vocab.from_dataset(dataset, field_name='Sentence')\n", "vocab = vocab.from_dataset(dataset, field_name='Sentence')\n",
@@ -841,10 +1223,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 28,
"id": "2f9a04b2", "id": "2f9a04b2",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n",
"| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n",
"| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n",
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n",
"| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n",
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [ "source": [
"vocab.index_dataset(dataset, field_name='Sentence')\n", "vocab.index_dataset(dataset, field_name='Sentence')\n",
"print(dataset)" "print(dataset)"
@@ -860,10 +1259,28 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null,
"execution_count": 29,
"id": "5f5eed18", "id": "5f5eed18",
"metadata": {}, "metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'negative': 0, 'positive': 1, 'neutral': 2}\n",
"+------------+------------------------------+-----------+\n",
"| SentenceId | Sentence | Sentiment |\n",
"+------------+------------------------------+-----------+\n",
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n",
"| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n",
"| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n",
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n",
"| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n",
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n",
"+------------+------------------------------+-----------+\n"
]
}
],
"source": [ "source": [
"target_vocab = Vocabulary(padding=None, unknown=None)\n", "target_vocab = Vocabulary(padding=None, unknown=None)\n",
"\n", "\n",


+ 310
- 3
tutorials/fastnlp_tutorial_2.ipynb View File

@@ -1,20 +1,327 @@
{ {
"cells": [ "cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# T2. dataloader 和 tokenizer 的基本使用\n",
"\n",
"&emsp; 1 &ensp; fastNLP 中的 dataloader\n",
"\n",
"&emsp; &emsp; 1.1 &ensp; databundle 的结构与使用\n",
"\n",
"&emsp; &emsp; 1.2 &ensp; dataloader 的结构与使用\n",
"\n",
"&emsp; 2 &ensp; fastNLP 中的 tokenizer\n",
" \n",
"&emsp; &emsp; 2.1 &ensp; 传统 GloVe 词嵌入的加载\n",
" \n",
"&emsp; &emsp; 2.2 &ensp; PreTrainedTokenizer 的概念\n",
"\n",
"&emsp; &emsp; 2.3 &ensp; BertTokenizer 的基本使用\n",
"\n",
"&emsp; 3 &ensp; 实例:NG20 数据集的完整加载过程\n",
" \n",
"&emsp; &emsp; 3.1 &ensp; \n",
"\n",
"&emsp; &emsp; 3.2 &ensp; "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. fastNLP 中的 dataloader\n",
"\n",
"### 1.1 databundle 的结构与使用\n",
"\n",
"在`fastNLP 0.8`中,在常用的数据加载模块`DataLoader`和数据集`DataSet`模块之间,还存在\n",
"\n",
"&emsp; 一个中间模块,即 **数据包`DataBundle`模块**,可以从`fastNLP.io`路径中导入该模块\n",
"\n",
"在`fastNLP 0.8`中,**一个`databundle`数据包包含若干`dataset`数据集和`vocabulary`词汇表**\n",
"\n",
"&emsp; 分别存储在`datasets`和`vocabs`两个变量中,所以了解`databundle`数据包之前\n",
"\n",
"&emsp; 需要首先**复习`dataset`数据集和`vocabulary`词汇表**,**下面的一串代码**,**你知道其大概含义吗?**\n",
"\n",
"必要提示:`NG20`,全称[`News Group 20`](http://qwone.com/~jason/20Newsgroups/),是一个新闻文本分类数据集,包含20个大类以及若干小类\n",
"\n",
"&emsp; 数据集包含训练集`'ng20_train.csv'`和测试集`'ng20_test.csv'`两部分,每条数据\n",
"\n",
"&emsp; 包括`'label'`标签和`'text'`文本两个条目,通过`sample(frac=1)[:10]`随机采样并读取前十条"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
"</pre>\n"
],
"text/plain": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing: 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Processing: 0%| | 0/10 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"+-------+------------------------------------------+\n",
"| label | text |\n",
"+-------+------------------------------------------+\n",
"| talk | ['mwilson', 'ncratl', 'atlantaga', 'n... |\n",
"| talk | ['ch981', 'cleveland', 'freenet', 'ed... |\n",
"| rec | ['mbeaving', 'bnr', 'ca', '\\\\(', 'bea... |\n",
"| soc | ['jayne', 'mmalt', 'guild', 'org', '\\... |\n",
"| talk | ['jrutledg', 'cs', 'ulowell', 'edu', ... |\n",
"| talk | ['cramer', 'optilink', 'com', '\\\\(', ... |\n",
"| comp | ['triton', 'unm', 'edu', '\\\\(', 'larr... |\n",
"| rec | ['ingres', 'com', '\\\\(', 'bruce', '\\\\... |\n",
"| comp | ['ldo', 'waikato', 'ac', 'nz', '\\\\(',... |\n",
"| misc | ['rebecca', 'rpi', 'edu', '\\\\(', 'ezr... |\n",
"+-------+------------------------------------------+\n",
"{'<pad>': 0, '<unk>': 1, 'rec': 2, 'talk': 3, 'comp': 4, 'soc': 5, 'misc': 6, 'sci': 7}\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"from fastNLP import DataSet\n",
"from fastNLP import Vocabulary\n",
"from fastNLP.io import DataBundle\n",
"\n",
"datasets = {}\n",
"datasets['train'] = DataSet.from_pandas(pd.read_csv('./data/ng20_train.csv').sample(frac=1)[:10])\n",
"datasets['train'].apply_more(lambda ins:{'label': ins['label'].lower().split('.')[0], \n",
" 'text': ins['text'].lower().split()},\n",
" progress_bar='tqdm')\n",
"datasets['test'] = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv').sample(frac=1)[:10])\n",
"datasets['test'].apply_more(lambda ins:{'label': ins['label'].lower().split('.')[0], \n",
" 'text': ins['text'].lower().split()},\n",
" progress_bar='tqdm')\n",
"print(datasets['train'])\n",
"\n",
"vocabs = {}\n",
"vocabs['label'] = Vocabulary().from_dataset(datasets['train'].concat(datasets['test'], inplace=False), field_name='label')\n",
"vocabs['text'] = Vocabulary().from_dataset(datasets['train'].concat(datasets['test'], inplace=False), field_name='text')\n",
"print(vocabs['label'].word2idx)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。\n",
" 该对象一般由fastNLP中各种Loader的load函数生成,可以通过以下的方法获取里面的内容"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In total 2 datasets:\n",
"\ttrain has 10 instances.\n",
"\ttest has 10 instances.\n",
"In total 2 vocabs:\n",
"\tlabel has 8 entries.\n",
"\ttext has 1687 entries.\n",
"\n"
]
}
],
"source": [
"data_bundle = DataBundle(datasets=datasets, vocabs=vocabs)\n",
"print(data_bundle)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1.2 dataloader 的结构与使用"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. fastNLP 中的 tokenizer\n",
"\n",
"### 2.1 传统 GloVe 词嵌入的加载"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.2 PreTrainTokenizer 的提出\n",
"\n",
"在`fastNLP 0.8`中,**使用`PreTrainedTokenizer`模块来为数据集中的词语进行词向量的标注**\n",
"\n",
"&emsp; 需要注意的是,`PreTrainedTokenizer`模块的下载和导入**需要确保环境安装了`transformers`模块**\n",
"\n",
"&emsp; 这是因为 `fastNLP 0.8`中`PreTrainedTokenizer`模块的实现基于`Huggingface Transformers`库\n",
"\n",
"**`Huggingface Transformers`是基于一个开源的**,**基于`transformer`模型结构提供的预训练语言库**\n",
"\n",
"&emsp; 包含了多种经典的基于`transformer`的预训练模型,如`BERT`、`BART`、`RoBERTa`、`GPT2`、`CPT`\n",
"\n",
"&emsp; 更多相关内容可以参考`Huggingface Transformers`的[相关论文](https://arxiv.org/pdf/1910.03771.pdf)、[官方文档](https://huggingface.co/transformers/)以及[的代码仓库](https://github.com/huggingface/transformers)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3 BertTokenizer 的基本使用\n",
"\n",
"在`fastNLP 0.8`中,以`PreTrainedTokenizer`为基类,泛化出多个子类,实现基于`BERT`等模型的标注\n",
"\n",
"&emsp; 本节以`BertTokenizer`模块为例,展示`PreTrainedTokenizer`模块的使用方法与应用实例\n",
"\n",
"**`BertTokenizer`的初始化包括 导入模块和导入数据 两步**,先通过从`fastNLP.transformers.torch`中\n",
"\n",
"&emsp; 导入`BertTokenizer`模块,再通过`from_pretrained`方法指定`tokenizer`参数类型下载\n",
"\n",
"&emsp; 其中,**`'bert-base-uncased'`指定`tokenizer`使用的预训练`BERT`类型**:单词不区分大小写\n",
"\n",
"&emsp; &emsp; **模块层数`L=12`**,**隐藏层维度`H=768`**,**自注意力头数`A=12`**,**总参数量`110M`**\n",
"\n",
"&emsp; 另外,模型参数自动下载至 home 目录下的`~\\.cache\\huggingface\\transformers`文件夹中"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true
"scrolled": false
}, },
"outputs": [], "outputs": [],
"source": []
"source": [
"from fastNLP.transformers.torch import BertTokenizer\n",
"\n",
"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')"
]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": []
"source": [
"dir(tokenizer)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. 实例:NG20 数据集的完整加载过程\n",
"\n",
"### 3.1 使用 BertTokenizer 处理数据集\n",
"\n",
"在`fastNLP 0.8`中,**`Trainer`模块和`Evaluator`模块分别表示“训练器”和“评测器”**\n",
"\n",
"&emsp; 对应于之前的`fastNLP`版本中的`Trainer`模块和`Tester`模块,其定义方法如下所示\n",
"\n",
"在`fastNLP 0.8`中,需要注意,在同个`python`脚本中先使用`Trainer`训练,然后使用`Evaluator`评测\n",
"\n",
"&emsp; 非常关键的问题在于**如何正确设置二者的`driver`**。这就引入了另一个问题:什么是 `driver`?"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from fastNLP import DataSet\n",
"from fastNLP import Vocabulary\n",
"\n",
"dataset = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from functools import partial\n",
"\n",
"encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n",
" return_attention_mask=True)\n",
"# 会新增 input_ids 、 attention_mask 和 token_type_ids 这三个 field\n",
"dataset.apply_field_more(encode, field_name='text')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"\n",
"target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
"target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
" new_field_name='labels')\n",
"# 需要将 input_ids 的 pad 值设置为 tokenizer 的 pad 值\n",
"dataset.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n",
"dataset.set_ignore('label', 'text') # 因为 label 是原始的不需要的 str ,所以我们可以忽略它,让它不要在 batch 的输出中出现"
]
} }
], ],
"metadata": { "metadata": {


+ 59
- 0
tutorials/fastnlp_tutorial_3.ipynb View File

@@ -0,0 +1,59 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "213d538c",
"metadata": {},
"source": [
"# T3. \n",
"\n",
"&emsp; 1 &ensp; \n",
" \n",
"&emsp; &emsp; 1.1 &ensp; \n",
"\n",
"&emsp; &emsp; 1.2 &ensp; \n",
"\n",
"&emsp; 2 &ensp; \n",
"\n",
"&emsp; &emsp; 2.1 &ensp; \n",
"\n",
"&emsp; &emsp; 2.2 &ensp; \n",
"\n",
"&emsp; 3 &ensp; \n",
" \n",
"&emsp; &emsp; 3.1 &ensp; \n",
"\n",
"&emsp; &emsp; 3.2 &ensp; "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b369137f",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

BIN
tutorials/figures/T0-fig-parameter-matching.png View File

Before After
Width: 1265  |  Height: 736  |  Size: 96 kB Width: 1265  |  Height: 736  |  Size: 96 kB

BIN
tutorials/figures/T0-fig-training-structure.png View File

Before After
Width: 1160  |  Height: 732  |  Size: 80 kB Width: 1160  |  Height: 732  |  Size: 80 kB

Loading…
Cancel
Save