|
@@ -80,7 +80,7 @@ |
|
|
} |
|
|
} |
|
|
], |
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"from fastNLP.core.dataset import DataSet\n", |
|
|
|
|
|
|
|
|
"from fastNLP import DataSet\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"data = {'idx': [0, 1, 2], \n", |
|
|
"data = {'idx': [0, 1, 2], \n", |
|
|
" 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"],\n", |
|
|
" 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"],\n", |
|
@@ -153,7 +153,7 @@ |
|
|
"name": "stdout", |
|
|
"name": "stdout", |
|
|
"output_type": "stream", |
|
|
"output_type": "stream", |
|
|
"text": [ |
|
|
"text": [ |
|
|
"1608199516936 1607874531400\n", |
|
|
|
|
|
|
|
|
"1630555358408 1630228349768\n", |
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
"| idx | sentence | words | num |\n", |
|
|
"| idx | sentence | words | num |\n", |
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
@@ -198,7 +198,7 @@ |
|
|
"name": "stdout", |
|
|
"name": "stdout", |
|
|
"output_type": "stream", |
|
|
"output_type": "stream", |
|
|
"text": [ |
|
|
"text": [ |
|
|
"1607874531400 1607874531400\n", |
|
|
|
|
|
|
|
|
"1630228349768 1630228349768\n", |
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
"| idx | sentence | words | num |\n", |
|
|
"| idx | sentence | words | num |\n", |
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
@@ -302,6 +302,7 @@ |
|
|
"\n", |
|
|
"\n", |
|
|
"  **`apply`和`apply_field`返回的是个列表**,**`apply_more`和`apply_field_more`返回的是个字典**\n", |
|
|
"  **`apply`和`apply_field`返回的是个列表**,**`apply_more`和`apply_field_more`返回的是个字典**\n", |
|
|
"\n", |
|
|
"\n", |
|
|
|
|
|
"    预处理过程中,通过`progress_bar`参数设置显示进度条类型,通过`num_proc`设置多进程\n", |
|
|
"***\n", |
|
|
"***\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"`apply`的参数包括一个函数`func`和一个新字段名`new_field_name`,函数`func`的处理对象是`dataset`模块中\n", |
|
|
"`apply`的参数包括一个函数`func`和一个新字段名`new_field_name`,函数`func`的处理对象是`dataset`模块中\n", |
|
@@ -311,30 +312,45 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 7, |
|
|
"id": "72a0b5f9", |
|
|
"id": "72a0b5f9", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [ |
|
|
"outputs": [ |
|
|
{ |
|
|
{ |
|
|
"data": { |
|
|
"data": { |
|
|
"application/vnd.jupyter.widget-view+json": { |
|
|
"application/vnd.jupyter.widget-view+json": { |
|
|
"model_id": "8532c5609a394c19b60315663a6f0f4a", |
|
|
|
|
|
|
|
|
"model_id": "", |
|
|
"version_major": 2, |
|
|
"version_major": 2, |
|
|
"version_minor": 0 |
|
|
"version_minor": 0 |
|
|
}, |
|
|
}, |
|
|
"text/plain": [ |
|
|
"text/plain": [ |
|
|
"Output()" |
|
|
|
|
|
|
|
|
"Processing: 0%| | 0/3 [00:00<?, ?it/s]" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"output_type": "display_data" |
|
|
"output_type": "display_data" |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n", |
|
|
|
|
|
"| idx | sentence | words |\n", |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n", |
|
|
|
|
|
"| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n", |
|
|
|
|
|
"| 1 | I like apples . | ['I', 'like', 'apples', '... |\n", |
|
|
|
|
|
"| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n", |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n" |
|
|
|
|
|
] |
|
|
} |
|
|
} |
|
|
], |
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
|
|
|
"from fastNLP import DataSet\n", |
|
|
|
|
|
"\n", |
|
|
"data = {'idx': [0, 1, 2], \n", |
|
|
"data = {'idx': [0, 1, 2], \n", |
|
|
" 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"], }\n", |
|
|
" 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"], }\n", |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words')\n", |
|
|
|
|
|
|
|
|
"dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words', progress_bar=\"tqdm\") #\n", |
|
|
"print(dataset)" |
|
|
"print(dataset)" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
@@ -348,10 +364,38 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 8, |
|
|
"id": "b1a8631f", |
|
|
"id": "b1a8631f", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": { |
|
|
|
|
|
"model_id": "", |
|
|
|
|
|
"version_major": 2, |
|
|
|
|
|
"version_minor": 0 |
|
|
|
|
|
}, |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"Processing: 0%| | 0/3 [00:00<?, ?it/s]" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "display_data" |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n", |
|
|
|
|
|
"| idx | sentence | words |\n", |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n", |
|
|
|
|
|
"| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n", |
|
|
|
|
|
"| 1 | I like apples . | ['I', 'like', 'apples', '... |\n", |
|
|
|
|
|
"| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n", |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset = DataSet(data)\n", |
|
|
"\n", |
|
|
"\n", |
|
@@ -360,7 +404,7 @@ |
|
|
" words = sentence.split()\n", |
|
|
" words = sentence.split()\n", |
|
|
" return words\n", |
|
|
" return words\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"dataset.apply(get_words, new_field_name='words')\n", |
|
|
|
|
|
|
|
|
"dataset.apply(get_words, new_field_name='words', progress_bar=\"tqdm\")\n", |
|
|
"print(dataset)" |
|
|
"print(dataset)" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
@@ -376,13 +420,42 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 9, |
|
|
"id": "057c1d2c", |
|
|
"id": "057c1d2c", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": { |
|
|
|
|
|
"model_id": "", |
|
|
|
|
|
"version_major": 2, |
|
|
|
|
|
"version_minor": 0 |
|
|
|
|
|
}, |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"Processing: 0%| | 0/3 [00:00<?, ?it/s]" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "display_data" |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n", |
|
|
|
|
|
"| idx | sentence | words |\n", |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n", |
|
|
|
|
|
"| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n", |
|
|
|
|
|
"| 1 | I like apples . | ['I', 'like', 'apples', '... |\n", |
|
|
|
|
|
"| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n", |
|
|
|
|
|
"+-----+------------------------------+------------------------------+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset.apply_field(lambda sent:sent.split(), field_name='sentence', new_field_name='words')\n", |
|
|
|
|
|
|
|
|
"dataset.apply_field(lambda sent:sent.split(), field_name='sentence', new_field_name='words', \n", |
|
|
|
|
|
" progress_bar=\"tqdm\")\n", |
|
|
"print(dataset)" |
|
|
"print(dataset)" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
@@ -398,13 +471,42 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 10, |
|
|
"id": "51e2f02c", |
|
|
"id": "51e2f02c", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": { |
|
|
|
|
|
"model_id": "", |
|
|
|
|
|
"version_major": 2, |
|
|
|
|
|
"version_minor": 0 |
|
|
|
|
|
}, |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"Processing: 0%| | 0/3 [00:00<?, ?it/s]" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "display_data" |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
|
|
|
"| idx | sentence | words | num |\n", |
|
|
|
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
|
|
|
"| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n", |
|
|
|
|
|
"| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n", |
|
|
|
|
|
"| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n", |
|
|
|
|
|
"+-----+------------------------+------------------------+-----+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset.apply_more(lambda ins:{'words': ins['sentence'].split(), 'num': len(ins['sentence'].split())})\n", |
|
|
|
|
|
|
|
|
"dataset.apply_more(lambda ins:{'words': ins['sentence'].split(), 'num': len(ins['sentence'].split())}, \n", |
|
|
|
|
|
" progress_bar=\"tqdm\")\n", |
|
|
"print(dataset)" |
|
|
"print(dataset)" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
@@ -420,14 +522,42 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 11, |
|
|
"id": "db4295d5", |
|
|
"id": "db4295d5", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": { |
|
|
|
|
|
"model_id": "", |
|
|
|
|
|
"version_major": 2, |
|
|
|
|
|
"version_minor": 0 |
|
|
|
|
|
}, |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"Processing: 0%| | 0/3 [00:00<?, ?it/s]" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "display_data" |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
|
|
|
"| idx | sentence | words | num |\n", |
|
|
|
|
|
"+-----+------------------------+------------------------+-----+\n", |
|
|
|
|
|
"| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n", |
|
|
|
|
|
"| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n", |
|
|
|
|
|
"| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n", |
|
|
|
|
|
"+-----+------------------------+------------------------+-----+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset = DataSet(data)\n", |
|
|
"dataset.apply_field_more(lambda sent:{'words': sent.split(), 'num': len(sent.split())}, \n", |
|
|
"dataset.apply_field_more(lambda sent:{'words': sent.split(), 'num': len(sent.split())}, \n", |
|
|
" field_name='sentence')\n", |
|
|
|
|
|
|
|
|
" field_name='sentence', progress_bar=\"tqdm\")\n", |
|
|
"print(dataset)" |
|
|
"print(dataset)" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
@@ -445,13 +575,13 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 12, |
|
|
"id": "012f537c", |
|
|
"id": "012f537c", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
"outputs": [], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"from fastNLP.core.dataset import DataSet\n", |
|
|
|
|
|
"from fastNLP.core.dataset import Instance\n", |
|
|
|
|
|
|
|
|
"from fastNLP import DataSet\n", |
|
|
|
|
|
"from fastNLP import Instance\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"dataset = DataSet([\n", |
|
|
"dataset = DataSet([\n", |
|
|
" Instance(sentence=\"This is an apple .\",\n", |
|
|
" Instance(sentence=\"This is an apple .\",\n", |
|
@@ -476,10 +606,20 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 13, |
|
|
"id": "a4c1c10d", |
|
|
"id": "a4c1c10d", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"dict_items([('sentence', 'This is an apple .'), ('words', ['This', 'is', 'an', 'apple', '.']), ('num', 5)])\n", |
|
|
|
|
|
"dict_keys(['sentence', 'words', 'num'])\n", |
|
|
|
|
|
"dict_values(['This is an apple .', ['This', 'is', 'an', 'apple', '.'], 5])\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"ins = Instance(sentence=\"This is an apple .\", words=['This', 'is', 'an', 'apple', '.'], num=5)\n", |
|
|
"ins = Instance(sentence=\"This is an apple .\", words=['This', 'is', 'an', 'apple', '.'], num=5)\n", |
|
|
"\n", |
|
|
"\n", |
|
@@ -498,10 +638,22 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 14, |
|
|
"id": "55376402", |
|
|
"id": "55376402", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"+--------------------+------------------------+-----+-----+\n", |
|
|
|
|
|
"| sentence | words | num | idx |\n", |
|
|
|
|
|
"+--------------------+------------------------+-----+-----+\n", |
|
|
|
|
|
"| This is an apple . | ['This', 'is', 'an'... | 5 | 0 |\n", |
|
|
|
|
|
"+--------------------+------------------------+-----+-----+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"ins.add_field(field_name='idx', field=0)\n", |
|
|
"ins.add_field(field_name='idx', field=0)\n", |
|
|
"print(ins)" |
|
|
"print(ins)" |
|
@@ -521,20 +673,44 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 15, |
|
|
"id": "fe15f4c1", |
|
|
"id": "fe15f4c1", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x17ba4b3a648>,\n", |
|
|
|
|
|
" 'words': <fastNLP.core.dataset.field.FieldArray at 0x17ba4b3a6c8>,\n", |
|
|
|
|
|
" 'num': <fastNLP.core.dataset.field.FieldArray at 0x17ba4b3a748>}" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"execution_count": 15, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "execute_result" |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"dataset.get_all_fields()" |
|
|
"dataset.get_all_fields()" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 16, |
|
|
"id": "5433815c", |
|
|
"id": "5433815c", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"['num', 'sentence', 'words']" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"execution_count": 16, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "execute_result" |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"dataset.get_field_names()" |
|
|
"dataset.get_field_names()" |
|
|
] |
|
|
] |
|
@@ -553,10 +729,29 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 17, |
|
|
"id": "25ce5488", |
|
|
"id": "25ce5488", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"3 False\n", |
|
|
|
|
|
"6 True\n", |
|
|
|
|
|
"+------------------------------+------------------------------+--------+\n", |
|
|
|
|
|
"| sentence | words | length |\n", |
|
|
|
|
|
"+------------------------------+------------------------------+--------+\n", |
|
|
|
|
|
"| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n", |
|
|
|
|
|
"| I like apples . | ['I', 'like', 'apples', '... | 4 |\n", |
|
|
|
|
|
"| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n", |
|
|
|
|
|
"| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n", |
|
|
|
|
|
"| I like apples . | ['I', 'like', 'apples', '... | 4 |\n", |
|
|
|
|
|
"| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n", |
|
|
|
|
|
"+------------------------------+------------------------------+--------+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"print(len(dataset), dataset.has_field('length')) \n", |
|
|
"print(len(dataset), dataset.has_field('length')) \n", |
|
|
"if 'num' in dataset:\n", |
|
|
"if 'num' in dataset:\n", |
|
@@ -588,12 +783,23 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 18, |
|
|
"id": "3515e096", |
|
|
"id": "3515e096", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"Vocabulary([]...)\n", |
|
|
|
|
|
"{'<pad>': 0, '<unk>': 1}\n", |
|
|
|
|
|
"<pad> 0\n", |
|
|
|
|
|
"<unk> 1\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"from fastNLP.core.vocabulary import Vocabulary\n", |
|
|
|
|
|
|
|
|
"from fastNLP import Vocabulary\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"vocab = Vocabulary()\n", |
|
|
"vocab = Vocabulary()\n", |
|
|
"print(vocab)\n", |
|
|
"print(vocab)\n", |
|
@@ -614,10 +820,20 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 19, |
|
|
"id": "88c7472a", |
|
|
"id": "88c7472a", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n", |
|
|
|
|
|
"6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n", |
|
|
|
|
|
"6 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"vocab.add_word_lst(['生活', '就像', '海洋'])\n", |
|
|
"vocab.add_word_lst(['生活', '就像', '海洋'])\n", |
|
|
"print(len(vocab), vocab.word_count)\n", |
|
|
"print(len(vocab), vocab.word_count)\n", |
|
@@ -640,10 +856,21 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 20, |
|
|
"id": "3447acde", |
|
|
"id": "3447acde", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"<pad> 0\n", |
|
|
|
|
|
"<unk> 1\n", |
|
|
|
|
|
"生活 2\n", |
|
|
|
|
|
"彼岸 1 False\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"print(vocab.to_word(0), vocab.to_index('<pad>'))\n", |
|
|
"print(vocab.to_word(0), vocab.to_index('<pad>'))\n", |
|
|
"print(vocab.to_word(1), vocab.to_index('<unk>'))\n", |
|
|
"print(vocab.to_word(1), vocab.to_index('<unk>'))\n", |
|
@@ -665,10 +892,21 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 21, |
|
|
"id": "490b101c", |
|
|
"id": "490b101c", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"生活 2\n", |
|
|
|
|
|
"彼岸 12 True\n", |
|
|
|
|
|
"13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n", |
|
|
|
|
|
"13 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n", |
|
|
"vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n", |
|
|
"print(vocab.to_word(2), vocab.to_index('生活'))\n", |
|
|
"print(vocab.to_word(2), vocab.to_index('生活'))\n", |
|
@@ -691,10 +929,19 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 22, |
|
|
"id": "a99ff909", |
|
|
"id": "a99ff909", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"{'positive': 0, 'negative': 1}\n", |
|
|
|
|
|
"ValueError: word `neutral` not in vocabulary\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"vocab = Vocabulary(unknown=None, padding=None)\n", |
|
|
"vocab = Vocabulary(unknown=None, padding=None)\n", |
|
|
"\n", |
|
|
"\n", |
|
@@ -717,10 +964,19 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 23, |
|
|
"id": "432f74c1", |
|
|
"id": "432f74c1", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"{'<unk>': 0, 'positive': 1, 'negative': 2}\n", |
|
|
|
|
|
"0 <unk>\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"vocab = Vocabulary(unknown='<unk>', padding=None)\n", |
|
|
"vocab = Vocabulary(unknown='<unk>', padding=None)\n", |
|
|
"\n", |
|
|
"\n", |
|
@@ -746,10 +1002,92 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 24, |
|
|
"id": "3dbd985d", |
|
|
"id": "3dbd985d", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"text/html": [ |
|
|
|
|
|
"<div>\n", |
|
|
|
|
|
"<style scoped>\n", |
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n", |
|
|
|
|
|
" vertical-align: middle;\n", |
|
|
|
|
|
" }\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" .dataframe tbody tr th {\n", |
|
|
|
|
|
" vertical-align: top;\n", |
|
|
|
|
|
" }\n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
" .dataframe thead th {\n", |
|
|
|
|
|
" text-align: right;\n", |
|
|
|
|
|
" }\n", |
|
|
|
|
|
"</style>\n", |
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
|
|
|
" <thead>\n", |
|
|
|
|
|
" <tr style=\"text-align: right;\">\n", |
|
|
|
|
|
" <th></th>\n", |
|
|
|
|
|
" <th>SentenceId</th>\n", |
|
|
|
|
|
" <th>Sentence</th>\n", |
|
|
|
|
|
" <th>Sentiment</th>\n", |
|
|
|
|
|
" </tr>\n", |
|
|
|
|
|
" </thead>\n", |
|
|
|
|
|
" <tbody>\n", |
|
|
|
|
|
" <tr>\n", |
|
|
|
|
|
" <th>0</th>\n", |
|
|
|
|
|
" <td>1</td>\n", |
|
|
|
|
|
" <td>A series of escapades demonstrating the adage ...</td>\n", |
|
|
|
|
|
" <td>negative</td>\n", |
|
|
|
|
|
" </tr>\n", |
|
|
|
|
|
" <tr>\n", |
|
|
|
|
|
" <th>1</th>\n", |
|
|
|
|
|
" <td>2</td>\n", |
|
|
|
|
|
" <td>This quiet , introspective and entertaining in...</td>\n", |
|
|
|
|
|
" <td>positive</td>\n", |
|
|
|
|
|
" </tr>\n", |
|
|
|
|
|
" <tr>\n", |
|
|
|
|
|
" <th>2</th>\n", |
|
|
|
|
|
" <td>3</td>\n", |
|
|
|
|
|
" <td>Even fans of Ismail Merchant 's work , I suspe...</td>\n", |
|
|
|
|
|
" <td>negative</td>\n", |
|
|
|
|
|
" </tr>\n", |
|
|
|
|
|
" <tr>\n", |
|
|
|
|
|
" <th>3</th>\n", |
|
|
|
|
|
" <td>4</td>\n", |
|
|
|
|
|
" <td>A positively thrilling combination of ethnogra...</td>\n", |
|
|
|
|
|
" <td>neutral</td>\n", |
|
|
|
|
|
" </tr>\n", |
|
|
|
|
|
" <tr>\n", |
|
|
|
|
|
" <th>4</th>\n", |
|
|
|
|
|
" <td>5</td>\n", |
|
|
|
|
|
" <td>A comedy-drama of nearly epic proportions root...</td>\n", |
|
|
|
|
|
" <td>positive</td>\n", |
|
|
|
|
|
" </tr>\n", |
|
|
|
|
|
" <tr>\n", |
|
|
|
|
|
" <th>5</th>\n", |
|
|
|
|
|
" <td>6</td>\n", |
|
|
|
|
|
" <td>The Importance of Being Earnest , so thick wit...</td>\n", |
|
|
|
|
|
" <td>neutral</td>\n", |
|
|
|
|
|
" </tr>\n", |
|
|
|
|
|
" </tbody>\n", |
|
|
|
|
|
"</table>\n", |
|
|
|
|
|
"</div>" |
|
|
|
|
|
], |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
" SentenceId Sentence Sentiment\n", |
|
|
|
|
|
"0 1 A series of escapades demonstrating the adage ... negative\n", |
|
|
|
|
|
"1 2 This quiet , introspective and entertaining in... positive\n", |
|
|
|
|
|
"2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n", |
|
|
|
|
|
"3 4 A positively thrilling combination of ethnogra... neutral\n", |
|
|
|
|
|
"4 5 A comedy-drama of nearly epic proportions root... positive\n", |
|
|
|
|
|
"5 6 The Importance of Being Earnest , so thick wit... neutral" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"execution_count": 24, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "execute_result" |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"import pandas as pd\n", |
|
|
"import pandas as pd\n", |
|
|
"\n", |
|
|
"\n", |
|
@@ -767,17 +1105,49 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 25, |
|
|
"id": "4f634586", |
|
|
"id": "4f634586", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"data": { |
|
|
|
|
|
"application/vnd.jupyter.widget-view+json": { |
|
|
|
|
|
"model_id": "", |
|
|
|
|
|
"version_major": 2, |
|
|
|
|
|
"version_minor": 0 |
|
|
|
|
|
}, |
|
|
|
|
|
"text/plain": [ |
|
|
|
|
|
"Processing: 0%| | 0/6 [00:00<?, ?it/s]" |
|
|
|
|
|
] |
|
|
|
|
|
}, |
|
|
|
|
|
"metadata": {}, |
|
|
|
|
|
"output_type": "display_data" |
|
|
|
|
|
}, |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"+------------+------------------------------+-----------+\n", |
|
|
|
|
|
"| SentenceId | Sentence | Sentiment |\n", |
|
|
|
|
|
"+------------+------------------------------+-----------+\n", |
|
|
|
|
|
"| 1 | ['a', 'series', 'of', 'es... | negative |\n", |
|
|
|
|
|
"| 2 | ['this', 'quiet', ',', 'i... | positive |\n", |
|
|
|
|
|
"| 3 | ['even', 'fans', 'of', 'i... | negative |\n", |
|
|
|
|
|
"| 4 | ['a', 'positively', 'thri... | neutral |\n", |
|
|
|
|
|
"| 5 | ['a', 'comedy-drama', 'of... | positive |\n", |
|
|
|
|
|
"| 6 | ['the', 'importance', 'of... | neutral |\n", |
|
|
|
|
|
"+------------+------------------------------+-----------+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"from fastNLP.core.dataset import DataSet\n", |
|
|
|
|
|
|
|
|
"from fastNLP import DataSet\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"dataset = DataSet()\n", |
|
|
"dataset = DataSet()\n", |
|
|
"dataset = dataset.from_pandas(df)\n", |
|
|
"dataset = dataset.from_pandas(df)\n", |
|
|
"dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n", |
|
|
"dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n", |
|
|
" 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']})\n", |
|
|
|
|
|
|
|
|
" 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']}, \n", |
|
|
|
|
|
" progress_bar=\"tqdm\")\n", |
|
|
"print(dataset)" |
|
|
"print(dataset)" |
|
|
] |
|
|
] |
|
|
}, |
|
|
}, |
|
@@ -791,7 +1161,7 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 26, |
|
|
"id": "46722efc", |
|
|
"id": "46722efc", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
"outputs": [], |
|
@@ -815,12 +1185,24 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 27, |
|
|
"id": "a2de615b", |
|
|
"id": "a2de615b", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"{'<pad>': 0, '<unk>': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n", |
|
|
|
|
|
"\n", |
|
|
|
|
|
"Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"from fastNLP.core.vocabulary import Vocabulary\n", |
|
|
|
|
|
|
|
|
"from fastNLP import Vocabulary\n", |
|
|
"\n", |
|
|
"\n", |
|
|
"vocab = Vocabulary()\n", |
|
|
"vocab = Vocabulary()\n", |
|
|
"vocab = vocab.from_dataset(dataset, field_name='Sentence')\n", |
|
|
"vocab = vocab.from_dataset(dataset, field_name='Sentence')\n", |
|
@@ -841,10 +1223,27 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 28, |
|
|
"id": "2f9a04b2", |
|
|
"id": "2f9a04b2", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"+------------+------------------------------+-----------+\n", |
|
|
|
|
|
"| SentenceId | Sentence | Sentiment |\n", |
|
|
|
|
|
"+------------+------------------------------+-----------+\n", |
|
|
|
|
|
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n", |
|
|
|
|
|
"| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n", |
|
|
|
|
|
"| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n", |
|
|
|
|
|
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n", |
|
|
|
|
|
"| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n", |
|
|
|
|
|
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n", |
|
|
|
|
|
"+------------+------------------------------+-----------+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"vocab.index_dataset(dataset, field_name='Sentence')\n", |
|
|
"vocab.index_dataset(dataset, field_name='Sentence')\n", |
|
|
"print(dataset)" |
|
|
"print(dataset)" |
|
@@ -860,10 +1259,28 @@ |
|
|
}, |
|
|
}, |
|
|
{ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
|
|
|
|
|
|
"execution_count": 29, |
|
|
"id": "5f5eed18", |
|
|
"id": "5f5eed18", |
|
|
"metadata": {}, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
|
|
|
|
|
|
"outputs": [ |
|
|
|
|
|
{ |
|
|
|
|
|
"name": "stdout", |
|
|
|
|
|
"output_type": "stream", |
|
|
|
|
|
"text": [ |
|
|
|
|
|
"{'negative': 0, 'positive': 1, 'neutral': 2}\n", |
|
|
|
|
|
"+------------+------------------------------+-----------+\n", |
|
|
|
|
|
"| SentenceId | Sentence | Sentiment |\n", |
|
|
|
|
|
"+------------+------------------------------+-----------+\n", |
|
|
|
|
|
"| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n", |
|
|
|
|
|
"| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n", |
|
|
|
|
|
"| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n", |
|
|
|
|
|
"| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n", |
|
|
|
|
|
"| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n", |
|
|
|
|
|
"| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n", |
|
|
|
|
|
"+------------+------------------------------+-----------+\n" |
|
|
|
|
|
] |
|
|
|
|
|
} |
|
|
|
|
|
], |
|
|
"source": [ |
|
|
"source": [ |
|
|
"target_vocab = Vocabulary(padding=None, unknown=None)\n", |
|
|
"target_vocab = Vocabulary(padding=None, unknown=None)\n", |
|
|
"\n", |
|
|
"\n", |
|
|