|
|
@@ -0,0 +1,292 @@ |
|
|
|
{ |
|
|
|
"cells": [ |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"# fastNLP中的DataSet" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 1, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"+------------------------------+---------------------------------------------+---------+\n", |
|
|
|
"| raw_words | words | seq_len |\n", |
|
|
|
"+------------------------------+---------------------------------------------+---------+\n", |
|
|
|
"| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", |
|
|
|
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n", |
|
|
|
"| Third instance . | ['Third', 'instance', '.'] | 3 |\n", |
|
|
|
"+------------------------------+---------------------------------------------+---------+\n" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"from fastNLP import DataSet\n", |
|
|
|
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n", |
|
|
|
" 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n", |
|
|
|
" 'seq_len': [6, 3, 3]}\n", |
|
|
|
"dataset = DataSet(data)\n", |
|
|
|
"# 传入的dict的每个key的value应该为具有相同长度的list\n", |
|
|
|
"print(dataset)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"## DataSet的构建" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 2, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"+----------------------------+---------------------------------------------+---------+\n", |
|
|
|
"| raw_words | words | seq_len |\n", |
|
|
|
"+----------------------------+---------------------------------------------+---------+\n", |
|
|
|
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", |
|
|
|
"+----------------------------+---------------------------------------------+---------+" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 2, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"from fastNLP import DataSet\n", |
|
|
|
"from fastNLP import Instance\n", |
|
|
|
"dataset = DataSet()\n", |
|
|
|
"instance = Instance(raw_words=\"This is the first instance\",\n", |
|
|
|
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n", |
|
|
|
" seq_len=6)\n", |
|
|
|
"dataset.append(instance)\n", |
|
|
|
"dataset" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 3, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"+----------------------------+---------------------------------------------+---------+\n", |
|
|
|
"| raw_words | words | seq_len |\n", |
|
|
|
"+----------------------------+---------------------------------------------+---------+\n", |
|
|
|
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", |
|
|
|
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n", |
|
|
|
"+----------------------------+---------------------------------------------+---------+" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 3, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"from fastNLP import DataSet\n", |
|
|
|
"from fastNLP import Instance\n", |
|
|
|
"dataset = DataSet([\n", |
|
|
|
" Instance(raw_words=\"This is the first instance\",\n", |
|
|
|
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n", |
|
|
|
" seq_len=6),\n", |
|
|
|
" Instance(raw_words=\"Second instance .\",\n", |
|
|
|
" words=['Second', 'instance', '.'],\n", |
|
|
|
" seq_len=3)\n", |
|
|
|
" ])\n", |
|
|
|
"dataset" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"## DataSet的删除" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 4, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"+----+---+\n", |
|
|
|
"| a | c |\n", |
|
|
|
"+----+---+\n", |
|
|
|
"| -5 | 0 |\n", |
|
|
|
"| -4 | 0 |\n", |
|
|
|
"| -3 | 0 |\n", |
|
|
|
"| -2 | 0 |\n", |
|
|
|
"| -1 | 0 |\n", |
|
|
|
"| 0 | 0 |\n", |
|
|
|
"| 1 | 0 |\n", |
|
|
|
"| 2 | 0 |\n", |
|
|
|
"| 3 | 0 |\n", |
|
|
|
"| 4 | 0 |\n", |
|
|
|
"+----+---+" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 4, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"from fastNLP import DataSet\n", |
|
|
|
"dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n", |
|
|
|
"dataset" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 5, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"+---+\n", |
|
|
|
"| c |\n", |
|
|
|
"+---+\n", |
|
|
|
"| 0 |\n", |
|
|
|
"| 0 |\n", |
|
|
|
"| 0 |\n", |
|
|
|
"| 0 |\n", |
|
|
|
"+---+" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 5, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet\n", |
|
|
|
"dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n", |
|
|
|
"# 在dataset中删除满足条件的instance\n", |
|
|
|
"dataset.drop(lambda ins:ins['a']<0)\n", |
|
|
|
"# 删除第3个instance\n", |
|
|
|
"dataset.delete_instance(2)\n", |
|
|
|
"# 删除名为'a'的field\n", |
|
|
|
"dataset.delete_field('a')\n", |
|
|
|
"dataset" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "markdown", |
|
|
|
"metadata": {}, |
|
|
|
"source": [ |
|
|
|
"## 简单的数据预处理" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 6, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"name": "stdout", |
|
|
|
"output_type": "stream", |
|
|
|
"text": [ |
|
|
|
"False\n" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"4" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 6, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"# 检查是否存在名为'a'的field\n", |
|
|
|
"print(dataset.has_field('a')) # 或 ('a' in dataset)\n", |
|
|
|
"# 将名为'a'的field改名为'b'\n", |
|
|
|
"dataset.rename_field('c', 'b')\n", |
|
|
|
"# DataSet的长度\n", |
|
|
|
"len(dataset)" |
|
|
|
] |
|
|
|
}, |
|
|
|
{ |
|
|
|
"cell_type": "code", |
|
|
|
"execution_count": 7, |
|
|
|
"metadata": {}, |
|
|
|
"outputs": [ |
|
|
|
{ |
|
|
|
"data": { |
|
|
|
"text/plain": [ |
|
|
|
"+------------------------------+-------------------------------------------------+\n", |
|
|
|
"| raw_words | words |\n", |
|
|
|
"+------------------------------+-------------------------------------------------+\n", |
|
|
|
"| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n", |
|
|
|
"| Second instance . | ['Second', 'instance', '.'] |\n", |
|
|
|
"| Third instance . | ['Third', 'instance', '.'] |\n", |
|
|
|
"+------------------------------+-------------------------------------------------+" |
|
|
|
] |
|
|
|
}, |
|
|
|
"execution_count": 7, |
|
|
|
"metadata": {}, |
|
|
|
"output_type": "execute_result" |
|
|
|
} |
|
|
|
], |
|
|
|
"source": [ |
|
|
|
"from fastNLP import DataSet\n", |
|
|
|
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n", |
|
|
|
"dataset = DataSet(data)\n", |
|
|
|
"\n", |
|
|
|
"# 将句子分成单词形式, 详见DataSet.apply()方法\n", |
|
|
|
"dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n", |
|
|
|
"\n", |
|
|
|
"# 或使用DataSet.apply_field()\n", |
|
|
|
"dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n", |
|
|
|
"\n", |
|
|
|
"# 除了匿名函数,也可以定义函数传递进去\n", |
|
|
|
"def get_words(instance):\n", |
|
|
|
" sentence = instance['raw_words']\n", |
|
|
|
" words = sentence.split()\n", |
|
|
|
" return words\n", |
|
|
|
"dataset.apply(get_words, new_field_name='words')\n", |
|
|
|
"dataset" |
|
|
|
] |
|
|
|
} |
|
|
|
], |
|
|
|
"metadata": { |
|
|
|
"kernelspec": { |
|
|
|
"display_name": "Python Now", |
|
|
|
"language": "python", |
|
|
|
"name": "now" |
|
|
|
}, |
|
|
|
"language_info": { |
|
|
|
"codemirror_mode": { |
|
|
|
"name": "ipython", |
|
|
|
"version": 3 |
|
|
|
}, |
|
|
|
"file_extension": ".py", |
|
|
|
"mimetype": "text/x-python", |
|
|
|
"name": "python", |
|
|
|
"nbconvert_exporter": "python", |
|
|
|
"pygments_lexer": "ipython3", |
|
|
|
"version": "3.8.0" |
|
|
|
} |
|
|
|
}, |
|
|
|
"nbformat": 4, |
|
|
|
"nbformat_minor": 2 |
|
|
|
} |