From 2d1500651fb812a89d4aa7e385e7d3bc82a42b72 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 26 Feb 2020 16:39:00 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=89=E4=B8=AA=E7=BA=AC=E5=BA=A6=20check:?= =?UTF-8?q?=20DataSet=20Tutorial?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cn_cls_example.png | Bin .../tutorials/tutorial_1_data_preprocess.rst | 21 +- .../序列标注.rst | 0 .../文本分类.rst | 0 docs/source/user/quickstart.rst | 4 +- test/test_tutorials.py | 75 +++++ tutorials/tutorial_1_data_preprocess.ipynb | 292 ++++++++++++++++++ 7 files changed, 384 insertions(+), 8 deletions(-) rename docs/source/{quickstart => tutorials}/cn_cls_example.png (100%) rename docs/source/{quickstart => tutorials}/序列标注.rst (100%) rename docs/source/{quickstart => tutorials}/文本分类.rst (100%) create mode 100644 tutorials/tutorial_1_data_preprocess.ipynb diff --git a/docs/source/quickstart/cn_cls_example.png b/docs/source/tutorials/cn_cls_example.png similarity index 100% rename from docs/source/quickstart/cn_cls_example.png rename to docs/source/tutorials/cn_cls_example.png diff --git a/docs/source/tutorials/tutorial_1_data_preprocess.rst b/docs/source/tutorials/tutorial_1_data_preprocess.rst index 005f23f1..448cb62e 100644 --- a/docs/source/tutorials/tutorial_1_data_preprocess.rst +++ b/docs/source/tutorials/tutorial_1_data_preprocess.rst @@ -16,7 +16,7 @@ fastNLP中的DataSet 每一行是一个instance (在fastNLP中被称为 :mod:`~fastNLP.core.Instance` ), 每一列是一个field (在fastNLP中称为 :mod:`~fastNLP.core.FieldArray` )。 -DataSet构建和删除 +DataSet的构建 ----------------------------- 我们使用传入字典的方式构建一个数据集,这是 :class:`~fastNLP.DataSet` 初始化的最基础的方式 @@ -77,16 +77,25 @@ DataSet构建和删除 for instance in dataset: # do something +DataSet的删除 +----------------------------- + FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` 、 :func:`~fastNLP.DataSet.delete_instance` 和 :func:`~fastNLP.DataSet.delete_field` +我们先用下面的代码生成一个只有两列的样例数据集,第一列的值分别为 -5 ~ 4,第二列的值均为 0. .. code-block:: python from fastNLP import DataSet - dataset = DataSet({'a': list(range(-5, 5))}) - # 返回满足条件的instance,并放入DataSet中 + dataset = DataSet({'a': range(-5, 5), 'c': [0]*10}) + +然后我们使用三种方法进行删除,删除后的数据集仅包含名为 c 的一列,包含4个值为0 的数据。 + +.. code-block:: python + + # 不改变dataset,生成一个删除了满足条件的instance的新 DataSet dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False) # 在dataset中删除满足条件的instance - dataset.drop(lambda ins:ins['a']<0) # dataset的instance数量减少 + dataset.drop(lambda ins:ins['a']<0) # 删除第3个instance dataset.delete_instance(2) # 删除名为'a'的field @@ -103,8 +112,8 @@ FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` # 检查是否存在名为'a'的field dataset.has_field('a') # 或 ('a' in dataset) - # 将名为'a'的field改名为'b' - dataset.rename_field('a', 'b') + # 将名为'c'的field改名为'b' + dataset.rename_field('c', 'b') # DataSet的长度 len(dataset) diff --git a/docs/source/quickstart/序列标注.rst b/docs/source/tutorials/序列标注.rst similarity index 100% rename from docs/source/quickstart/序列标注.rst rename to docs/source/tutorials/序列标注.rst diff --git a/docs/source/quickstart/文本分类.rst b/docs/source/tutorials/文本分类.rst similarity index 100% rename from docs/source/quickstart/文本分类.rst rename to docs/source/tutorials/文本分类.rst diff --git a/docs/source/user/quickstart.rst b/docs/source/user/quickstart.rst index 28b76c4d..40039af6 100644 --- a/docs/source/user/quickstart.rst +++ b/docs/source/user/quickstart.rst @@ -7,8 +7,8 @@ .. toctree:: :maxdepth: 1 - /quickstart/文本分类 - /quickstart/序列标注 + /tutorials/文本分类 + /tutorials/序列标注 这些教程是简单地介绍了 fastNLP 的使用流程,其中文本分类相对简单,序列标注则较为复杂。更多的教程分析见 :doc:`/user/tutorials` diff --git a/test/test_tutorials.py b/test/test_tutorials.py index 3ec0e381..aa7c4a60 100644 --- a/test/test_tutorials.py +++ b/test/test_tutorials.py @@ -7,7 +7,82 @@ from fastNLP.core.losses import CrossEntropyLoss from fastNLP.core.metrics import AccuracyMetric from fastNLP.io.loader import CSVLoader + class TestTutorial(unittest.TestCase): + def test_tutorial_1_data_preprocess(self): + from fastNLP import DataSet + data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."], + 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], + ['Third', 'instance', '.']], + 'seq_len': [6, 3, 3]} + dataset = DataSet(data) + # 传入的dict的每个key的value应该为具有相同长度的list + + from fastNLP import DataSet + from fastNLP import Instance + dataset = DataSet() + instance = Instance(raw_words="This is the first instance", + words=['this', 'is', 'the', 'first', 'instance', '.'], + seq_len=6) + dataset.append(instance) + + from fastNLP import DataSet + from fastNLP import Instance + dataset = DataSet([ + Instance(raw_words="This is the first instance", + words=['this', 'is', 'the', 'first', 'instance', '.'], + seq_len=6), + Instance(raw_words="Second instance .", + words=['Second', 'instance', '.'], + seq_len=3) + ]) + + from fastNLP import DataSet + dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10}) + + # 不改变dataset,生成一个删除了满足条件的instance的新 DataSet + dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False) + # 在dataset中删除满足条件的instance + dataset.drop(lambda ins: ins['a'] < 0) + # 删除第3个instance + dataset.delete_instance(2) + # 删除名为'a'的field + dataset.delete_field('a') + + # 检查是否存在名为'a'的field + print(dataset.has_field('a')) # 或 ('a' in dataset) + # 将名为'a'的field改名为'b' + dataset.rename_field('c', 'b') + # DataSet的长度 + len(dataset) + + from fastNLP import DataSet + data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]} + dataset = DataSet(data) + + # 将句子分成单词形式, 详见DataSet.apply()方法 + dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words') + + # 或使用DataSet.apply_field() + dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words') + + # 除了匿名函数,也可以定义函数传递进去 + def get_words(instance): + sentence = instance['raw_words'] + words = sentence.split() + return words + + dataset.apply(get_words, new_field_name='words') + + def setUp(self): + import os + self._init_wd = os.path.abspath(os.curdir) + + def tearDown(self): + import os + os.chdir(self._init_wd) + +class TestOldTutorial(unittest.TestCase): def test_fastnlp_10min_tutorial(self): # 从csv读取数据到DataSet sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" diff --git a/tutorials/tutorial_1_data_preprocess.ipynb b/tutorials/tutorial_1_data_preprocess.ipynb new file mode 100644 index 00000000..a987e7f2 --- /dev/null +++ b/tutorials/tutorial_1_data_preprocess.ipynb @@ -0,0 +1,292 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# fastNLP中的DataSet" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+------------------------------+---------------------------------------------+---------+\n", + "| raw_words | words | seq_len |\n", + "+------------------------------+---------------------------------------------+---------+\n", + "| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", + "| Second instance . | ['Second', 'instance', '.'] | 3 |\n", + "| Third instance . | ['Third', 'instance', '.'] | 3 |\n", + "+------------------------------+---------------------------------------------+---------+\n" + ] + } + ], + "source": [ + "from fastNLP import DataSet\n", + "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n", + " 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n", + " 'seq_len': [6, 3, 3]}\n", + "dataset = DataSet(data)\n", + "# 传入的dict的每个key的value应该为具有相同长度的list\n", + "print(dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataSet的构建" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "+----------------------------+---------------------------------------------+---------+\n", + "| raw_words | words | seq_len |\n", + "+----------------------------+---------------------------------------------+---------+\n", + "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", + "+----------------------------+---------------------------------------------+---------+" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from fastNLP import DataSet\n", + "from fastNLP import Instance\n", + "dataset = DataSet()\n", + "instance = Instance(raw_words=\"This is the first instance\",\n", + " words=['this', 'is', 'the', 'first', 'instance', '.'],\n", + " seq_len=6)\n", + "dataset.append(instance)\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "+----------------------------+---------------------------------------------+---------+\n", + "| raw_words | words | seq_len |\n", + "+----------------------------+---------------------------------------------+---------+\n", + "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", + "| Second instance . | ['Second', 'instance', '.'] | 3 |\n", + "+----------------------------+---------------------------------------------+---------+" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from fastNLP import DataSet\n", + "from fastNLP import Instance\n", + "dataset = DataSet([\n", + " Instance(raw_words=\"This is the first instance\",\n", + " words=['this', 'is', 'the', 'first', 'instance', '.'],\n", + " seq_len=6),\n", + " Instance(raw_words=\"Second instance .\",\n", + " words=['Second', 'instance', '.'],\n", + " seq_len=3)\n", + " ])\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DataSet的删除" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "+----+---+\n", + "| a | c |\n", + "+----+---+\n", + "| -5 | 0 |\n", + "| -4 | 0 |\n", + "| -3 | 0 |\n", + "| -2 | 0 |\n", + "| -1 | 0 |\n", + "| 0 | 0 |\n", + "| 1 | 0 |\n", + "| 2 | 0 |\n", + "| 3 | 0 |\n", + "| 4 | 0 |\n", + "+----+---+" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from fastNLP import DataSet\n", + "dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "+---+\n", + "| c |\n", + "+---+\n", + "| 0 |\n", + "| 0 |\n", + "| 0 |\n", + "| 0 |\n", + "+---+" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet\n", + "dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n", + "# 在dataset中删除满足条件的instance\n", + "dataset.drop(lambda ins:ins['a']<0)\n", + "# 删除第3个instance\n", + "dataset.delete_instance(2)\n", + "# 删除名为'a'的field\n", + "dataset.delete_field('a')\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 简单的数据预处理" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "False\n" + ] + }, + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 检查是否存在名为'a'的field\n", + "print(dataset.has_field('a')) # 或 ('a' in dataset)\n", + "# 将名为'a'的field改名为'b'\n", + "dataset.rename_field('c', 'b')\n", + "# DataSet的长度\n", + "len(dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "+------------------------------+-------------------------------------------------+\n", + "| raw_words | words |\n", + "+------------------------------+-------------------------------------------------+\n", + "| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n", + "| Second instance . | ['Second', 'instance', '.'] |\n", + "| Third instance . | ['Third', 'instance', '.'] |\n", + "+------------------------------+-------------------------------------------------+" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from fastNLP import DataSet\n", + "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n", + "dataset = DataSet(data)\n", + "\n", + "# 将句子分成单词形式, 详见DataSet.apply()方法\n", + "dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n", + "\n", + "# 或使用DataSet.apply_field()\n", + "dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n", + "\n", + "# 除了匿名函数,也可以定义函数传递进去\n", + "def get_words(instance):\n", + " sentence = instance['raw_words']\n", + " words = sentence.split()\n", + " return words\n", + "dataset.apply(get_words, new_field_name='words')\n", + "dataset" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python Now", + "language": "python", + "name": "now" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}