三个纬度 check: DataSet Tutorial

5 years ago · 2d1500651f
--- a/docs/source/quickstart/cn_cls_example.png
+++ b/docs/source/quickstart/cn_cls_example.png
--- a/docs/source/tutorials/tutorial_1_data_preprocess.rst
+++ b/docs/source/tutorials/tutorial_1_data_preprocess.rst
@@ -16,7 +16,7 @@ fastNLP中的DataSet
 每一行是一个instance (在fastNLP中被称为 :mod:`~fastNLP.core.Instance` )，
 每一列是一个field (在fastNLP中称为 :mod:`~fastNLP.core.FieldArray` )。

 DataSet构建和删除
 DataSet的构建
 -----------------------------

 我们使用传入字典的方式构建一个数据集，这是 :class:`~fastNLP.DataSet` 初始化的最基础的方式
@@ -77,16 +77,25 @@ DataSet构建和删除
    for instance in dataset:
        # do something

 DataSet的删除
 -----------------------------

 FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` 、 :func:`~fastNLP.DataSet.delete_instance` 和 :func:`~fastNLP.DataSet.delete_field`
 我们先用下面的代码生成一个只有两列的样例数据集，第一列的值分别为 -5 ~ 4，第二列的值均为 0.

 .. code-block:: python

    from fastNLP import DataSet
    dataset = DataSet({'a': list(range(-5, 5))})
    # 返回满足条件的instance,并放入DataSet中
    dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})

 然后我们使用三种方法进行删除，删除后的数据集仅包含名为 c 的一列，包含4个值为0 的数据。

 .. code-block:: python

    # 不改变dataset，生成一个删除了满足条件的instance的新 DataSet
    dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)
    # 在dataset中删除满足条件的instance
    dataset.drop(lambda ins:ins['a']<0)  # dataset的instance数量减少
    dataset.drop(lambda ins:ins['a']<0)
    #  删除第3个instance
    dataset.delete_instance(2)
    #  删除名为'a'的field
@@ -103,8 +112,8 @@ FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop`

    #  检查是否存在名为'a'的field
    dataset.has_field('a')  # 或 ('a' in dataset)
    #  将名为'a'的field改名为'b'
    dataset.rename_field('a', 'b')
    #  将名为'c'的field改名为'b'
    dataset.rename_field('c', 'b')
    #  DataSet的长度
    len(dataset)

--- a/docs/source/quickstart/序列标注.rst
+++ b/docs/source/quickstart/序列标注.rst
--- a/docs/source/quickstart/文本分类.rst
+++ b/docs/source/quickstart/文本分类.rst
--- a/docs/source/user/quickstart.rst
+++ b/docs/source/user/quickstart.rst
@@ -7,8 +7,8 @@
 .. toctree::
   :maxdepth: 1

   /quickstart/文本分类
   /quickstart/序列标注
   /tutorials/文本分类
   /tutorials/序列标注

 这些教程是简单地介绍了 fastNLP 的使用流程，其中文本分类相对简单，序列标注则较为复杂。更多的教程分析见 :doc:`/user/tutorials`

--- a/test/test_tutorials.py
+++ b/test/test_tutorials.py
@@ -7,7 +7,82 @@ from fastNLP.core.losses import CrossEntropyLoss
 from fastNLP.core.metrics import AccuracyMetric
 from fastNLP.io.loader import CSVLoader


 class TestTutorial(unittest.TestCase):
    def test_tutorial_1_data_preprocess(self):
        from fastNLP import DataSet
        data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."],
                'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'],
                          ['Third', 'instance', '.']],
                'seq_len': [6, 3, 3]}
        dataset = DataSet(data)
        # 传入的dict的每个key的value应该为具有相同长度的list

        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet()
        instance = Instance(raw_words="This is the first instance",
                            words=['this', 'is', 'the', 'first', 'instance', '.'],
                            seq_len=6)
        dataset.append(instance)

        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet([
            Instance(raw_words="This is the first instance",
                     words=['this', 'is', 'the', 'first', 'instance', '.'],
                     seq_len=6),
            Instance(raw_words="Second instance .",
                     words=['Second', 'instance', '.'],
                     seq_len=3)
        ])

        from fastNLP import DataSet
        dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10})

        # 不改变dataset，生成一个删除了满足条件的instance的新 DataSet
        dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False)
        # 在dataset中删除满足条件的instance
        dataset.drop(lambda ins: ins['a'] < 0)
        #  删除第3个instance
        dataset.delete_instance(2)
        #  删除名为'a'的field
        dataset.delete_field('a')

        #  检查是否存在名为'a'的field
        print(dataset.has_field('a'))  # 或 ('a' in dataset)
        #  将名为'a'的field改名为'b'
        dataset.rename_field('c', 'b')
        #  DataSet的长度
        len(dataset)

        from fastNLP import DataSet
        data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]}
        dataset = DataSet(data)

        # 将句子分成单词形式, 详见DataSet.apply()方法
        dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')

        # 或使用DataSet.apply_field()
        dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words')

        # 除了匿名函数，也可以定义函数传递进去
        def get_words(instance):
            sentence = instance['raw_words']
            words = sentence.split()
            return words

        dataset.apply(get_words, new_field_name='words')
        
    def setUp(self):
        import os
        self._init_wd = os.path.abspath(os.curdir)

    def tearDown(self):
        import os
        os.chdir(self._init_wd)
        
 class TestOldTutorial(unittest.TestCase):
    def test_fastnlp_10min_tutorial(self):
        # 从csv读取数据到DataSet
        sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"
--- a/tutorials/tutorial_1_data_preprocess.ipynb
+++ b/tutorials/tutorial_1_data_preprocess.ipynb
@@ -0,0 +1,292 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# fastNLP中的DataSet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------------------+---------------------------------------------+---------+\n",
      "| raw_words                    | words                                       | seq_len |\n",
      "+------------------------------+---------------------------------------------+---------+\n",
      "| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6       |\n",
      "| Second instance .            | ['Second', 'instance', '.']                 | 3       |\n",
      "| Third instance .             | ['Third', 'instance', '.']                  | 3       |\n",
      "+------------------------------+---------------------------------------------+---------+\n"
     ]
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
    "        'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
    "        'seq_len': [6, 3, 3]}\n",
    "dataset = DataSet(data)\n",
    "# 传入的dict的每个key的value应该为具有相同长度的list\n",
    "print(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DataSet的构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----------------------------+---------------------------------------------+---------+\n",
       "| raw_words                  | words                                       | seq_len |\n",
       "+----------------------------+---------------------------------------------+---------+\n",
       "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6       |\n",
       "+----------------------------+---------------------------------------------+---------+"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "from fastNLP import Instance\n",
    "dataset = DataSet()\n",
    "instance = Instance(raw_words=\"This is the first instance\",\n",
    "                    words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
    "                    seq_len=6)\n",
    "dataset.append(instance)\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----------------------------+---------------------------------------------+---------+\n",
       "| raw_words                  | words                                       | seq_len |\n",
       "+----------------------------+---------------------------------------------+---------+\n",
       "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6       |\n",
       "| Second instance .          | ['Second', 'instance', '.']                 | 3       |\n",
       "+----------------------------+---------------------------------------------+---------+"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "from fastNLP import Instance\n",
    "dataset = DataSet([\n",
    "    Instance(raw_words=\"This is the first instance\",\n",
    "        words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
    "        seq_len=6),\n",
    "    Instance(raw_words=\"Second instance .\",\n",
    "        words=['Second', 'instance', '.'],\n",
    "        seq_len=3)\n",
    "    ])\n",
    "dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## DataSet的删除"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+----+---+\n",
       "| a  | c |\n",
       "+----+---+\n",
       "| -5 | 0 |\n",
       "| -4 | 0 |\n",
       "| -3 | 0 |\n",
       "| -2 | 0 |\n",
       "| -1 | 0 |\n",
       "| 0  | 0 |\n",
       "| 1  | 0 |\n",
       "| 2  | 0 |\n",
       "| 3  | 0 |\n",
       "| 4  | 0 |\n",
       "+----+---+"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+---+\n",
       "| c |\n",
       "+---+\n",
       "| 0 |\n",
       "| 0 |\n",
       "| 0 |\n",
       "| 0 |\n",
       "+---+"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 不改变dataset，生成一个删除了满足条件的instance的新 DataSet\n",
    "dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
    "# 在dataset中删除满足条件的instance\n",
    "dataset.drop(lambda ins:ins['a']<0)\n",
    "#  删除第3个instance\n",
    "dataset.delete_instance(2)\n",
    "#  删除名为'a'的field\n",
    "dataset.delete_field('a')\n",
    "dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 简单的数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "False\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#  检查是否存在名为'a'的field\n",
    "print(dataset.has_field('a'))  # 或 ('a' in dataset)\n",
    "#  将名为'a'的field改名为'b'\n",
    "dataset.rename_field('c', 'b')\n",
    "#  DataSet的长度\n",
    "len(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "+------------------------------+-------------------------------------------------+\n",
       "| raw_words                    | words                                           |\n",
       "+------------------------------+-------------------------------------------------+\n",
       "| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
       "| Second instance .            | ['Second', 'instance', '.']                     |\n",
       "| Third instance .             | ['Third', 'instance', '.']                      |\n",
       "+------------------------------+-------------------------------------------------+"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from fastNLP import DataSet\n",
    "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
    "dataset = DataSet(data)\n",
    "\n",
    "# 将句子分成单词形式, 详见DataSet.apply()方法\n",
    "dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
    "\n",
    "# 或使用DataSet.apply_field()\n",
    "dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
    "\n",
    "# 除了匿名函数，也可以定义函数传递进去\n",
    "def get_words(instance):\n",
    "    sentence = instance['raw_words']\n",
    "    words = sentence.split()\n",
    "    return words\n",
    "dataset.apply(get_words, new_field_name='words')\n",
    "dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python Now",
   "language": "python",
   "name": "now"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }