Browse Source

三个纬度 check: DataSet Tutorial

tags/v0.5.5
ChenXin 4 years ago
parent
commit
2d1500651f
7 changed files with 384 additions and 8 deletions
  1. +0
    -0
      docs/source/tutorials/cn_cls_example.png
  2. +15
    -6
      docs/source/tutorials/tutorial_1_data_preprocess.rst
  3. +0
    -0
      docs/source/tutorials/序列标注.rst
  4. +0
    -0
      docs/source/tutorials/文本分类.rst
  5. +2
    -2
      docs/source/user/quickstart.rst
  6. +75
    -0
      test/test_tutorials.py
  7. +292
    -0
      tutorials/tutorial_1_data_preprocess.ipynb

docs/source/quickstart/cn_cls_example.png → docs/source/tutorials/cn_cls_example.png View File


+ 15
- 6
docs/source/tutorials/tutorial_1_data_preprocess.rst View File

@@ -16,7 +16,7 @@ fastNLP中的DataSet
每一行是一个instance (在fastNLP中被称为 :mod:`~fastNLP.core.Instance` ),
每一列是一个field (在fastNLP中称为 :mod:`~fastNLP.core.FieldArray` )。

DataSet构建和删除
DataSet构建
-----------------------------

我们使用传入字典的方式构建一个数据集,这是 :class:`~fastNLP.DataSet` 初始化的最基础的方式
@@ -77,16 +77,25 @@ DataSet构建和删除
for instance in dataset:
# do something

DataSet的删除
-----------------------------

FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop` 、 :func:`~fastNLP.DataSet.delete_instance` 和 :func:`~fastNLP.DataSet.delete_field`
我们先用下面的代码生成一个只有两列的样例数据集,第一列的值分别为 -5 ~ 4,第二列的值均为 0.

.. code-block:: python

from fastNLP import DataSet
dataset = DataSet({'a': list(range(-5, 5))})
# 返回满足条件的instance,并放入DataSet中
dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})

然后我们使用三种方法进行删除,删除后的数据集仅包含名为 c 的一列,包含4个值为0 的数据。

.. code-block:: python

# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet
dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)
# 在dataset中删除满足条件的instance
dataset.drop(lambda ins:ins['a']<0) # dataset的instance数量减少
dataset.drop(lambda ins:ins['a']<0)
# 删除第3个instance
dataset.delete_instance(2)
# 删除名为'a'的field
@@ -103,8 +112,8 @@ FastNLP 同样提供了多种删除数据的方法 :func:`~fastNLP.DataSet.drop`

# 检查是否存在名为'a'的field
dataset.has_field('a') # 或 ('a' in dataset)
# 将名为'a'的field改名为'b'
dataset.rename_field('a', 'b')
# 将名为'c'的field改名为'b'
dataset.rename_field('c', 'b')
# DataSet的长度
len(dataset)



docs/source/quickstart/序列标注.rst → docs/source/tutorials/序列标注.rst View File


docs/source/quickstart/文本分类.rst → docs/source/tutorials/文本分类.rst View File


+ 2
- 2
docs/source/user/quickstart.rst View File

@@ -7,8 +7,8 @@
.. toctree::
:maxdepth: 1

/quickstart/文本分类
/quickstart/序列标注
/tutorials/文本分类
/tutorials/序列标注

这些教程是简单地介绍了 fastNLP 的使用流程,其中文本分类相对简单,序列标注则较为复杂。更多的教程分析见 :doc:`/user/tutorials`


+ 75
- 0
test/test_tutorials.py View File

@@ -7,7 +7,82 @@ from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric
from fastNLP.io.loader import CSVLoader


class TestTutorial(unittest.TestCase):
def test_tutorial_1_data_preprocess(self):
from fastNLP import DataSet
data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."],
'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'],
['Third', 'instance', '.']],
'seq_len': [6, 3, 3]}
dataset = DataSet(data)
# 传入的dict的每个key的value应该为具有相同长度的list

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
instance = Instance(raw_words="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6)
dataset.append(instance)

from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet([
Instance(raw_words="This is the first instance",
words=['this', 'is', 'the', 'first', 'instance', '.'],
seq_len=6),
Instance(raw_words="Second instance .",
words=['Second', 'instance', '.'],
seq_len=3)
])

from fastNLP import DataSet
dataset = DataSet({'a': range(-5, 5), 'c': [0] * 10})

# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet
dropped_dataset = dataset.drop(lambda ins: ins['a'] < 0, inplace=False)
# 在dataset中删除满足条件的instance
dataset.drop(lambda ins: ins['a'] < 0)
# 删除第3个instance
dataset.delete_instance(2)
# 删除名为'a'的field
dataset.delete_field('a')

# 检查是否存在名为'a'的field
print(dataset.has_field('a')) # 或 ('a' in dataset)
# 将名为'a'的field改名为'b'
dataset.rename_field('c', 'b')
# DataSet的长度
len(dataset)

from fastNLP import DataSet
data = {'raw_words': ["This is the first instance .", "Second instance .", "Third instance ."]}
dataset = DataSet(data)

# 将句子分成单词形式, 详见DataSet.apply()方法
dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')

# 或使用DataSet.apply_field()
dataset.apply_field(lambda sent: sent.split(), field_name='raw_words', new_field_name='words')

# 除了匿名函数,也可以定义函数传递进去
def get_words(instance):
sentence = instance['raw_words']
words = sentence.split()
return words

dataset.apply(get_words, new_field_name='words')
def setUp(self):
import os
self._init_wd = os.path.abspath(os.curdir)

def tearDown(self):
import os
os.chdir(self._init_wd)
class TestOldTutorial(unittest.TestCase):
def test_fastnlp_10min_tutorial(self):
# 从csv读取数据到DataSet
sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"


+ 292
- 0
tutorials/tutorial_1_data_preprocess.ipynb View File

@@ -0,0 +1,292 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# fastNLP中的DataSet"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+------------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+------------------------------+---------------------------------------------+---------+\n",
"| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"| Third instance . | ['Third', 'instance', '.'] | 3 |\n",
"+------------------------------+---------------------------------------------+---------+\n"
]
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n",
" 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n",
" 'seq_len': [6, 3, 3]}\n",
"dataset = DataSet(data)\n",
"# 传入的dict的每个key的value应该为具有相同长度的list\n",
"print(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的构建"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet()\n",
"instance = Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6)\n",
"dataset.append(instance)\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----------------------------+---------------------------------------------+---------+\n",
"| raw_words | words | seq_len |\n",
"+----------------------------+---------------------------------------------+---------+\n",
"| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n",
"| Second instance . | ['Second', 'instance', '.'] | 3 |\n",
"+----------------------------+---------------------------------------------+---------+"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"dataset = DataSet([\n",
" Instance(raw_words=\"This is the first instance\",\n",
" words=['this', 'is', 'the', 'first', 'instance', '.'],\n",
" seq_len=6),\n",
" Instance(raw_words=\"Second instance .\",\n",
" words=['Second', 'instance', '.'],\n",
" seq_len=3)\n",
" ])\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DataSet的删除"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+----+---+\n",
"| a | c |\n",
"+----+---+\n",
"| -5 | 0 |\n",
"| -4 | 0 |\n",
"| -3 | 0 |\n",
"| -2 | 0 |\n",
"| -1 | 0 |\n",
"| 0 | 0 |\n",
"| 1 | 0 |\n",
"| 2 | 0 |\n",
"| 3 | 0 |\n",
"| 4 | 0 |\n",
"+----+---+"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n",
"dataset"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+---+\n",
"| c |\n",
"+---+\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"| 0 |\n",
"+---+"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet\n",
"dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n",
"# 在dataset中删除满足条件的instance\n",
"dataset.drop(lambda ins:ins['a']<0)\n",
"# 删除第3个instance\n",
"dataset.delete_instance(2)\n",
"# 删除名为'a'的field\n",
"dataset.delete_field('a')\n",
"dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 简单的数据预处理"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 检查是否存在名为'a'的field\n",
"print(dataset.has_field('a')) # 或 ('a' in dataset)\n",
"# 将名为'a'的field改名为'b'\n",
"dataset.rename_field('c', 'b')\n",
"# DataSet的长度\n",
"len(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"+------------------------------+-------------------------------------------------+\n",
"| raw_words | words |\n",
"+------------------------------+-------------------------------------------------+\n",
"| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n",
"| Second instance . | ['Second', 'instance', '.'] |\n",
"| Third instance . | ['Third', 'instance', '.'] |\n",
"+------------------------------+-------------------------------------------------+"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from fastNLP import DataSet\n",
"data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n",
"dataset = DataSet(data)\n",
"\n",
"# 将句子分成单词形式, 详见DataSet.apply()方法\n",
"dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n",
"\n",
"# 或使用DataSet.apply_field()\n",
"dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n",
"\n",
"# 除了匿名函数,也可以定义函数传递进去\n",
"def get_words(instance):\n",
" sentence = instance['raw_words']\n",
" words = sentence.split()\n",
" return words\n",
"dataset.apply(get_words, new_field_name='words')\n",
"dataset"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python Now",
"language": "python",
"name": "now"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Loading…
Cancel
Save