{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# fastNLP中的DataSet" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------------------------------+---------------------------------------------+---------+\n", "| raw_words | words | seq_len |\n", "+------------------------------+---------------------------------------------+---------+\n", "| This is the first instance . | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", "| Second instance . | ['Second', 'instance', '.'] | 3 |\n", "| Third instance . | ['Third', 'instance', '.'] | 3 |\n", "+------------------------------+---------------------------------------------+---------+\n" ] } ], "source": [ "from fastNLP import DataSet\n", "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"],\n", " 'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.']],\n", " 'seq_len': [6, 3, 3]}\n", "dataset = DataSet(data)\n", "# 传入的dict的每个key的value应该为具有相同长度的list\n", "print(dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataSet的构建" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "+----------------------------+---------------------------------------------+---------+\n", "| raw_words | words | seq_len |\n", "+----------------------------+---------------------------------------------+---------+\n", "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", "+----------------------------+---------------------------------------------+---------+" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from fastNLP import DataSet\n", "from fastNLP import Instance\n", "dataset = DataSet()\n", "instance = Instance(raw_words=\"This is the first instance\",\n", " words=['this', 'is', 'the', 'first', 'instance', '.'],\n", " seq_len=6)\n", "dataset.append(instance)\n", "dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "+----------------------------+---------------------------------------------+---------+\n", "| raw_words | words | seq_len |\n", "+----------------------------+---------------------------------------------+---------+\n", "| This is the first instance | ['this', 'is', 'the', 'first', 'instance... | 6 |\n", "| Second instance . | ['Second', 'instance', '.'] | 3 |\n", "+----------------------------+---------------------------------------------+---------+" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from fastNLP import DataSet\n", "from fastNLP import Instance\n", "dataset = DataSet([\n", " Instance(raw_words=\"This is the first instance\",\n", " words=['this', 'is', 'the', 'first', 'instance', '.'],\n", " seq_len=6),\n", " Instance(raw_words=\"Second instance .\",\n", " words=['Second', 'instance', '.'],\n", " seq_len=3)\n", " ])\n", "dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataSet的删除" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "+----+---+\n", "| a | c |\n", "+----+---+\n", "| -5 | 0 |\n", "| -4 | 0 |\n", "| -3 | 0 |\n", "| -2 | 0 |\n", "| -1 | 0 |\n", "| 0 | 0 |\n", "| 1 | 0 |\n", "| 2 | 0 |\n", "| 3 | 0 |\n", "| 4 | 0 |\n", "+----+---+" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from fastNLP import DataSet\n", "dataset = DataSet({'a': range(-5, 5), 'c': [0]*10})\n", "dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "+---+\n", "| c |\n", "+---+\n", "| 0 |\n", "| 0 |\n", "| 0 |\n", "| 0 |\n", "+---+" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 不改变dataset,生成一个删除了满足条件的instance的新 DataSet\n", "dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)\n", "# 在dataset中删除满足条件的instance\n", "dataset.drop(lambda ins:ins['a']<0)\n", "# 删除第3个instance\n", "dataset.delete_instance(2)\n", "# 删除名为'a'的field\n", "dataset.delete_field('a')\n", "dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 简单的数据预处理" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "False\n" ] }, { "data": { "text/plain": [ "4" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 检查是否存在名为'a'的field\n", "print(dataset.has_field('a')) # 或 ('a' in dataset)\n", "# 将名为'a'的field改名为'b'\n", "dataset.rename_field('c', 'b')\n", "# DataSet的长度\n", "len(dataset)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "+------------------------------+-------------------------------------------------+\n", "| raw_words | words |\n", "+------------------------------+-------------------------------------------------+\n", "| This is the first instance . | ['This', 'is', 'the', 'first', 'instance', '.'] |\n", "| Second instance . | ['Second', 'instance', '.'] |\n", "| Third instance . | ['Third', 'instance', '.'] |\n", "+------------------------------+-------------------------------------------------+" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from fastNLP import DataSet\n", "data = {'raw_words':[\"This is the first instance .\", \"Second instance .\", \"Third instance .\"]}\n", "dataset = DataSet(data)\n", "\n", "# 将句子分成单词形式, 详见DataSet.apply()方法\n", "dataset.apply(lambda ins: ins['raw_words'].split(), new_field_name='words')\n", "\n", "# 或使用DataSet.apply_field()\n", "dataset.apply_field(lambda sent:sent.split(), field_name='raw_words', new_field_name='words')\n", "\n", "# 除了匿名函数,也可以定义函数传递进去\n", "def get_words(instance):\n", " sentence = instance['raw_words']\n", " words = sentence.split()\n", " return words\n", "dataset.apply(get_words, new_field_name='words')\n", "dataset" ] } ], "metadata": { "kernelspec": { "display_name": "Python Now", "language": "python", "name": "now" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 2 }