Browse Source

测试文档

tags/v0.4.10
yh 6 years ago
parent
commit
2c202bb151
5 changed files with 133 additions and 76 deletions
  1. +8
    -0
      docs/source/fastNLP.io.rst
  2. +8
    -0
      docs/source/fastNLP.modules.decoder.rst
  3. +100
    -68
      fastNLP/core/dataset.py
  4. +12
    -8
      fastNLP/core/fieldarray.py
  5. +5
    -0
      test/core/test_dataset.py

+ 8
- 0
docs/source/fastNLP.io.rst View File

@@ -36,6 +36,14 @@ fastNLP.io.embed\_loader module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.io.file\_reader module
------------------------------

.. automodule:: fastNLP.io.file_reader
:members:
:undoc-members:
:show-inheritance:

fastNLP.io.model\_io module fastNLP.io.model\_io module
--------------------------- ---------------------------




+ 8
- 0
docs/source/fastNLP.modules.decoder.rst View File

@@ -20,6 +20,14 @@ fastNLP.modules.decoder.MLP module
:undoc-members: :undoc-members:
:show-inheritance: :show-inheritance:


fastNLP.modules.decoder.utils module
------------------------------------

.. automodule:: fastNLP.modules.decoder.utils
:members:
:undoc-members:
:show-inheritance:



Module contents Module contents
--------------- ---------------


+ 100
- 68
fastNLP/core/dataset.py View File

@@ -1,3 +1,18 @@
"""
fastNLP.core.DataSet的介绍文档

DataSet是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格,每一行是一个instance(或sample),每一列是一个feature。

csv-table::
:header: "Field1", "Field2", "Field3"
:widths:20, 10, 10

"This is the first instance", ['This', 'is', 'the', 'first', 'instance'], 5
"Second instance", ['Second', 'instance'], 2

"""


import _pickle as pickle import _pickle as pickle


import numpy as np import numpy as np
@@ -31,7 +46,7 @@ class DataSet(object):
length_set.add(len(value)) length_set.add(len(value))
assert len(length_set) == 1, "Arrays must all be same length." assert len(length_set) == 1, "Arrays must all be same length."
for key, value in data.items(): for key, value in data.items():
self.add_field(name=key, fields=value)
self.add_field(field_name=key, fields=value)
elif isinstance(data, list): elif isinstance(data, list):
for ins in data: for ins in data:
assert isinstance(ins, Instance), "Must be Instance type, not {}.".format(type(ins)) assert isinstance(ins, Instance), "Must be Instance type, not {}.".format(type(ins))
@@ -88,7 +103,7 @@ class DataSet(object):
raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}") raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}")
data_set = DataSet() data_set = DataSet()
for field in self.field_arrays.values(): for field in self.field_arrays.values():
data_set.add_field(name=field.name, fields=field.content[idx], padder=field.padder,
data_set.add_field(field_name=field.name, fields=field.content[idx], padder=field.padder,
is_input=field.is_input, is_target=field.is_target, ignore_type=field.ignore_type) is_input=field.is_input, is_target=field.is_target, ignore_type=field.ignore_type)
return data_set return data_set
elif isinstance(idx, str): elif isinstance(idx, str):
@@ -131,7 +146,7 @@ class DataSet(object):
return "DataSet(" + self.__inner_repr__() + ")" return "DataSet(" + self.__inner_repr__() + ")"


def append(self, ins): def append(self, ins):
"""Add an instance to the DataSet.
"""将一个instance对象append到DataSet后面。
If the DataSet is not empty, the instance must have the same field names as the rest instances in the DataSet. If the DataSet is not empty, the instance must have the same field names as the rest instances in the DataSet.


:param ins: an Instance object :param ins: an Instance object
@@ -151,57 +166,60 @@ class DataSet(object):
assert name in self.field_arrays assert name in self.field_arrays
self.field_arrays[name].append(field) self.field_arrays[name].append(field)


def add_field(self, name, fields, padder=None, is_input=False, is_target=False, ignore_type=False):
"""Add a new field to the DataSet.
def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False):
"""新增一个field
:param str name: the name of the field.
:param fields: a list of int, float, or other objects.
:param padder: PadBase对象,如何对该Field进行padding。如果为None则使用
:param bool is_input: whether this field is model input.
:param bool is_target: whether this field is label or target.
:param bool ignore_type: If True, do not perform type check. (Default: False)
:param str field_name: 新增的field的名称
:param list fields: 需要新增的field的内容
:param None, Padder padder: 如果为None,则不进行pad。
:param bool is_input: 新加入的field是否是input
:param bool is_target: 新加入的field是否是target
:param bool ignore_type: 是否忽略对新加入的field的类型检查
""" """
if padder is None:
padder = AutoPadder(pad_val=0)


if len(self.field_arrays) != 0: if len(self.field_arrays) != 0:
if len(self) != len(fields): if len(self) != len(fields):
raise RuntimeError(f"The field to append must have the same size as dataset. " raise RuntimeError(f"The field to append must have the same size as dataset. "
f"Dataset size {len(self)} != field size {len(fields)}") f"Dataset size {len(self)} != field size {len(fields)}")
self.field_arrays[name] = FieldArray(name, fields, is_target=is_target, is_input=is_input,
padder=padder, ignore_type=ignore_type)
self.field_arrays[field_name] = FieldArray(field_name, fields, is_target=is_target, is_input=is_input,
padder=padder, ignore_type=ignore_type)


def delete_field(self, name):
"""Delete a field based on the field name.
def delete_field(self, field_name):
"""删除field


:param name: the name of the field to be deleted.
:param str field_name: 需要删除的field的名称.
""" """
self.field_arrays.pop(name)
self.field_arrays.pop(field_name)


def get_field(self, field_name): def get_field(self, field_name):
"""获取field_name这个field

:param str field_name: field的名称
:return: FieldArray
"""
if field_name not in self.field_arrays: if field_name not in self.field_arrays:
raise KeyError("Field name {} not found in DataSet".format(field_name)) raise KeyError("Field name {} not found in DataSet".format(field_name))
return self.field_arrays[field_name] return self.field_arrays[field_name]


def get_all_fields(self): def get_all_fields(self):
"""Return all the fields with their names.
"""返回一个dict,key为field_name, value为对应的FieldArray


:return field_arrays: the internal data structure of DataSet.
:return: dict:
""" """
return self.field_arrays return self.field_arrays


def get_length(self): def get_length(self):
"""Fetch the length of the dataset.
"""获取DataSet的元素数量


:return length:
:return: int length:
""" """
return len(self) return len(self)


def rename_field(self, old_name, new_name): def rename_field(self, old_name, new_name):
"""Rename a field.
"""将某个field重新命名.


:param str old_name:
:param str new_name:
:param str old_name: 原来的field名称
:param str new_name: 修改为new_name
""" """
if old_name in self.field_arrays: if old_name in self.field_arrays:
self.field_arrays[new_name] = self.field_arrays.pop(old_name) self.field_arrays[new_name] = self.field_arrays.pop(old_name)
@@ -216,8 +234,8 @@ class DataSet(object):
dataset.set_target('labels', 'seq_len') # 将labels和seq_len这两个field的target属性设置为True dataset.set_target('labels', 'seq_len') # 将labels和seq_len这两个field的target属性设置为True
dataset.set_target('labels', 'seq_lens', flag=False) # 将labels和seq_len的target属性设置为False dataset.set_target('labels', 'seq_lens', flag=False) # 将labels和seq_len的target属性设置为False


:param field_names: str, field的名称
:param flag: bool, 将field_name的target状态设置为flag
:param str field_names: field的名称
:param bool flag: 将field_name的target状态设置为flag
""" """
assert isinstance(flag, bool), "Only bool type supported." assert isinstance(flag, bool), "Only bool type supported."
for name in field_names: for name in field_names:
@@ -233,8 +251,8 @@ class DataSet(object):
dataset.set_input('words', 'seq_len') # 将words和seq_len这两个field的input属性设置为True dataset.set_input('words', 'seq_len') # 将words和seq_len这两个field的input属性设置为True
dataset.set_input('words', flag=False) # 将words这个field的input属性设置为False dataset.set_input('words', flag=False) # 将words这个field的input属性设置为False


:param field_names: str, field的名称
:param flag: bool, 将field_name的input状态设置为flag
:param str field_names: field的名称
:param bool flag: 将field_name的input状态设置为flag
""" """
for name in field_names: for name in field_names:
if name in self.field_arrays: if name in self.field_arrays:
@@ -245,8 +263,8 @@ class DataSet(object):
def set_ignore_type(self, *field_names, flag=True): def set_ignore_type(self, *field_names, flag=True):
"""将field_names的ignore_type设置为flag状态 """将field_names的ignore_type设置为flag状态


:param field_names: str, field的名称
:param flag: bool,
:param str field_names: field的名称
:param bool flag: 将field_name的ignore_type状态设置为flag
:return: :return:
""" """
assert isinstance(flag, bool), "Only bool type supported." assert isinstance(flag, bool), "Only bool type supported."
@@ -264,8 +282,8 @@ class DataSet(object):
padder = EngChar2DPadder() padder = EngChar2DPadder()
dataset.set_padder('chars', padder) # 则chars这个field会使用EngChar2DPadder进行pad操作 dataset.set_padder('chars', padder) # 则chars这个field会使用EngChar2DPadder进行pad操作


:param field_name: str, 设置field的padding方式为padder
:param padder: (None, PadderBase). 设置为None即删除padder, 即对该field不进行padding操作.
:param str field_name: 设置field的padding方式为padder
:param None, Padder padder: 设置为None即删除padder, 即对该field不进行pad操作.
:return: :return:
""" """
if field_name not in self.field_arrays: if field_name not in self.field_arrays:
@@ -275,8 +293,8 @@ class DataSet(object):
def set_pad_val(self, field_name, pad_val): def set_pad_val(self, field_name, pad_val):
"""为某个field设置对应的pad_val. """为某个field设置对应的pad_val.


:param field_name: str,修改该field的pad_val
:param pad_val: int,该field的padder会以pad_val作为padding index
:param str field_name: 修改该field的pad_val
:param int pad_val: 该field的padder会以pad_val作为padding index
:return: :return:
""" """
if field_name not in self.field_arrays: if field_name not in self.field_arrays:
@@ -286,7 +304,7 @@ class DataSet(object):
def get_input_name(self): def get_input_name(self):
"""返回所有is_input被设置为True的field名称 """返回所有is_input被设置为True的field名称


:return list, 里面的元素为被设置为input的field名称
:return: list, 里面的元素为被设置为input的field名称
""" """
return [name for name, field in self.field_arrays.items() if field.is_input] return [name for name, field in self.field_arrays.items() if field.is_input]


@@ -300,15 +318,22 @@ class DataSet(object):
def apply_field(self, func, field_name, new_field_name=None, **kwargs): def apply_field(self, func, field_name, new_field_name=None, **kwargs):
"""将DataSet中的每个instance中的`field_name`这个field传给func,并获取它的返回值. """将DataSet中的每个instance中的`field_name`这个field传给func,并获取它的返回值.


:param func: Callable, input是instance的`field_name`这个field.
:param field_name: str, 传入func的是哪个field.
:param new_field_name: (str, None). 如果不是None,将func的返回值放入这个名为`new_field_name`的新field中,如果名称与已有
的field相同,则覆盖之前的field.
:param **kwargs: 合法的参数有以下三个
(1) is_input: bool, 如果为True则将`new_field_name`这个field设置为input
(2) is_target: bool, 如果为True则将`new_field_name`这个field设置为target
(3) ignore_type: bool, 如果为True则将`new_field_name`这个field的ignore_type设置为true, 忽略其类型
:return: List[], 里面的元素为func的返回值,所以list长度为DataSet的长度
:param callable func: input是instance的`field_name`这个field.
:param str field_name: 传入func的是哪个field.
:param str, None new_field_name: 将func返回的内容放入到什么field中

1. str, 将func的返回值放入这个名为`new_field_name`的新field中,如果名称与已有的field相
同,则覆盖之前的field

2. None, 不创建新的field
:param kwargs: 合法的参数有以下三个

1. is_input: bool, 如果为True则将`new_field_name`的field设置为input

2. is_target: bool, 如果为True则将`new_field_name`的field设置为target

3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
:return: list(Any), 里面的元素为func的返回值,所以list长度为DataSet的长度


""" """
assert len(self)!=0, "Null DataSet cannot use apply()." assert len(self)!=0, "Null DataSet cannot use apply()."
@@ -334,9 +359,9 @@ class DataSet(object):
def _add_apply_field(self, results, new_field_name, kwargs): def _add_apply_field(self, results, new_field_name, kwargs):
"""将results作为加入到新的field中,field名称为new_field_name """将results作为加入到新的field中,field名称为new_field_name


:param results: List[], 一般是apply*()之后的结果
:param new_field_name: str, 新加入的field的名称
:param kwargs: dict, 用户apply*()时传入的自定义参数
:param list(str) results: 一般是apply*()之后的结果
:param str new_field_name: 新加入的field的名称
:param dict kwargs: 用户apply*()时传入的自定义参数
:return: :return:
""" """
extra_param = {} extra_param = {}
@@ -355,23 +380,30 @@ class DataSet(object):
extra_param['is_target'] = old_field.is_target extra_param['is_target'] = old_field.is_target
if 'ignore_type' not in extra_param: if 'ignore_type' not in extra_param:
extra_param['ignore_type'] = old_field.ignore_type extra_param['ignore_type'] = old_field.ignore_type
self.add_field(name=new_field_name, fields=results, is_input=extra_param["is_input"],
self.add_field(field_name=new_field_name, fields=results, is_input=extra_param["is_input"],
is_target=extra_param["is_target"], ignore_type=extra_param['ignore_type']) is_target=extra_param["is_target"], ignore_type=extra_param['ignore_type'])
else: else:
self.add_field(name=new_field_name, fields=results, is_input=extra_param.get("is_input", None),
self.add_field(field_name=new_field_name, fields=results, is_input=extra_param.get("is_input", None),
is_target=extra_param.get("is_target", None), is_target=extra_param.get("is_target", None),
ignore_type=extra_param.get("ignore_type", False)) ignore_type=extra_param.get("ignore_type", False))


def apply(self, func, new_field_name=None, **kwargs): def apply(self, func, new_field_name=None, **kwargs):
"""将DataSet中每个instance传入到func中,并获取它的返回值. """将DataSet中每个instance传入到func中,并获取它的返回值.


:param func: Callable, 参数是DataSet中的instance
:param new_field_name: (None, str). (1) None, 不创建新的field; (2) str,将func的返回值放入这个名为
`new_field_name`的新field中,如果名称与已有的field相同,则覆盖之前的field;
:param callable func: 参数是DataSet中的instance
:param str, None new_field_name: 将func返回的内容放入到什么field中

1. str, 将func的返回值放入这个名为`new_field_name`的新field中,如果名称与已有的field相
同,则覆盖之前的field

2. None, 不创建新的field
:param kwargs: 合法的参数有以下三个 :param kwargs: 合法的参数有以下三个
(1) is_input: bool, 如果为True则将`new_field_name`的field设置为input
(2) is_target: bool, 如果为True则将`new_field_name`的field设置为target
(3) ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型

1. is_input: bool, 如果为True则将`new_field_name`的field设置为input

2. is_target: bool, 如果为True则将`new_field_name`的field设置为target

3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
:return: List[], 里面的元素为func的返回值,所以list长度为DataSet的长度 :return: List[], 里面的元素为func的返回值,所以list长度为DataSet的长度
""" """
assert len(self)!=0, "Null DataSet cannot use apply()." assert len(self)!=0, "Null DataSet cannot use apply()."
@@ -396,10 +428,10 @@ class DataSet(object):
def drop(self, func, inplace=True): def drop(self, func, inplace=True):
"""func接受一个instance,返回bool值,返回值为True时,该instance会被删除。 """func接受一个instance,返回bool值,返回值为True时,该instance会被删除。


:param func: Callable, 接受一个instance作为参数,返回bool值。为True时删除该instance
:param inplace: bool, 是否在当前DataSet中直接删除instance。如果为False,返回值为一个删除了相应instance的新的DataSet
:param callable func: 接受一个instance作为参数,返回bool值。为True时删除该instance
:param bool inplace: 是否在当前DataSet中直接删除instance。如果为False,返回值为一个删除了相应instance的新的DataSet


:return: DataSet.
:return: DataSet
""" """
if inplace: if inplace:
results = [ins for ins in self._inner_iter() if not func(ins)] results = [ins for ins in self._inner_iter() if not func(ins)]
@@ -408,16 +440,16 @@ class DataSet(object):
return self return self
else: else:
results = [ins for ins in self if not func(ins)] results = [ins for ins in self if not func(ins)]
data = DataSet(results)
dataset = DataSet(results)
for field_name, field in self.field_arrays.items(): for field_name, field in self.field_arrays.items():
data.field_arrays[field_name].to(field)
return data
dataset.field_arrays[field_name].to(field)
return dataset


def split(self, ratio): def split(self, ratio):
"""将DataSet按照ratio的比例拆分,返回两个DataSet """将DataSet按照ratio的比例拆分,返回两个DataSet


:param ratio: float, 0<ratio<1, 返回的第一个DataSet拥有ratio这么多数据,第二个DataSet拥有(1-ratio)这么多数据
:return (DataSet, DataSet)
:param float ratio: 0<ratio<1, 返回的第一个DataSet拥有ratio这么多数据,第二个DataSet拥有(1-ratio)这么多数据
:return: [DataSet, DataSet]
""" """
assert isinstance(ratio, float) assert isinstance(ratio, float)
assert 0 < ratio < 1 assert 0 < ratio < 1
@@ -480,7 +512,7 @@ class DataSet(object):
def save(self, path): def save(self, path):
"""保存DataSet. """保存DataSet.


:param path: str, 将DataSet存在哪个路径
:param str path: 将DataSet存在哪个路径
""" """
with open(path, 'wb') as f: with open(path, 'wb') as f:
pickle.dump(self, f) pickle.dump(self, f)
@@ -489,8 +521,8 @@ class DataSet(object):
def load(path): def load(path):
"""从保存的DataSet pickle路径中读取DataSet """从保存的DataSet pickle路径中读取DataSet


:param path: str, 读取路径
:return DataSet:
:param str path: 从哪里读取DataSet
:return: DataSet
""" """
with open(path, 'rb') as f: with open(path, 'rb') as f:
d = pickle.load(f) d = pickle.load(f)


+ 12
- 8
fastNLP/core/fieldarray.py View File

@@ -1,3 +1,6 @@



import numpy as np import numpy as np
from copy import deepcopy from copy import deepcopy


@@ -10,13 +13,14 @@ class FieldArray(object):
:param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray. :param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray.
:param bool is_target: If True, this FieldArray is used to compute loss. :param bool is_target: If True, this FieldArray is used to compute loss.
:param bool is_input: If True, this FieldArray is used to the model input. :param bool is_input: If True, this FieldArray is used to the model input.
:param PadderBase padder: PadderBase类型。赋值给fieldarray的padder的对象会被deepcopy一份,需要修改padder参数必须通过
:param Padder padder: PadderBase类型。赋值给fieldarray的padder的对象会被deepcopy一份,需要修改padder参数必须通过
fieldarray.set_pad_val()。 fieldarray.set_pad_val()。
默认为None,(1)如果某个field是scalar,则不进行任何padding;(2)如果为一维list, 且fieldarray的dtype为float或int类型 默认为None,(1)如果某个field是scalar,则不进行任何padding;(2)如果为一维list, 且fieldarray的dtype为float或int类型
则会进行padding;(3)其它情况不进行padder。 则会进行padding;(3)其它情况不进行padder。
假设需要对English word中character进行padding,则需要使用其他的padder。 假设需要对English word中character进行padding,则需要使用其他的padder。
或ignore_type为True但是需要进行padding。 或ignore_type为True但是需要进行padding。
:param bool ignore_type: whether to ignore type. If True, no type detection will rise for this FieldArray. (default: False)
:param bool ignore_type: whether to ignore type. If True, no type detection will rise for this FieldArray.
(default: False)
""" """


def __init__(self, name, content, is_target=None, is_input=None, padder=None, ignore_type=False): def __init__(self, name, content, is_target=None, is_input=None, padder=None, ignore_type=False):
@@ -59,7 +63,7 @@ class FieldArray(object):
if padder is None: if padder is None:
padder = AutoPadder(pad_val=0) padder = AutoPadder(pad_val=0)
else: else:
assert isinstance(padder, PadderBase), "padder must be of type PadderBase."
assert isinstance(padder, Padder), "padder must be of type Padder."
padder = deepcopy(padder) padder = deepcopy(padder)
self.set_padder(padder) self.set_padder(padder)
self.ignore_type = ignore_type self.ignore_type = ignore_type
@@ -272,11 +276,11 @@ class FieldArray(object):
""" """
设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。 设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。


:param padder: (None, PadderBase). 设置为None即删除padder.
:param padder: (None, Padder). 设置为None即删除padder.
:return: :return:
""" """
if padder is not None: if padder is not None:
assert isinstance(padder, PadderBase), "padder must be of type PadderBase."
assert isinstance(padder, Padder), "padder must be of type Padder."
self.padder = deepcopy(padder) self.padder = deepcopy(padder)
else: else:
self.padder = None self.padder = None
@@ -323,7 +327,7 @@ def is_iterable(content):
return True return True




class PadderBase:
class Padder:
""" """
所有padder都需要继承这个类,并覆盖__call__()方法。 所有padder都需要继承这个类,并覆盖__call__()方法。
用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。
@@ -378,7 +382,7 @@ class PadderBase:
raise NotImplementedError raise NotImplementedError




class AutoPadder(PadderBase):
class AutoPadder(Padder):
""" """
根据contents的数据自动判定是否需要做padding。 根据contents的数据自动判定是否需要做padding。


@@ -428,7 +432,7 @@ class AutoPadder(PadderBase):
return array return array




class EngChar2DPadder(PadderBase):
class EngChar2DPadder(Padder):
""" """
用于为英语执行character级别的2D padding操作。对应的field内容应该类似[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']], 用于为英语执行character级别的2D padding操作。对应的field内容应该类似[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']],
但这个Padder只能处理index为int的情况。 但这个Padder只能处理index为int的情况。


+ 5
- 0
test/core/test_dataset.py View File

@@ -163,6 +163,11 @@ class TestDataSetMethods(unittest.TestCase):
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target]) self.assertEqual(ds.get_target_name(), [_ for _ in ds.field_arrays if ds.field_arrays[_].is_target])


def test_split(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10})
d1, d2 = ds.split(0.1)


def test_apply2(self): def test_apply2(self):
def split_sent(ins): def split_sent(ins):
return ins['raw_sentence'].split() return ins['raw_sentence'].split()


Loading…
Cancel
Save