@@ -1,6 +1,20 @@ | |||||
""" | |||||
core 模块里实现了 fastNLP 的核心框架,常用的组件都可以从 fastNLP 包中直接 import。当然你也同样可以从 core 模块的子模块中 import, | |||||
例如 Batch 组件有两种 import 的方式:: | |||||
# 直接从 fastNLP 中 import | |||||
from fastNLP import Batch | |||||
# 从 core 模块的子模块 batch 中 import | |||||
from fastNLP.core.batch import Batch | |||||
对于常用的功能,你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的分工,您可以阅读以下文档: | |||||
""" | |||||
from .batch import Batch | from .batch import Batch | ||||
from .dataset import DataSet | from .dataset import DataSet | ||||
from .fieldarray import FieldArray | |||||
from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder | |||||
from .instance import Instance | from .instance import Instance | ||||
from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward | from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward | ||||
from .metrics import AccuracyMetric | from .metrics import AccuracyMetric | ||||
@@ -1,24 +1,34 @@ | |||||
""" | |||||
batch 模块实现了 fastNLP 所需的 Batch 类。 | |||||
""" | |||||
__all__ = ["Batch"] | |||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
import atexit | import atexit | ||||
from fastNLP.core.sampler import RandomSampler, Sampler | |||||
from .sampler import RandomSampler, Sampler | |||||
import torch.multiprocessing as mp | import torch.multiprocessing as mp | ||||
_python_is_exit = False | _python_is_exit = False | ||||
def _set_python_is_exit(): | def _set_python_is_exit(): | ||||
global _python_is_exit | global _python_is_exit | ||||
_python_is_exit = True | _python_is_exit = True | ||||
atexit.register(_set_python_is_exit) | atexit.register(_set_python_is_exit) | ||||
class Batch(object): | class Batch(object): | ||||
""" | """ | ||||
.. _Batch: | |||||
别名::class:`fastNLP.Batch` :class:`fastNLP.core.batch.Batch` | |||||
Batch 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出. | Batch 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出. | ||||
组成 `x` 和 `y` | 组成 `x` 和 `y` | ||||
Example:: | Example:: | ||||
batch = Batch(data_set, batch_size=16, sampler=SequentialSampler()) | batch = Batch(data_set, batch_size=16, sampler=SequentialSampler()) | ||||
@@ -26,16 +36,19 @@ class Batch(object): | |||||
for batch_x, batch_y in batch: | for batch_x, batch_y in batch: | ||||
# do stuff ... | # do stuff ... | ||||
:param DataSet dataset: `DataSet` 对象, 数据集 | |||||
:param dataset: :class:`~fastNLP.DataSet` 对象, 数据集 | |||||
:param int batch_size: 取出的batch大小 | :param int batch_size: 取出的batch大小 | ||||
:param Sampler sampler: 规定使用的 Sample 方式. 若为 ``None`` , 使用 RandomSampler. | |||||
:param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.RandomSampler`. | |||||
Default: ``None`` | Default: ``None`` | ||||
:param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 torch.Tensor. | |||||
:param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`. | |||||
Default: ``False`` | Default: ``False`` | ||||
:param bool prefetch: 若为 ``True`` 使用多进程预先取出下一batch. | :param bool prefetch: 若为 ``True`` 使用多进程预先取出下一batch. | ||||
Default: ``False`` | Default: ``False`` | ||||
""" | """ | ||||
def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False): | def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False): | ||||
self.dataset = dataset | self.dataset = dataset | ||||
self.batch_size = batch_size | self.batch_size = batch_size | ||||
@@ -49,17 +62,17 @@ class Batch(object): | |||||
self.cur_batch_indices = None | self.cur_batch_indices = None | ||||
self.prefetch = prefetch | self.prefetch = prefetch | ||||
self.lengths = 0 | self.lengths = 0 | ||||
def _fetch_one(self): | |||||
def fetch_one(self): | |||||
if self.curidx >= len(self.idx_list): | if self.curidx >= len(self.idx_list): | ||||
return None | return None | ||||
else: | else: | ||||
endidx = min(self.curidx + self.batch_size, len(self.idx_list)) | endidx = min(self.curidx + self.batch_size, len(self.idx_list)) | ||||
batch_x, batch_y = {}, {} | batch_x, batch_y = {}, {} | ||||
indices = self.idx_list[self.curidx:endidx] | indices = self.idx_list[self.curidx:endidx] | ||||
self.cur_batch_indices = indices | self.cur_batch_indices = indices | ||||
for field_name, field in self.dataset.get_all_fields().items(): | for field_name, field in self.dataset.get_all_fields().items(): | ||||
if field.is_target or field.is_input: | if field.is_target or field.is_input: | ||||
batch = field.get(indices) | batch = field.get(indices) | ||||
@@ -69,10 +82,10 @@ class Batch(object): | |||||
batch_y[field_name] = batch | batch_y[field_name] = batch | ||||
if field.is_input: | if field.is_input: | ||||
batch_x[field_name] = batch | batch_x[field_name] = batch | ||||
self.curidx = endidx | self.curidx = endidx | ||||
return batch_x, batch_y | return batch_x, batch_y | ||||
def __iter__(self): | def __iter__(self): | ||||
""" | """ | ||||
Iterate on dataset, fetch batch data. Fetch process don't block the iterate process | Iterate on dataset, fetch batch data. Fetch process don't block the iterate process | ||||
@@ -80,25 +93,28 @@ class Batch(object): | |||||
""" | """ | ||||
if self.prefetch: | if self.prefetch: | ||||
return _run_batch_iter(self) | return _run_batch_iter(self) | ||||
def batch_iter(): | def batch_iter(): | ||||
self._init_iter() | |||||
self.init_iter() | |||||
while 1: | while 1: | ||||
res = self._fetch_one() | |||||
res = self.fetch_one() | |||||
if res is None: | if res is None: | ||||
break | break | ||||
yield res | yield res | ||||
return batch_iter() | return batch_iter() | ||||
def _init_iter(self): | |||||
def init_iter(self): | |||||
self.idx_list = self.sampler(self.dataset) | self.idx_list = self.sampler(self.dataset) | ||||
self.curidx = 0 | self.curidx = 0 | ||||
self.lengths = self.dataset.get_length() | self.lengths = self.dataset.get_length() | ||||
def __len__(self): | def __len__(self): | ||||
return self.num_batches | return self.num_batches | ||||
def get_batch_indices(self): | def get_batch_indices(self): | ||||
"""取得当前batch在DataSet中所在的index下标序列 | |||||
""" | |||||
取得当前batch在DataSet中所在的index下标序列 | |||||
:return list(int) indexes: 下标序列 | :return list(int) indexes: 下标序列 | ||||
""" | """ | ||||
@@ -118,16 +134,16 @@ def _to_tensor(batch, dtype): | |||||
def _run_fetch(batch, q): | def _run_fetch(batch, q): | ||||
global _python_is_exit | global _python_is_exit | ||||
batch._init_iter() | |||||
batch.init_iter() | |||||
# print('start fetch') | # print('start fetch') | ||||
while 1: | while 1: | ||||
res = batch._fetch_one() | |||||
res = batch.fetch_one() | |||||
# print('fetch one') | # print('fetch one') | ||||
while 1: | while 1: | ||||
try: | try: | ||||
q.put(res, timeout=3) | q.put(res, timeout=3) | ||||
break | break | ||||
except Exception as e: | |||||
except: | |||||
if _python_is_exit: | if _python_is_exit: | ||||
return | return | ||||
if res is None: | if res is None: | ||||
@@ -159,4 +175,3 @@ def _run_batch_iter(batch): | |||||
fetch_p.terminate() | fetch_p.terminate() | ||||
fetch_p.join() | fetch_p.join() | ||||
# print('iter done') | # print('iter done') | ||||
@@ -1,7 +1,6 @@ | |||||
""" | """ | ||||
FieldArray是 DataSet_ 中一列的存储方式,原理部分请参考 DataSet_ 处 | |||||
.. _FieldArray: | |||||
field模块实现了 FieldArray 和若干 Padder。 FieldArray 是 :class:`~fastNLP.DataSet` 中一列的存储方式, | |||||
原理部分请参考 :doc:`fastNLP.core.dataset` | |||||
""" | """ | ||||
@@ -11,19 +10,21 @@ from copy import deepcopy | |||||
class FieldArray(object): | class FieldArray(object): | ||||
""" | |||||
别名::class:`fastNLP.FieldArray` :class:`fastNLP.core.field.FieldArray` | |||||
FieldArray 是用于保存 :class:`~fastNLP.DataSet` 中一个field的类型。 | |||||
:param str name: FieldArray的名称 | |||||
:param list,numpy.ndarray content: 列表的元素可以为list,int,float, | |||||
:param bool is_target: 这个field是否是一个target field。 | |||||
:param bool is_input: 这个field是否是一个input field。 | |||||
:param padder: :class:`~fastNLP.Padder` 类型。赋值给fieldarray的padder的对象会被deepcopy一份,需要修改padder参数必须通过 | |||||
fieldarray.set_pad_val()。默认为None,即使用 :class:`~fastNLP.AutoPadder` 。 | |||||
:param bool ignore_type: 是否忽略该field的type,一般如果这个field不需要转为torch.FloatTensor或torch.LongTensor, | |||||
就可以设置为True。具体意义请参考 :class:`~fastNLP.DataSet` 。 | |||||
""" | |||||
def __init__(self, name, content, is_target=None, is_input=None, padder=None, ignore_type=False): | def __init__(self, name, content, is_target=None, is_input=None, padder=None, ignore_type=False): | ||||
"""FieldArray是用于保存 DataSet_ 中一个field的实体。 | |||||
:param str name: FieldArray的名称 | |||||
:param list,numpy.ndarray content: 列表的元素可以为list,int,float, | |||||
:param bool is_target: 这个field是否是一个target field。 | |||||
:param bool is_input: 这个field是否是一个input field。 | |||||
:param Padder padder: PadderBase类型。赋值给fieldarray的padder的对象会被deepcopy一份,需要修改padder参数必须通过 | |||||
fieldarray.set_pad_val()。默认为None,即使用 AutoPadder_ 。 | |||||
:param bool ignore_type: 是否忽略该field的type,一般如果这个field不需要转为torch.FloatTensor或torch.LongTensor, 就 | |||||
可以设置为True。具体意义请参考 DataSet_ 。 | |||||
""" | |||||
self.name = name | self.name = name | ||||
if isinstance(content, list): | if isinstance(content, list): | ||||
# 如果DataSet使用dict初始化, content 可能是二维list/二维array/三维list | # 如果DataSet使用dict初始化, content 可能是二维list/二维array/三维list | ||||
@@ -87,14 +88,15 @@ class FieldArray(object): | |||||
@is_target.setter | @is_target.setter | ||||
def is_target(self, value): | def is_target(self, value): | ||||
""" | """ | ||||
当 field_array.is_target = True / False 时被调用 | |||||
当 field_array.is_target = True / False 时被调用 | |||||
""" | """ | ||||
if value is True: | if value is True: | ||||
self._set_dtype() | self._set_dtype() | ||||
self._is_target = value | self._is_target = value | ||||
def _type_detection(self, content): | def _type_detection(self, content): | ||||
"""当该field被设置为is_input或者is_target时被调用 | |||||
""" | |||||
当该field被设置为is_input或者is_target时被调用 | |||||
""" | """ | ||||
if len(content) == 0: | if len(content) == 0: | ||||
@@ -238,11 +240,12 @@ class FieldArray(object): | |||||
self.content[idx] = val | self.content[idx] = val | ||||
def get(self, indices, pad=True): | def get(self, indices, pad=True): | ||||
"""根据给定的indices返回内容 | |||||
""" | |||||
根据给定的indices返回内容 | |||||
:param int,list(int) indices:, 获取indices对应的内容。 | |||||
:param bool pad: , 是否对返回的结果进行padding。仅对indices为List[int]时有效 | |||||
:return: (single, List) | |||||
:param int,List[int] indices: 获取indices对应的内容。 | |||||
:param bool pad: 是否对返回的结果进行padding。仅对indices为List[int]时有效 | |||||
:return: 根据给定的indices返回的内容,可能是单个值或List | |||||
""" | """ | ||||
if isinstance(indices, int): | if isinstance(indices, int): | ||||
return self.content[indices] | return self.content[indices] | ||||
@@ -259,8 +262,7 @@ class FieldArray(object): | |||||
""" | """ | ||||
设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。 | 设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。 | ||||
:param None,Padder padder:. 设置为None即删除padder。 | |||||
:return: | |||||
:param padder: :class:`~fastNLP.Padder` 类型,设置为None即删除padder。 | |||||
""" | """ | ||||
if padder is not None: | if padder is not None: | ||||
assert isinstance(padder, Padder), "padder must be of type Padder." | assert isinstance(padder, Padder), "padder must be of type Padder." | ||||
@@ -269,10 +271,10 @@ class FieldArray(object): | |||||
self.padder = None | self.padder = None | ||||
def set_pad_val(self, pad_val): | def set_pad_val(self, pad_val): | ||||
"""修改padder的pad_val. | |||||
""" | |||||
修改padder的pad_val. | |||||
:param int pad_val: 该field的pad值设置为该值。 | :param int pad_val: 该field的pad值设置为该值。 | ||||
:return: | |||||
""" | """ | ||||
if self.padder is not None: | if self.padder is not None: | ||||
self.padder.set_pad_val(pad_val) | self.padder.set_pad_val(pad_val) | ||||
@@ -280,7 +282,8 @@ class FieldArray(object): | |||||
def __len__(self): | def __len__(self): | ||||
"""Returns the size of FieldArray. | |||||
""" | |||||
Returns the size of FieldArray. | |||||
:return int length: | :return int length: | ||||
""" | """ | ||||
@@ -288,10 +291,11 @@ class FieldArray(object): | |||||
def to(self, other): | def to(self, other): | ||||
""" | """ | ||||
将other的属性复制给本FieldArray(other必须为FieldArray类型).属性包括 is_input, is_target, padder, ignore_type | |||||
将other的属性复制给本FieldArray(other必须为FieldArray类型). | |||||
属性包括 is_input, is_target, padder, ignore_type | |||||
:param FieldArray other: 从哪个field拷贝属性 | |||||
:return: FieldArray | |||||
:param other: :class:`~fastNLP.FieldArray` 从哪个field拷贝属性 | |||||
:return: :class:`~fastNLP.FieldArray` | |||||
""" | """ | ||||
assert isinstance(other, FieldArray), "Only support FieldArray type, not {}.".format(type(other)) | assert isinstance(other, FieldArray), "Only support FieldArray type, not {}.".format(type(other)) | ||||
@@ -312,10 +316,20 @@ def _is_iterable(content): | |||||
class Padder: | class Padder: | ||||
""" | """ | ||||
.. _Padder: | |||||
别名::class:`fastNLP.Padder` :class:`fastNLP.core.field.Padder` | |||||
所有padder都需要继承这个类,并覆盖__call__()方法。 | |||||
用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 | |||||
所有padder都需要继承这个类,并覆盖__call__方法。 | |||||
用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 | |||||
.. py:function:: __call__(self, contents, field_name, field_ele_dtype): | |||||
传入的是List内容。假设有以下的DataSet。 | |||||
:param list(Any) contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 | |||||
deepcopy一份。 | |||||
:param str, field_name: field的名称。 | |||||
:param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True,该这个值为None。 | |||||
:return: np.array([padded_element]) | |||||
""" | """ | ||||
def __init__(self, pad_val=0, **kwargs): | def __init__(self, pad_val=0, **kwargs): | ||||
@@ -368,7 +382,7 @@ class Padder: | |||||
class AutoPadder(Padder): | class AutoPadder(Padder): | ||||
""" | """ | ||||
.. _AutoPadder: | |||||
别名::class:`fastNLP.AutoPadder` :class:`fastNLP.core.field.AutoPadder` | |||||
根据contents的数据自动判定是否需要做padding。 | 根据contents的数据自动判定是否需要做padding。 | ||||
@@ -420,7 +434,7 @@ class AutoPadder(Padder): | |||||
class EngChar2DPadder(Padder): | class EngChar2DPadder(Padder): | ||||
""" | """ | ||||
.. _EngChar2DPadder: | |||||
别名::class:`fastNLP.EngChar2DPadder` :class:`fastNLP.core.field.EngChar2DPadder` | |||||
用于为英语执行character级别的2D padding操作。对应的field内容应该类似[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']], | 用于为英语执行character级别的2D padding操作。对应的field内容应该类似[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']], | ||||
但这个Padder只能处理index为int的情况。 | 但这个Padder只能处理index为int的情况。 |
@@ -1,47 +1,50 @@ | |||||
""" | """ | ||||
Instance文档 | |||||
.. _Instance: | |||||
Instance是fastNLP中对应于一个sample的类。一个sample可以认为是fastNLP中的一个Instance对象。一个具像化的表示类似与 DataSet_ | |||||
出那个表中所展示的一行。 | |||||
instance 模块实现了Instance 类在fastNLP中对应sample。一个sample可以认为是一个Instance类型的对象。 | |||||
便于理解的例子可以参考文档 :doc:`fastNLP.core.dataset` 中的表格 | |||||
""" | """ | ||||
__all__ = ["Instance"] | |||||
class Instance(object): | class Instance(object): | ||||
""" | |||||
别名::class:`fastNLP.Instance` :class:`fastNLP.core.instance.Instance` | |||||
Instance是fastNLP中对应一个sample的类。每个sample在fastNLP中是一个Instance对象。 | |||||
Instance一般与 :class:`~fastNLP.DataSet` 一起使用, Instance的初始化如下面的Example所示 | |||||
Example:: | |||||
>>>from fastNLP import Instance | |||||
>>>ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2]) | |||||
>>>ins["field_1"] | |||||
[1, 1, 1] | |||||
>>>ins.add_field("field_3", [3, 3, 3]) | |||||
>>>ins = Instance(**{'x1': 1, 'x2':np.zeros((3, 4))}) | |||||
""" | |||||
def __init__(self, **fields): | def __init__(self, **fields): | ||||
"""Instance的初始化如下面的Example所示 | |||||
Example:: | |||||
ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2]) | |||||
ins["field_1"] | |||||
>>[1, 1, 1] | |||||
ins.add_field("field_3", [3, 3, 3]) | |||||
ins = Instance(**{'x1': 1, 'x2':np.zeros((3, 4))}) | |||||
""" | |||||
self.fields = fields | self.fields = fields | ||||
def add_field(self, field_name, field): | def add_field(self, field_name, field): | ||||
"""向Instance中增加一个field | |||||
""" | |||||
向Instance中增加一个field | |||||
:param str field_name: 新增field的名称 | :param str field_name: 新增field的名称 | ||||
:param Any field: 新增field的内容 | :param Any field: 新增field的内容 | ||||
""" | """ | ||||
self.fields[field_name] = field | self.fields[field_name] = field | ||||
def __getitem__(self, name): | def __getitem__(self, name): | ||||
if name in self.fields: | if name in self.fields: | ||||
return self.fields[name] | return self.fields[name] | ||||
else: | else: | ||||
raise KeyError("{} not found".format(name)) | raise KeyError("{} not found".format(name)) | ||||
def __setitem__(self, name, field): | def __setitem__(self, name, field): | ||||
return self.add_field(name, field) | return self.add_field(name, field) | ||||
def __repr__(self): | def __repr__(self): | ||||
s = '\'' | s = '\'' | ||||
return "{" + ",\n".join( | return "{" + ",\n".join( | ||||
@@ -34,6 +34,8 @@ def _check_build_status(func): | |||||
class Vocabulary(object): | class Vocabulary(object): | ||||
""" | """ | ||||
别名::class:`fastNLP.Vocabulary` :class:`fastNLP.core.vocabulary.Vocabulary` | |||||
用于构建, 存储和使用 `str` 到 `int` 的一一映射 | 用于构建, 存储和使用 `str` 到 `int` 的一一映射 | ||||
Example:: | Example:: | ||||
@@ -98,7 +100,7 @@ class Vocabulary(object): | |||||
""" | """ | ||||
依次增加序列中词在词典中的出现频率 | 依次增加序列中词在词典中的出现频率 | ||||
:param list(str) word_lst: 词的序列 | |||||
:param list[str] word_lst: 词的序列 | |||||
""" | """ | ||||
self.update(word_lst) | self.update(word_lst) | ||||
@@ -185,12 +187,11 @@ class Vocabulary(object): | |||||
# remember to use `field_name` | # remember to use `field_name` | ||||
vocab.index_dataset(train_data, dev_data, test_data, field_name='words') | vocab.index_dataset(train_data, dev_data, test_data, field_name='words') | ||||
:param DataSet datasets: 需要转index的 DataSet, 支持一个或多个 | |||||
:param datasets: 需要转index的 class:`~fastNLP.DataSet` , 支持一个或多个(list) | |||||
:param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field. | :param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field. | ||||
目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))`` | 目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))`` | ||||
:param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field. | :param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field. | ||||
Default: ``None`` | Default: ``None`` | ||||
:return self: | |||||
""" | """ | ||||
def index_instance(ins): | def index_instance(ins): | ||||
""" | """ | ||||
@@ -230,7 +231,7 @@ class Vocabulary(object): | |||||
# remember to use `field_name` | # remember to use `field_name` | ||||
vocab.from_dataset(train_data1, train_data2, field_name='words') | vocab.from_dataset(train_data1, train_data2, field_name='words') | ||||
:param DataSet datasets: 需要转index的 DataSet, 支持一个或多个. | |||||
:param datasets: 需要转index的 class:`~fastNLP.DataSet` , 支持一个或多个(list) | |||||
:param field_name: 可为 ``str`` 或 ``list(str)`` . | :param field_name: 可为 ``str`` 或 ``list(str)`` . | ||||
构建词典所使用的 field(s), 支持一个或多个field | 构建词典所使用的 field(s), 支持一个或多个field | ||||
若有多个 DataSet, 每个DataSet都必须有这些field. | 若有多个 DataSet, 每个DataSet都必须有这些field. | ||||