2. FieldArray默认使用AutoPadder, AutoPadder的行为与之前不使用padder是一致的的 3. 为了解决二维padding的问题,引入了EngChar2dPadder用于对character进行padding 4. 增加一份padding的tutorial。tags/v0.3.1^2
@@ -48,7 +48,7 @@ class Batch(object): | |||||
for field_name, field in self.dataset.get_all_fields().items(): | for field_name, field in self.dataset.get_all_fields().items(): | ||||
if field.is_target or field.is_input: | if field.is_target or field.is_input: | ||||
batch = field.get(indices) | batch = field.get(indices) | ||||
if not self.as_numpy: | |||||
if not self.as_numpy and field.padder is not None: | |||||
batch = to_tensor(batch, field.dtype) | batch = to_tensor(batch, field.dtype) | ||||
if field.is_target: | if field.is_target: | ||||
batch_y[field_name] = batch | batch_y[field_name] = batch | ||||
@@ -67,8 +67,11 @@ class Batch(object): | |||||
def to_tensor(batch, dtype): | def to_tensor(batch, dtype): | ||||
if dtype in (int, np.int8, np.int16, np.int32, np.int64): | |||||
batch = torch.LongTensor(batch) | |||||
if dtype in (float, np.float32, np.float64): | |||||
batch = torch.FloatTensor(batch) | |||||
try: | |||||
if dtype in (int, np.int8, np.int16, np.int32, np.int64): | |||||
batch = torch.LongTensor(batch) | |||||
if dtype in (float, np.float32, np.float64): | |||||
batch = torch.FloatTensor(batch) | |||||
except: | |||||
pass | |||||
return batch | return batch |
@@ -3,6 +3,7 @@ import _pickle as pickle | |||||
import numpy as np | import numpy as np | ||||
from fastNLP.core.fieldarray import FieldArray | from fastNLP.core.fieldarray import FieldArray | ||||
from fastNLP.core.fieldarray import AutoPadder | |||||
from fastNLP.core.instance import Instance | from fastNLP.core.instance import Instance | ||||
from fastNLP.core.utils import get_func_signature | from fastNLP.core.utils import get_func_signature | ||||
from fastNLP.io.base_loader import DataLoaderRegister | from fastNLP.io.base_loader import DataLoaderRegister | ||||
@@ -88,11 +89,8 @@ class DataSet(object): | |||||
raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}") | raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}") | ||||
data_set = DataSet() | data_set = DataSet() | ||||
for field in self.field_arrays.values(): | for field in self.field_arrays.values(): | ||||
data_set.add_field(name=field.name, | |||||
fields=field.content[idx], | |||||
padding_val=field.padding_val, | |||||
is_input=field.is_input, | |||||
is_target=field.is_target) | |||||
data_set.add_field(name=field.name, fields=field.content[idx], padder=field.padder, | |||||
is_input=field.is_input, is_target=field.is_target) | |||||
return data_set | return data_set | ||||
else: | else: | ||||
raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) | raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) | ||||
@@ -151,12 +149,12 @@ class DataSet(object): | |||||
assert name in self.field_arrays | assert name in self.field_arrays | ||||
self.field_arrays[name].append(field) | self.field_arrays[name].append(field) | ||||
def add_field(self, name, fields, padding_val=0, is_input=False, is_target=False): | |||||
def add_field(self, name, fields, padder=AutoPadder(pad_val=0), is_input=False, is_target=False): | |||||
"""Add a new field to the DataSet. | """Add a new field to the DataSet. | ||||
:param str name: the name of the field. | :param str name: the name of the field. | ||||
:param fields: a list of int, float, or other objects. | :param fields: a list of int, float, or other objects. | ||||
:param int padding_val: integer for padding. | |||||
:param int padder: PadBase对象,如何对该Field进行padding。大部分情况使用默认值即可 | |||||
:param bool is_input: whether this field is model input. | :param bool is_input: whether this field is model input. | ||||
:param bool is_target: whether this field is label or target. | :param bool is_target: whether this field is label or target. | ||||
""" | """ | ||||
@@ -164,8 +162,8 @@ class DataSet(object): | |||||
if len(self) != len(fields): | if len(self) != len(fields): | ||||
raise RuntimeError(f"The field to append must have the same size as dataset. " | raise RuntimeError(f"The field to append must have the same size as dataset. " | ||||
f"Dataset size {len(self)} != field size {len(fields)}") | f"Dataset size {len(self)} != field size {len(fields)}") | ||||
self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target, | |||||
is_input=is_input) | |||||
self.field_arrays[name] = FieldArray(name, fields, is_target=is_target, is_input=is_input, | |||||
padder=padder) | |||||
def delete_field(self, name): | def delete_field(self, name): | ||||
"""Delete a field based on the field name. | """Delete a field based on the field name. | ||||
@@ -229,6 +227,25 @@ class DataSet(object): | |||||
else: | else: | ||||
raise KeyError("{} is not a valid field name.".format(name)) | raise KeyError("{} is not a valid field name.".format(name)) | ||||
def set_padder(self, field_name, padder): | |||||
""" | |||||
为field_name设置padder | |||||
:param field_name: str, 设置field的padding方式为padder | |||||
:param padder: PadderBase类型或None. 设置为None即删除padder。即对该field不进行padding操作. | |||||
:return: | |||||
""" | |||||
self.field_arrays[field_name].set_padder(padder) | |||||
def set_pad_val(self, field_name, pad_val): | |||||
""" | |||||
为某个 | |||||
:param field_name: str,修改该field的pad_val | |||||
:param pad_val: int,该field的padder会以pad_val作为padding index | |||||
:return: | |||||
""" | |||||
self.field_arrays[field_name].set_pad_val(pad_val) | |||||
def get_input_name(self): | def get_input_name(self): | ||||
"""Get all field names with `is_input` as True. | """Get all field names with `is_input` as True. | ||||
@@ -270,12 +287,9 @@ class DataSet(object): | |||||
extra_param['is_input'] = old_field.is_input | extra_param['is_input'] = old_field.is_input | ||||
if 'is_target' not in extra_param: | if 'is_target' not in extra_param: | ||||
extra_param['is_target'] = old_field.is_target | extra_param['is_target'] = old_field.is_target | ||||
self.add_field(name=new_field_name, | |||||
fields=results, | |||||
padding_val=old_field.padding_val, | |||||
**extra_param) | |||||
self.add_field(name=new_field_name, fields=results) | |||||
else: | else: | ||||
self.add_field(name=new_field_name, fields=results, **extra_param) | |||||
self.add_field(name=new_field_name, fields=results) | |||||
else: | else: | ||||
return results | return results | ||||
@@ -314,8 +328,16 @@ class DataSet(object): | |||||
for field_name in self.field_arrays: | for field_name in self.field_arrays: | ||||
train_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input | train_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input | ||||
train_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target | train_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target | ||||
train_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder | |||||
train_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype | |||||
train_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype | |||||
train_set.field_arrays[field_name].is_2d_list = self.field_arrays[field_name].is_2d_list | |||||
dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input | dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input | ||||
dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target | dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target | ||||
dev_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder | |||||
dev_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype | |||||
dev_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype | |||||
dev_set.field_arrays[field_name].is_2d_list = self.field_arrays[field_name].is_2d_list | |||||
return train_set, dev_set | return train_set, dev_set | ||||
@@ -1,19 +1,105 @@ | |||||
import numpy as np | import numpy as np | ||||
class PadderBase: | |||||
""" | |||||
所有padder都需要继承这个类,并覆盖__call__()方法。 | |||||
用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 | |||||
""" | |||||
def __init__(self, pad_val=0, **kwargs): | |||||
self.pad_val = pad_val | |||||
def set_pad_val(self, pad_val): | |||||
self.pad_val = pad_val | |||||
def __call__(self, contents, field_name, field_ele_dtype): | |||||
""" | |||||
传入的是List内容。假设有以下的DataSet。 | |||||
from fastNLP import DataSet | |||||
from fastNLP import Instance | |||||
dataset = DataSet() | |||||
dataset.append(Instance(word='this is a demo', length=4, | |||||
chars=[['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['d', 'e', 'm', 'o']])) | |||||
dataset.append(Instance(word='another one', length=2, | |||||
chars=[['a', 'n', 'o', 't', 'h', 'e', 'r'], ['o', 'n', 'e']])) | |||||
# 如果batch_size=2, 下面只是用str的方式看起来更直观一点,但实际上可能word和chars在pad时都已经为index了。 | |||||
word这个field的pad_func会接收到的内容会是 | |||||
[ | |||||
'this is a demo', | |||||
'another one' | |||||
] | |||||
length这个field的pad_func会接收到的内容会是 | |||||
[4, 2] | |||||
chars这个field的pad_func会接收到的内容会是 | |||||
[ | |||||
[['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['d', 'e', 'm', 'o']], | |||||
[['a', 'n', 'o', 't', 'h', 'e', 'r'], ['o', 'n', 'e']] | |||||
] | |||||
即把每个instance中某个field的内容合成一个List传入 | |||||
:param contents: List[element]。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 | |||||
deepcopy一份。 | |||||
:param field_name: str, field的名称,帮助定位错误 | |||||
:param field_ele_dtype: np.int64, np.float64, np.str. 该field的内层list元素的类型。辅助判断是否pad,大多数情况用不上 | |||||
:return: List[padded_element]或np.array([padded_element]) | |||||
""" | |||||
raise NotImplementedError | |||||
class AutoPadder(PadderBase): | |||||
""" | |||||
根据contents的数据自动判定是否需要做padding。 | |||||
(1) 如果元素类型(元素类型是指field中最里层List的元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类 | |||||
型为np.str, [[1,2], ...]的元素类型为np.int64)的数据不为(np.int64, np.float64)则不会进行padding | |||||
(2) 如果元素类型为(np.int64, np.float64), | |||||
(2.1) 如果该field的内容只有一个,比如为sequence_length, 则不进行padding | |||||
(2.2) 如果该field的内容为List, 那么会将Batch中的List pad为一样长。若该List下还有里层的List需要padding,请使用其它padder。 | |||||
如果某个instance中field为[1, 2, 3],则可以pad; 若为[[1,2], [3,4, ...]]则不能进行pad | |||||
""" | |||||
def __init__(self, pad_val=0): | |||||
""" | |||||
:param pad_val: int, padding的位置使用该index | |||||
""" | |||||
super().__init__(pad_val=pad_val) | |||||
def _is_two_dimension(self, contents): | |||||
""" | |||||
判断contents是不是只有两个维度。[[1,2], [3]]是两个维度. [[[1,2], [3, 4, 5]], [[4,5]]]有三个维度 | |||||
:param contents: | |||||
:return: | |||||
""" | |||||
value = contents[0] | |||||
if isinstance(value , (np.ndarray, list)): | |||||
value = value[0] | |||||
if isinstance(value, (np.ndarray, list)): | |||||
return False | |||||
return True | |||||
return False | |||||
def __call__(self, contents, field_name, field_ele_dtype): | |||||
if not is_iterable(contents[0]): | |||||
array = np.array([content for content in contents], dtype=field_ele_dtype) | |||||
elif field_ele_dtype in (np.int64, np.float64) and self._is_two_dimension(contents): | |||||
max_len = max([len(content) for content in contents]) | |||||
array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype) | |||||
for i, content in enumerate(contents): | |||||
array[i][:len(content)] = content | |||||
else: # should only be str | |||||
array = np.array([content for content in contents]) | |||||
return array | |||||
class FieldArray(object): | class FieldArray(object): | ||||
"""``FieldArray`` is the collection of ``Instance``s of the same field. | """``FieldArray`` is the collection of ``Instance``s of the same field. | ||||
It is the basic element of ``DataSet`` class. | It is the basic element of ``DataSet`` class. | ||||
:param str name: the name of the FieldArray | :param str name: the name of the FieldArray | ||||
:param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray. | :param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray. | ||||
:param int padding_val: the integer for padding. Default: 0. | |||||
:param bool is_target: If True, this FieldArray is used to compute loss. | :param bool is_target: If True, this FieldArray is used to compute loss. | ||||
:param bool is_input: If True, this FieldArray is used to the model input. | :param bool is_input: If True, this FieldArray is used to the model input. | ||||
:param padder: PadderBase类型。大多数情况下都不需要设置该值,除非需要在多个维度上进行padding(比如英文中对character进行padding) | |||||
""" | """ | ||||
def __init__(self, name, content, padding_val=0, is_target=None, is_input=None): | |||||
def __init__(self, name, content, is_target=None, is_input=None, padder=AutoPadder(pad_val=0)): | |||||
self.name = name | self.name = name | ||||
if isinstance(content, list): | if isinstance(content, list): | ||||
content = content | content = content | ||||
@@ -22,7 +108,7 @@ class FieldArray(object): | |||||
else: | else: | ||||
raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content))) | raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content))) | ||||
self.content = content | self.content = content | ||||
self.padding_val = padding_val | |||||
self.set_padder(padder) | |||||
self._is_target = None | self._is_target = None | ||||
self._is_input = None | self._is_input = None | ||||
@@ -149,28 +235,44 @@ class FieldArray(object): | |||||
assert isinstance(idx, int) | assert isinstance(idx, int) | ||||
self.content[idx] = val | self.content[idx] = val | ||||
def get(self, indices): | |||||
def get(self, indices, pad=True): | |||||
"""Fetch instances based on indices. | """Fetch instances based on indices. | ||||
:param indices: an int, or a list of int. | :param indices: an int, or a list of int. | ||||
:param pad: bool, 是否对返回的结果进行padding。 | |||||
:return: | :return: | ||||
""" | """ | ||||
if isinstance(indices, int): | if isinstance(indices, int): | ||||
return self.content[indices] | return self.content[indices] | ||||
if self.is_input is False and self.is_target is False: | if self.is_input is False and self.is_target is False: | ||||
raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name)) | raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name)) | ||||
batch_size = len(indices) | |||||
if not is_iterable(self.content[0]): | |||||
array = np.array([self.content[i] for i in indices], dtype=self.dtype) | |||||
elif self.dtype in (np.int64, np.float64): | |||||
max_len = max([len(self.content[i]) for i in indices]) | |||||
array = np.full((batch_size, max_len), self.padding_val, dtype=self.dtype) | |||||
for i, idx in enumerate(indices): | |||||
array[i][:len(self.content[idx])] = self.content[idx] | |||||
else: # should only be str | |||||
array = np.array([self.content[i] for i in indices]) | |||||
return array | |||||
contents = [self.content[i] for i in indices] | |||||
if self.padder is None or pad is False: | |||||
return np.array(contents) | |||||
else: | |||||
return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype) | |||||
def set_padder(self, padder): | |||||
""" | |||||
设置padding方式 | |||||
:param padder: PadderBase类型或None. 设置为None即删除padder. | |||||
:return: | |||||
""" | |||||
if padder is not None: | |||||
assert isinstance(padder, PadderBase), "padder must be of type PadderBase." | |||||
self.padder = padder | |||||
def set_pad_val(self, pad_val): | |||||
""" | |||||
修改padder的pad_val. | |||||
:param pad_val: int。 | |||||
:return: | |||||
""" | |||||
if self.padder is not None: | |||||
self.padder.set_pad_val(pad_val) | |||||
def __len__(self): | def __len__(self): | ||||
"""Returns the size of FieldArray. | """Returns the size of FieldArray. | ||||
@@ -186,3 +288,80 @@ def is_iterable(content): | |||||
except TypeError: | except TypeError: | ||||
return False | return False | ||||
return True | return True | ||||
class EngChar2DPadder(PadderBase): | |||||
""" | |||||
用于为英语执行character级别的2D padding操作。对应的field内容应该为[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']](这里为 | |||||
了更直观,把它们写为str,但实际使用时它们应该是character的index)。 | |||||
padded过后的batch内容,形状为(batch_size, max_sentence_length, max_word_length). max_sentence_length最大句子长度。 | |||||
max_word_length最长的word的长度 | |||||
""" | |||||
def __init__(self, pad_val=0, pad_length=0): | |||||
""" | |||||
:param pad_val: int, padding的位置使用该index | |||||
:param pad_length: int, 如果为0则取一个batch中最大的单词长度作为padding长度。如果为大于0的数,则将所有单词的长度都pad或截 | |||||
取到该长度. | |||||
""" | |||||
super().__init__(pad_val=pad_val) | |||||
self.pad_length = pad_length | |||||
def _exactly_three_dims(self, contents, field_name): | |||||
""" | |||||
检查传入的contents是否刚好是3维,如果不是3维就报错。理论上,第一个维度是batch,第二个维度是word,第三个维度是character | |||||
:param contents: | |||||
:param field_name: str | |||||
:return: | |||||
""" | |||||
if not isinstance(contents, list): | |||||
raise TypeError("contents should be a list, not {}.".format(type(contents))) | |||||
value = contents[0] | |||||
try: | |||||
value = value[0] | |||||
except: | |||||
raise ValueError("Field:{} only has one dimension.".format(field_name)) | |||||
try: | |||||
value = value[1] | |||||
except: | |||||
raise ValueError("Field:{} only has two dimensions.".format(field_name)) | |||||
if is_iterable(value): | |||||
raise ValueError("Field:{} has more than 3 dimension.".format(field_name)) | |||||
def __call__(self, contents, field_name, field_ele_dtype): | |||||
""" | |||||
期望输入类似于 | |||||
[ | |||||
[[0, 2], [2, 3, 4], ..], | |||||
[[9, 8, 2, 4], [1, 2,], ...], | |||||
.... | |||||
] | |||||
:param contents: | |||||
:param field_name: | |||||
:param field_ele_dtype | |||||
:return: | |||||
""" | |||||
if field_ele_dtype not in (np.int64, np.float64): | |||||
raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format( | |||||
field_name, field_ele_dtype | |||||
)) | |||||
self._exactly_three_dims(contents, field_name) | |||||
if self.pad_length < 1: | |||||
max_char_length = max(max([[len(char_lst) for char_lst in word_lst] for word_lst in contents])) | |||||
else: | |||||
max_char_length = self.pad_length | |||||
max_sent_length = max(len(word_lst) for word_lst in contents) | |||||
batch_size = len(contents) | |||||
dtype = type(contents[0][0][0]) | |||||
padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val, | |||||
dtype=dtype) | |||||
for b_idx, word_lst in enumerate(contents): | |||||
for c_idx, char_lst in enumerate(word_lst): | |||||
chars = char_lst[:max_char_length] | |||||
padded_array[b_idx, c_idx, :len(chars)] = chars | |||||
return padded_array |
@@ -39,7 +39,6 @@ class TransformerCWS(nn.Module): | |||||
allowed_transitions=allowed_trans) | allowed_transitions=allowed_trans) | ||||
def forward(self, chars, target, seq_lens, bigrams=None): | def forward(self, chars, target, seq_lens, bigrams=None): | ||||
seq_lens = seq_lens | |||||
masks = seq_len_to_byte_mask(seq_lens).float() | masks = seq_len_to_byte_mask(seq_lens).float() | ||||
x = self.embedding(chars) | x = self.embedding(chars) | ||||
batch_size = x.size(0) | batch_size = x.size(0) | ||||
@@ -59,8 +58,59 @@ class TransformerCWS(nn.Module): | |||||
return pred_dict | return pred_dict | ||||
def predict(self, chars, seq_lens, bigrams=None): | |||||
masks = seq_len_to_byte_mask(seq_lens).float() | |||||
x = self.embedding(chars) | |||||
batch_size = x.size(0) | |||||
length = x.size(1) | |||||
if hasattr(self, 'bigram_embedding'): | |||||
bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size | |||||
x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1) | |||||
self.drop(x) | |||||
x = self.fc1(x) | |||||
feats = self.transformer(x, masks) | |||||
feats = self.fc2(feats) | |||||
probs = self.crf.viterbi_decode(feats, masks, get_score=False) | |||||
return {'pred': probs, 'seq_lens':seq_lens} | |||||
class NoamOpt(torch.optim.Optimizer): | |||||
"Optim wrapper that implements rate." | |||||
def __init__(self, model_size, factor, warmup, optimizer): | |||||
super().__init__([torch.nn.Parameter(torch.ones(1))], {}) | |||||
self.optimizer = optimizer | |||||
self._step = 0 | |||||
self.warmup = warmup | |||||
self.factor = factor | |||||
self.model_size = model_size | |||||
self._rate = 0 | |||||
def step(self, **kwargs): | |||||
"Update parameters and rate" | |||||
self._step += 1 | |||||
rate = self.rate() | |||||
for p in self.optimizer.param_groups: | |||||
p['lr'] = rate | |||||
self._rate = rate | |||||
self.optimizer.step() | |||||
def rate(self, step=None): | |||||
"Implement `lrate` above" | |||||
if step is None: | |||||
step = self._step | |||||
return self.factor * \ | |||||
(self.model_size ** (-0.5) * | |||||
min(step ** (-0.5), step * self.warmup ** (-1.5))) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
transformer = TransformerCWS(10, embed_dim=100, bigram_vocab_num=10, bigram_embed_dim=100, num_bigram_per_char=8, | transformer = TransformerCWS(10, embed_dim=100, bigram_vocab_num=10, bigram_embed_dim=100, num_bigram_per_char=8, | ||||
hidden_size=200, embed_drop_p=0.3, num_layers=1, num_heads=8, tag_size=4) | hidden_size=200, embed_drop_p=0.3, num_layers=1, num_heads=8, tag_size=4) | ||||
chars = torch.randint(10, size=(4, 7)).long() | chars = torch.randint(10, size=(4, 7)).long() | ||||
@@ -68,4 +118,8 @@ if __name__ == '__main__': | |||||
seq_lens = torch.ones(4).long()*7 | seq_lens = torch.ones(4).long()*7 | ||||
target = torch.randint(4, size=(4, 7)) | target = torch.randint(4, size=(4, 7)) | ||||
print(transformer(chars, target, seq_lens, bigrams)) | |||||
print(transformer(chars, target, seq_lens, bigrams)) | |||||
optimizer = torch.optim.Adam(transformer.parameters()) | |||||
opt = NoamOpt(10 ,1, 400, optimizer) |
@@ -97,3 +97,64 @@ class TestFieldArray(unittest.TestCase): | |||||
fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) | fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) | ||||
self.assertEqual(len(fa), 3) | self.assertEqual(len(fa), 3) | ||||
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) | self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) | ||||
class TestPadder(unittest.TestCase): | |||||
def test01(self): | |||||
""" | |||||
测试AutoPadder能否正常工作 | |||||
:return: | |||||
""" | |||||
from fastNLP.core.fieldarray import AutoPadder | |||||
padder = AutoPadder() | |||||
content = ['This is a str', 'this is another str'] | |||||
self.assertListEqual(content, padder(content, None, np.str).tolist()) | |||||
content = [1, 2] | |||||
self.assertListEqual(content, padder(content, None, np.int64).tolist()) | |||||
content = [[1,2], [3], [4]] | |||||
self.assertListEqual([[1,2], [3, 0], [4, 0]], | |||||
padder(content, None, np.int64).tolist()) | |||||
contents = [ | |||||
[[1, 2, 3], [4, 5], [7,8,9,10]], | |||||
[[1]] | |||||
] | |||||
print(padder(contents, None, np.int64)) | |||||
def test02(self): | |||||
""" | |||||
测试EngChar2DPadder能不能正确使用 | |||||
:return: | |||||
""" | |||||
from fastNLP.core.fieldarray import EngChar2DPadder | |||||
padder = EngChar2DPadder(pad_length=0) | |||||
contents = [1, 2] | |||||
# 不能是1维 | |||||
with self.assertRaises(ValueError): | |||||
padder(contents, None, np.int64) | |||||
contents = [[1, 2]] | |||||
# 不能是2维 | |||||
with self.assertRaises(ValueError): | |||||
padder(contents, None, np.int64) | |||||
contents = [[[[1, 2]]]] | |||||
# 不能是3维以上 | |||||
with self.assertRaises(ValueError): | |||||
padder(contents, None, np.int64) | |||||
contents = [ | |||||
[[1, 2, 3], [4, 5], [7,8,9,10]], | |||||
[[1]] | |||||
] | |||||
self.assertListEqual([[[1, 2, 3, 0], [4, 5, 0, 0], [7, 8, 9, 10]], [[1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], | |||||
padder(contents, None, np.int64).tolist()) | |||||
padder = EngChar2DPadder(pad_length=5, pad_val=-100) | |||||
self.assertListEqual( | |||||
[[[1, 2, 3, -100, -100], [4, 5, -100, -100, -100], [7, 8, 9, 10, -100]], | |||||
[[1, -100, -100, -100, -100], [-100, -100, -100, -100, -100], [-100, -100, -100, -100, -100]]], | |||||
padder(contents, None, np.int64).tolist() | |||||
) |
@@ -0,0 +1,370 @@ | |||||
{ | |||||
"cells": [ | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": 1, | |||||
"metadata": {}, | |||||
"outputs": [ | |||||
{ | |||||
"name": "stderr", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"/Users/yh/miniconda2/envs/python3/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", | |||||
" \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"DataSet({'raw_sent': this is a bad idea . type=str,\n", | |||||
"'label': 0 type=int,\n", | |||||
"'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n", | |||||
"'words': [4, 2, 5, 6, 7, 3] type=list},\n", | |||||
"{'raw_sent': it is great . type=str,\n", | |||||
"'label': 1 type=int,\n", | |||||
"'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n", | |||||
"'words': [8, 2, 9, 3] type=list})" | |||||
] | |||||
}, | |||||
"execution_count": 1, | |||||
"metadata": {}, | |||||
"output_type": "execute_result" | |||||
} | |||||
], | |||||
"source": [ | |||||
"# 假设有以下的DataSet, 这里只是为了举例所以只选择了两个sample\n", | |||||
"import sys\n", | |||||
"import os\n", | |||||
"sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP')\n", | |||||
"\n", | |||||
"from fastNLP import DataSet\n", | |||||
"from fastNLP import Instance\n", | |||||
"from fastNLP import Vocabulary\n", | |||||
"\n", | |||||
"dataset = DataSet()\n", | |||||
"dataset.append(Instance(raw_sent='This is a bad idea .', label=0))\n", | |||||
"dataset.append(Instance(raw_sent='It is great .', label=1))\n", | |||||
"\n", | |||||
"# 按照fastNLP_10min_tutorial.ipynb的步骤,对数据进行一些处理。这里为了演示padding操作,把field的名称做了一些改变\n", | |||||
"dataset.apply(lambda x:x['raw_sent'].lower(), new_field_name='raw_sent')\n", | |||||
"dataset.apply(lambda x:x['raw_sent'].split(), new_field_name='word_str_lst')\n", | |||||
"\n", | |||||
"# 建立Vocabulary\n", | |||||
"word_vocab = Vocabulary()\n", | |||||
"dataset.apply(lambda x:word_vocab.update(x['word_str_lst']))\n", | |||||
"dataset.apply(lambda x:[word_vocab.to_index(word) for word in x['word_str_lst']], new_field_name='words')\n", | |||||
"\n", | |||||
"# 检查以下是否得到我们想要的结果了\n", | |||||
"dataset[:2]" | |||||
] | |||||
}, | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": 2, | |||||
"metadata": {}, | |||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n", | |||||
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[4, 2, 5, 6, 7, 3],\n", | |||||
" [8, 2, 9, 3, 0, 0]])}\n", | |||||
"batch_y has: {'label': tensor([0, 1])}\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"'\"\\n结果中\\n Batch会对元素类型(元素即最内层的数据,raw_sent为str,word_str_lst为str,words为int, label为int)为int或者float的数据进行默认\\n padding,而非int或float的则不进行padding。但若每个Instance中该field为二维数据,也不进行padding。因为二维数据的padding涉及到\\n 两个维度的padding,不容易自动判断padding的形式。\\n'" | |||||
] | |||||
}, | |||||
"execution_count": 2, | |||||
"metadata": {}, | |||||
"output_type": "execute_result" | |||||
} | |||||
], | |||||
"source": [ | |||||
"# 将field设置为input或者target\n", | |||||
"dataset.set_input('word_str_lst')\n", | |||||
"dataset.set_input('words')\n", | |||||
"dataset.set_target('label')\n", | |||||
"\n", | |||||
"# 使用Batch取出batch数据\n", | |||||
"from fastNLP.core.batch import Batch\n", | |||||
"from fastNLP.core.sampler import RandomSampler\n", | |||||
"\n", | |||||
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n", | |||||
"for batch_x, batch_y in batch_iterator:\n", | |||||
" print(\"batch_x has: \", batch_x)\n", | |||||
" print(\"batch_y has: \", batch_y)\n", | |||||
"\"\"\"\"\n", | |||||
"结果中\n", | |||||
" Batch会对元素类型(元素即最内层的数据,raw_sent为str,word_str_lst为str,words为int, label为int)为int或者float的数据进行默认\n", | |||||
" padding,而非int或float的则不进行padding。但若每个Instance中该field为二维数据,也不进行padding。因为二维数据的padding涉及到\n", | |||||
" 两个维度的padding,不容易自动判断padding的形式。\n", | |||||
"\"\"\"" | |||||
] | |||||
}, | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": 3, | |||||
"metadata": {}, | |||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"batch_x has: {'word_str_lst': array([list(['it', 'is', 'great', '.']),\n", | |||||
" list(['this', 'is', 'a', 'bad', 'idea', '.'])], dtype=object), 'words': tensor([[ 8, 2, 9, 3, -100, -100],\n", | |||||
" [ 4, 2, 5, 6, 7, 3]])}\n", | |||||
"batch_y has: {'label': tensor([1, 0])}\n" | |||||
] | |||||
} | |||||
], | |||||
"source": [ | |||||
"# 所有的pad_val都默认为0,如果需要修改某一个field的默认pad值,可以通过DataSet.set_pad_val(field_name, pad_val)进行修改\n", | |||||
"# 若需要将word的padding修改为-100\n", | |||||
"dataset.set_pad_val('words', pad_val=-100)\n", | |||||
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n", | |||||
"for batch_x, batch_y in batch_iterator:\n", | |||||
" print(\"batch_x has: \", batch_x)\n", | |||||
" print(\"batch_y has: \", batch_y)\n", | |||||
"# pad的值修改为-100了" | |||||
] | |||||
}, | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": 4, | |||||
"metadata": {}, | |||||
"outputs": [ | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"DataSet({'raw_sent': this is a bad idea . type=str,\n", | |||||
"'label': 0 type=int,\n", | |||||
"'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n", | |||||
"'words': [4, 2, 5, 6, 7, 3] type=list,\n", | |||||
"'char_str_lst': [['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['b', 'a', 'd'], ['i', 'd', 'e', 'a'], ['.']] type=list,\n", | |||||
"'chars': [[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]] type=list},\n", | |||||
"{'raw_sent': it is great . type=str,\n", | |||||
"'label': 1 type=int,\n", | |||||
"'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n", | |||||
"'words': [8, 2, 9, 3] type=list,\n", | |||||
"'char_str_lst': [['i', 't'], ['i', 's'], ['g', 'r', 'e', 'a', 't'], ['.']] type=list,\n", | |||||
"'chars': [[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]] type=list})" | |||||
] | |||||
}, | |||||
"execution_count": 4, | |||||
"metadata": {}, | |||||
"output_type": "execute_result" | |||||
} | |||||
], | |||||
"source": [ | |||||
"# 若需要使用二维padding或指定padding方式,可以通过设置该field的padder实现,下面以英文的character padding为例。在某些场景下,可能想要\n", | |||||
"# 使用英文word的character作为特征,character的padding为二维padding,fastNLP默认只会进行一维padding。\n", | |||||
"\n", | |||||
"dataset.apply(lambda x: [[c for c in word] for word in x['word_str_lst']], new_field_name='char_str_lst')\n", | |||||
"char_vocab = Vocabulary()\n", | |||||
"dataset.apply(lambda x:[char_vocab.update(chars) for chars in x['char_str_lst']])\n", | |||||
"dataset.apply(lambda x:[[char_vocab.to_index(c) for c in chars] for chars in x['char_str_lst']],new_field_name='chars')\n", | |||||
"dataset[:2]" | |||||
] | |||||
}, | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": 5, | |||||
"metadata": {}, | |||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n", | |||||
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n", | |||||
" [ 8, 2, 9, 3, -100, -100]]), 'chars': array([list([[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]]),\n", | |||||
" list([[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]])], dtype=object)}\n", | |||||
"batch_y has: {'label': tensor([0, 1])}\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"'\\n 其它field与之前的是相同的。chars因为存在两个维度需要padding,不能自动决定padding方式,所以直接输出了原始形式。\\n'" | |||||
] | |||||
}, | |||||
"execution_count": 5, | |||||
"metadata": {}, | |||||
"output_type": "execute_result" | |||||
} | |||||
], | |||||
"source": [ | |||||
"# 如果不针对二维的character指定padding方法\n", | |||||
"dataset.set_input('chars')\n", | |||||
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n", | |||||
"for batch_x, batch_y in batch_iterator:\n", | |||||
" print(\"batch_x has: \", batch_x)\n", | |||||
" print(\"batch_y has: \", batch_y)\n", | |||||
" \n", | |||||
"\"\"\"\n", | |||||
" 其它field与之前的是相同的。chars因为存在两个维度需要padding,不能自动决定padding方式,所以直接输出了原始形式。\n", | |||||
"\"\"\"" | |||||
] | |||||
}, | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": 6, | |||||
"metadata": {}, | |||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n", | |||||
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n", | |||||
" [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n", | |||||
" [ 2, 5, 0, 0],\n", | |||||
" [ 3, 0, 0, 0],\n", | |||||
" [10, 3, 6, 0],\n", | |||||
" [ 2, 6, 7, 3],\n", | |||||
" [ 8, 0, 0, 0]],\n", | |||||
"\n", | |||||
" [[ 2, 4, 0, 0],\n", | |||||
" [ 2, 5, 0, 0],\n", | |||||
" [11, 12, 7, 3],\n", | |||||
" [ 8, 0, 0, 0],\n", | |||||
" [ 0, 0, 0, 0],\n", | |||||
" [ 0, 0, 0, 0]]])}\n", | |||||
"batch_y has: {'label': tensor([0, 1])}\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"'\\n chars被正确padding了\\n'" | |||||
] | |||||
}, | |||||
"execution_count": 6, | |||||
"metadata": {}, | |||||
"output_type": "execute_result" | |||||
} | |||||
], | |||||
"source": [ | |||||
"# 若要使用二维padding,需要手动设置padding方式\n", | |||||
"from fastNLP.core.fieldarray import EngChar2DPadder\n", | |||||
"dataset.set_padder('chars', EngChar2DPadder())\n", | |||||
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n", | |||||
"for batch_x, batch_y in batch_iterator:\n", | |||||
" print(\"batch_x has: \", batch_x)\n", | |||||
" print(\"batch_y has: \", batch_y)\n", | |||||
" \n", | |||||
"\"\"\"\n", | |||||
" chars被正确padding了\n", | |||||
"\"\"\"" | |||||
] | |||||
}, | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": 7, | |||||
"metadata": {}, | |||||
"outputs": [ | |||||
{ | |||||
"name": "stdout", | |||||
"output_type": "stream", | |||||
"text": [ | |||||
"batch_x has: {'raw_sent': ['this is a bad idea .', 'it is great . '], 'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n", | |||||
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n", | |||||
" [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n", | |||||
" [ 2, 5, 0, 0],\n", | |||||
" [ 3, 0, 0, 0],\n", | |||||
" [10, 3, 6, 0],\n", | |||||
" [ 2, 6, 7, 3],\n", | |||||
" [ 8, 0, 0, 0]],\n", | |||||
"\n", | |||||
" [[ 2, 4, 0, 0],\n", | |||||
" [ 2, 5, 0, 0],\n", | |||||
" [11, 12, 7, 3],\n", | |||||
" [ 8, 0, 0, 0],\n", | |||||
" [ 0, 0, 0, 0],\n", | |||||
" [ 0, 0, 0, 0]]])}\n", | |||||
"batch_y has: {'label': tensor([0, 1])}\n" | |||||
] | |||||
}, | |||||
{ | |||||
"data": { | |||||
"text/plain": [ | |||||
"'\\n raw_sent正确输出,对应内容也进行了pad。\\n'" | |||||
] | |||||
}, | |||||
"execution_count": 7, | |||||
"metadata": {}, | |||||
"output_type": "execute_result" | |||||
} | |||||
], | |||||
"source": [ | |||||
"# 如果AutoPad与EngChar2DPadder不能满足需要,可以自己实现Padder对象。这里举一个例子,比如需要把raw_sentence pad到一样长\n", | |||||
"from fastNLP.core.fieldarray import PadderBase\n", | |||||
"\n", | |||||
"class PadStr(PadderBase):\n", | |||||
" def __init__(self, pad_val=' '):\n", | |||||
" super().__init__(pad_val=pad_val) #让父类管理pad_val的值,这样可以通过DataSet.set_pad_val()修改到该值\n", | |||||
" \n", | |||||
" def __call__(self, contents, field_name, field_ele_dtype):\n", | |||||
" \"\"\"\n", | |||||
" 如果以上面的例子举例,在raw_sent这个field进行pad时,传入的\n", | |||||
" contents:\n", | |||||
" [\n", | |||||
" 'This is a bad idea .',\n", | |||||
" 'It is great .'\n", | |||||
" ]\n", | |||||
" field_name: 'raw_sent',当前field的名称,主要用于帮助debug。\n", | |||||
" field_ele_dtype: np.str. 这个参数基本都用不上,是该field中内部元素的类型\n", | |||||
" \"\"\"\n", | |||||
" max_len = max([len(str_) for str_ in contents])\n", | |||||
" pad_strs = []\n", | |||||
" for content in contents:\n", | |||||
" pad_strs.append(content + (max_len-len(content))*self.pad_val)\n", | |||||
" return pad_strs\n", | |||||
"\n", | |||||
"dataset.set_input('raw_sent')\n", | |||||
"dataset.set_padder('raw_sent', PadStr())\n", | |||||
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n", | |||||
"for batch_x, batch_y in batch_iterator:\n", | |||||
" print(\"batch_x has: \", batch_x)\n", | |||||
" print(\"batch_y has: \", batch_y)\n", | |||||
"\n", | |||||
"\"\"\"\n", | |||||
" raw_sent正确输出,对应内容也进行了pad。\n", | |||||
"\"\"\"" | |||||
] | |||||
}, | |||||
{ | |||||
"cell_type": "code", | |||||
"execution_count": null, | |||||
"metadata": {}, | |||||
"outputs": [], | |||||
"source": [] | |||||
} | |||||
], | |||||
"metadata": { | |||||
"kernelspec": { | |||||
"display_name": "Python 3", | |||||
"language": "python", | |||||
"name": "python3" | |||||
}, | |||||
"language_info": { | |||||
"codemirror_mode": { | |||||
"name": "ipython", | |||||
"version": 3 | |||||
}, | |||||
"file_extension": ".py", | |||||
"mimetype": "text/x-python", | |||||
"name": "python", | |||||
"nbconvert_exporter": "python", | |||||
"pygments_lexer": "ipython3", | |||||
"version": "3.6.7" | |||||
} | |||||
}, | |||||
"nbformat": 4, | |||||
"nbformat_minor": 2 | |||||
} |