Browse Source

1. 将pad的功能从FieldArray中剥离出来,使用Padder完成各种padding操作。

2. FieldArray默认使用AutoPadder, AutoPadder的行为与之前不使用padder是一致的的
3. 为了解决二维padding的问题,引入了EngChar2dPadder用于对character进行padding
4. 增加一份padding的tutorial。
tags/v0.3.1^2
yh 6 years ago
parent
commit
8091a734ee
8 changed files with 727 additions and 38 deletions
  1. +8
    -5
      fastNLP/core/batch.py
  2. +36
    -14
      fastNLP/core/dataset.py
  3. +196
    -17
      fastNLP/core/fieldarray.py
  4. +0
    -0
      reproduction/Chinese_word_segmentation/__init__.py
  5. +56
    -2
      reproduction/Chinese_word_segmentation/models/cws_transformer.py
  6. +0
    -0
      reproduction/__init__.py
  7. +61
    -0
      test/core/test_fieldarray.py
  8. +370
    -0
      tutorials/fastNLP_padding_tutorial.ipynb

+ 8
- 5
fastNLP/core/batch.py View File

@@ -48,7 +48,7 @@ class Batch(object):
for field_name, field in self.dataset.get_all_fields().items(): for field_name, field in self.dataset.get_all_fields().items():
if field.is_target or field.is_input: if field.is_target or field.is_input:
batch = field.get(indices) batch = field.get(indices)
if not self.as_numpy:
if not self.as_numpy and field.padder is not None:
batch = to_tensor(batch, field.dtype) batch = to_tensor(batch, field.dtype)
if field.is_target: if field.is_target:
batch_y[field_name] = batch batch_y[field_name] = batch
@@ -67,8 +67,11 @@ class Batch(object):




def to_tensor(batch, dtype): def to_tensor(batch, dtype):
if dtype in (int, np.int8, np.int16, np.int32, np.int64):
batch = torch.LongTensor(batch)
if dtype in (float, np.float32, np.float64):
batch = torch.FloatTensor(batch)
try:
if dtype in (int, np.int8, np.int16, np.int32, np.int64):
batch = torch.LongTensor(batch)
if dtype in (float, np.float32, np.float64):
batch = torch.FloatTensor(batch)
except:
pass
return batch return batch

+ 36
- 14
fastNLP/core/dataset.py View File

@@ -3,6 +3,7 @@ import _pickle as pickle
import numpy as np import numpy as np


from fastNLP.core.fieldarray import FieldArray from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.fieldarray import AutoPadder
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.utils import get_func_signature from fastNLP.core.utils import get_func_signature
from fastNLP.io.base_loader import DataLoaderRegister from fastNLP.io.base_loader import DataLoaderRegister
@@ -88,11 +89,8 @@ class DataSet(object):
raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}") raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}")
data_set = DataSet() data_set = DataSet()
for field in self.field_arrays.values(): for field in self.field_arrays.values():
data_set.add_field(name=field.name,
fields=field.content[idx],
padding_val=field.padding_val,
is_input=field.is_input,
is_target=field.is_target)
data_set.add_field(name=field.name, fields=field.content[idx], padder=field.padder,
is_input=field.is_input, is_target=field.is_target)
return data_set return data_set
else: else:
raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx)))
@@ -151,12 +149,12 @@ class DataSet(object):
assert name in self.field_arrays assert name in self.field_arrays
self.field_arrays[name].append(field) self.field_arrays[name].append(field)


def add_field(self, name, fields, padding_val=0, is_input=False, is_target=False):
def add_field(self, name, fields, padder=AutoPadder(pad_val=0), is_input=False, is_target=False):
"""Add a new field to the DataSet. """Add a new field to the DataSet.
:param str name: the name of the field. :param str name: the name of the field.
:param fields: a list of int, float, or other objects. :param fields: a list of int, float, or other objects.
:param int padding_val: integer for padding.
:param int padder: PadBase对象,如何对该Field进行padding。大部分情况使用默认值即可
:param bool is_input: whether this field is model input. :param bool is_input: whether this field is model input.
:param bool is_target: whether this field is label or target. :param bool is_target: whether this field is label or target.
""" """
@@ -164,8 +162,8 @@ class DataSet(object):
if len(self) != len(fields): if len(self) != len(fields):
raise RuntimeError(f"The field to append must have the same size as dataset. " raise RuntimeError(f"The field to append must have the same size as dataset. "
f"Dataset size {len(self)} != field size {len(fields)}") f"Dataset size {len(self)} != field size {len(fields)}")
self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target,
is_input=is_input)
self.field_arrays[name] = FieldArray(name, fields, is_target=is_target, is_input=is_input,
padder=padder)


def delete_field(self, name): def delete_field(self, name):
"""Delete a field based on the field name. """Delete a field based on the field name.
@@ -229,6 +227,25 @@ class DataSet(object):
else: else:
raise KeyError("{} is not a valid field name.".format(name)) raise KeyError("{} is not a valid field name.".format(name))


def set_padder(self, field_name, padder):
"""
为field_name设置padder
:param field_name: str, 设置field的padding方式为padder
:param padder: PadderBase类型或None. 设置为None即删除padder。即对该field不进行padding操作.
:return:
"""
self.field_arrays[field_name].set_padder(padder)

def set_pad_val(self, field_name, pad_val):
"""
为某个

:param field_name: str,修改该field的pad_val
:param pad_val: int,该field的padder会以pad_val作为padding index
:return:
"""
self.field_arrays[field_name].set_pad_val(pad_val)

def get_input_name(self): def get_input_name(self):
"""Get all field names with `is_input` as True. """Get all field names with `is_input` as True.


@@ -270,12 +287,9 @@ class DataSet(object):
extra_param['is_input'] = old_field.is_input extra_param['is_input'] = old_field.is_input
if 'is_target' not in extra_param: if 'is_target' not in extra_param:
extra_param['is_target'] = old_field.is_target extra_param['is_target'] = old_field.is_target
self.add_field(name=new_field_name,
fields=results,
padding_val=old_field.padding_val,
**extra_param)
self.add_field(name=new_field_name, fields=results)
else: else:
self.add_field(name=new_field_name, fields=results, **extra_param)
self.add_field(name=new_field_name, fields=results)
else: else:
return results return results


@@ -314,8 +328,16 @@ class DataSet(object):
for field_name in self.field_arrays: for field_name in self.field_arrays:
train_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input train_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input
train_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target train_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target
train_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder
train_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype
train_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype
train_set.field_arrays[field_name].is_2d_list = self.field_arrays[field_name].is_2d_list
dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input
dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target
dev_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder
dev_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype
dev_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype
dev_set.field_arrays[field_name].is_2d_list = self.field_arrays[field_name].is_2d_list


return train_set, dev_set return train_set, dev_set




+ 196
- 17
fastNLP/core/fieldarray.py View File

@@ -1,19 +1,105 @@
import numpy as np import numpy as np




class PadderBase:
"""
所有padder都需要继承这个类,并覆盖__call__()方法。
用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。
"""
def __init__(self, pad_val=0, **kwargs):
self.pad_val = pad_val

def set_pad_val(self, pad_val):
self.pad_val = pad_val

def __call__(self, contents, field_name, field_ele_dtype):
"""
传入的是List内容。假设有以下的DataSet。
from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
dataset.append(Instance(word='this is a demo', length=4,
chars=[['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['d', 'e', 'm', 'o']]))
dataset.append(Instance(word='another one', length=2,
chars=[['a', 'n', 'o', 't', 'h', 'e', 'r'], ['o', 'n', 'e']]))
# 如果batch_size=2, 下面只是用str的方式看起来更直观一点,但实际上可能word和chars在pad时都已经为index了。
word这个field的pad_func会接收到的内容会是
[
'this is a demo',
'another one'
]
length这个field的pad_func会接收到的内容会是
[4, 2]
chars这个field的pad_func会接收到的内容会是
[
[['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['d', 'e', 'm', 'o']],
[['a', 'n', 'o', 't', 'h', 'e', 'r'], ['o', 'n', 'e']]
]
即把每个instance中某个field的内容合成一个List传入
:param contents: List[element]。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前
deepcopy一份。
:param field_name: str, field的名称,帮助定位错误
:param field_ele_dtype: np.int64, np.float64, np.str. 该field的内层list元素的类型。辅助判断是否pad,大多数情况用不上
:return: List[padded_element]或np.array([padded_element])
"""
raise NotImplementedError


class AutoPadder(PadderBase):
"""
根据contents的数据自动判定是否需要做padding。
(1) 如果元素类型(元素类型是指field中最里层List的元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类
型为np.str, [[1,2], ...]的元素类型为np.int64)的数据不为(np.int64, np.float64)则不会进行padding
(2) 如果元素类型为(np.int64, np.float64),
(2.1) 如果该field的内容只有一个,比如为sequence_length, 则不进行padding
(2.2) 如果该field的内容为List, 那么会将Batch中的List pad为一样长。若该List下还有里层的List需要padding,请使用其它padder。
如果某个instance中field为[1, 2, 3],则可以pad; 若为[[1,2], [3,4, ...]]则不能进行pad
"""
def __init__(self, pad_val=0):
"""
:param pad_val: int, padding的位置使用该index
"""
super().__init__(pad_val=pad_val)

def _is_two_dimension(self, contents):
"""
判断contents是不是只有两个维度。[[1,2], [3]]是两个维度. [[[1,2], [3, 4, 5]], [[4,5]]]有三个维度
:param contents:
:return:
"""
value = contents[0]
if isinstance(value , (np.ndarray, list)):
value = value[0]
if isinstance(value, (np.ndarray, list)):
return False
return True
return False

def __call__(self, contents, field_name, field_ele_dtype):
if not is_iterable(contents[0]):
array = np.array([content for content in contents], dtype=field_ele_dtype)
elif field_ele_dtype in (np.int64, np.float64) and self._is_two_dimension(contents):
max_len = max([len(content) for content in contents])
array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype)
for i, content in enumerate(contents):
array[i][:len(content)] = content
else: # should only be str
array = np.array([content for content in contents])
return array


class FieldArray(object): class FieldArray(object):
"""``FieldArray`` is the collection of ``Instance``s of the same field. """``FieldArray`` is the collection of ``Instance``s of the same field.
It is the basic element of ``DataSet`` class. It is the basic element of ``DataSet`` class.


:param str name: the name of the FieldArray :param str name: the name of the FieldArray
:param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray. :param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray.
:param int padding_val: the integer for padding. Default: 0.
:param bool is_target: If True, this FieldArray is used to compute loss. :param bool is_target: If True, this FieldArray is used to compute loss.
:param bool is_input: If True, this FieldArray is used to the model input. :param bool is_input: If True, this FieldArray is used to the model input.

:param padder: PadderBase类型。大多数情况下都不需要设置该值,除非需要在多个维度上进行padding(比如英文中对character进行padding)
""" """


def __init__(self, name, content, padding_val=0, is_target=None, is_input=None):
def __init__(self, name, content, is_target=None, is_input=None, padder=AutoPadder(pad_val=0)):
self.name = name self.name = name
if isinstance(content, list): if isinstance(content, list):
content = content content = content
@@ -22,7 +108,7 @@ class FieldArray(object):
else: else:
raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content))) raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content)))
self.content = content self.content = content
self.padding_val = padding_val
self.set_padder(padder)


self._is_target = None self._is_target = None
self._is_input = None self._is_input = None
@@ -149,28 +235,44 @@ class FieldArray(object):
assert isinstance(idx, int) assert isinstance(idx, int)
self.content[idx] = val self.content[idx] = val


def get(self, indices):
def get(self, indices, pad=True):
"""Fetch instances based on indices. """Fetch instances based on indices.


:param indices: an int, or a list of int. :param indices: an int, or a list of int.
:param pad: bool, 是否对返回的结果进行padding。
:return: :return:
""" """
if isinstance(indices, int): if isinstance(indices, int):
return self.content[indices] return self.content[indices]
if self.is_input is False and self.is_target is False: if self.is_input is False and self.is_target is False:
raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name)) raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name))
batch_size = len(indices)

if not is_iterable(self.content[0]):
array = np.array([self.content[i] for i in indices], dtype=self.dtype)
elif self.dtype in (np.int64, np.float64):
max_len = max([len(self.content[i]) for i in indices])
array = np.full((batch_size, max_len), self.padding_val, dtype=self.dtype)
for i, idx in enumerate(indices):
array[i][:len(self.content[idx])] = self.content[idx]
else: # should only be str
array = np.array([self.content[i] for i in indices])
return array

contents = [self.content[i] for i in indices]
if self.padder is None or pad is False:
return np.array(contents)
else:
return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype)

def set_padder(self, padder):
"""
设置padding方式

:param padder: PadderBase类型或None. 设置为None即删除padder.
:return:
"""
if padder is not None:
assert isinstance(padder, PadderBase), "padder must be of type PadderBase."
self.padder = padder

def set_pad_val(self, pad_val):
"""
修改padder的pad_val.
:param pad_val: int。
:return:
"""
if self.padder is not None:
self.padder.set_pad_val(pad_val)



def __len__(self): def __len__(self):
"""Returns the size of FieldArray. """Returns the size of FieldArray.
@@ -186,3 +288,80 @@ def is_iterable(content):
except TypeError: except TypeError:
return False return False
return True return True


class EngChar2DPadder(PadderBase):
"""
用于为英语执行character级别的2D padding操作。对应的field内容应该为[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']](这里为
了更直观,把它们写为str,但实际使用时它们应该是character的index)。
padded过后的batch内容,形状为(batch_size, max_sentence_length, max_word_length). max_sentence_length最大句子长度。
max_word_length最长的word的长度

"""
def __init__(self, pad_val=0, pad_length=0):
"""
:param pad_val: int, padding的位置使用该index
:param pad_length: int, 如果为0则取一个batch中最大的单词长度作为padding长度。如果为大于0的数,则将所有单词的长度都pad或截
取到该长度.
"""
super().__init__(pad_val=pad_val)

self.pad_length = pad_length

def _exactly_three_dims(self, contents, field_name):
"""
检查传入的contents是否刚好是3维,如果不是3维就报错。理论上,第一个维度是batch,第二个维度是word,第三个维度是character
:param contents:
:param field_name: str
:return:
"""
if not isinstance(contents, list):
raise TypeError("contents should be a list, not {}.".format(type(contents)))
value = contents[0]
try:
value = value[0]
except:
raise ValueError("Field:{} only has one dimension.".format(field_name))
try:
value = value[1]
except:
raise ValueError("Field:{} only has two dimensions.".format(field_name))

if is_iterable(value):
raise ValueError("Field:{} has more than 3 dimension.".format(field_name))

def __call__(self, contents, field_name, field_ele_dtype):
"""
期望输入类似于
[
[[0, 2], [2, 3, 4], ..],
[[9, 8, 2, 4], [1, 2,], ...],
....
]

:param contents:
:param field_name:
:param field_ele_dtype
:return:
"""
if field_ele_dtype not in (np.int64, np.float64):
raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format(
field_name, field_ele_dtype
))
self._exactly_three_dims(contents, field_name)
if self.pad_length < 1:
max_char_length = max(max([[len(char_lst) for char_lst in word_lst] for word_lst in contents]))
else:
max_char_length = self.pad_length
max_sent_length = max(len(word_lst) for word_lst in contents)
batch_size = len(contents)
dtype = type(contents[0][0][0])

padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val,
dtype=dtype)
for b_idx, word_lst in enumerate(contents):
for c_idx, char_lst in enumerate(word_lst):
chars = char_lst[:max_char_length]
padded_array[b_idx, c_idx, :len(chars)] = chars

return padded_array

+ 0
- 0
reproduction/Chinese_word_segmentation/__init__.py View File


reproduction/chinese_word_segment/models/cws_transformer.py → reproduction/Chinese_word_segmentation/models/cws_transformer.py View File

@@ -39,7 +39,6 @@ class TransformerCWS(nn.Module):
allowed_transitions=allowed_trans) allowed_transitions=allowed_trans)


def forward(self, chars, target, seq_lens, bigrams=None): def forward(self, chars, target, seq_lens, bigrams=None):
seq_lens = seq_lens
masks = seq_len_to_byte_mask(seq_lens).float() masks = seq_len_to_byte_mask(seq_lens).float()
x = self.embedding(chars) x = self.embedding(chars)
batch_size = x.size(0) batch_size = x.size(0)
@@ -59,8 +58,59 @@ class TransformerCWS(nn.Module):


return pred_dict return pred_dict


def predict(self, chars, seq_lens, bigrams=None):
masks = seq_len_to_byte_mask(seq_lens).float()

x = self.embedding(chars)
batch_size = x.size(0)
length = x.size(1)
if hasattr(self, 'bigram_embedding'):
bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size
x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1)
self.drop(x)
x = self.fc1(x)
feats = self.transformer(x, masks)
feats = self.fc2(feats)

probs = self.crf.viterbi_decode(feats, masks, get_score=False)

return {'pred': probs, 'seq_lens':seq_lens}


class NoamOpt(torch.optim.Optimizer):
"Optim wrapper that implements rate."

def __init__(self, model_size, factor, warmup, optimizer):
super().__init__([torch.nn.Parameter(torch.ones(1))], {})

self.optimizer = optimizer
self._step = 0
self.warmup = warmup
self.factor = factor
self.model_size = model_size
self._rate = 0

def step(self, **kwargs):
"Update parameters and rate"
self._step += 1
rate = self.rate()
for p in self.optimizer.param_groups:
p['lr'] = rate
self._rate = rate
self.optimizer.step()

def rate(self, step=None):
"Implement `lrate` above"
if step is None:
step = self._step
return self.factor * \
(self.model_size ** (-0.5) *
min(step ** (-0.5), step * self.warmup ** (-1.5)))



if __name__ == '__main__': if __name__ == '__main__':


transformer = TransformerCWS(10, embed_dim=100, bigram_vocab_num=10, bigram_embed_dim=100, num_bigram_per_char=8, transformer = TransformerCWS(10, embed_dim=100, bigram_vocab_num=10, bigram_embed_dim=100, num_bigram_per_char=8,
hidden_size=200, embed_drop_p=0.3, num_layers=1, num_heads=8, tag_size=4) hidden_size=200, embed_drop_p=0.3, num_layers=1, num_heads=8, tag_size=4)
chars = torch.randint(10, size=(4, 7)).long() chars = torch.randint(10, size=(4, 7)).long()
@@ -68,4 +118,8 @@ if __name__ == '__main__':
seq_lens = torch.ones(4).long()*7 seq_lens = torch.ones(4).long()*7
target = torch.randint(4, size=(4, 7)) target = torch.randint(4, size=(4, 7))


print(transformer(chars, target, seq_lens, bigrams))
print(transformer(chars, target, seq_lens, bigrams))

optimizer = torch.optim.Adam(transformer.parameters())

opt = NoamOpt(10 ,1, 400, optimizer)

+ 0
- 0
reproduction/__init__.py View File


+ 61
- 0
test/core/test_fieldarray.py View File

@@ -97,3 +97,64 @@ class TestFieldArray(unittest.TestCase):
fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) fa.append([1.2, 2.3, 3.4, 4.5, 5.6])
self.assertEqual(len(fa), 3) self.assertEqual(len(fa), 3)
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6])


class TestPadder(unittest.TestCase):

def test01(self):
"""
测试AutoPadder能否正常工作
:return:
"""
from fastNLP.core.fieldarray import AutoPadder
padder = AutoPadder()
content = ['This is a str', 'this is another str']
self.assertListEqual(content, padder(content, None, np.str).tolist())

content = [1, 2]
self.assertListEqual(content, padder(content, None, np.int64).tolist())

content = [[1,2], [3], [4]]
self.assertListEqual([[1,2], [3, 0], [4, 0]],
padder(content, None, np.int64).tolist())

contents = [
[[1, 2, 3], [4, 5], [7,8,9,10]],
[[1]]
]
print(padder(contents, None, np.int64))

def test02(self):
"""
测试EngChar2DPadder能不能正确使用
:return:
"""
from fastNLP.core.fieldarray import EngChar2DPadder
padder = EngChar2DPadder(pad_length=0)

contents = [1, 2]
# 不能是1维
with self.assertRaises(ValueError):
padder(contents, None, np.int64)
contents = [[1, 2]]
# 不能是2维
with self.assertRaises(ValueError):
padder(contents, None, np.int64)
contents = [[[[1, 2]]]]
# 不能是3维以上
with self.assertRaises(ValueError):
padder(contents, None, np.int64)

contents = [
[[1, 2, 3], [4, 5], [7,8,9,10]],
[[1]]
]
self.assertListEqual([[[1, 2, 3, 0], [4, 5, 0, 0], [7, 8, 9, 10]], [[1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]],
padder(contents, None, np.int64).tolist())

padder = EngChar2DPadder(pad_length=5, pad_val=-100)
self.assertListEqual(
[[[1, 2, 3, -100, -100], [4, 5, -100, -100, -100], [7, 8, 9, 10, -100]],
[[1, -100, -100, -100, -100], [-100, -100, -100, -100, -100], [-100, -100, -100, -100, -100]]],
padder(contents, None, np.int64).tolist()
)

+ 370
- 0
tutorials/fastNLP_padding_tutorial.ipynb View File

@@ -0,0 +1,370 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/yh/miniconda2/envs/python3/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n"
]
},
{
"data": {
"text/plain": [
"DataSet({'raw_sent': this is a bad idea . type=str,\n",
"'label': 0 type=int,\n",
"'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n",
"'words': [4, 2, 5, 6, 7, 3] type=list},\n",
"{'raw_sent': it is great . type=str,\n",
"'label': 1 type=int,\n",
"'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n",
"'words': [8, 2, 9, 3] type=list})"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 假设有以下的DataSet, 这里只是为了举例所以只选择了两个sample\n",
"import sys\n",
"import os\n",
"sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP')\n",
"\n",
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"from fastNLP import Vocabulary\n",
"\n",
"dataset = DataSet()\n",
"dataset.append(Instance(raw_sent='This is a bad idea .', label=0))\n",
"dataset.append(Instance(raw_sent='It is great .', label=1))\n",
"\n",
"# 按照fastNLP_10min_tutorial.ipynb的步骤,对数据进行一些处理。这里为了演示padding操作,把field的名称做了一些改变\n",
"dataset.apply(lambda x:x['raw_sent'].lower(), new_field_name='raw_sent')\n",
"dataset.apply(lambda x:x['raw_sent'].split(), new_field_name='word_str_lst')\n",
"\n",
"# 建立Vocabulary\n",
"word_vocab = Vocabulary()\n",
"dataset.apply(lambda x:word_vocab.update(x['word_str_lst']))\n",
"dataset.apply(lambda x:[word_vocab.to_index(word) for word in x['word_str_lst']], new_field_name='words')\n",
"\n",
"# 检查以下是否得到我们想要的结果了\n",
"dataset[:2]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[4, 2, 5, 6, 7, 3],\n",
" [8, 2, 9, 3, 0, 0]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\"\\n结果中\\n Batch会对元素类型(元素即最内层的数据,raw_sent为str,word_str_lst为str,words为int, label为int)为int或者float的数据进行默认\\n padding,而非int或float的则不进行padding。但若每个Instance中该field为二维数据,也不进行padding。因为二维数据的padding涉及到\\n 两个维度的padding,不容易自动判断padding的形式。\\n'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将field设置为input或者target\n",
"dataset.set_input('word_str_lst')\n",
"dataset.set_input('words')\n",
"dataset.set_target('label')\n",
"\n",
"# 使用Batch取出batch数据\n",
"from fastNLP.core.batch import Batch\n",
"from fastNLP.core.sampler import RandomSampler\n",
"\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"\"\"\"\"\n",
"结果中\n",
" Batch会对元素类型(元素即最内层的数据,raw_sent为str,word_str_lst为str,words为int, label为int)为int或者float的数据进行默认\n",
" padding,而非int或float的则不进行padding。但若每个Instance中该field为二维数据,也不进行padding。因为二维数据的padding涉及到\n",
" 两个维度的padding,不容易自动判断padding的形式。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['it', 'is', 'great', '.']),\n",
" list(['this', 'is', 'a', 'bad', 'idea', '.'])], dtype=object), 'words': tensor([[ 8, 2, 9, 3, -100, -100],\n",
" [ 4, 2, 5, 6, 7, 3]])}\n",
"batch_y has: {'label': tensor([1, 0])}\n"
]
}
],
"source": [
"# 所有的pad_val都默认为0,如果需要修改某一个field的默认pad值,可以通过DataSet.set_pad_val(field_name, pad_val)进行修改\n",
"# 若需要将word的padding修改为-100\n",
"dataset.set_pad_val('words', pad_val=-100)\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"# pad的值修改为-100了"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataSet({'raw_sent': this is a bad idea . type=str,\n",
"'label': 0 type=int,\n",
"'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n",
"'words': [4, 2, 5, 6, 7, 3] type=list,\n",
"'char_str_lst': [['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['b', 'a', 'd'], ['i', 'd', 'e', 'a'], ['.']] type=list,\n",
"'chars': [[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]] type=list},\n",
"{'raw_sent': it is great . type=str,\n",
"'label': 1 type=int,\n",
"'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n",
"'words': [8, 2, 9, 3] type=list,\n",
"'char_str_lst': [['i', 't'], ['i', 's'], ['g', 'r', 'e', 'a', 't'], ['.']] type=list,\n",
"'chars': [[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]] type=list})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 若需要使用二维padding或指定padding方式,可以通过设置该field的padder实现,下面以英文的character padding为例。在某些场景下,可能想要\n",
"# 使用英文word的character作为特征,character的padding为二维padding,fastNLP默认只会进行一维padding。\n",
"\n",
"dataset.apply(lambda x: [[c for c in word] for word in x['word_str_lst']], new_field_name='char_str_lst')\n",
"char_vocab = Vocabulary()\n",
"dataset.apply(lambda x:[char_vocab.update(chars) for chars in x['char_str_lst']])\n",
"dataset.apply(lambda x:[[char_vocab.to_index(c) for c in chars] for chars in x['char_str_lst']],new_field_name='chars')\n",
"dataset[:2]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': array([list([[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]]),\n",
" list([[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]])], dtype=object)}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n 其它field与之前的是相同的。chars因为存在两个维度需要padding,不能自动决定padding方式,所以直接输出了原始形式。\\n'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 如果不针对二维的character指定padding方法\n",
"dataset.set_input('chars')\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
" \n",
"\"\"\"\n",
" 其它field与之前的是相同的。chars因为存在两个维度需要padding,不能自动决定padding方式,所以直接输出了原始形式。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n",
" [ 2, 5, 0, 0],\n",
" [ 3, 0, 0, 0],\n",
" [10, 3, 6, 0],\n",
" [ 2, 6, 7, 3],\n",
" [ 8, 0, 0, 0]],\n",
"\n",
" [[ 2, 4, 0, 0],\n",
" [ 2, 5, 0, 0],\n",
" [11, 12, 7, 3],\n",
" [ 8, 0, 0, 0],\n",
" [ 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0]]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n chars被正确padding了\\n'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 若要使用二维padding,需要手动设置padding方式\n",
"from fastNLP.core.fieldarray import EngChar2DPadder\n",
"dataset.set_padder('chars', EngChar2DPadder())\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
" \n",
"\"\"\"\n",
" chars被正确padding了\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'raw_sent': ['this is a bad idea .', 'it is great . '], 'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n",
" [ 2, 5, 0, 0],\n",
" [ 3, 0, 0, 0],\n",
" [10, 3, 6, 0],\n",
" [ 2, 6, 7, 3],\n",
" [ 8, 0, 0, 0]],\n",
"\n",
" [[ 2, 4, 0, 0],\n",
" [ 2, 5, 0, 0],\n",
" [11, 12, 7, 3],\n",
" [ 8, 0, 0, 0],\n",
" [ 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0]]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n raw_sent正确输出,对应内容也进行了pad。\\n'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 如果AutoPad与EngChar2DPadder不能满足需要,可以自己实现Padder对象。这里举一个例子,比如需要把raw_sentence pad到一样长\n",
"from fastNLP.core.fieldarray import PadderBase\n",
"\n",
"class PadStr(PadderBase):\n",
" def __init__(self, pad_val=' '):\n",
" super().__init__(pad_val=pad_val) #让父类管理pad_val的值,这样可以通过DataSet.set_pad_val()修改到该值\n",
" \n",
" def __call__(self, contents, field_name, field_ele_dtype):\n",
" \"\"\"\n",
" 如果以上面的例子举例,在raw_sent这个field进行pad时,传入的\n",
" contents:\n",
" [\n",
" 'This is a bad idea .',\n",
" 'It is great .'\n",
" ]\n",
" field_name: 'raw_sent',当前field的名称,主要用于帮助debug。\n",
" field_ele_dtype: np.str. 这个参数基本都用不上,是该field中内部元素的类型\n",
" \"\"\"\n",
" max_len = max([len(str_) for str_ in contents])\n",
" pad_strs = []\n",
" for content in contents:\n",
" pad_strs.append(content + (max_len-len(content))*self.pad_val)\n",
" return pad_strs\n",
"\n",
"dataset.set_input('raw_sent')\n",
"dataset.set_padder('raw_sent', PadStr())\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"\n",
"\"\"\"\n",
" raw_sent正确输出,对应内容也进行了pad。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Loading…
Cancel
Save