From d0354d8e2883b4433378c4a6a247972de664a06d Mon Sep 17 00:00:00 2001 From: ChenXin Date: Tue, 20 Aug 2019 00:00:31 +0800 Subject: [PATCH] fix some importing bugs --- fastNLP/__init__.py | 5 +- fastNLP/core/__init__.py | 2 +- fastNLP/core/field.py | 191 +++++++++++++++++++++------------------ 3 files changed, 108 insertions(+), 90 deletions(-) diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py index a6767088..879fd644 100644 --- a/fastNLP/__init__.py +++ b/fastNLP/__init__.py @@ -14,6 +14,7 @@ __all__ = [ "Instance", "FieldArray", + "DataSetIter", "BatchIter", "TorchLoaderIter", @@ -31,6 +32,7 @@ __all__ = [ "TensorboardCallback", "LRScheduler", "ControlC", + "LRFinder", "Padder", "AutoPadder", @@ -43,7 +45,8 @@ __all__ = [ "Optimizer", "SGD", "Adam", - + "AdamW", + "Sampler", "SequentialSampler", "BucketSampler", diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index acf0efc4..4a43b73d 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -22,7 +22,7 @@ from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder from .instance import Instance from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward from .metrics import AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric -from .optimizer import Optimizer, SGD, Adam +from .optimizer import Optimizer, SGD, Adam, AdamW from .sampler import SequentialSampler, BucketSampler, RandomSampler, Sampler from .tester import Tester from .trainer import Trainer diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 65bd9be4..26d22ada 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -1,4 +1,8 @@ - +__all__ = [ + "Padder", + "AutoPadder", + "EngChar2DPadder", +] from numbers import Number import torch @@ -9,24 +13,27 @@ from copy import deepcopy from collections import Counter from .utils import _is_iterable + class SetInputOrTargetException(Exception): def __init__(self, msg, index=None, field_name=None): super().__init__(msg) self.msg = msg self.index = index # 标示在哪个数据遭遇到问题了 - self.field_name = field_name # 标示当前field的名称 + self.field_name = field_name # 标示当前field的名称 + class AppendToTargetOrInputException(Exception): def __init__(self, msg, index=None, field_name=None): super().__init__(msg) self.msg = msg self.index = index # 标示在哪个数据遭遇到问题了 - self.field_name = field_name # 标示当前field的名称 + self.field_name = field_name # 标示当前field的名称 + class FieldArray: def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False, use_1st_ins_infer_dim_type=True): - if len(content)==0: + if len(content) == 0: raise RuntimeError("Empty fieldarray is not allowed.") _content = content try: @@ -43,34 +50,34 @@ class FieldArray: self._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self._is_input = False self._is_target = False - + if is_input: self.is_input = is_input if is_target: self.is_target = is_target - + if padder is None: padder = AutoPadder(pad_val=0) else: assert isinstance(padder, Padder), "padder must be of type fastNLP.Padder." padder = deepcopy(padder) self.set_padder(padder) - + @property def ignore_type(self): return self._ignore_type - + @ignore_type.setter def ignore_type(self, value): if value: self._cell_ndim = None self.dtype = None self._ignore_type = value - + @property def is_input(self): return self._is_input - + @is_input.setter def is_input(self, value): """ @@ -85,11 +92,11 @@ class FieldArray: self.dtype = None self._cell_ndim = None self._is_input = value - + @property def is_target(self): return self._is_target - + @is_target.setter def is_target(self, value): """ @@ -103,7 +110,7 @@ class FieldArray: self.dtype = None self._cell_ndim = None self._is_target = value - + def _check_dtype_and_ndim(self, only_check_1st_ins_dim_type=True): """ 检查当前content所有的element是否是同一个类型,且是否每个元素具有相同的维度。通过的话,设置_cell_ndim与_ele_type属性;没有 @@ -120,35 +127,37 @@ class FieldArray: for cell in self.content[1:]: index += 1 type_i, dim_i = _get_ele_type_and_dim(cell) - if type_i!=type_0: - raise SetInputOrTargetException("Type:{} in index {} is different from the first element with type:{}." - ".".format(type_i, index, type_0)) - if dim_0!=dim_i: - raise SetInputOrTargetException("Dimension:{} in index {} is different from the first element with " - "dimension:{}.".format(dim_i, index, dim_0)) + if type_i != type_0: + raise SetInputOrTargetException( + "Type:{} in index {} is different from the first element with type:{}." + ".".format(type_i, index, type_0)) + if dim_0 != dim_i: + raise SetInputOrTargetException( + "Dimension:{} in index {} is different from the first element with " + "dimension:{}.".format(dim_i, index, dim_0)) self._cell_ndim = dim_0 self.dtype = type_0 except SetInputOrTargetException as e: e.index = index raise e - - def append(self, val:Any): + + def append(self, val: Any): """ :param val: 把该val append到fieldarray。 :return: """ if (self._is_target or self._is_input) and self._ignore_type is False and not self._use_1st_ins_infer_dim_type: type_, dim_ = _get_ele_type_and_dim(val) - if self.dtype!=type_: + if self.dtype != type_: raise AppendToTargetOrInputException(f"Value(type:{type_}) are of different types with " f"previous values(type:{self.dtype}).") - if self._cell_ndim!=dim_: + if self._cell_ndim != dim_: raise AppendToTargetOrInputException(f"Value(dim:{dim_}) are of different dimensions with " f"previous values(dim:{self._cell_ndim}).") self.content.append(val) else: self.content.append(val) - + def pop(self, index): """ 删除该field中index处的元素 @@ -156,22 +165,22 @@ class FieldArray: :return: """ self.content.pop(index) - + def __getitem__(self, indices): return self.get(indices, pad=False) - + def __setitem__(self, idx, val): assert isinstance(idx, int) if (self._is_target or self._is_input) and self.ignore_type is False: # 需要检测类型 type_, dim_ = _get_ele_type_and_dim(val) - if self.dtype!=type_: + if self.dtype != type_: raise RuntimeError(f"Value(type:{type_}) are of different types with " - f"other values(type:{self.dtype}).") - if self._cell_ndim!=dim_: + f"other values(type:{self.dtype}).") + if self._cell_ndim != dim_: raise RuntimeError(f"Value(dim:{dim_}) are of different dimensions with " - f"previous values(dim:{self._cell_ndim}).") + f"previous values(dim:{self._cell_ndim}).") self.content[idx] = val - + def get(self, indices, pad=True): """ 根据给定的indices返回内容 @@ -184,16 +193,16 @@ class FieldArray: return self.content[indices] if self.is_input is False and self.is_target is False: raise RuntimeError("Please specify either is_input or is_target to True for {}".format(self.name)) - + contents = [self.content[i] for i in indices] if self.padder is None or pad is False: return np.array(contents) else: return self.pad(contents) - + def pad(self, contents): return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim) - + def set_padder(self, padder): """ 设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。 @@ -205,7 +214,7 @@ class FieldArray: self.padder = deepcopy(padder) else: self.padder = None - + def set_pad_val(self, pad_val): """ 修改padder的pad_val. @@ -215,7 +224,7 @@ class FieldArray: if self.padder is not None: self.padder.set_pad_val(pad_val) return self - + def __len__(self): """ Returns the size of FieldArray. @@ -223,7 +232,7 @@ class FieldArray: :return int length: """ return len(self.content) - + def to(self, other): """ 将other的属性复制给本FieldArray(other必须为FieldArray类型). @@ -233,15 +242,15 @@ class FieldArray: :return: :class:`~fastNLP.FieldArray` """ assert isinstance(other, FieldArray), "Only supports fastNLP.FieldArray type, not {}.".format(type(other)) - + self.ignore_type = other.ignore_type self.is_input = other.is_input self.is_target = other.is_target self.padder = other.padder - + return self - - def split(self, sep:str=None, inplace:bool=True): + + def split(self, sep: str = None, inplace: bool = True): """ 依次对自身的元素使用.split()方法,应该只有当本field的元素为str时,该方法才有用。将返回值 @@ -257,8 +266,8 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - - def int(self, inplace:bool=True): + + def int(self, inplace: bool = True): """ 将本field中的值调用int(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。) @@ -277,7 +286,7 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") print(e) return self._after_process(new_contents, inplace=inplace) - + def float(self, inplace=True): """ 将本field中的值调用float(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -297,7 +306,7 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def bool(self, inplace=True): """ 将本field中的值调用bool(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -316,9 +325,9 @@ class FieldArray: except Exception as e: print(f"Exception happens when process value in index {index}.") raise e - + return self._after_process(new_contents, inplace=inplace) - + def lower(self, inplace=True): """ 将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -338,7 +347,7 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def upper(self, inplace=True): """ 将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -358,7 +367,7 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def value_count(self): """ 返回该field下不同value的数量。多用于统计label数量 @@ -366,17 +375,18 @@ class FieldArray: :return: Counter, key是label,value是出现次数 """ count = Counter() - + def cum(cell): if _is_iterable(cell) and not isinstance(cell, str): for cell_ in cell: cum(cell_) else: count[cell] += 1 + for cell in self.content: cum(cell) return count - + def _after_process(self, new_contents, inplace): """ 当调用处理函数之后,决定是否要替换field。 @@ -398,7 +408,7 @@ class FieldArray: return new_contents -def _get_ele_type_and_dim(cell:Any, dim=0): +def _get_ele_type_and_dim(cell: Any, dim=0): """ 识别cell的类别与dimension的数量 @@ -414,13 +424,13 @@ def _get_ele_type_and_dim(cell:Any, dim=0): elif isinstance(cell, list): dim += 1 res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] - types = set([i for i,j in res]) - dims = set([j for i,j in res]) - if len(types)>1: + types = set([i for i, j in res]) + dims = set([j for i, j in res]) + if len(types) > 1: raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) - elif len(types)==0: + elif len(types) == 0: raise SetInputOrTargetException("Empty value encountered.") - if len(dims)>1: + if len(dims) > 1: raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) return types.pop(), dims.pop() elif isinstance(cell, torch.Tensor): @@ -431,16 +441,16 @@ def _get_ele_type_and_dim(cell:Any, dim=0): # 否则需要继续往下iterate dim += 1 res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] - types = set([i for i,j in res]) - dims = set([j for i,j in res]) - if len(types)>1: + types = set([i for i, j in res]) + dims = set([j for i, j in res]) + if len(types) > 1: raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) - elif len(types)==0: + elif len(types) == 0: raise SetInputOrTargetException("Empty value encountered.") - if len(dims)>1: + if len(dims) > 1: raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) return types.pop(), dims.pop() - else: # 包含tuple, set, dict以及其它的类型 + else: # 包含tuple, set, dict以及其它的类型 raise SetInputOrTargetException(f"Cannot process type:{type(cell)}.") @@ -462,15 +472,15 @@ class Padder: :return: np.array([padded_element]) """ - + def __init__(self, pad_val=0, **kwargs): self.pad_val = pad_val - + def set_pad_val(self, pad_val): self.pad_val = pad_val - + @abstractmethod - def __call__(self, contents, field_name, field_ele_dtype, dim:int): + def __call__(self, contents, field_name, field_ele_dtype, dim: int): """ 传入的是List内容。假设有以下的DataSet。 @@ -537,23 +547,24 @@ class AutoPadder(Padder): 3 其它情况不进行处理,返回一个np.array类型。 """ + def __init__(self, pad_val=0): super().__init__(pad_val=pad_val) - + def __call__(self, contents, field_name, field_ele_dtype, dim): if field_ele_dtype: - if dim>3: + if dim > 3: return np.array(contents) if isinstance(field_ele_dtype, type) and \ (issubclass(field_ele_dtype, np.number) or issubclass(field_ele_dtype, Number)): - if dim==0: + if dim == 0: array = np.array(contents, dtype=field_ele_dtype) - elif dim==1: + elif dim == 1: max_len = max(map(len, contents)) array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype) for i, content_i in enumerate(contents): array[i, :len(content_i)] = content_i - elif dim==2: + elif dim == 2: max_len = max(map(len, contents)) max_word_len = max([max([len(content_ii) for content_ii in content_i]) for content_i in contents]) @@ -563,20 +574,21 @@ class AutoPadder(Padder): array[i, j, :len(content_ii)] = content_ii else: shape = np.shape(contents) - if len(shape)==4: # 说明各dimension是相同的大小 + if len(shape) == 4: # 说明各dimension是相同的大小 array = np.array(contents, dtype=field_ele_dtype) else: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") return array elif str(field_ele_dtype).startswith('torch'): - if dim==0: + if dim == 0: tensor = torch.tensor(contents).to(field_ele_dtype) - elif dim==1: + elif dim == 1: max_len = max(map(len, contents)) tensor = torch.full((len(contents), max_len), fill_value=self.pad_val, dtype=field_ele_dtype) for i, content_i in enumerate(contents): tensor[i, :len(content_i)] = torch.tensor(content_i) - elif dim==2: + elif dim == 2: max_len = max(map(len, contents)) max_word_len = max([max([len(content_ii) for content_ii in content_i]) for content_i in contents]) @@ -587,15 +599,18 @@ class AutoPadder(Padder): tensor[i, j, :len(content_ii)] = torch.tensor(content_ii) else: shapes = set([np.shape(content_i) for content_i in contents]) - if len(shapes)>1: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + if len(shapes) > 1: + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") shape = shapes.pop() - if len(shape)==3: - tensor = torch.full([len(contents)]+list(shape), fill_value=self.pad_val, dtype=field_ele_dtype) + if len(shape) == 3: + tensor = torch.full([len(contents)] + list(shape), fill_value=self.pad_val, + dtype=field_ele_dtype) for i, content_i in enumerate(contents): tensor[i] = torch.tensor(content_i, dtype=field_ele_dtype) else: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") return tensor else: return np.array(contents) # 不进行任何操作 @@ -626,7 +641,7 @@ class EngChar2DPadder(Padder): dataset.set_padder('chars', padder) # chars这个field的设置为了EnChar2DPadder """ - + def __init__(self, pad_val=0, pad_length=0): """ :param pad_val: int, pad的位置使用该index @@ -634,9 +649,9 @@ class EngChar2DPadder(Padder): 都pad或截取到该长度. """ super().__init__(pad_val=pad_val) - + self.pad_length = pad_length - + def __call__(self, contents, field_name, field_ele_dtype, dim): """ 期望输入类似于 @@ -655,7 +670,7 @@ class EngChar2DPadder(Padder): raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format( field_name, field_ele_dtype )) - assert dim==2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions." + assert dim == 2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions." if self.pad_length < 1: max_char_length = max([max(len(char_lst) for char_lst in word_lst) for word_lst in contents]) else: @@ -663,12 +678,12 @@ class EngChar2DPadder(Padder): max_sent_length = max(len(word_lst) for word_lst in contents) batch_size = len(contents) dtype = type(contents[0][0][0]) - + padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val, dtype=dtype) for b_idx, word_lst in enumerate(contents): for c_idx, char_lst in enumerate(word_lst): chars = char_lst[:max_char_length] padded_array[b_idx, c_idx, :len(chars)] = chars - + return padded_array