From 661780b9757586d4bd56b0f8437cbc0b5d497eec Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 4 Dec 2018 10:54:09 +0800 Subject: [PATCH 1/2] Improve FieldArray. Support nested list and a list of np.array --- fastNLP/core/fieldarray.py | 90 +++++++++++++++++++++--------------- fastNLP/core/losses.py | 1 + test/core/test_fieldarray.py | 18 ++++++-- 3 files changed, 69 insertions(+), 40 deletions(-) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index 1b1a89c1..a1ece0aa 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -11,7 +11,7 @@ class FieldArray(object): """ :param str name: the name of the FieldArray - :param list content: a list of int, float, or a list of list. + :param list content: a list of int, float, str or np.ndarray, or a list of list of one. :param int padding_val: the integer for padding. Default: 0. :param bool is_target: If True, this FieldArray is used to compute loss. :param bool is_input: If True, this FieldArray is used to the model input. @@ -27,35 +27,46 @@ class FieldArray(object): self.padding_val = padding_val self.is_target = is_target self.is_input = is_input + + self.BASIC_TYPES = (int, float, str, np.ndarray) + self.is_2d_list = False self.pytype = self._type_detection(content) self.dtype = self._map_to_np_type(self.pytype) - @staticmethod - def _type_detection(content): + def _type_detection(self, content): + """ + :param content: a list of int, float, str or np.ndarray, or a list of list of one. + :return type: one of int, float, str, np.ndarray + + """ if isinstance(content, list) and len(content) > 0 and isinstance(content[0], list): - # 2-D list - # TODO: refactor - type_set = set([type(item) for item in content[0]]) - else: - # 1-D list + # content is a 2-D list + type_set = set([self._type_detection(x) for x in content]) + if len(type_set) > 1: + raise RuntimeError("Cannot create FieldArray with more than one type. Provided {}".format(type_set)) + self.is_2d_list = True + return type_set.pop() + + elif isinstance(content, list): + # content is a 1-D list if len(content) == 0: raise RuntimeError("Cannot create FieldArray with an empty list.") type_set = set([type(item) for item in content]) - if len(type_set) == 1 and any(basic_type in type_set for basic_type in (str, int, float)): - return type_set.pop() - elif len(type_set) == 2 and float in type_set and int in type_set: - # up-cast int to float - for idx, _ in enumerate(content): - content[idx] = float(content[idx]) - return float + if len(type_set) == 1 and tuple(type_set)[0] in self.BASIC_TYPES: + return type_set.pop() + elif len(type_set) == 2 and float in type_set and int in type_set: + # up-cast int to float + return float + else: + raise RuntimeError("Cannot create FieldArray with type {}".format(*type_set)) else: - raise ValueError("Unsupported type conversion detected in FieldArray: {}".format(*type_set)) + raise RuntimeError("Cannot create FieldArray with type {}".format(type(content))) @staticmethod def _map_to_np_type(basic_type): - type_mapping = {int: np.int64, float: np.float64, str: np.str} + type_mapping = {int: np.int64, float: np.float64, str: np.str, np.ndarray: np.ndarray} return type_mapping[basic_type] def __repr__(self): @@ -64,29 +75,35 @@ class FieldArray(object): def append(self, val): """Add a new item to the tail of FieldArray. - :param val: int, float, str, or a list of them. + :param val: int, float, str, or a list of one. """ val_type = type(val) - if val_type is int and self.pytype is float: - # up-cast the appended value - val = float(val) - elif val_type is float and self.pytype is int: - # up-cast all other values in the content - for idx, _ in enumerate(self.content): - self.content[idx] = float(self.content[idx]) - self.pytype = float - self.dtype = self._map_to_np_type(self.pytype) - elif val_type is list: + if val_type == list: # shape check + if self.is_2d_list is False: + raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.") if len(val) == 0: - raise ValueError("Cannot append an empty list.") + raise RuntimeError("Cannot append an empty list.") + val_list_type = [type(_) for _ in val] # type check + if len(val_list_type) == 2 and int in val_list_type and float in val_list_type: + # up-cast int to float + val_type = float + elif len(val_list_type) == 1: + val_type = val_list_type[0] else: - if type(val[0]) != self.pytype: - raise ValueError( - "Cannot append a list of {}-type value into a {}-tpye FieldArray.". - format(type(val[0]), self.pytype)) - elif val_type != self.pytype: - raise ValueError("Cannot append a {}-type value into a {}-tpye FieldArray.".format(val_type, self.pytype)) - + raise RuntimeError("Cannot append a list of {}".format(val_list_type)) + else: + if self.is_2d_list is True: + raise RuntimeError("Cannot append a non-list into a 2-D list. Please provide a list.") + if val_type == float and self.pytype == int: + # up-cast + self.pytype = float + self.dtype = self._map_to_np_type(self.pytype) + elif val_type == int and self.pytype == float: + pass + elif val_type == self.pytype: + pass + else: + raise RuntimeError("Cannot append type {} into type {}".format(val_type, self.pytype)) self.content.append(val) def __getitem__(self, indices): @@ -102,7 +119,6 @@ class FieldArray(object): :param indices: an int, or a list of int. :return: """ - # TODO: 返回行为不一致,有隐患 if isinstance(indices, int): return self.content[indices] assert self.is_input is True or self.is_target is True diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index f2fb16d0..af3d2ef0 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -126,6 +126,7 @@ class LossBase(object): for keys, val in target_dict.items(): param_val_dict.update({keys: val}) + # TODO: use the origin key to raise error if not self._checked: for keys in args: if param_map[keys] not in param_val_dict.keys(): diff --git a/test/core/test_fieldarray.py b/test/core/test_fieldarray.py index 883e1136..0264c2ff 100644 --- a/test/core/test_fieldarray.py +++ b/test/core/test_fieldarray.py @@ -24,19 +24,31 @@ class TestFieldArray(unittest.TestCase): def test_type_conversion(self): fa = FieldArray("x", [1.2, 2.2, 3, 4, 5], is_input=True) self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.double) + self.assertEqual(fa.dtype, np.float64) fa = FieldArray("x", [1, 2, 3, 4, 5], is_input=True) fa.append(1.3333) self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.double) + self.assertEqual(fa.dtype, np.float64) fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=False) fa.append(10) self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.double) + self.assertEqual(fa.dtype, np.float64) fa = FieldArray("y", ["a", "b", "c", "d"], is_input=False) fa.append("e") self.assertEqual(fa.dtype, np.str) self.assertEqual(fa.pytype, str) + + def test_support_np_array(self): + fa = FieldArray("y", [np.array([1.1, 2.2, 3.3, 4.4, 5.5])], is_input=False) + self.assertEqual(fa.dtype, np.ndarray) + + fa.append(np.array([1.1, 2.2, 3.3, 4.4, 5.5])) + self.assertEqual(fa.pytype, np.ndarray) + + def test_nested_list(self): + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.1, 2.2, 3.3, 4.4, 5.5]], is_input=False) + self.assertEqual(fa.pytype, float) + self.assertEqual(fa.dtype, np.float64) From 4b099bb0ddee13e3414a18f1eccd19ecd9286248 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 4 Dec 2018 11:16:24 +0800 Subject: [PATCH 2/2] * add tqdm in requirements.txt * fix FieldArray type check bugs --- fastNLP/core/fieldarray.py | 4 ++-- requirements.txt | 1 + test/core/test_trainer.py | 24 ++++++++++++++++++------ 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/fastNLP/core/fieldarray.py b/fastNLP/core/fieldarray.py index a1ece0aa..0a94b26c 100644 --- a/fastNLP/core/fieldarray.py +++ b/fastNLP/core/fieldarray.py @@ -83,12 +83,12 @@ class FieldArray(object): raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.") if len(val) == 0: raise RuntimeError("Cannot append an empty list.") - val_list_type = [type(_) for _ in val] # type check + val_list_type = set([type(_) for _ in val]) # type check if len(val_list_type) == 2 and int in val_list_type and float in val_list_type: # up-cast int to float val_type = float elif len(val_list_type) == 1: - val_type = val_list_type[0] + val_type = val_list_type.pop() else: raise RuntimeError("Cannot append a list of {}".format(val_list_type)) else: diff --git a/requirements.txt b/requirements.txt index 91a3f040..60ab7849 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.14.2 torch>=0.4.0 tensorboardX +tqdm \ No newline at end of file diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index ed4cc38d..2b14aa11 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -1,8 +1,8 @@ import unittest import numpy as np -from torch import nn import torch.nn.functional as F +from torch import nn from fastNLP.core.dataset import DataSet from fastNLP.core.instance import Instance @@ -26,6 +26,7 @@ def prepare_fake_dataset(): [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) return data_set + def prepare_fake_dataset2(*args, size=100): ys = np.random.randint(4, size=100) data = {'y': ys} @@ -33,6 +34,7 @@ def prepare_fake_dataset2(*args, size=100): data[arg] = np.random.randn(size, 5) return DataSet(data=data) + class TrainerTestGround(unittest.TestCase): def test_case(self): data_set = prepare_fake_dataset() @@ -55,15 +57,20 @@ class TrainerTestGround(unittest.TestCase): check_code_level=2, use_tqdm=True) trainer.train() + """ + # 应该正确运行 + """ def test_trainer_suggestion1(self): # 检查报错提示能否正确提醒用户。 # 这里没有传入forward需要的数据。需要trainer提醒用户如何设置。 dataset = prepare_fake_dataset2('x') + class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): x1 = self.fc(x1) x2 = self.fc(x2) @@ -72,10 +79,12 @@ class TrainerTestGround(unittest.TestCase): return {'loss': loss} model = Model() - trainer = Trainer( - train_data=dataset, - model=model - ) + + with self.assertRaises(NameError): + trainer = Trainer( + train_data=dataset, + model=model + ) """ # 应该获取到的报错提示 NameError: @@ -91,10 +100,12 @@ class TrainerTestGround(unittest.TestCase): # 这里传入forward需要的数据,看是否可以运行 dataset = prepare_fake_dataset2('x1', 'x2') dataset.set_input('x1', 'x2', 'y', flag=True) + class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): x1 = self.fc(x1) x2 = self.fc(x2) @@ -119,10 +130,12 @@ class TrainerTestGround(unittest.TestCase): # 这里传入forward需要的数据,但是forward没有返回loss这个key dataset = prepare_fake_dataset2('x1', 'x2') dataset.set_input('x1', 'x2', 'y', flag=True) + class Model(nn.Module): def __init__(self): super().__init__() self.fc = nn.Linear(5, 4) + def forward(self, x1, x2, y): x1 = self.fc(x1) x2 = self.fc(x2) @@ -142,7 +155,6 @@ class TrainerTestGround(unittest.TestCase): # 应该正确运行 """ - def test_case2(self): # check metrics Wrong data_set = prepare_fake_dataset2('x1', 'x2')