From 0257dc6dde4e13aea632267d5fcdae3a7576db08 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 6 Jun 2019 00:28:12 +0800 Subject: [PATCH 01/17] =?UTF-8?q?=E4=BF=AE=E5=A4=8DTrainer=E6=97=A0?= =?UTF-8?q?=E6=B3=95=E6=8D=95=E8=8E=B7Exception=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 2 +- test/core/test_callbacks.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 57a31a69..d7694e00 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -532,7 +532,7 @@ class Trainer(object): self._train() self.callback_manager.on_train_end() - except Exception as e: + except BaseException as e: self.callback_manager.on_exception(e) if on_exception == 'auto': if not isinstance(e, (CallbackException, KeyboardInterrupt)): diff --git a/test/core/test_callbacks.py b/test/core/test_callbacks.py index e2aa5fa4..71a5565d 100644 --- a/test/core/test_callbacks.py +++ b/test/core/test_callbacks.py @@ -66,8 +66,7 @@ class TestCallback(unittest.TestCase): dev_data=data_set, metrics=AccuracyMetric(pred="predict", target="y"), callbacks=[EarlyStopCallback(5)]) - with self.assertRaises(EarlyStopError): - trainer.train() + trainer.train() def test_lr_scheduler(self): data_set, model = prepare_env() From e90bbbb3f1912b7e058f40a0a04e6ad377e38038 Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 7 Jun 2019 23:28:05 +0800 Subject: [PATCH 02/17] =?UTF-8?q?=EF=BC=81=EF=BC=81=EF=BC=81=E9=87=8D?= =?UTF-8?q?=E8=A6=81=E6=9B=B4=E6=96=B0=EF=BC=8CDataSet=E7=90=86=E8=AE=BA?= =?UTF-8?q?=E4=B8=8A=E6=94=AF=E6=8C=81=E4=BB=BB=E6=84=8F=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E7=9A=84=E6=95=B0=E6=8D=AE=E4=BA=86=EF=BC=8C=E4=BD=86=E6=98=AF?= =?UTF-8?q?=E5=9B=A0=E4=B8=BA=E6=94=B9=E5=8A=A8=E9=9D=9E=E5=B8=B8=E5=A4=A7?= =?UTF-8?q?=EF=BC=8C=E6=89=80=E4=BB=A5=E5=8F=AF=E8=83=BD=E4=BC=9A=E6=9C=89?= =?UTF-8?q?bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/batch.py | 17 +- fastNLP/core/dataset.py | 23 +- fastNLP/core/field.py | 572 ++++++++++++++++++------------------- fastNLP/io/embed_loader.py | 4 +- test/core/test_field.py | 205 +++++++++---- 5 files changed, 444 insertions(+), 377 deletions(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 109d4fe9..0ca920d4 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -12,6 +12,7 @@ from queue import Empty, Full import numpy as np import torch import torch.multiprocessing as mp +from numbers import Number from .sampler import RandomSampler @@ -78,8 +79,10 @@ class Batch(object): for field_name, field in self.dataset.get_all_fields().items(): if field.is_target or field.is_input: batch = field.get(indices) - if not self.as_numpy and field.padder is not None: - batch = _to_tensor(batch, field.dtype) + if not self.as_numpy and \ + field.dtype is not None and \ + issubclass(field.dtype, Number) and not isinstance(batch, torch.Tensor): + batch = _to_tensor(batch) if field.is_target: batch_y[field_name] = batch if field.is_input: @@ -174,12 +177,12 @@ class Batch(object): # print('iter done') -def _to_tensor(batch, dtype): +def _to_tensor(batch): try: - if dtype in (int, np.int8, np.int16, np.int32, np.int64): - batch = torch.LongTensor(batch) - if dtype in (float, np.float32, np.float64): - batch = torch.FloatTensor(batch) + if issubclass(batch.dtype.type, np.floating): + batch = torch.as_tensor(batch).float() # 默认使用float32 + else: + batch = torch.as_tensor(batch) # 复用内存地址,避免复制 except: pass return batch diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 9f24adf2..ab020ce4 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -285,7 +285,8 @@ from .field import AutoPadder from .field import FieldArray from .instance import Instance from .utils import _get_func_signature - +from .field import AppendToTargetOrInputException +from .field import SetInputOrTargetException class DataSet(object): """ @@ -422,7 +423,7 @@ class DataSet(object): if len(self.field_arrays) == 0: # DataSet has no field yet for name, field in instance.fields.items(): - field = field.tolist() if isinstance(field, np.ndarray) else field + # field = field.tolist() if isinstance(field, np.ndarray) else field self.field_arrays[name] = FieldArray(name, [field]) # 第一个样本,必须用list包装起来 else: if len(self.field_arrays) != len(instance.fields): @@ -431,7 +432,11 @@ class DataSet(object): .format(len(self.field_arrays), len(instance.fields))) for name, field in instance.fields.items(): assert name in self.field_arrays - self.field_arrays[name].append(field) + try: + self.field_arrays[name].append(field) + except AppendToTargetOrInputException as e: + print(f"Cannot append to field:{name}.") + raise e def add_fieldarray(self, field_name, fieldarray): """ @@ -565,7 +570,11 @@ class DataSet(object): assert isinstance(flag, bool), "Only bool type supported." for name in field_names: if name in self.field_arrays: - self.field_arrays[name].is_target = flag + try: + self.field_arrays[name].is_target = flag + except SetInputOrTargetException as e: + print(f"Cannot set field:{name} as target.") + raise e else: raise KeyError("{} is not a valid field name.".format(name)) @@ -581,7 +590,11 @@ class DataSet(object): """ for name in field_names: if name in self.field_arrays: - self.field_arrays[name].is_input = flag + try: + self.field_arrays[name].is_input = flag + except SetInputOrTargetException as e: + print(f"Cannot set field:{name} as input.") + raise e else: raise KeyError("{} is not a valid field name.".format(name)) diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 9ef8d963..c47771df 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -1,251 +1,162 @@ -""" -field模块实现了 FieldArray 和若干 Padder。 FieldArray 是 :class:`~fastNLP.DataSet` 中一列的存储方式, -原理部分请参考 :doc:`fastNLP.core.dataset` - -""" -__all__ = [ - "FieldArray", - "Padder", - "AutoPadder", - "EngChar2DPadder" -] -from copy import deepcopy +from numbers import Number +import torch import numpy as np +from typing import Any +from abc import abstractmethod +from copy import deepcopy - -class FieldArray(object): - """ - 别名::class:`fastNLP.FieldArray` :class:`fastNLP.core.field.FieldArray` - - FieldArray 是用于保存 :class:`~fastNLP.DataSet` 中一个field的类型。 - - :param str name: FieldArray的名称 - :param list,numpy.ndarray content: 列表的元素可以为list,int,float, - :param bool is_target: 这个field是否是一个target field。 - :param bool is_input: 这个field是否是一个input field。 - :param padder: :class:`~fastNLP.Padder` 类型。赋值给fieldarray的padder的对象会被deepcopy一份,需要修改padder参数必须通过 - fieldarray.set_pad_val()。默认为None,即使用 :class:`~fastNLP.AutoPadder` 。 - :param bool ignore_type: 是否忽略该field的type,一般如果这个field不需要转为torch.FloatTensor或torch.LongTensor, - 就可以设置为True。具体意义请参考 :class:`~fastNLP.DataSet` 。 - """ - - def __init__(self, name, content, is_target=None, is_input=None, padder=None, ignore_type=False): +class SetInputOrTargetException(Exception): + def __init__(self, msg, index=None, field_name=None): + super().__init__(msg) + self.msg = msg + self.index = index # 标示在哪个数据遭遇到问题了 + self.field_name = field_name # 标示当前field的名称 + +class AppendToTargetOrInputException(Exception): + def __init__(self, msg, index=None, field_name=None): + super().__init__(msg) + self.msg = msg + self.index = index # 标示在哪个数据遭遇到问题了 + self.field_name = field_name # 标示当前field的名称 + +class FieldArray: + def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False): + if len(content)==0: + raise RuntimeError("Empty fieldarray is not allowed.") + _content = content + try: + _content = list(_content) + except BaseException as e: + print(f"Cannot convert content(of type:{type(content)}) into list.") + raise e self.name = name - if isinstance(content, list): - # 如果DataSet使用dict初始化, content 可能是二维list/二维array/三维list - # 如果DataSet使用list of Instance 初始化, content可能是 [list]/[array]/[2D list] - for idx, item in enumerate(content): - # 这是使用list of Instance 初始化时第一个样本:FieldArray(name, [field]) - # 将[np.array] 转化为 list of list - # 也可以支持[array, array, array]的情况 - if isinstance(item, np.ndarray): - content[idx] = content[idx].tolist() - elif isinstance(content, np.ndarray): - content = content.tolist() # convert np.ndarray into 2-D list - else: - raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content))) - if len(content) == 0: - raise RuntimeError("Cannot initialize FieldArray with empty list.") - - self.content = content # 1维 或 2维 或 3维 list, 形状可能不对齐 - self.content_dim = None # 表示content是多少维的list + self.content = _content + self._ignore_type = ignore_type + # 根据input的情况设置input,target等 + self._cell_ndim = None # 多少维度 + self.dtype = None # 最内层的element都是什么类型的 + self._is_input = False + self._is_target = False + + if is_input: + self.is_input = is_input + if is_target: + self.is_target = is_target + if padder is None: padder = AutoPadder(pad_val=0) else: - assert isinstance(padder, Padder), "padder must be of type Padder." + assert isinstance(padder, Padder), "padder must be of type fastNLP.Padder." padder = deepcopy(padder) self.set_padder(padder) - self.ignore_type = ignore_type - - self.BASIC_TYPES = (int, float, str) # content中可接受的Python基本类型,这里没有np.array - - self.pytype = None - self.dtype = None - self._is_input = None - self._is_target = None - - if is_input is not None or is_target is not None: - self.is_input = is_input - self.is_target = is_target - - def _set_dtype(self): - if self.ignore_type is False: - self.pytype = self._type_detection(self.content) - self.dtype = self._map_to_np_type(self.pytype) - + + @property + def ignore_type(self): + return self._ignore_type + + @ignore_type.setter + def ignore_type(self, value): + if value: + self._cell_ndim = None + self.dtype = None + @property def is_input(self): return self._is_input - + @is_input.setter def is_input(self, value): """ 当 field_array.is_input = True / False 时被调用 """ - if value is True: - self._set_dtype() + # 如果(value为True)且(_is_input和_is_target都是False)且(ignore_type为False) + if value is True and \ + self._is_target is False and \ + self._ignore_type is False: + self._check_dtype_and_ndim() + if value is False and self._is_target is False: + self.dtype = None + self._cell_ndim = None self._is_input = value - + @property def is_target(self): return self._is_target - + @is_target.setter def is_target(self, value): """ 当 field_array.is_target = True / False 时被调用 """ - if value is True: - self._set_dtype() + if value is True and \ + self._is_input is False and \ + self._ignore_type is False: + self._check_dtype_and_ndim() + if value is False and self._is_input is False: + self.dtype = None + self._cell_ndim = None self._is_target = value - - def _type_detection(self, content): - """ - 当该field被设置为is_input或者is_target时被调用 + def _check_dtype_and_ndim(self): """ - if len(content) == 0: - raise RuntimeError("Empty list in Field {}.".format(self.name)) - - type_set = set([type(item) for item in content]) - - if list in type_set: - if len(type_set) > 1: - # list 跟 非list 混在一起 - raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, list(type_set))) - # >1维list - inner_type_set = set() - for l in content: - [inner_type_set.add(type(obj)) for obj in l] - if list not in inner_type_set: - # 二维list - self.content_dim = 2 - return self._basic_type_detection(inner_type_set) - else: - if len(inner_type_set) == 1: - # >2维list - inner_inner_type_set = set() - for _2d_list in content: - for _1d_list in _2d_list: - [inner_inner_type_set.add(type(obj)) for obj in _1d_list] - if list in inner_inner_type_set: - raise RuntimeError("FieldArray cannot handle 4-D or more-D list.") - # 3维list - self.content_dim = 3 - return self._basic_type_detection(inner_inner_type_set) - else: - # list 跟 非list 混在一起 - raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, list(inner_type_set))) - else: - # 一维list - for content_type in type_set: - if content_type not in self.BASIC_TYPES: - raise RuntimeError("Unexpected data type in Field '{}'. Expect one of {}. Got {}.".format( - self.name, self.BASIC_TYPES, content_type)) - self.content_dim = 1 - return self._basic_type_detection(type_set) - - def _basic_type_detection(self, type_set): + 检查当前content所有的element是否是同一个类型,且是否每个元素具有相同的维度。通过的话,设置_cell_ndim与_ele_type属性;没有 + 通过将直接报错. + + :return: """ - :param type_set: a set of Python types - :return: one of self.BASIC_TYPES + cell_0 = self.content[0] + index = 0 + try: + type_0, dim_0 = _get_ele_type_and_dim(cell_0) + for cell in self.content[1:]: + index += 1 + type_i, dim_i = _get_ele_type_and_dim(cell) + if type_i!=type_0: + raise SetInputOrTargetException("Type:{} in index {} is different from the first element with type:{}." + ".".format(type_i, index, type_0)) + if dim_0!=dim_i: + raise SetInputOrTargetException("Dimension:{} in index {} is different from the first element with " + "dimension:{}.".format(dim_i, index, dim_0)) + self._cell_ndim = dim_0 + self.dtype = type_0 + except SetInputOrTargetException as e: + e.index = index + raise e + + def append(self, val:Any): + """ + :param val: 把该val append到fieldarray。 + :return: """ - if len(type_set) == 1: - return type_set.pop() - elif len(type_set) == 2: - # 有多个basic type; 可能需要up-cast - if float in type_set and int in type_set: - # up-cast int to float - return float - else: - # str 跟 int 或者 float 混在一起 - raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, list(type_set))) + if (self._is_target or self._is_input) and self._ignore_type is False: + type_, dim_ = _get_ele_type_and_dim(val) + if self.dtype!=type_: + raise AppendToTargetOrInputException(f"Value(type:{type_}) are of different types with " + f"previous values(type:{self.dtype}).") + if self._cell_ndim!=dim_: + raise AppendToTargetOrInputException(f"Value(dim:{dim_}) are of different dimensions with " + f"previous values(dim:{self._cell_ndim}).") + self.content.append(val) else: - # str, int, float混在一起 - raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, list(type_set))) - - def _1d_list_check(self, val): - """如果不是1D list就报错 - """ - type_set = set((type(obj) for obj in val)) - if any(obj not in self.BASIC_TYPES for obj in type_set): - raise ValueError("Mixed data types in Field {}: {}".format(self.name, list(type_set))) - self._basic_type_detection(type_set) - # otherwise: _basic_type_detection will raise error - return True - - def _2d_list_check(self, val): - """如果不是2D list 就报错 - """ - type_set = set(type(obj) for obj in val) - if list(type_set) != [list]: - raise ValueError("Mixed data types in Field {}: {}".format(self.name, type_set)) - inner_type_set = set() - for l in val: - for obj in l: - inner_type_set.add(type(obj)) - self._basic_type_detection(inner_type_set) - return True - - @staticmethod - def _map_to_np_type(basic_type): - type_mapping = {int: np.int64, float: np.float64, str: np.str, np.ndarray: np.ndarray} - return type_mapping[basic_type] - - def __repr__(self): - return "FieldArray {}: {}".format(self.name, self.content.__repr__()) - - def append(self, val): - """将val append到这个field的尾部。如果这个field已经被设置为input或者target,则在append之前会检查该类型是否与已有 - 的内容是匹配的。 - - :param Any val: 需要append的值。 - """ - if self.ignore_type is False: - if isinstance(val, list): - pass - elif isinstance(val, tuple): # 确保最外层是list - val = list(val) - elif isinstance(val, np.ndarray): - val = val.tolist() - elif any((isinstance(val, t) for t in self.BASIC_TYPES)): - pass - else: - raise RuntimeError( - "Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES)) - - if self.is_input is True or self.is_target is True: - if type(val) == list: - if len(val) == 0: - raise ValueError("Cannot append an empty list.") - if self.content_dim == 2 and self._1d_list_check(val): - # 1维list检查 - pass - elif self.content_dim == 3 and self._2d_list_check(val): - # 2维list检查 - pass - else: - raise RuntimeError( - "Dimension not matched: expect dim={}, got {}.".format(self.content_dim - 1, val)) - elif type(val) in self.BASIC_TYPES and self.content_dim == 1: - # scalar检查 - if type(val) == float and self.pytype == int: - self.pytype = float - self.dtype = self._map_to_np_type(self.pytype) - else: - raise RuntimeError( - "Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES)) - self.content.append(val) - + self.content.append(val) + def __getitem__(self, indices): return self.get(indices, pad=False) - + def __setitem__(self, idx, val): assert isinstance(idx, int) + if (self._is_target or self._is_input) and self.ignore_type is False: # 需要检测类型 + type_, dim_ = _get_ele_type_and_dim(val) + if self.dtype!=type_: + raise RuntimeError(f"Value(type:{type_}) are of different types with " + f"other values(type:{self.dtype}).") + if self._cell_ndim!=dim_: + raise RuntimeError(f"Value(dim:{dim_}) are of different dimensions with " + f"previous values(dim:{self._cell_ndim}).") self.content[idx] = val - + def get(self, indices, pad=True): """ 根据给定的indices返回内容 @@ -257,14 +168,14 @@ class FieldArray(object): if isinstance(indices, int): return self.content[indices] if self.is_input is False and self.is_target is False: - raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name)) - + raise RuntimeError("Please specify either is_input or is_target to True for {}".format(self.name)) + contents = [self.content[i] for i in indices] if self.padder is None or pad is False: return np.array(contents) else: - return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype) - + return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim) + def set_padder(self, padder): """ 设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。 @@ -276,7 +187,7 @@ class FieldArray(object): self.padder = deepcopy(padder) else: self.padder = None - + def set_pad_val(self, pad_val): """ 修改padder的pad_val. @@ -286,7 +197,7 @@ class FieldArray(object): if self.padder is not None: self.padder.set_pad_val(pad_val) return self - + def __len__(self): """ Returns the size of FieldArray. @@ -294,7 +205,7 @@ class FieldArray(object): :return int length: """ return len(self.content) - + def to(self, other): """ 将other的属性复制给本FieldArray(other必须为FieldArray类型). @@ -303,22 +214,63 @@ class FieldArray(object): :param other: :class:`~fastNLP.FieldArray` 从哪个field拷贝属性 :return: :class:`~fastNLP.FieldArray` """ - assert isinstance(other, FieldArray), "Only support FieldArray type, not {}.".format(type(other)) - + assert isinstance(other, FieldArray), "Only supports fastNLP.FieldArray type, not {}.".format(type(other)) + + self.ignore_type = other.ignore_type self.is_input = other.is_input self.is_target = other.is_target self.padder = other.padder - self.ignore_type = other.ignore_type - + return self -def _is_iterable(content): +def _get_ele_type_and_dim(cell:Any, dim=0): + """ + 识别cell的类别与dimension的数量 + + numpy scalar type:https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.scalars.html + :param cell: + :param dim: + :return: + """ + if isinstance(cell, (str, Number, np.bool_)): + return type(cell), dim + elif isinstance(cell, list): + dim += 1 + res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] + types = set([i for i,j in res]) + dims = set([j for i,j in res]) + if len(types)>1: + raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) + if len(dims)>1: + raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) + return types.pop(), dims.pop() + elif isinstance(cell, torch.Tensor): + return cell.dtype, cell.dim() + dim # 如果是torch.mean的结果是0 + elif isinstance(cell, np.ndarray): + if cell.dtype != np.dtype('O'): # 如果不是object的话说明是well-formatted的了 + return cell.dtype.type, cell.ndim + dim + # 否则需要继续往下iterate + dim += 1 + res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] + types = set([i for i,j in res]) + dims = set([j for i,j in res]) + if len(types)>1: + raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) + if len(dims)>1: + raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) + return types.pop(), dims.pop() + else: # 包含tuple, set, dict以及其它的类型 + raise SetInputOrTargetException(f"Cannot process type:{type(cell)}.") + + +def _is_iterable(value): + # 检查是否是iterable的, duck typing try: - _ = (e for e in content) - except TypeError: + iter(value) + return True + except BaseException as e: return False - return True class Padder: @@ -327,32 +279,35 @@ class Padder: 所有padder都需要继承这个类,并覆盖__call__方法。 用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 - + .. py:function:: __call__(self, contents, field_name, field_ele_dtype): 传入的是List内容。假设有以下的DataSet。 - + :param list(Any) contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 deepcopy一份。 :param str, field_name: field的名称。 :param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True,该这个值为None。 :return: np.array([padded_element]) - + """ - + def __init__(self, pad_val=0, **kwargs): self.pad_val = pad_val - + def set_pad_val(self, pad_val): self.pad_val = pad_val - - def __call__(self, contents, field_name, field_ele_dtype): + + @abstractmethod + def __call__(self, contents, field_name, field_ele_dtype, dim:int): """ 传入的是List内容。假设有以下的DataSet。 :param list(Any) contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 deepcopy一份。 :param str, field_name: field的名称。 - :param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True,该这个值为None。 + :param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True, + 该这个值为None。 + :param dim: 这个field的维度。当ignore_type为True时,该值为None :return: np.array([padded_element]) Example:: @@ -394,50 +349,87 @@ class AutoPadder(Padder): 根据contents的数据自动判定是否需要做padding。 1 如果元素类型(元素类型是指field中最里层元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类 - 型为np.str, [[1,2], ...]的元素类型为np.int64)的数据不为(np.int64, np.float64)则不会进行pad + 型为str, [[1,2], ...]的元素类型为int)的数据不为数值类型则不会进行pad + + 2 如果元素类型为数值类型,比如np.int64, np.float64, int, float, torch.int64等 - 2 如果元素类型为(np.int64, np.float64), + 2.1 如果该field的内容为数值类型(包括int, float等),比如为seq_len, 则不进行padding - 2.1 如果该field的内容为(np.int64, np.float64),比如为seq_len, 则不进行padding + 2.2 如果该field的内容等价于一维list, 那么会将Batch中的List pad为一样长。 - 2.2 如果该field的内容为List, 那么会将Batch中的List pad为一样长。若该List下还有里层的List需要padding,请使用其它padder。 - 即如果Instance中field形如[1, 2, 3, ...],则可以pad;若为[[1,2], [3,4, ...]]则不能进行pad + 2.3 如果该field的内容等价于二维list,那么会按照英语character padding的方式进行padding。如果是character padding建议使用 + :class: fastNLP.EngChar2DPadder. + + 2.4 如果该field的内容等价于三维list,则如果每个instance在每个维度上相等,会组成一个batch的tensor返回,这种情况应该是为图片 + 的情况。 + + 3 其它情况不进行处理,返回一个np.array类型。 """ - def __init__(self, pad_val=0): - """ - :param pad_val: int, padding的位置使用该index - """ super().__init__(pad_val=pad_val) - - def _is_two_dimension(self, contents): - """ - 判断contents是不是只有两个维度。[[1,2], [3]]是两个维度. [[[1,2], [3, 4, 5]], [[4,5]]]有三个维度 - :param contents: - :return: - """ - value = contents[0] - if isinstance(value, (np.ndarray, list)): - value = value[0] - if isinstance(value, (np.ndarray, list)): - return False - return True - return False - - def __call__(self, contents, field_name, field_ele_dtype): - - if not _is_iterable(contents[0]): - array = np.array([content for content in contents], dtype=field_ele_dtype) - elif field_ele_dtype in (np.int64, np.float64) and self._is_two_dimension(contents): - max_len = max([len(content) for content in contents]) - array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype) - for i, content in enumerate(contents): - array[i][:len(content)] = content - elif field_ele_dtype is None: - array = np.array(contents) # 当ignore_type=True时,直接返回contents - else: # should only be str - array = np.array([content for content in contents]) - return array + + def __call__(self, contents, field_name, field_ele_dtype, dim): + if field_ele_dtype: + if dim>3: + return np.array(contents) + if isinstance(field_ele_dtype, np.dtype) or field_ele_dtype in (float, int, bool, str): + if isinstance(field_ele_dtype, np.number) or field_ele_dtype in (float, int, bool): + if dim==0: + array = np.array(contents, dtype=field_ele_dtype) + elif dim==1: + max_len = max(map(len, contents)) + array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + array[i, :len(content_i)] = content_i + elif dim==2: + max_len = max(map(len, contents)) + max_word_len = max([max([len(content_ii) for content_ii in content_i]) for + content_i in contents]) + array = np.full((len(contents), max_len, max_word_len), self.pad_val, dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + for j, content_ii in enumerate(content_i): + array[i, j, :len(content_ii)] = content_ii + else: + shape = np.shape(contents) + if len(shape)==4: # 说明各dimension是相同的大小 + array = np.array(contents, dtype=field_ele_dtype) + else: + raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + return array + return np.array(contents) + elif str(field_ele_dtype).startswith('torch'): + if dim==0: + tensor = torch.tensor(contents).to(field_ele_dtype) + elif dim==1: + max_len = max(map(len, contents)) + tensor = torch.full((len(contents), max_len), fill_value=self.pad_val, dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + tensor[i, :len(content_i)] = torch.tensor(content_i) + elif dim==2: + max_len = max(map(len, contents)) + max_word_len = max([max([len(content_ii) for content_ii in content_i]) for + content_i in contents]) + tensor = torch.full((len(contents), max_len, max_word_len), fill_value=self.pad_val, + dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + for j, content_ii in enumerate(content_i): + tensor[i, j, :len(content_ii)] = torch.tensor(content_ii) + else: + shapes = set([np.shape(content_i) for content_i in contents]) + if len(shapes)>1: + raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + shape = shapes.pop() + if len(shape)==3: + tensor = torch.full([len(contents)]+list(shape), fill_value=self.pad_val, dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + tensor[i] = torch.tensor(content_i, dtype=field_ele_dtype) + else: + raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + return tensor + else: + return np.array(contents) # 不进行任何操作 + else: + return np.array(contents) class EngChar2DPadder(Padder): @@ -463,7 +455,7 @@ class EngChar2DPadder(Padder): dataset.set_padder('chars', padder) # chars这个field的设置为了EnChar2DPadder """ - + def __init__(self, pad_val=0, pad_length=0): """ :param pad_val: int, pad的位置使用该index @@ -471,32 +463,10 @@ class EngChar2DPadder(Padder): 都pad或截取到该长度. """ super().__init__(pad_val=pad_val) - + self.pad_length = pad_length - - def _exactly_three_dims(self, contents, field_name): - """ - 检查传入的contents是否刚好是3维,如果不是3维就报错。理论上,第一个维度是batch,第二个维度是word,第三个维度是character - :param contents: - :param field_name: str - :return: - """ - if not isinstance(contents, list): - raise TypeError("contents should be a list, not {}.".format(type(contents))) - value = contents[0] - try: - value = value[0] - except: - raise ValueError("Field:{} only has one dimension.".format(field_name)) - try: - value = value[0] - except: - raise ValueError("Field:{} only has two dimensions.".format(field_name)) - - if _is_iterable(value): - raise ValueError("Field:{} has more than 3 dimension.".format(field_name)) - - def __call__(self, contents, field_name, field_ele_dtype): + + def __call__(self, contents, field_name, field_ele_dtype, dim): """ 期望输入类似于 [ @@ -510,11 +480,11 @@ class EngChar2DPadder(Padder): :param field_ele_dtype :return: """ - if field_ele_dtype not in (np.int64, np.float64): + if field_ele_dtype not in (np.int64, np.float64, int, float): raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format( field_name, field_ele_dtype )) - self._exactly_three_dims(contents, field_name) + assert dim==2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions." if self.pad_length < 1: max_char_length = max([max(len(char_lst) for char_lst in word_lst) for word_lst in contents]) else: @@ -522,12 +492,12 @@ class EngChar2DPadder(Padder): max_sent_length = max(len(word_lst) for word_lst in contents) batch_size = len(contents) dtype = type(contents[0][0][0]) - + padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val, dtype=dtype) for b_idx, word_lst in enumerate(contents): for c_idx, char_lst in enumerate(word_lst): chars = char_lst[:max_char_length] padded_array[b_idx, c_idx, :len(chars)] = chars - + return padded_array diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index bc37777e..4119d93f 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -107,9 +107,9 @@ class EmbedLoader(BaseLoader): :param bool normalize: 是否将每个vector归一化到norm为1 :param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。这里主要可能出错的地 方在于词表有空行或者词表出现了维度不一致。 - :return numpy.ndarray: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。 - :return numpy.ndarray: Vocabulary Embedding的shape是[词表大小+x, 词表维度], "词表大小+x"是由于最终的大小还取决与 + :return (numpy.ndarray, Vocabulary): Embedding的shape是[词表大小+x, 词表维度], "词表大小+x"是由于最终的大小还取决与 是否使用padding, 以及unknown有没有在词表中找到对应的词。 Vocabulary中的词的顺序与Embedding的顺序是一一对应的。 + """ vocab = Vocabulary(padding=padding, unknown=unknown) vec_dict = {} diff --git a/test/core/test_field.py b/test/core/test_field.py index 1f6580c1..e9053f37 100644 --- a/test/core/test_field.py +++ b/test/core/test_field.py @@ -1,8 +1,55 @@ import unittest import numpy as np +import torch from fastNLP import FieldArray +from fastNLP.core.field import _get_ele_type_and_dim +from fastNLP import AutoPadder + +class TestFieldArrayTyepDimDetect(unittest.TestCase): + """ + 检测FieldArray能否正确识别type与ndim + + """ + def test_case1(self): + # 1.1 常规类型测试 + for value in [1, True, 1.0, 'abc']: + type_ = type(value) + _type, _dim = _get_ele_type_and_dim(cell=value) + self.assertListEqual([_type, _dim], [type_, 0]) + # 1.2 mix类型报错 + with self.assertRaises(Exception): + value = [1, 2, 1.0] + self.assertRaises(_get_ele_type_and_dim(value)) + # 带有numpy的测试 + # 2.1 + value = np.array([1, 2, 3]) + type_ = value.dtype + dim_ = 1 + self.assertSequenceEqual(_get_ele_type_and_dim(cell=value), [type_, dim_]) + # 2.2 + value = np.array([[1, 2], [3, 4, 5]]) # char embedding的场景 + self.assertSequenceEqual([int, 2], _get_ele_type_and_dim(value)) + # 2.3 + value = np.zeros((3, 4)) + self.assertSequenceEqual([value.dtype, 2], _get_ele_type_and_dim(value)) + # 2.4 测试错误的dimension + with self.assertRaises(Exception): + value = np.array([[1, 2], [3, [1]]]) + _get_ele_type_and_dim(value) + # 2.5 测试混合类型 + with self.assertRaises(Exception): + value = np.array([[1, 2], [3.0]]) + _get_ele_type_and_dim(value) + + # 带有tensor的测试 + # 3.1 word embedding的场景 + value = torch.zeros(3, 10) + self.assertSequenceEqual([value.dtype, 2], _get_ele_type_and_dim(value)) + # 3.2 char embedding/image的场景 + value = torch.zeros(3, 32, 32) + self.assertSequenceEqual([value.dtype, 3], _get_ele_type_and_dim(value)) class TestFieldArrayInit(unittest.TestCase): @@ -31,12 +78,6 @@ class TestFieldArrayInit(unittest.TestCase): # 三维list fa = FieldArray("x", [[[1, 2], [3, 4]], [[1, 2], [3, 4]]], is_input=True) - def test_init_v7(self): - # list of array - fa = FieldArray("x", [np.array([[1, 2], [3, 4]]), np.array([[1, 2], [3, 4]])], is_input=True) - self.assertEqual(fa.pytype, int) - self.assertEqual(fa.dtype, np.int) - def test_init_v4(self): # 一维list val = [1, 2, 3, 4] @@ -56,6 +97,11 @@ class TestFieldArrayInit(unittest.TestCase): fa.append(val) def test_init_v7(self): + # list of array + fa = FieldArray("x", [np.array([[1, 2], [3, 4]]), np.array([[1, 2], [3, 4]])], is_input=True) + self.assertEqual(fa.dtype, np.array([1]).dtype) + + def test_init_v8(self): # 二维list val = np.array([[1, 2], [3, 4]]) fa = FieldArray("x", [val], is_input=True) @@ -79,33 +125,23 @@ class TestFieldArray(unittest.TestCase): self.assertListEqual(list(fa.get([0, 1, 2])), [1, 2, 3]) def test_type_conversion(self): - fa = FieldArray("x", [1.2, 2.2, 3, 4, 5], is_input=True) - self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.float64) - fa = FieldArray("x", [1, 2, 3, 4, 5], is_input=True) - fa.append(1.3333) - self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.float64) + self.assertEqual(fa.dtype, int) fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True) - fa.append(10) - self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.float64) + fa.append(10.0) + self.assertEqual(fa.dtype, float) fa = FieldArray("y", ["a", "b", "c", "d"], is_input=True) fa.append("e") - self.assertEqual(fa.dtype, np.str) - self.assertEqual(fa.pytype, str) + self.assertEqual(fa.dtype, str) def test_support_np_array(self): fa = FieldArray("y", np.array([[1.1, 2.2, 3.3, 4.4, 5.5]]), is_input=True) self.assertEqual(fa.dtype, np.float64) - self.assertEqual(fa.pytype, float) fa.append(np.array([1.1, 2.2, 3.3, 4.4, 5.5])) self.assertEqual(fa.dtype, np.float64) - self.assertEqual(fa.pytype, float) fa = FieldArray("my_field", np.random.rand(3, 5), is_input=True) # in this case, pytype is actually a float. We do not care about it. @@ -113,11 +149,10 @@ class TestFieldArray(unittest.TestCase): def test_nested_list(self): fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.1, 2.2, 3.3, 4.4, 5.5]], is_input=True) - self.assertEqual(fa.pytype, float) - self.assertEqual(fa.dtype, np.float64) + self.assertEqual(fa.dtype, float) def test_getitem_v1(self): - fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True) self.assertEqual(fa[0], [1.1, 2.2, 3.3, 4.4, 5.5]) ans = fa[[0, 1]] self.assertTrue(isinstance(ans, np.ndarray)) @@ -150,7 +185,7 @@ class TestFieldArray(unittest.TestCase): fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) fa.append(["str", 0, 0, 0, 1.89]) - fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) + fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True) fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) self.assertEqual(len(fa), 3) self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) @@ -163,33 +198,86 @@ class TestFieldArray(unittest.TestCase): fa = FieldArray("y", [(1, "1"), (2, "2"), (3, "3"), (4, "4")], is_target=True, ignore_type=True) -class TestPadder(unittest.TestCase): +class TestAutoPadder(unittest.TestCase): + def test00(self): + padder = AutoPadder() + # 没有类型时 + contents = [(1, 2), ('str', 'a')] + padder(contents, None, None, None) def test01(self): - """ - 测试AutoPadder能否正常工作 - :return: - """ - from fastNLP import AutoPadder + # 测试使用多维的bool, int, str, float的情况 + # str padder = AutoPadder() content = ['This is a str', 'this is another str'] - self.assertListEqual(content, padder(content, None, np.str).tolist()) + self.assertListEqual(content, padder(content, None, str, 0).tolist()) - content = [1, 2] - self.assertListEqual(content, padder(content, None, np.int64).tolist()) - - content = [[1,2], [3], [4]] - self.assertListEqual([[1,2], [3, 0], [4, 0]], - padder(content, None, np.int64).tolist()) + # 1维int + content = [[1, 2, 3], [4,], [5, 6, 7, 8]] + padded_content = [[1, 2, 3, 0], [4, 0, 0, 0], [5, 6, 7, 8]] + self.assertListEqual(padder(content, None, int, 1).tolist(), padded_content) + # 二维int + padded_content = [[[1, 2, 3, 0], [4, 5, 0, 0], [7, 8, 9, 10]], [[1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]] content = [ - [[1, 2, 3], [4, 5], [7,8,9,10]], - [[1]] - ] - self.assertListEqual(content, - padder(content, None, np.int64).tolist()) + [[1, 2, 3], [4, 5], [7, 8, 9, 10]], + [[1]] + ] + self.assertListEqual(padder(content, None, int, 2).tolist(), padded_content) + + # 3维图片 + contents = [np.random.rand(3, 4, 4).tolist() for _ in range(5)] + self.assertTrue(padder(contents, None, float, 3).shape==(5, 3, 4, 4)) + + # 更高维度直接返回 + contents = [np.random.rand(24, 3, 4, 4).tolist() for _ in range(5)] + self.assertTrue(isinstance(padder(contents, None, float, 4), np.ndarray)) def test02(self): + padder = AutoPadder() + # 测试numpy的情况 + # 0维 + contents = np.arange(12) + self.assertListEqual(padder(contents, None, contents.dtype, 0).tolist(), contents.tolist()) + + # 1维 + contents = np.arange(12).reshape((3, 4)) + self.assertListEqual(padder(contents, None, contents.dtype, 1).tolist(), contents.tolist()) + + # 2维 + contents = np.ones((3, 10, 5)) + self.assertListEqual(padder(contents, None, contents.dtype, 2).tolist(), contents.tolist()) + + # 3维 + contents = [np.random.rand(3, 4, 4) for _ in range(5)] + l_contents = [content.tolist() for content in contents] + self.assertListEqual(padder(contents, None, contents[0].dtype, 3).tolist(), l_contents) + + def test03(self): + padder = AutoPadder() + # 测试tensor的情况 + # 0维 + contents = torch.arange(12) + r_contents = padder(contents, None, contents.dtype, 0) + self.assertSequenceEqual(r_contents.tolist(), contents.tolist()) + self.assertTrue(r_contents.dtype==contents.dtype) + + # 0维 + contents = [torch.tensor(1) for _ in range(10)] + self.assertSequenceEqual(padder(contents, None, torch.int64, 0).tolist(), contents) + + # 1维 + contents = torch.randn(3, 4) + padder(contents, None, torch.float64, 1) + + # 3维 + contents = [torch.randn(3, 4, 4) for _ in range(5)] + padder(contents, None, torch.float64, 3) + + + +class TestEngChar2DPadder(unittest.TestCase): + def test01(self): """ 测试EngChar2DPadder能不能正确使用 :return: @@ -198,38 +286,31 @@ class TestPadder(unittest.TestCase): padder = EngChar2DPadder(pad_length=0) contents = [1, 2] - # 不能是1维 - with self.assertRaises(ValueError): - padder(contents, None, np.int64) + # 不能是0维 + with self.assertRaises(Exception): + padder(contents, None, np.int64, 0) contents = [[1, 2]] - # 不能是2维 - with self.assertRaises(ValueError): - padder(contents, None, np.int64) - contents = [[[[1, 2]]]] + # 不能是1维 + with self.assertRaises(Exception): + padder(contents, None, np.int64, 1) + contents = [ + [[[[1, 2]]]] + ] # 不能是3维以上 - with self.assertRaises(ValueError): - padder(contents, None, np.int64) + with self.assertRaises(Exception): + padder(contents, None, np.int64, 3) contents = [ [[1, 2, 3], [4, 5], [7,8,9,10]], [[1]] ] self.assertListEqual([[[1, 2, 3, 0], [4, 5, 0, 0], [7, 8, 9, 10]], [[1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]], - padder(contents, None, np.int64).tolist()) + padder(contents, None, np.int64, 2).tolist()) padder = EngChar2DPadder(pad_length=5, pad_val=-100) self.assertListEqual( [[[1, 2, 3, -100, -100], [4, 5, -100, -100, -100], [7, 8, 9, 10, -100]], [[1, -100, -100, -100, -100], [-100, -100, -100, -100, -100], [-100, -100, -100, -100, -100]]], - padder(contents, None, np.int64).tolist() + padder(contents, None, np.int64, 2).tolist() ) - def test_None_dtype(self): - from fastNLP import AutoPadder - padder = AutoPadder() - content = [ - [[1, 2, 3], [4, 5], [7, 8, 9, 10]], - [[1]] - ] - ans = padder(content, None, None).tolist() - self.assertListEqual(content, ans) From bddce51b05ed9fce8f3e8827b187d78d7e8d32c4 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sat, 8 Jun 2019 09:47:39 +0800 Subject: [PATCH 03/17] merge update --- fastNLP/core/metrics.py | 8 +++++--- reproduction/seqence_labelling/cws/model/module.py | 3 +-- reproduction/seqence_labelling/cws/train_shift_relay.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index f994bd31..77695852 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -22,7 +22,7 @@ from .utils import _check_arg_dict_list from .utils import _get_func_signature from .utils import seq_len_to_mask from .vocabulary import Vocabulary - +from abc import abstractmethod class MetricBase(object): """ @@ -117,10 +117,12 @@ class MetricBase(object): def __init__(self): self.param_map = {} # key is param in function, value is input param. self._checked = False - + + @abstractmethod def evaluate(self, *args, **kwargs): raise NotImplementedError - + + @abstractmethod def get_metric(self, reset=True): raise NotImplemented diff --git a/reproduction/seqence_labelling/cws/model/module.py b/reproduction/seqence_labelling/cws/model/module.py index 6cd8b5e3..86149f39 100644 --- a/reproduction/seqence_labelling/cws/model/module.py +++ b/reproduction/seqence_labelling/cws/model/module.py @@ -1,11 +1,10 @@ from torch import nn import torch -from fastNLP.modules import Embedding import numpy as np class SemiCRFShiftRelay(nn.Module): """ - 该模块是一个decoder,但 + 该模块是一个decoder,但当前不支持含有tag的decode。 """ def __init__(self, L): diff --git a/reproduction/seqence_labelling/cws/train_shift_relay.py b/reproduction/seqence_labelling/cws/train_shift_relay.py index ed512252..c5d436fe 100644 --- a/reproduction/seqence_labelling/cws/train_shift_relay.py +++ b/reproduction/seqence_labelling/cws/train_shift_relay.py @@ -32,7 +32,7 @@ lr = 0.02 #########hyper device = 0 -# !!!!这里前往不要放完全路径,因为这样会暴露你们在服务器上的用户名,比较危险。所以一定要使用相对路径,最好把数据放到 +# !!!!这里千万不要放完全路径,因为这样会暴露你们在服务器上的用户名,比较危险。所以一定要使用相对路径,最好把数据放到 # 你们的reproduction路径下,然后设置.gitignore file_dir = '/path/to/pku' char_embed_path = '/path/to/1grams_t3_m50_corpus.txt' From 2edb2a1a007176025a76cfd68c8fe80a726d4f0b Mon Sep 17 00:00:00 2001 From: Violet Yao Date: Sat, 8 Jun 2019 14:27:52 +0800 Subject: [PATCH 04/17] added yelpLoader --- fastNLP/io/dataset_loader.py | 3 +- .../text_classification/data/yelpLoader.py | 68 +++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 reproduction/text_classification/data/yelpLoader.py diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index e366c6ea..3b5e897c 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -29,6 +29,7 @@ from .file_reader import _read_csv, _read_json, _read_conll from .base_loader import DataSetLoader from .data_loader.sst import SSTLoader from ..core.const import Const +import ast class PeopleDailyCorpusLoader(DataSetLoader): @@ -239,7 +240,7 @@ class JsonLoader(DataSetLoader): if self.fields: ins = {self.fields[k]: v for k, v in d.items()} else: - ins = d + ins = ast.literal_eval(d) ds.append(Instance(**ins)) return ds diff --git a/reproduction/text_classification/data/yelpLoader.py b/reproduction/text_classification/data/yelpLoader.py new file mode 100644 index 00000000..ed5db021 --- /dev/null +++ b/reproduction/text_classification/data/yelpLoader.py @@ -0,0 +1,68 @@ +import ast +from fastNLP import DataSet, Instance, Vocabulary +from fastNLP.core.vocabulary import VocabularyOption +from fastNLP.io import JsonLoader +from fastNLP.io.base_loader import DataInfo +from fastNLP.io.embed_loader import EmbeddingOption +from fastNLP.io.file_reader import _read_json +from typing import Union, Dict +from reproduction.Star_transformer.datasets import EmbedLoader +from reproduction.utils import check_dataloader_paths + + +class yelpLoader(JsonLoader): + + """ + 读取Yelp数据集, DataSet包含fields: + + review_id: str, 22 character unique review id + user_id: str, 22 character unique user id + business_id: str, 22 character business id + useful: int, number of useful votes received + funny: int, number of funny votes received + cool: int, number of cool votes received + date: str, date formatted YYYY-MM-DD + words: list(str), 需要分类的文本 + target: str, 文本的标签 + + 数据来源: https://www.yelp.com/dataset/download + + :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` + """ + + def __init__(self, fine_grained=False): + super(yelpLoader, self).__init__() + tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral', + '4.0': 'positive', '5.0': 'very positive'} + if not fine_grained: + tag_v['1.0'] = tag_v['2.0'] + tag_v['5.0'] = tag_v['4.0'] + self.fine_grained = fine_grained + self.tag_v = tag_v + + def _load(self, path): + ds = DataSet() + for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): + d = ast.literal_eval(d) + d["words"] = d.pop("text").split() + d["target"] = self.tag_v[str(d.pop("stars"))] + ds.append(Instance(**d)) + return ds + + def process(self, paths: Union[str, Dict[str, str]], vocab_opt: VocabularyOption = None, + embed_opt: EmbeddingOption = None): + paths = check_dataloader_paths(paths) + datasets = {} + info = DataInfo() + vocab = Vocabulary(min_freq=2) if vocab_opt is None else Vocabulary(**vocab_opt) + for name, path in paths.items(): + dataset = self.load(path) + datasets[name] = dataset + vocab.from_dataset(dataset, field_name="words") + info.vocabs = vocab + info.datasets = datasets + if embed_opt is not None: + embed = EmbedLoader.load_with_vocab(**embed_opt, vocab=vocab) + info.embeddings['words'] = embed + return info + From ad6a55ba2679dd631bd7acfbfb7a2b5900b02954 Mon Sep 17 00:00:00 2001 From: Violet Yao Date: Sat, 8 Jun 2019 14:32:25 +0800 Subject: [PATCH 05/17] fixed comment format --- .../text_classification/data/yelpLoader.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/reproduction/text_classification/data/yelpLoader.py b/reproduction/text_classification/data/yelpLoader.py index ed5db021..c47d48fd 100644 --- a/reproduction/text_classification/data/yelpLoader.py +++ b/reproduction/text_classification/data/yelpLoader.py @@ -13,8 +13,8 @@ from reproduction.utils import check_dataloader_paths class yelpLoader(JsonLoader): """ - 读取Yelp数据集, DataSet包含fields: - + 读取Yelp数据集, DataSet包含fields: + review_id: str, 22 character unique review id user_id: str, 22 character unique user id business_id: str, 22 character business id @@ -24,11 +24,11 @@ class yelpLoader(JsonLoader): date: str, date formatted YYYY-MM-DD words: list(str), 需要分类的文本 target: str, 文本的标签 - - 数据来源: https://www.yelp.com/dataset/download - - :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` - """ + + 数据来源: https://www.yelp.com/dataset/download + + :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` + """ def __init__(self, fine_grained=False): super(yelpLoader, self).__init__() From d17acd0cb1a5ec591cc475aad83ca1f8147d77b1 Mon Sep 17 00:00:00 2001 From: Violet Yao Date: Sun, 9 Jun 2019 16:45:08 +0800 Subject: [PATCH 06/17] remove ast --- fastNLP/io/dataset_loader.py | 2 +- test/data_for_tests/yelp_sample.json | 20 ++++++++++++++++++++ test/io/test_dataset_loader.py | 7 +++++++ 3 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 test/data_for_tests/yelp_sample.json diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 3b5e897c..7d78669e 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -240,7 +240,7 @@ class JsonLoader(DataSetLoader): if self.fields: ins = {self.fields[k]: v for k, v in d.items()} else: - ins = ast.literal_eval(d) + ins = d ds.append(Instance(**ins)) return ds diff --git a/test/data_for_tests/yelp_sample.json b/test/data_for_tests/yelp_sample.json new file mode 100644 index 00000000..053dc4bc --- /dev/null +++ b/test/data_for_tests/yelp_sample.json @@ -0,0 +1,20 @@ +"{\"review_id\":\"Q1sbwvVQXV2734tPgoKj4Q\",\"user_id\":\"hG7b0MtEbXx5QzbzE6C_VA\",\"business_id\":\"ujmEBvifdJM6h6RLv4wQIg\",\"stars\":1.0,\"useful\":6,\"funny\":1,\"cool\":0,\"text\":\"Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.\",\"date\":\"2013-05-07 04:34:36\"}\n" +"{\"review_id\":\"GJXCdrto3ASJOqKeVWPi6Q\",\"user_id\":\"yXQM5uF2jS6es16SJzNHfg\",\"business_id\":\"NZnhc2sEQy3RmzKTZnqtwQ\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"I *adore* Travis at the Hard Rock's new Kelly Cardenas Salon! I'm always a fan of a great blowout and no stranger to the chains that offer this service; however, Travis has taken the flawless blowout to a whole new level! \\n\\nTravis's greets you with his perfectly green swoosh in his otherwise perfectly styled black hair and a Vegas-worthy rockstar outfit. Next comes the most relaxing and incredible shampoo -- where you get a full head message that could cure even the very worst migraine in minutes --- and the scented shampoo room. Travis has freakishly strong fingers (in a good way) and use the perfect amount of pressure. That was superb! Then starts the glorious blowout... where not one, not two, but THREE people were involved in doing the best round-brush action my hair has ever seen. The team of stylists clearly gets along extremely well, as it's evident from the way they talk to and help one another that it's really genuine and not some corporate requirement. It was so much fun to be there! \\n\\nNext Travis started with the flat iron. The way he flipped his wrist to get volume all around without over-doing it and making me look like a Texas pagent girl was admirable. It's also worth noting that he didn't fry my hair -- something that I've had happen before with less skilled stylists. At the end of the blowout & style my hair was perfectly bouncey and looked terrific. The only thing better? That this awesome blowout lasted for days! \\n\\nTravis, I will see you every single time I'm out in Vegas. You make me feel beauuuutiful!\",\"date\":\"2017-01-14 21:30:33\"}\n" +"{\"review_id\":\"2TzJjDVDEuAW6MR5Vuc1ug\",\"user_id\":\"n6-Gk65cPZL6Uz8qRm3NYw\",\"business_id\":\"WTqjgwHlXbSFevF32_DJVw\",\"stars\":5.0,\"useful\":3,\"funny\":0,\"cool\":0,\"text\":\"I have to say that this office really has it together, they are so organized and friendly! Dr. J. Phillipp is a great dentist, very friendly and professional. The dental assistants that helped in my procedure were amazing, Jewel and Bailey helped me to feel comfortable! I don't have dental insurance, but they have this insurance through their office you can purchase for $80 something a year and this gave me 25% off all of my dental work, plus they helped me get signed up for care credit which I knew nothing about before this visit! I highly recommend this office for the nice synergy the whole office has!\",\"date\":\"2016-11-09 20:09:03\"}\n" +"{\"review_id\":\"yi0R0Ugj_xUx_Nek0-_Qig\",\"user_id\":\"dacAIZ6fTM6mqwW5uxkskg\",\"business_id\":\"ikCg8xy5JIg_NGPx-MSIDA\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Went in for a lunch. Steak sandwich was delicious, and the Caesar salad had an absolutely delicious dressing, with a perfect amount of dressing, and distributed perfectly across each leaf. I know I'm going on about the salad ... But it was perfect.\\n\\nDrink prices were pretty good.\\n\\nThe Server, Dawn, was friendly and accommodating. Very happy with her.\\n\\nIn summation, a great pub experience. Would go again!\",\"date\":\"2018-01-09 20:56:38\"}\n" +"{\"review_id\":\"11a8sVPMUFtaC7_ABRkmtw\",\"user_id\":\"ssoyf2_x0EQMed6fgHeMyQ\",\"business_id\":\"b1b1eb3uo-w561D0ZfCEiQ\",\"stars\":1.0,\"useful\":7,\"funny\":0,\"cool\":0,\"text\":\"Today was my second out of three sessions I had paid for. Although my first session went well, I could tell Meredith had a particular enjoyment for her male clients over her female. However, I returned because she did my teeth fine and I was pleased with the results. When I went in today, I was in the whitening room with three other gentlemen. My appointment started out well, although, being a person who is in the service industry, I always attend to my female clientele first when a couple arrives. Unbothered by those signs, I waited my turn. She checked on me once after my original 30 minute timer to ask if I was ok. She attended my boyfriend on numerous occasions, as well as the other men, and would exit the room without even asking me or looking to see if I had any irritation. Half way through, another woman had showed up who she was explaining the deals to in the lobby. While she admits timers must be reset half way through the process, she reset my boyfriends, left, rest the gentleman furthest away from me who had time to come in, redeem his deal, get set, and gave his timer done, before me, then left, and at this point my time was at 10 minutes. So, she should have reset it 5 minutes ago, according to her. While I sat there patiently this whole time with major pain in my gums, i watched the time until the lamp shut off. Not only had she reset two others, explained deals to other guest, but she never once checked on my time. When my light turned off, I released the stance of my mouth to a more relaxed state, assuming I was only getting a thirty minute session instead of the usual 45, because she had yet to come in. At this point, the teeth formula was not only burning the gum she neglected for 25 minutes now, but it began to burn my lips. I began squealing and slapping my chair trying to get her attention from the other room in a panic. I was in so much pain, that by the time she entered the room I was already out of my chair. She finally then acknowledged me, and asked if she could put vitamin E on my gum burn (pictured below). At this point, she has treated two other gums burns, while neglecting me, and I was so irritated that I had to suffer, all I wanted was to leave. While I waited for my boyfriend, she kept harassing me about the issue. Saying, \\\"well burns come with teeth whitening.\\\" While I totally agree, and under justifiable circumstances would not be as irritate, it could have easily been avoid if she had checked on me even a second time, so I could let her know. Not only did she never check on my physical health, she couldn't even take two seconds to reset the timer, which she even admitted to me. Her accuse was that she was coming in to do it, but I had the light off for a solid two minutes before I couldn't stand the pain. She admitted it should be reset every 15 minutes, which means for 25 minutes she did not bother to help me at all. Her guest in the lobby then proceeded to attack me as well, simply because I wanted to leave after the way I was treated. I also expected a refund for not getting a complete session today, due to the neglect, and the fact I won't be returning for my last, she had failed to do that. She was even screaming from the door, and continued to until my boyfriend and I were down the steps. I have never in my life been more appalled by a grown woman's behavior, who claims to be in the business for \\\"10 years.\\\" Admit your wrongs, but don't make your guest feel unwelcome because you can't do you job properly.\",\"date\":\"2018-01-30 23:07:38\"}\n" +"{\"review_id\":\"fdiNeiN_hoCxCMy2wTRW9g\",\"user_id\":\"w31MKYsNFMrjhWxxAb5wIw\",\"business_id\":\"eU_713ec6fTGNO4BegRaww\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"I'll be the first to admit that I was not excited about going to La Tavolta. Being a food snob, when a group of friends suggested we go for dinner I looked online at the menu and to me there was nothing special and it seemed overpriced. Im also not big on ordering pasta when I go out. Alas, I was outnumbered. Thank goodness! I ordered the sea bass special. It was to die for. Cooked perfectly, seasoned perfectly, perfect portion. I can not say enough good things about this dish. When the server asked how it was he seemed very proud of the dish and said, \\\" doesn't she (the chef) do an incredible job?\\\" She does. \\n\\nMy hubby got the crab tortellini and also loved his. I heard \\\"mmmm this is so good\\\" from all around the table. Our waiter was super nice and even gave us free desserts because we were some of the last people in the restaurant. Service was very slow and the place was PACKED but we had our jugs of wine and a large group with good conversation so it didn't seem to bother anyone.\\n\\nSo-\\n\\nDo order the calamari and fried zucchini appetizers. Leave out the mussels. \\n\\nIf they have the sea bass special, I highly recommend it. The chicken parm and crab tortellini were also very good and very big. The chicken Romano was a bit bland. The house salads were teeny. \\n\\nDo make a reservation but still expect to wait for your food. Go with a large group of people and plan for it to be loud. Don't go with a date unless you're fighting and don't feel like hearing anything they have to say. Ask to sit in the side room if it's available.\",\"date\":\"2013-01-20 13:25:59\"}\n" +"{\"review_id\":\"G7XHMxG0bx9oBJNECG4IFg\",\"user_id\":\"jlu4CztcSxrKx56ba1a5AQ\",\"business_id\":\"3fw2X5bZYeW9xCz_zGhOHg\",\"stars\":3.0,\"useful\":5,\"funny\":4,\"cool\":5,\"text\":\"Tracy dessert had a big name in Hong Kong and the one in First Markham place has been here for many years now! \\n\\nCame in for some Chinese dessert, and I must say their selection has increased tremendously over the years. I might as well add that the price has also increased tremendously as well. The waitress gave us tea, which I could taste had red date in it. Fancy!\\n\\nA simple taro with coconut with tapioca pearls was like $5.25 or something. Basically all the desserts were more than $5. That's crazy! I can literally just make this dessert at home and for a bowl, it would probably cost like $0.50. A few years ago, I think I can still get it for like $3-$4, which is more reasonable, but wow, more than $5 is a little over the top for this dessert. Though I must say, it is Tracy Dessert, and they are a little more on the expensive side. \\n\\nI also saw other items on the menu like fish balls, chicken wings, shaved ice. My friend got a mango drink with fresh mango in it! \\n\\nI'm also surprised how many people come to Tracy Dessert after work. We came on a Sunday and the tables were always filled. I think the amount of tables they had were just perfect because no one really waited for seats for a long time, but the tables kept filling up once a table was finished.\",\"date\":\"2016-05-07 01:21:02\"}\n" +"{\"review_id\":\"8e9HxxLjjqc9ez5ezzN7iQ\",\"user_id\":\"d6xvYpyzcfbF_AZ8vMB7QA\",\"business_id\":\"zvO-PJCpNk4fgAVUnExYAA\",\"stars\":1.0,\"useful\":3,\"funny\":1,\"cool\":1,\"text\":\"This place has gone down hill. Clearly they have cut back on staff and food quality\\n\\nMany of the reviews were written before the menu changed. I've been going for years and the food quality has gone down hill.\\n\\nThe service is slow & my salad, which was $15, was as bad as it gets.\\n\\nIt's just not worth spending the money on this place when there are so many other options.\",\"date\":\"2010-10-05 19:12:35\"}\n" +"{\"review_id\":\"qrffudO73zsslZbe8B9D3Q\",\"user_id\":\"sG_h0dIzTKWa3Q6fmb4u-g\",\"business_id\":\"b2jN2mm9Wf3RcrZCgfo1cg\",\"stars\":2.0,\"useful\":1,\"funny\":0,\"cool\":0,\"text\":\"I was really looking forward to visiting after having some of their beers. The \\\"Man O'War\\\" quickly became my favorite DIPA; the Rusulka Vanilla Stout is a good thick, sweet stout; and the Ironclad is a top notch IPA. \\nThe only big miss on their beers I've had is the Big Chuck Barleywine. It could probably benefit greatly with age, but at this age all there is to taste is the alcohol. \\nNonetheless, I had enough to convince me that the other beers I hadn't had from them would be top notch... and they are! \\nThe reason for the 2 stars should not reflect the quality of the brewers, they obviously know their craft well! \\nThe servers are great and friendly.... but relying on two servers to wait on 100+ customers says a lot about how inexperienced management must be. In fact, after waiting 15 mins at a dirty table I was finally able to track down someone I guessed was an employee to let them know we were even there! \\nAfter another 5+ mins, the GM finally stopped over to take our drink order. The smugness of this guy was amazing. The thought of offering a simple apology never seemed to enter into his head. \\nThis is the time a server finally stopped by to pick up the non-final check left by the party before us... who didn't seem very pleased when leaving. \\nThe toast & cheese was good, but by the time we were able to dig into their heartiest offering of food, saltines and butter may have been equally pleasing.\",\"date\":\"2015-01-18 14:04:18\"}\n" +"{\"review_id\":\"RS_GTIT6836bCaPy637kNQ\",\"user_id\":\"nMeCE5-xsdleyxYuNZ_7rA\",\"business_id\":\"oxwGyA17NL6c5t1Etg5WgQ\",\"stars\":3.0,\"useful\":1,\"funny\":0,\"cool\":1,\"text\":\"It's a giant Best Buy with 66 registers. I don't get it. What's the big deal about this place??\",\"date\":\"2012-02-29 21:52:43\"}\n" +"{\"review_id\":\"kbtscdyz6lvrtGjD1quQTg\",\"user_id\":\"FIk4lQQu1eTe2EpzQ4xhBA\",\"business_id\":\"8mIrX_LrOnAqWsB5JrOojQ\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Like walking back in time, every Saturday morning my sister and I was in a bowling league and after we were done, we'd spend a few quarters playing the pin ball machines until our mother came to pick us up.\\n\\nMy sister was daring and play the machines hard, she was afraid of that \\\"tilt\\\" showing up and freezing the game. I, on the other hand was a bit more gentler and wanted to make sure I got my quarter's worth.\\n\\nThis place has rows and rows of machines, some are really old and some are more of a mid 80's theme. There is even a Ms pac man! It was fun to spend an afternoon playing the machines and remembering all the fun of my early teen years.\",\"date\":\"2011-11-30 02:11:15\"}\n" +"{\"review_id\":\"-I5umRTkhw15RqpKMl_o1Q\",\"user_id\":\"-mA3-1mN4JIEkqOtdbNXCQ\",\"business_id\":\"mRUVMJkUGxrByzMQ2MuOpA\",\"stars\":1.0,\"useful\":0,\"funny\":1,\"cool\":0,\"text\":\"Walked in around 4 on a Friday afternoon, we sat at a table just off the bar and walked out after 5 min or so. Don't even think they realized we walked in. However everyone at the bar noticed we walked in!!! Service was non existent at best. Not a good way for a new business to start out. Oh well, the location they are at has been about 5 different things over the past several years, so they will just be added to the list. SMDH!!!\",\"date\":\"2017-12-15 23:27:08\"}\n" +"{\"review_id\":\"Z7wgXp98wYB57QdRY3HQ3w\",\"user_id\":\"GYNnVehQeXjty0xH7-6Fhw\",\"business_id\":\"FxLfqxdYPA6Z85PFKaqLrg\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Wow. So surprised at the one and two star reviews! We started with the most tender calamari. Although the marinara sauce was a bit bland, but a touch of salt made it just right. My husband had the veal with peppers and said it was so delicious and tender. The mashed potatoes were perfect. I had the salmon Diablo which was also delicious. Our salad was beautiful! Dressing was served on the salad and it was a nice amount. We ended our delicious meal with a piece of tiramisu. Our server Matt was right on!! Very pleasant and knowledgeable about the menu. Our appetizer, salad and entrees were timed perfectly. I love salad and did not mind that my entree was served while I was still eating it! No problem it let my dinner cool to just the right temp for me to eat it comfortably. \\nI wonder sometimes if people just don't appreciate relaxing and taking time to eat a wonderful and beautifully prepared meal. A wonderful atmosphere. So relaxing. The chairs are super comfortable too!!! We will certainly be back. \\nGive it a try. Don't always go by the reviews. \\nA bottle of Riesling, calamari app, two delicious entrees and dessert for $92! \\nWell with it.\",\"date\":\"2016-05-07 01:36:53\"}\n" +"{\"review_id\":\"qlXw1JQ0UodW7qrmVgwCXw\",\"user_id\":\"bAhqAPoWaZYcyYi7bs024Q\",\"business_id\":\"LUN6swQYa4xJKaM_UEUOEw\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Michael from Red Carpet VIP is amazing ! I reached out because I needed help planning my soon to be sister in law's bachelorette. It was a group of 10 girls so I was a little overwhelmed but Michael saved the day! Everything was super smooth and easy! We got good deals and had the best time ever! We booked hotel and a bachelorette package for a great price. I have saved contact info because I will for sure reach out again on next Vegas trip!!!\",\"date\":\"2018-04-27 20:25:26\"}\n" +"{\"review_id\":\"JVcjMhlavKKn3UIt9p9OXA\",\"user_id\":\"TpyOT5E16YASd7EWjLQlrw\",\"business_id\":\"AakkkTuGZA2KBodKi2_u8A\",\"stars\":1.0,\"useful\":1,\"funny\":1,\"cool\":0,\"text\":\"I cannot believe how things have changed in 3 years. I picked up duck congee sometime in the winter when my hubby was sick. I was very disappointed because the ginger fish sauce tasted like it had gone bad (it should never be bitter). Today, my hubby wanted to eat there since he was craving the duck congee and most places don't serve the duck & coleslaw side. We waited about 10 minutes to get our menu. After we placed our orders, we waited another 5 minutes to get the tea that most places bring with the menu. I could go on with the details but the gist of the story is they were understaffed or the staff was slow. The worst part of it was that the service. The servers make us feel bad for asking for anything (like when they took our order). We had arrived and placed our order before another couple bside us at least 10 minutes ahead but somehow, this couple received their pho before mine. They were almost done eating their pho before mine came out.\",\"date\":\"2012-07-16 00:37:14\"}\n" +"{\"review_id\":\"svK3nBU7Rk8VfGorlrN52A\",\"user_id\":\"NJlxGtouq06hhC7sS2ECYw\",\"business_id\":\"YvrylyuWgbP90RgMqZQVnQ\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"You can't really find anything wrong with this place, the pastas and pizzas are both amazing and high quality, the price is very reasonable, the owner and the staff are very friendly, if you're in downtown check this place out, a lot of people think just because it's downtown there are lots of options around but that's not always the case as there is also a lot of poor quality food in downtown as well.\",\"date\":\"2017-04-07 21:27:49\"}\n" +"{\"review_id\":\"1wVA2-vQIuW_ClmXkDxqMQ\",\"user_id\":\"86J5DwcFk4f4In1Vxe2TvA\",\"business_id\":\"NyLYY8q1-H3hfsTwuwLPCg\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Great lunch today. Staff was very helpful in assisting with selections and knowledgeable on the ingredients. We enjoyed the BBQ chicken with tika masala sauce and really good naan bread. The biryani with chicken was also yummy! Fun to see the food being prepared in the tandoori ovens. Great addition to the fast casual scene in Cleveland.\",\"date\":\"2015-01-03 22:47:34\"}\n" +"{\"review_id\":\"6BnQwlxRn7ZuWdzninM9sQ\",\"user_id\":\"JSrP-dUmLlwZiI7Dp3PQ2A\",\"business_id\":\"cHdJXLlKNWixBXpDwEGb_A\",\"stars\":3.0,\"useful\":1,\"funny\":7,\"cool\":1,\"text\":\"I love chinese food and I love mexican food. What can go wrong? A couple of things. First things first, this place is more of a \\\"rice bowl\\\" kind of place. I thought it was going to be more diverse as far as the menu goes, but its mainly rice bowls you get with different kinds of meats. The ordering was a little confusing at first, but one of the employees helped us out and I got the 2-item bowl and got the jade chicken and hengrenade chicken with all rice(jerk). I also ordered a jade chicken quesadilla on the side.\\n\\nI'm gonna admit, this place looks kinda dirty. I don't think Arizona uses those health department letter grade system like California does, but if I were to just judge by how it looked inside, i'd give it a \\\"C\\\" grade lol. We waited for about 15 minutes or so and finally got our food. We took it to go and ate at our hotel room. \\n\\nMmmm... the food was just alright. The jade chicken was nothing special. It tasted like any generic chinese fast food orange chicken\\/sesame chicken variant. The hengrenade chicken, although was the less spicier version of the jerk chicken, was still pretty spicy for me. Just be warned the jerk chicken is super spicy. If you aren't sure, ask for a sample at the restaurant before ordering, but it was way too spicy for me. \\n\\nThe jade chicken quesadilla was decent, but nothing special. Just imagine orange chicken in between a tortilla and cheese. A friend of mine ordered a jade chicken burrito and we were confused when we pulled it out of the bag because it was literally the size of Mcdonald's apple pie. If you order the burrito, be warned that it's a burrito for gnomes and smurfs, but he said it was tasty. \\n\\nThey provide a snicker doodle sugar cookie for each meal and it was decent, again nothing special. \\n\\nNot gonna lie, the next day my stomach felt like a little mexican dude and chinese dude were wrestling and throwing molotov cocktails inside. I used the bathroom like 5 times. I don't recommend eating this place if you have a lot to do the next day.\",\"date\":\"2015-04-01 16:30:00\"}\n" +"{\"review_id\":\"rEITo90tpyKmEfNDp3Ou3A\",\"user_id\":\"6Fz_nus_OG4gar721OKgZA\",\"business_id\":\"6lj2BJ4tJeu7db5asGHQ4w\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"We've been a huge Slim's fan since they opened one up in Texas about two years ago when we used to live there. This place never disappoints. They even have great salads and grilled chicken. Plus they have fresh brewed sweet tea, it's the best!\",\"date\":\"2017-05-26 01:23:19\"}\n" +"{\"review_id\":\"4bUyL7lzoWzDZaJETAKREg\",\"user_id\":\"_N7Ndn29bpll_961oPeEfw\",\"business_id\":\"y-Iw6dZflNix4BdwIyTNGA\",\"stars\":3.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Good selection of classes of beers and mains. I've been here twice.\\n\\nFirst time I had the fried chicken. It was delicious, but be warned, extremely salty. I couldn't even finish the last piece of chicken after experiencing a salt overload.\\n\\nSecond time we came on a wednesday. We didn't know it was BBQ night, where they have a completely different menu, and don't offer anything from their original vegetarian-friendly menu. This menu has one vegetarian-friendly option - an eggplant sandwich. The vegetarian in my party said it was awful. Also, on BBQ night you choose 2 sides. Except they were out of all their sides except 2 - fries and potato salad. I can't say I was thrilled to have carb heavy sides with my carb heavy main. How do you run out of sides so early in the evening?\\n\\nService not so great.\\n\\nI'd avoid coming here on wednesdays.\",\"date\":\"2014-06-27 21:19:23\"}\n" diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py index 1ca2e672..83f16dcd 100644 --- a/test/io/test_dataset_loader.py +++ b/test/io/test_dataset_loader.py @@ -2,6 +2,8 @@ import unittest import os from fastNLP.io import Conll2003Loader, PeopleDailyCorpusLoader, CSVLoader, SNLILoader, JsonLoader from fastNLP.io.dataset_loader import SSTLoader +from reproduction.text_classification.data.yelpLoader import yelpLoader + class TestDatasetLoader(unittest.TestCase): @@ -59,3 +61,8 @@ class TestDatasetLoader(unittest.TestCase): print(info.vocabs) print(info.datasets) os.remove(train), os.remove(test) + + def test_yelp(self): + ds = yelpLoader().load('test/data_for_tests/yelp_sample.json') + assert len(ds) == 20 + From 189a0a066c34c6927fbaf75b7adfb96bfa54be15 Mon Sep 17 00:00:00 2001 From: Violet Yao Date: Sun, 9 Jun 2019 16:45:54 +0800 Subject: [PATCH 07/17] removed ast --- fastNLP/io/dataset_loader.py | 1 - 1 file changed, 1 deletion(-) diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 7d78669e..e366c6ea 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -29,7 +29,6 @@ from .file_reader import _read_csv, _read_json, _read_conll from .base_loader import DataSetLoader from .data_loader.sst import SSTLoader from ..core.const import Const -import ast class PeopleDailyCorpusLoader(DataSetLoader): From e1869ec0572be383f62fca989dd887252db4ce5f Mon Sep 17 00:00:00 2001 From: Violet Yao Date: Sun, 9 Jun 2019 21:25:56 +0800 Subject: [PATCH 08/17] Delete yelp_sample.json transfer to reproduction folder --- test/data_for_tests/yelp_sample.json | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 test/data_for_tests/yelp_sample.json diff --git a/test/data_for_tests/yelp_sample.json b/test/data_for_tests/yelp_sample.json deleted file mode 100644 index 053dc4bc..00000000 --- a/test/data_for_tests/yelp_sample.json +++ /dev/null @@ -1,20 +0,0 @@ -"{\"review_id\":\"Q1sbwvVQXV2734tPgoKj4Q\",\"user_id\":\"hG7b0MtEbXx5QzbzE6C_VA\",\"business_id\":\"ujmEBvifdJM6h6RLv4wQIg\",\"stars\":1.0,\"useful\":6,\"funny\":1,\"cool\":0,\"text\":\"Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.\",\"date\":\"2013-05-07 04:34:36\"}\n" -"{\"review_id\":\"GJXCdrto3ASJOqKeVWPi6Q\",\"user_id\":\"yXQM5uF2jS6es16SJzNHfg\",\"business_id\":\"NZnhc2sEQy3RmzKTZnqtwQ\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"I *adore* Travis at the Hard Rock's new Kelly Cardenas Salon! I'm always a fan of a great blowout and no stranger to the chains that offer this service; however, Travis has taken the flawless blowout to a whole new level! \\n\\nTravis's greets you with his perfectly green swoosh in his otherwise perfectly styled black hair and a Vegas-worthy rockstar outfit. Next comes the most relaxing and incredible shampoo -- where you get a full head message that could cure even the very worst migraine in minutes --- and the scented shampoo room. Travis has freakishly strong fingers (in a good way) and use the perfect amount of pressure. That was superb! Then starts the glorious blowout... where not one, not two, but THREE people were involved in doing the best round-brush action my hair has ever seen. The team of stylists clearly gets along extremely well, as it's evident from the way they talk to and help one another that it's really genuine and not some corporate requirement. It was so much fun to be there! \\n\\nNext Travis started with the flat iron. The way he flipped his wrist to get volume all around without over-doing it and making me look like a Texas pagent girl was admirable. It's also worth noting that he didn't fry my hair -- something that I've had happen before with less skilled stylists. At the end of the blowout & style my hair was perfectly bouncey and looked terrific. The only thing better? That this awesome blowout lasted for days! \\n\\nTravis, I will see you every single time I'm out in Vegas. You make me feel beauuuutiful!\",\"date\":\"2017-01-14 21:30:33\"}\n" -"{\"review_id\":\"2TzJjDVDEuAW6MR5Vuc1ug\",\"user_id\":\"n6-Gk65cPZL6Uz8qRm3NYw\",\"business_id\":\"WTqjgwHlXbSFevF32_DJVw\",\"stars\":5.0,\"useful\":3,\"funny\":0,\"cool\":0,\"text\":\"I have to say that this office really has it together, they are so organized and friendly! Dr. J. Phillipp is a great dentist, very friendly and professional. The dental assistants that helped in my procedure were amazing, Jewel and Bailey helped me to feel comfortable! I don't have dental insurance, but they have this insurance through their office you can purchase for $80 something a year and this gave me 25% off all of my dental work, plus they helped me get signed up for care credit which I knew nothing about before this visit! I highly recommend this office for the nice synergy the whole office has!\",\"date\":\"2016-11-09 20:09:03\"}\n" -"{\"review_id\":\"yi0R0Ugj_xUx_Nek0-_Qig\",\"user_id\":\"dacAIZ6fTM6mqwW5uxkskg\",\"business_id\":\"ikCg8xy5JIg_NGPx-MSIDA\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Went in for a lunch. Steak sandwich was delicious, and the Caesar salad had an absolutely delicious dressing, with a perfect amount of dressing, and distributed perfectly across each leaf. I know I'm going on about the salad ... But it was perfect.\\n\\nDrink prices were pretty good.\\n\\nThe Server, Dawn, was friendly and accommodating. Very happy with her.\\n\\nIn summation, a great pub experience. Would go again!\",\"date\":\"2018-01-09 20:56:38\"}\n" -"{\"review_id\":\"11a8sVPMUFtaC7_ABRkmtw\",\"user_id\":\"ssoyf2_x0EQMed6fgHeMyQ\",\"business_id\":\"b1b1eb3uo-w561D0ZfCEiQ\",\"stars\":1.0,\"useful\":7,\"funny\":0,\"cool\":0,\"text\":\"Today was my second out of three sessions I had paid for. Although my first session went well, I could tell Meredith had a particular enjoyment for her male clients over her female. However, I returned because she did my teeth fine and I was pleased with the results. When I went in today, I was in the whitening room with three other gentlemen. My appointment started out well, although, being a person who is in the service industry, I always attend to my female clientele first when a couple arrives. Unbothered by those signs, I waited my turn. She checked on me once after my original 30 minute timer to ask if I was ok. She attended my boyfriend on numerous occasions, as well as the other men, and would exit the room without even asking me or looking to see if I had any irritation. Half way through, another woman had showed up who she was explaining the deals to in the lobby. While she admits timers must be reset half way through the process, she reset my boyfriends, left, rest the gentleman furthest away from me who had time to come in, redeem his deal, get set, and gave his timer done, before me, then left, and at this point my time was at 10 minutes. So, she should have reset it 5 minutes ago, according to her. While I sat there patiently this whole time with major pain in my gums, i watched the time until the lamp shut off. Not only had she reset two others, explained deals to other guest, but she never once checked on my time. When my light turned off, I released the stance of my mouth to a more relaxed state, assuming I was only getting a thirty minute session instead of the usual 45, because she had yet to come in. At this point, the teeth formula was not only burning the gum she neglected for 25 minutes now, but it began to burn my lips. I began squealing and slapping my chair trying to get her attention from the other room in a panic. I was in so much pain, that by the time she entered the room I was already out of my chair. She finally then acknowledged me, and asked if she could put vitamin E on my gum burn (pictured below). At this point, she has treated two other gums burns, while neglecting me, and I was so irritated that I had to suffer, all I wanted was to leave. While I waited for my boyfriend, she kept harassing me about the issue. Saying, \\\"well burns come with teeth whitening.\\\" While I totally agree, and under justifiable circumstances would not be as irritate, it could have easily been avoid if she had checked on me even a second time, so I could let her know. Not only did she never check on my physical health, she couldn't even take two seconds to reset the timer, which she even admitted to me. Her accuse was that she was coming in to do it, but I had the light off for a solid two minutes before I couldn't stand the pain. She admitted it should be reset every 15 minutes, which means for 25 minutes she did not bother to help me at all. Her guest in the lobby then proceeded to attack me as well, simply because I wanted to leave after the way I was treated. I also expected a refund for not getting a complete session today, due to the neglect, and the fact I won't be returning for my last, she had failed to do that. She was even screaming from the door, and continued to until my boyfriend and I were down the steps. I have never in my life been more appalled by a grown woman's behavior, who claims to be in the business for \\\"10 years.\\\" Admit your wrongs, but don't make your guest feel unwelcome because you can't do you job properly.\",\"date\":\"2018-01-30 23:07:38\"}\n" -"{\"review_id\":\"fdiNeiN_hoCxCMy2wTRW9g\",\"user_id\":\"w31MKYsNFMrjhWxxAb5wIw\",\"business_id\":\"eU_713ec6fTGNO4BegRaww\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"I'll be the first to admit that I was not excited about going to La Tavolta. Being a food snob, when a group of friends suggested we go for dinner I looked online at the menu and to me there was nothing special and it seemed overpriced. Im also not big on ordering pasta when I go out. Alas, I was outnumbered. Thank goodness! I ordered the sea bass special. It was to die for. Cooked perfectly, seasoned perfectly, perfect portion. I can not say enough good things about this dish. When the server asked how it was he seemed very proud of the dish and said, \\\" doesn't she (the chef) do an incredible job?\\\" She does. \\n\\nMy hubby got the crab tortellini and also loved his. I heard \\\"mmmm this is so good\\\" from all around the table. Our waiter was super nice and even gave us free desserts because we were some of the last people in the restaurant. Service was very slow and the place was PACKED but we had our jugs of wine and a large group with good conversation so it didn't seem to bother anyone.\\n\\nSo-\\n\\nDo order the calamari and fried zucchini appetizers. Leave out the mussels. \\n\\nIf they have the sea bass special, I highly recommend it. The chicken parm and crab tortellini were also very good and very big. The chicken Romano was a bit bland. The house salads were teeny. \\n\\nDo make a reservation but still expect to wait for your food. Go with a large group of people and plan for it to be loud. Don't go with a date unless you're fighting and don't feel like hearing anything they have to say. Ask to sit in the side room if it's available.\",\"date\":\"2013-01-20 13:25:59\"}\n" -"{\"review_id\":\"G7XHMxG0bx9oBJNECG4IFg\",\"user_id\":\"jlu4CztcSxrKx56ba1a5AQ\",\"business_id\":\"3fw2X5bZYeW9xCz_zGhOHg\",\"stars\":3.0,\"useful\":5,\"funny\":4,\"cool\":5,\"text\":\"Tracy dessert had a big name in Hong Kong and the one in First Markham place has been here for many years now! \\n\\nCame in for some Chinese dessert, and I must say their selection has increased tremendously over the years. I might as well add that the price has also increased tremendously as well. The waitress gave us tea, which I could taste had red date in it. Fancy!\\n\\nA simple taro with coconut with tapioca pearls was like $5.25 or something. Basically all the desserts were more than $5. That's crazy! I can literally just make this dessert at home and for a bowl, it would probably cost like $0.50. A few years ago, I think I can still get it for like $3-$4, which is more reasonable, but wow, more than $5 is a little over the top for this dessert. Though I must say, it is Tracy Dessert, and they are a little more on the expensive side. \\n\\nI also saw other items on the menu like fish balls, chicken wings, shaved ice. My friend got a mango drink with fresh mango in it! \\n\\nI'm also surprised how many people come to Tracy Dessert after work. We came on a Sunday and the tables were always filled. I think the amount of tables they had were just perfect because no one really waited for seats for a long time, but the tables kept filling up once a table was finished.\",\"date\":\"2016-05-07 01:21:02\"}\n" -"{\"review_id\":\"8e9HxxLjjqc9ez5ezzN7iQ\",\"user_id\":\"d6xvYpyzcfbF_AZ8vMB7QA\",\"business_id\":\"zvO-PJCpNk4fgAVUnExYAA\",\"stars\":1.0,\"useful\":3,\"funny\":1,\"cool\":1,\"text\":\"This place has gone down hill. Clearly they have cut back on staff and food quality\\n\\nMany of the reviews were written before the menu changed. I've been going for years and the food quality has gone down hill.\\n\\nThe service is slow & my salad, which was $15, was as bad as it gets.\\n\\nIt's just not worth spending the money on this place when there are so many other options.\",\"date\":\"2010-10-05 19:12:35\"}\n" -"{\"review_id\":\"qrffudO73zsslZbe8B9D3Q\",\"user_id\":\"sG_h0dIzTKWa3Q6fmb4u-g\",\"business_id\":\"b2jN2mm9Wf3RcrZCgfo1cg\",\"stars\":2.0,\"useful\":1,\"funny\":0,\"cool\":0,\"text\":\"I was really looking forward to visiting after having some of their beers. The \\\"Man O'War\\\" quickly became my favorite DIPA; the Rusulka Vanilla Stout is a good thick, sweet stout; and the Ironclad is a top notch IPA. \\nThe only big miss on their beers I've had is the Big Chuck Barleywine. It could probably benefit greatly with age, but at this age all there is to taste is the alcohol. \\nNonetheless, I had enough to convince me that the other beers I hadn't had from them would be top notch... and they are! \\nThe reason for the 2 stars should not reflect the quality of the brewers, they obviously know their craft well! \\nThe servers are great and friendly.... but relying on two servers to wait on 100+ customers says a lot about how inexperienced management must be. In fact, after waiting 15 mins at a dirty table I was finally able to track down someone I guessed was an employee to let them know we were even there! \\nAfter another 5+ mins, the GM finally stopped over to take our drink order. The smugness of this guy was amazing. The thought of offering a simple apology never seemed to enter into his head. \\nThis is the time a server finally stopped by to pick up the non-final check left by the party before us... who didn't seem very pleased when leaving. \\nThe toast & cheese was good, but by the time we were able to dig into their heartiest offering of food, saltines and butter may have been equally pleasing.\",\"date\":\"2015-01-18 14:04:18\"}\n" -"{\"review_id\":\"RS_GTIT6836bCaPy637kNQ\",\"user_id\":\"nMeCE5-xsdleyxYuNZ_7rA\",\"business_id\":\"oxwGyA17NL6c5t1Etg5WgQ\",\"stars\":3.0,\"useful\":1,\"funny\":0,\"cool\":1,\"text\":\"It's a giant Best Buy with 66 registers. I don't get it. What's the big deal about this place??\",\"date\":\"2012-02-29 21:52:43\"}\n" -"{\"review_id\":\"kbtscdyz6lvrtGjD1quQTg\",\"user_id\":\"FIk4lQQu1eTe2EpzQ4xhBA\",\"business_id\":\"8mIrX_LrOnAqWsB5JrOojQ\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Like walking back in time, every Saturday morning my sister and I was in a bowling league and after we were done, we'd spend a few quarters playing the pin ball machines until our mother came to pick us up.\\n\\nMy sister was daring and play the machines hard, she was afraid of that \\\"tilt\\\" showing up and freezing the game. I, on the other hand was a bit more gentler and wanted to make sure I got my quarter's worth.\\n\\nThis place has rows and rows of machines, some are really old and some are more of a mid 80's theme. There is even a Ms pac man! It was fun to spend an afternoon playing the machines and remembering all the fun of my early teen years.\",\"date\":\"2011-11-30 02:11:15\"}\n" -"{\"review_id\":\"-I5umRTkhw15RqpKMl_o1Q\",\"user_id\":\"-mA3-1mN4JIEkqOtdbNXCQ\",\"business_id\":\"mRUVMJkUGxrByzMQ2MuOpA\",\"stars\":1.0,\"useful\":0,\"funny\":1,\"cool\":0,\"text\":\"Walked in around 4 on a Friday afternoon, we sat at a table just off the bar and walked out after 5 min or so. Don't even think they realized we walked in. However everyone at the bar noticed we walked in!!! Service was non existent at best. Not a good way for a new business to start out. Oh well, the location they are at has been about 5 different things over the past several years, so they will just be added to the list. SMDH!!!\",\"date\":\"2017-12-15 23:27:08\"}\n" -"{\"review_id\":\"Z7wgXp98wYB57QdRY3HQ3w\",\"user_id\":\"GYNnVehQeXjty0xH7-6Fhw\",\"business_id\":\"FxLfqxdYPA6Z85PFKaqLrg\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Wow. So surprised at the one and two star reviews! We started with the most tender calamari. Although the marinara sauce was a bit bland, but a touch of salt made it just right. My husband had the veal with peppers and said it was so delicious and tender. The mashed potatoes were perfect. I had the salmon Diablo which was also delicious. Our salad was beautiful! Dressing was served on the salad and it was a nice amount. We ended our delicious meal with a piece of tiramisu. Our server Matt was right on!! Very pleasant and knowledgeable about the menu. Our appetizer, salad and entrees were timed perfectly. I love salad and did not mind that my entree was served while I was still eating it! No problem it let my dinner cool to just the right temp for me to eat it comfortably. \\nI wonder sometimes if people just don't appreciate relaxing and taking time to eat a wonderful and beautifully prepared meal. A wonderful atmosphere. So relaxing. The chairs are super comfortable too!!! We will certainly be back. \\nGive it a try. Don't always go by the reviews. \\nA bottle of Riesling, calamari app, two delicious entrees and dessert for $92! \\nWell with it.\",\"date\":\"2016-05-07 01:36:53\"}\n" -"{\"review_id\":\"qlXw1JQ0UodW7qrmVgwCXw\",\"user_id\":\"bAhqAPoWaZYcyYi7bs024Q\",\"business_id\":\"LUN6swQYa4xJKaM_UEUOEw\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Michael from Red Carpet VIP is amazing ! I reached out because I needed help planning my soon to be sister in law's bachelorette. It was a group of 10 girls so I was a little overwhelmed but Michael saved the day! Everything was super smooth and easy! We got good deals and had the best time ever! We booked hotel and a bachelorette package for a great price. I have saved contact info because I will for sure reach out again on next Vegas trip!!!\",\"date\":\"2018-04-27 20:25:26\"}\n" -"{\"review_id\":\"JVcjMhlavKKn3UIt9p9OXA\",\"user_id\":\"TpyOT5E16YASd7EWjLQlrw\",\"business_id\":\"AakkkTuGZA2KBodKi2_u8A\",\"stars\":1.0,\"useful\":1,\"funny\":1,\"cool\":0,\"text\":\"I cannot believe how things have changed in 3 years. I picked up duck congee sometime in the winter when my hubby was sick. I was very disappointed because the ginger fish sauce tasted like it had gone bad (it should never be bitter). Today, my hubby wanted to eat there since he was craving the duck congee and most places don't serve the duck & coleslaw side. We waited about 10 minutes to get our menu. After we placed our orders, we waited another 5 minutes to get the tea that most places bring with the menu. I could go on with the details but the gist of the story is they were understaffed or the staff was slow. The worst part of it was that the service. The servers make us feel bad for asking for anything (like when they took our order). We had arrived and placed our order before another couple bside us at least 10 minutes ahead but somehow, this couple received their pho before mine. They were almost done eating their pho before mine came out.\",\"date\":\"2012-07-16 00:37:14\"}\n" -"{\"review_id\":\"svK3nBU7Rk8VfGorlrN52A\",\"user_id\":\"NJlxGtouq06hhC7sS2ECYw\",\"business_id\":\"YvrylyuWgbP90RgMqZQVnQ\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"You can't really find anything wrong with this place, the pastas and pizzas are both amazing and high quality, the price is very reasonable, the owner and the staff are very friendly, if you're in downtown check this place out, a lot of people think just because it's downtown there are lots of options around but that's not always the case as there is also a lot of poor quality food in downtown as well.\",\"date\":\"2017-04-07 21:27:49\"}\n" -"{\"review_id\":\"1wVA2-vQIuW_ClmXkDxqMQ\",\"user_id\":\"86J5DwcFk4f4In1Vxe2TvA\",\"business_id\":\"NyLYY8q1-H3hfsTwuwLPCg\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Great lunch today. Staff was very helpful in assisting with selections and knowledgeable on the ingredients. We enjoyed the BBQ chicken with tika masala sauce and really good naan bread. The biryani with chicken was also yummy! Fun to see the food being prepared in the tandoori ovens. Great addition to the fast casual scene in Cleveland.\",\"date\":\"2015-01-03 22:47:34\"}\n" -"{\"review_id\":\"6BnQwlxRn7ZuWdzninM9sQ\",\"user_id\":\"JSrP-dUmLlwZiI7Dp3PQ2A\",\"business_id\":\"cHdJXLlKNWixBXpDwEGb_A\",\"stars\":3.0,\"useful\":1,\"funny\":7,\"cool\":1,\"text\":\"I love chinese food and I love mexican food. What can go wrong? A couple of things. First things first, this place is more of a \\\"rice bowl\\\" kind of place. I thought it was going to be more diverse as far as the menu goes, but its mainly rice bowls you get with different kinds of meats. The ordering was a little confusing at first, but one of the employees helped us out and I got the 2-item bowl and got the jade chicken and hengrenade chicken with all rice(jerk). I also ordered a jade chicken quesadilla on the side.\\n\\nI'm gonna admit, this place looks kinda dirty. I don't think Arizona uses those health department letter grade system like California does, but if I were to just judge by how it looked inside, i'd give it a \\\"C\\\" grade lol. We waited for about 15 minutes or so and finally got our food. We took it to go and ate at our hotel room. \\n\\nMmmm... the food was just alright. The jade chicken was nothing special. It tasted like any generic chinese fast food orange chicken\\/sesame chicken variant. The hengrenade chicken, although was the less spicier version of the jerk chicken, was still pretty spicy for me. Just be warned the jerk chicken is super spicy. If you aren't sure, ask for a sample at the restaurant before ordering, but it was way too spicy for me. \\n\\nThe jade chicken quesadilla was decent, but nothing special. Just imagine orange chicken in between a tortilla and cheese. A friend of mine ordered a jade chicken burrito and we were confused when we pulled it out of the bag because it was literally the size of Mcdonald's apple pie. If you order the burrito, be warned that it's a burrito for gnomes and smurfs, but he said it was tasty. \\n\\nThey provide a snicker doodle sugar cookie for each meal and it was decent, again nothing special. \\n\\nNot gonna lie, the next day my stomach felt like a little mexican dude and chinese dude were wrestling and throwing molotov cocktails inside. I used the bathroom like 5 times. I don't recommend eating this place if you have a lot to do the next day.\",\"date\":\"2015-04-01 16:30:00\"}\n" -"{\"review_id\":\"rEITo90tpyKmEfNDp3Ou3A\",\"user_id\":\"6Fz_nus_OG4gar721OKgZA\",\"business_id\":\"6lj2BJ4tJeu7db5asGHQ4w\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"We've been a huge Slim's fan since they opened one up in Texas about two years ago when we used to live there. This place never disappoints. They even have great salads and grilled chicken. Plus they have fresh brewed sweet tea, it's the best!\",\"date\":\"2017-05-26 01:23:19\"}\n" -"{\"review_id\":\"4bUyL7lzoWzDZaJETAKREg\",\"user_id\":\"_N7Ndn29bpll_961oPeEfw\",\"business_id\":\"y-Iw6dZflNix4BdwIyTNGA\",\"stars\":3.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Good selection of classes of beers and mains. I've been here twice.\\n\\nFirst time I had the fried chicken. It was delicious, but be warned, extremely salty. I couldn't even finish the last piece of chicken after experiencing a salt overload.\\n\\nSecond time we came on a wednesday. We didn't know it was BBQ night, where they have a completely different menu, and don't offer anything from their original vegetarian-friendly menu. This menu has one vegetarian-friendly option - an eggplant sandwich. The vegetarian in my party said it was awful. Also, on BBQ night you choose 2 sides. Except they were out of all their sides except 2 - fries and potato salad. I can't say I was thrilled to have carb heavy sides with my carb heavy main. How do you run out of sides so early in the evening?\\n\\nService not so great.\\n\\nI'd avoid coming here on wednesdays.\",\"date\":\"2014-06-27 21:19:23\"}\n" From 74925f8237e32c2bf6d026d31a174a09c5bf4b58 Mon Sep 17 00:00:00 2001 From: Violet Yao Date: Sun, 9 Jun 2019 21:30:40 +0800 Subject: [PATCH 09/17] wrong folder --- test/io/test_dataset_loader.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py index 83f16dcd..7cff3c12 100644 --- a/test/io/test_dataset_loader.py +++ b/test/io/test_dataset_loader.py @@ -61,8 +61,3 @@ class TestDatasetLoader(unittest.TestCase): print(info.vocabs) print(info.datasets) os.remove(train), os.remove(test) - - def test_yelp(self): - ds = yelpLoader().load('test/data_for_tests/yelp_sample.json') - assert len(ds) == 20 - From 83729dfc39ca809d25dc4458651ada14ee9deb9e Mon Sep 17 00:00:00 2001 From: Violet Yao Date: Sun, 9 Jun 2019 21:46:34 +0800 Subject: [PATCH 10/17] moved test to reproduction folder --- .../text_classification/test/sample_yelp.json | 20 +++++++++++++++++++ .../text_classification/test/test_yelp.py | 7 +++++++ 2 files changed, 27 insertions(+) create mode 100644 reproduction/text_classification/test/sample_yelp.json create mode 100644 reproduction/text_classification/test/test_yelp.py diff --git a/reproduction/text_classification/test/sample_yelp.json b/reproduction/text_classification/test/sample_yelp.json new file mode 100644 index 00000000..053dc4bc --- /dev/null +++ b/reproduction/text_classification/test/sample_yelp.json @@ -0,0 +1,20 @@ +"{\"review_id\":\"Q1sbwvVQXV2734tPgoKj4Q\",\"user_id\":\"hG7b0MtEbXx5QzbzE6C_VA\",\"business_id\":\"ujmEBvifdJM6h6RLv4wQIg\",\"stars\":1.0,\"useful\":6,\"funny\":1,\"cool\":0,\"text\":\"Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.\",\"date\":\"2013-05-07 04:34:36\"}\n" +"{\"review_id\":\"GJXCdrto3ASJOqKeVWPi6Q\",\"user_id\":\"yXQM5uF2jS6es16SJzNHfg\",\"business_id\":\"NZnhc2sEQy3RmzKTZnqtwQ\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"I *adore* Travis at the Hard Rock's new Kelly Cardenas Salon! I'm always a fan of a great blowout and no stranger to the chains that offer this service; however, Travis has taken the flawless blowout to a whole new level! \\n\\nTravis's greets you with his perfectly green swoosh in his otherwise perfectly styled black hair and a Vegas-worthy rockstar outfit. Next comes the most relaxing and incredible shampoo -- where you get a full head message that could cure even the very worst migraine in minutes --- and the scented shampoo room. Travis has freakishly strong fingers (in a good way) and use the perfect amount of pressure. That was superb! Then starts the glorious blowout... where not one, not two, but THREE people were involved in doing the best round-brush action my hair has ever seen. The team of stylists clearly gets along extremely well, as it's evident from the way they talk to and help one another that it's really genuine and not some corporate requirement. It was so much fun to be there! \\n\\nNext Travis started with the flat iron. The way he flipped his wrist to get volume all around without over-doing it and making me look like a Texas pagent girl was admirable. It's also worth noting that he didn't fry my hair -- something that I've had happen before with less skilled stylists. At the end of the blowout & style my hair was perfectly bouncey and looked terrific. The only thing better? That this awesome blowout lasted for days! \\n\\nTravis, I will see you every single time I'm out in Vegas. You make me feel beauuuutiful!\",\"date\":\"2017-01-14 21:30:33\"}\n" +"{\"review_id\":\"2TzJjDVDEuAW6MR5Vuc1ug\",\"user_id\":\"n6-Gk65cPZL6Uz8qRm3NYw\",\"business_id\":\"WTqjgwHlXbSFevF32_DJVw\",\"stars\":5.0,\"useful\":3,\"funny\":0,\"cool\":0,\"text\":\"I have to say that this office really has it together, they are so organized and friendly! Dr. J. Phillipp is a great dentist, very friendly and professional. The dental assistants that helped in my procedure were amazing, Jewel and Bailey helped me to feel comfortable! I don't have dental insurance, but they have this insurance through their office you can purchase for $80 something a year and this gave me 25% off all of my dental work, plus they helped me get signed up for care credit which I knew nothing about before this visit! I highly recommend this office for the nice synergy the whole office has!\",\"date\":\"2016-11-09 20:09:03\"}\n" +"{\"review_id\":\"yi0R0Ugj_xUx_Nek0-_Qig\",\"user_id\":\"dacAIZ6fTM6mqwW5uxkskg\",\"business_id\":\"ikCg8xy5JIg_NGPx-MSIDA\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Went in for a lunch. Steak sandwich was delicious, and the Caesar salad had an absolutely delicious dressing, with a perfect amount of dressing, and distributed perfectly across each leaf. I know I'm going on about the salad ... But it was perfect.\\n\\nDrink prices were pretty good.\\n\\nThe Server, Dawn, was friendly and accommodating. Very happy with her.\\n\\nIn summation, a great pub experience. Would go again!\",\"date\":\"2018-01-09 20:56:38\"}\n" +"{\"review_id\":\"11a8sVPMUFtaC7_ABRkmtw\",\"user_id\":\"ssoyf2_x0EQMed6fgHeMyQ\",\"business_id\":\"b1b1eb3uo-w561D0ZfCEiQ\",\"stars\":1.0,\"useful\":7,\"funny\":0,\"cool\":0,\"text\":\"Today was my second out of three sessions I had paid for. Although my first session went well, I could tell Meredith had a particular enjoyment for her male clients over her female. However, I returned because she did my teeth fine and I was pleased with the results. When I went in today, I was in the whitening room with three other gentlemen. My appointment started out well, although, being a person who is in the service industry, I always attend to my female clientele first when a couple arrives. Unbothered by those signs, I waited my turn. She checked on me once after my original 30 minute timer to ask if I was ok. She attended my boyfriend on numerous occasions, as well as the other men, and would exit the room without even asking me or looking to see if I had any irritation. Half way through, another woman had showed up who she was explaining the deals to in the lobby. While she admits timers must be reset half way through the process, she reset my boyfriends, left, rest the gentleman furthest away from me who had time to come in, redeem his deal, get set, and gave his timer done, before me, then left, and at this point my time was at 10 minutes. So, she should have reset it 5 minutes ago, according to her. While I sat there patiently this whole time with major pain in my gums, i watched the time until the lamp shut off. Not only had she reset two others, explained deals to other guest, but she never once checked on my time. When my light turned off, I released the stance of my mouth to a more relaxed state, assuming I was only getting a thirty minute session instead of the usual 45, because she had yet to come in. At this point, the teeth formula was not only burning the gum she neglected for 25 minutes now, but it began to burn my lips. I began squealing and slapping my chair trying to get her attention from the other room in a panic. I was in so much pain, that by the time she entered the room I was already out of my chair. She finally then acknowledged me, and asked if she could put vitamin E on my gum burn (pictured below). At this point, she has treated two other gums burns, while neglecting me, and I was so irritated that I had to suffer, all I wanted was to leave. While I waited for my boyfriend, she kept harassing me about the issue. Saying, \\\"well burns come with teeth whitening.\\\" While I totally agree, and under justifiable circumstances would not be as irritate, it could have easily been avoid if she had checked on me even a second time, so I could let her know. Not only did she never check on my physical health, she couldn't even take two seconds to reset the timer, which she even admitted to me. Her accuse was that she was coming in to do it, but I had the light off for a solid two minutes before I couldn't stand the pain. She admitted it should be reset every 15 minutes, which means for 25 minutes she did not bother to help me at all. Her guest in the lobby then proceeded to attack me as well, simply because I wanted to leave after the way I was treated. I also expected a refund for not getting a complete session today, due to the neglect, and the fact I won't be returning for my last, she had failed to do that. She was even screaming from the door, and continued to until my boyfriend and I were down the steps. I have never in my life been more appalled by a grown woman's behavior, who claims to be in the business for \\\"10 years.\\\" Admit your wrongs, but don't make your guest feel unwelcome because you can't do you job properly.\",\"date\":\"2018-01-30 23:07:38\"}\n" +"{\"review_id\":\"fdiNeiN_hoCxCMy2wTRW9g\",\"user_id\":\"w31MKYsNFMrjhWxxAb5wIw\",\"business_id\":\"eU_713ec6fTGNO4BegRaww\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"I'll be the first to admit that I was not excited about going to La Tavolta. Being a food snob, when a group of friends suggested we go for dinner I looked online at the menu and to me there was nothing special and it seemed overpriced. Im also not big on ordering pasta when I go out. Alas, I was outnumbered. Thank goodness! I ordered the sea bass special. It was to die for. Cooked perfectly, seasoned perfectly, perfect portion. I can not say enough good things about this dish. When the server asked how it was he seemed very proud of the dish and said, \\\" doesn't she (the chef) do an incredible job?\\\" She does. \\n\\nMy hubby got the crab tortellini and also loved his. I heard \\\"mmmm this is so good\\\" from all around the table. Our waiter was super nice and even gave us free desserts because we were some of the last people in the restaurant. Service was very slow and the place was PACKED but we had our jugs of wine and a large group with good conversation so it didn't seem to bother anyone.\\n\\nSo-\\n\\nDo order the calamari and fried zucchini appetizers. Leave out the mussels. \\n\\nIf they have the sea bass special, I highly recommend it. The chicken parm and crab tortellini were also very good and very big. The chicken Romano was a bit bland. The house salads were teeny. \\n\\nDo make a reservation but still expect to wait for your food. Go with a large group of people and plan for it to be loud. Don't go with a date unless you're fighting and don't feel like hearing anything they have to say. Ask to sit in the side room if it's available.\",\"date\":\"2013-01-20 13:25:59\"}\n" +"{\"review_id\":\"G7XHMxG0bx9oBJNECG4IFg\",\"user_id\":\"jlu4CztcSxrKx56ba1a5AQ\",\"business_id\":\"3fw2X5bZYeW9xCz_zGhOHg\",\"stars\":3.0,\"useful\":5,\"funny\":4,\"cool\":5,\"text\":\"Tracy dessert had a big name in Hong Kong and the one in First Markham place has been here for many years now! \\n\\nCame in for some Chinese dessert, and I must say their selection has increased tremendously over the years. I might as well add that the price has also increased tremendously as well. The waitress gave us tea, which I could taste had red date in it. Fancy!\\n\\nA simple taro with coconut with tapioca pearls was like $5.25 or something. Basically all the desserts were more than $5. That's crazy! I can literally just make this dessert at home and for a bowl, it would probably cost like $0.50. A few years ago, I think I can still get it for like $3-$4, which is more reasonable, but wow, more than $5 is a little over the top for this dessert. Though I must say, it is Tracy Dessert, and they are a little more on the expensive side. \\n\\nI also saw other items on the menu like fish balls, chicken wings, shaved ice. My friend got a mango drink with fresh mango in it! \\n\\nI'm also surprised how many people come to Tracy Dessert after work. We came on a Sunday and the tables were always filled. I think the amount of tables they had were just perfect because no one really waited for seats for a long time, but the tables kept filling up once a table was finished.\",\"date\":\"2016-05-07 01:21:02\"}\n" +"{\"review_id\":\"8e9HxxLjjqc9ez5ezzN7iQ\",\"user_id\":\"d6xvYpyzcfbF_AZ8vMB7QA\",\"business_id\":\"zvO-PJCpNk4fgAVUnExYAA\",\"stars\":1.0,\"useful\":3,\"funny\":1,\"cool\":1,\"text\":\"This place has gone down hill. Clearly they have cut back on staff and food quality\\n\\nMany of the reviews were written before the menu changed. I've been going for years and the food quality has gone down hill.\\n\\nThe service is slow & my salad, which was $15, was as bad as it gets.\\n\\nIt's just not worth spending the money on this place when there are so many other options.\",\"date\":\"2010-10-05 19:12:35\"}\n" +"{\"review_id\":\"qrffudO73zsslZbe8B9D3Q\",\"user_id\":\"sG_h0dIzTKWa3Q6fmb4u-g\",\"business_id\":\"b2jN2mm9Wf3RcrZCgfo1cg\",\"stars\":2.0,\"useful\":1,\"funny\":0,\"cool\":0,\"text\":\"I was really looking forward to visiting after having some of their beers. The \\\"Man O'War\\\" quickly became my favorite DIPA; the Rusulka Vanilla Stout is a good thick, sweet stout; and the Ironclad is a top notch IPA. \\nThe only big miss on their beers I've had is the Big Chuck Barleywine. It could probably benefit greatly with age, but at this age all there is to taste is the alcohol. \\nNonetheless, I had enough to convince me that the other beers I hadn't had from them would be top notch... and they are! \\nThe reason for the 2 stars should not reflect the quality of the brewers, they obviously know their craft well! \\nThe servers are great and friendly.... but relying on two servers to wait on 100+ customers says a lot about how inexperienced management must be. In fact, after waiting 15 mins at a dirty table I was finally able to track down someone I guessed was an employee to let them know we were even there! \\nAfter another 5+ mins, the GM finally stopped over to take our drink order. The smugness of this guy was amazing. The thought of offering a simple apology never seemed to enter into his head. \\nThis is the time a server finally stopped by to pick up the non-final check left by the party before us... who didn't seem very pleased when leaving. \\nThe toast & cheese was good, but by the time we were able to dig into their heartiest offering of food, saltines and butter may have been equally pleasing.\",\"date\":\"2015-01-18 14:04:18\"}\n" +"{\"review_id\":\"RS_GTIT6836bCaPy637kNQ\",\"user_id\":\"nMeCE5-xsdleyxYuNZ_7rA\",\"business_id\":\"oxwGyA17NL6c5t1Etg5WgQ\",\"stars\":3.0,\"useful\":1,\"funny\":0,\"cool\":1,\"text\":\"It's a giant Best Buy with 66 registers. I don't get it. What's the big deal about this place??\",\"date\":\"2012-02-29 21:52:43\"}\n" +"{\"review_id\":\"kbtscdyz6lvrtGjD1quQTg\",\"user_id\":\"FIk4lQQu1eTe2EpzQ4xhBA\",\"business_id\":\"8mIrX_LrOnAqWsB5JrOojQ\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Like walking back in time, every Saturday morning my sister and I was in a bowling league and after we were done, we'd spend a few quarters playing the pin ball machines until our mother came to pick us up.\\n\\nMy sister was daring and play the machines hard, she was afraid of that \\\"tilt\\\" showing up and freezing the game. I, on the other hand was a bit more gentler and wanted to make sure I got my quarter's worth.\\n\\nThis place has rows and rows of machines, some are really old and some are more of a mid 80's theme. There is even a Ms pac man! It was fun to spend an afternoon playing the machines and remembering all the fun of my early teen years.\",\"date\":\"2011-11-30 02:11:15\"}\n" +"{\"review_id\":\"-I5umRTkhw15RqpKMl_o1Q\",\"user_id\":\"-mA3-1mN4JIEkqOtdbNXCQ\",\"business_id\":\"mRUVMJkUGxrByzMQ2MuOpA\",\"stars\":1.0,\"useful\":0,\"funny\":1,\"cool\":0,\"text\":\"Walked in around 4 on a Friday afternoon, we sat at a table just off the bar and walked out after 5 min or so. Don't even think they realized we walked in. However everyone at the bar noticed we walked in!!! Service was non existent at best. Not a good way for a new business to start out. Oh well, the location they are at has been about 5 different things over the past several years, so they will just be added to the list. SMDH!!!\",\"date\":\"2017-12-15 23:27:08\"}\n" +"{\"review_id\":\"Z7wgXp98wYB57QdRY3HQ3w\",\"user_id\":\"GYNnVehQeXjty0xH7-6Fhw\",\"business_id\":\"FxLfqxdYPA6Z85PFKaqLrg\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Wow. So surprised at the one and two star reviews! We started with the most tender calamari. Although the marinara sauce was a bit bland, but a touch of salt made it just right. My husband had the veal with peppers and said it was so delicious and tender. The mashed potatoes were perfect. I had the salmon Diablo which was also delicious. Our salad was beautiful! Dressing was served on the salad and it was a nice amount. We ended our delicious meal with a piece of tiramisu. Our server Matt was right on!! Very pleasant and knowledgeable about the menu. Our appetizer, salad and entrees were timed perfectly. I love salad and did not mind that my entree was served while I was still eating it! No problem it let my dinner cool to just the right temp for me to eat it comfortably. \\nI wonder sometimes if people just don't appreciate relaxing and taking time to eat a wonderful and beautifully prepared meal. A wonderful atmosphere. So relaxing. The chairs are super comfortable too!!! We will certainly be back. \\nGive it a try. Don't always go by the reviews. \\nA bottle of Riesling, calamari app, two delicious entrees and dessert for $92! \\nWell with it.\",\"date\":\"2016-05-07 01:36:53\"}\n" +"{\"review_id\":\"qlXw1JQ0UodW7qrmVgwCXw\",\"user_id\":\"bAhqAPoWaZYcyYi7bs024Q\",\"business_id\":\"LUN6swQYa4xJKaM_UEUOEw\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Michael from Red Carpet VIP is amazing ! I reached out because I needed help planning my soon to be sister in law's bachelorette. It was a group of 10 girls so I was a little overwhelmed but Michael saved the day! Everything was super smooth and easy! We got good deals and had the best time ever! We booked hotel and a bachelorette package for a great price. I have saved contact info because I will for sure reach out again on next Vegas trip!!!\",\"date\":\"2018-04-27 20:25:26\"}\n" +"{\"review_id\":\"JVcjMhlavKKn3UIt9p9OXA\",\"user_id\":\"TpyOT5E16YASd7EWjLQlrw\",\"business_id\":\"AakkkTuGZA2KBodKi2_u8A\",\"stars\":1.0,\"useful\":1,\"funny\":1,\"cool\":0,\"text\":\"I cannot believe how things have changed in 3 years. I picked up duck congee sometime in the winter when my hubby was sick. I was very disappointed because the ginger fish sauce tasted like it had gone bad (it should never be bitter). Today, my hubby wanted to eat there since he was craving the duck congee and most places don't serve the duck & coleslaw side. We waited about 10 minutes to get our menu. After we placed our orders, we waited another 5 minutes to get the tea that most places bring with the menu. I could go on with the details but the gist of the story is they were understaffed or the staff was slow. The worst part of it was that the service. The servers make us feel bad for asking for anything (like when they took our order). We had arrived and placed our order before another couple bside us at least 10 minutes ahead but somehow, this couple received their pho before mine. They were almost done eating their pho before mine came out.\",\"date\":\"2012-07-16 00:37:14\"}\n" +"{\"review_id\":\"svK3nBU7Rk8VfGorlrN52A\",\"user_id\":\"NJlxGtouq06hhC7sS2ECYw\",\"business_id\":\"YvrylyuWgbP90RgMqZQVnQ\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"You can't really find anything wrong with this place, the pastas and pizzas are both amazing and high quality, the price is very reasonable, the owner and the staff are very friendly, if you're in downtown check this place out, a lot of people think just because it's downtown there are lots of options around but that's not always the case as there is also a lot of poor quality food in downtown as well.\",\"date\":\"2017-04-07 21:27:49\"}\n" +"{\"review_id\":\"1wVA2-vQIuW_ClmXkDxqMQ\",\"user_id\":\"86J5DwcFk4f4In1Vxe2TvA\",\"business_id\":\"NyLYY8q1-H3hfsTwuwLPCg\",\"stars\":4.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Great lunch today. Staff was very helpful in assisting with selections and knowledgeable on the ingredients. We enjoyed the BBQ chicken with tika masala sauce and really good naan bread. The biryani with chicken was also yummy! Fun to see the food being prepared in the tandoori ovens. Great addition to the fast casual scene in Cleveland.\",\"date\":\"2015-01-03 22:47:34\"}\n" +"{\"review_id\":\"6BnQwlxRn7ZuWdzninM9sQ\",\"user_id\":\"JSrP-dUmLlwZiI7Dp3PQ2A\",\"business_id\":\"cHdJXLlKNWixBXpDwEGb_A\",\"stars\":3.0,\"useful\":1,\"funny\":7,\"cool\":1,\"text\":\"I love chinese food and I love mexican food. What can go wrong? A couple of things. First things first, this place is more of a \\\"rice bowl\\\" kind of place. I thought it was going to be more diverse as far as the menu goes, but its mainly rice bowls you get with different kinds of meats. The ordering was a little confusing at first, but one of the employees helped us out and I got the 2-item bowl and got the jade chicken and hengrenade chicken with all rice(jerk). I also ordered a jade chicken quesadilla on the side.\\n\\nI'm gonna admit, this place looks kinda dirty. I don't think Arizona uses those health department letter grade system like California does, but if I were to just judge by how it looked inside, i'd give it a \\\"C\\\" grade lol. We waited for about 15 minutes or so and finally got our food. We took it to go and ate at our hotel room. \\n\\nMmmm... the food was just alright. The jade chicken was nothing special. It tasted like any generic chinese fast food orange chicken\\/sesame chicken variant. The hengrenade chicken, although was the less spicier version of the jerk chicken, was still pretty spicy for me. Just be warned the jerk chicken is super spicy. If you aren't sure, ask for a sample at the restaurant before ordering, but it was way too spicy for me. \\n\\nThe jade chicken quesadilla was decent, but nothing special. Just imagine orange chicken in between a tortilla and cheese. A friend of mine ordered a jade chicken burrito and we were confused when we pulled it out of the bag because it was literally the size of Mcdonald's apple pie. If you order the burrito, be warned that it's a burrito for gnomes and smurfs, but he said it was tasty. \\n\\nThey provide a snicker doodle sugar cookie for each meal and it was decent, again nothing special. \\n\\nNot gonna lie, the next day my stomach felt like a little mexican dude and chinese dude were wrestling and throwing molotov cocktails inside. I used the bathroom like 5 times. I don't recommend eating this place if you have a lot to do the next day.\",\"date\":\"2015-04-01 16:30:00\"}\n" +"{\"review_id\":\"rEITo90tpyKmEfNDp3Ou3A\",\"user_id\":\"6Fz_nus_OG4gar721OKgZA\",\"business_id\":\"6lj2BJ4tJeu7db5asGHQ4w\",\"stars\":5.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"We've been a huge Slim's fan since they opened one up in Texas about two years ago when we used to live there. This place never disappoints. They even have great salads and grilled chicken. Plus they have fresh brewed sweet tea, it's the best!\",\"date\":\"2017-05-26 01:23:19\"}\n" +"{\"review_id\":\"4bUyL7lzoWzDZaJETAKREg\",\"user_id\":\"_N7Ndn29bpll_961oPeEfw\",\"business_id\":\"y-Iw6dZflNix4BdwIyTNGA\",\"stars\":3.0,\"useful\":0,\"funny\":0,\"cool\":0,\"text\":\"Good selection of classes of beers and mains. I've been here twice.\\n\\nFirst time I had the fried chicken. It was delicious, but be warned, extremely salty. I couldn't even finish the last piece of chicken after experiencing a salt overload.\\n\\nSecond time we came on a wednesday. We didn't know it was BBQ night, where they have a completely different menu, and don't offer anything from their original vegetarian-friendly menu. This menu has one vegetarian-friendly option - an eggplant sandwich. The vegetarian in my party said it was awful. Also, on BBQ night you choose 2 sides. Except they were out of all their sides except 2 - fries and potato salad. I can't say I was thrilled to have carb heavy sides with my carb heavy main. How do you run out of sides so early in the evening?\\n\\nService not so great.\\n\\nI'd avoid coming here on wednesdays.\",\"date\":\"2014-06-27 21:19:23\"}\n" diff --git a/reproduction/text_classification/test/test_yelp.py b/reproduction/text_classification/test/test_yelp.py new file mode 100644 index 00000000..2c390d46 --- /dev/null +++ b/reproduction/text_classification/test/test_yelp.py @@ -0,0 +1,7 @@ +import unittest +from reproduction.text_classification.data.yelpLoader import yelpLoader + +class TestDatasetLoader(unittest.TestCase): + def test_yelpLoader(self): + ds = yelpLoader().load('sample_yelp.json') + assert len(ds) == 20 \ No newline at end of file From 6309eafd25084c4c1f33113a05b9c03d2eaaf0b1 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Wed, 12 Jun 2019 11:10:33 +0800 Subject: [PATCH 11/17] =?UTF-8?q?1.=20=E5=9C=A8fieldarray=E4=B8=AD?= =?UTF-8?q?=E6=94=AF=E6=8C=81split=EF=BC=8Cint=E7=AD=89handy=E7=9A=84funct?= =?UTF-8?q?ion=202.=20=E9=87=8D=E5=A4=A7=E6=9B=B4=E6=96=B0=EF=BC=8C?= =?UTF-8?q?=E6=94=AF=E6=8C=81ElmoEmbedding,=20BertEmbedding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataset.py | 18 +- fastNLP/core/field.py | 154 ++++ fastNLP/core/utils.py | 8 + fastNLP/core/vocabulary.py | 6 +- fastNLP/io/embed_loader.py | 22 +- fastNLP/io/file_utils.py | 255 ++++++ fastNLP/models/cnn_text_classification.py | 16 +- fastNLP/modules/encoder/_bert.py | 625 ++++++++++++++ fastNLP/modules/encoder/_elmo.py | 774 ++++++++++++++++++ fastNLP/modules/encoder/bert.py | 3 +- fastNLP/modules/encoder/char_encoder.py | 4 +- fastNLP/modules/encoder/conv_maxpool.py | 34 +- fastNLP/modules/encoder/embedding.py | 759 ++++++++++++++++- fastNLP/modules/encoder/lstm.py | 2 +- fastNLP/modules/utils.py | 16 +- .../cws/train_shift_relay.py | 8 +- 16 files changed, 2633 insertions(+), 71 deletions(-) create mode 100644 fastNLP/io/file_utils.py create mode 100644 fastNLP/modules/encoder/_bert.py create mode 100644 fastNLP/modules/encoder/_elmo.py diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index ab020ce4..b011d15a 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -554,6 +554,7 @@ class DataSet(object): self.field_arrays[new_name].name = new_name else: raise KeyError("DataSet has no field named {}.".format(old_name)) + return self def set_target(self, *field_names, flag=True): """ @@ -593,7 +594,7 @@ class DataSet(object): try: self.field_arrays[name].is_input = flag except SetInputOrTargetException as e: - print(f"Cannot set field:{name} as input.") + print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.") raise e else: raise KeyError("{} is not a valid field name.".format(name)) @@ -761,7 +762,20 @@ class DataSet(object): self._add_apply_field(results, new_field_name, kwargs) return results - + + def add_seq_len(self, field_name:str, new_field_name='seq_len'): + """ + 将使用len()直接对field_name中每个元素作用,将其结果作为seqence length, 并放入seq_len这个field。 + + :param field_name: str. + :return: + """ + if self.has_field(field_name=field_name): + self.apply_field(len, field_name, new_field_name=new_field_name) + else: + raise KeyError(f"Field:{field_name} not found.") + return self + def drop(self, func, inplace=True): """ func接受一个Instance,返回bool值。返回值为True时,该Instance会被移除或者加入到返回的DataSet中。 diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index c47771df..b9a8196c 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -6,6 +6,7 @@ import numpy as np from typing import Any from abc import abstractmethod from copy import deepcopy +from collections import Counter class SetInputOrTargetException(Exception): def __init__(self, msg, index=None, field_name=None): @@ -223,6 +224,155 @@ class FieldArray: return self + def split(self, sep:str=None, inplace:bool=True): + """ + 依次对自身的元素使用.split()方法,应该只有当本field的元素为str时,该方法才有用。将返回值 + + :param sep: 分割符,如果为None则直接调用str.split()。 + :param inplace: 如果为True,则将新生成值替换本field。否则返回list。 + :return: List[List[str]] or self + """ + new_contents = [] + for index, cell in enumerate(self.content): + try: + new_contents.append(cell.split(sep)) + except Exception as e: + print(f"Exception happens when process value in index {index}.") + print(e) + return self._after_process(new_contents, inplace=inplace) + + def int(self, inplace:bool=True): + """ + 将本field中的值调用int(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), + (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。) + + :param inplace: 如果为True,则将新生成值替换本field。否则返回list。 + :return: List[int], List[List[int]], self + """ + new_contents = [] + for index, cell in enumerate(self.content): + try: + if isinstance(cell, list): + new_contents.append([int(value) for value in cell]) + else: + new_contents.append(int(cell)) + except Exception as e: + print(f"Exception happens when process value in index {index}.") + print(e) + return self._after_process(new_contents, inplace=inplace) + + def float(self, inplace=True): + """ + 将本field中的值调用float(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), + (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。) + + :param inplace: 如果为True,则将新生成值替换本field。否则返回list。 + :return: + """ + new_contents = [] + for index, cell in enumerate(self.content): + try: + if isinstance(cell, list): + new_contents.append([float(value) for value in cell]) + else: + new_contents.append(float(cell)) + except Exception as e: + print(f"Exception happens when process value in index {index}.") + print(e) + return self._after_process(new_contents, inplace=inplace) + + def bool(self, inplace=True): + """ + 将本field中的值调用bool(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), + (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。) + + :param inplace: 如果为True,则将新生成值替换本field。否则返回list。 + :return: + """ + new_contents = [] + for index, cell in enumerate(self.content): + try: + if isinstance(cell, list): + new_contents.append([bool(value) for value in cell]) + else: + new_contents.append(bool(cell)) + except Exception as e: + print(f"Exception happens when process value in index {index}.") + print(e) + + return self._after_process(new_contents, inplace=inplace) + + def lower(self, inplace=True): + """ + 将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), + (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。) + + :param inplace: 如果为True,则将新生成值替换本field。否则返回list。 + :return: List[int], List[List[int]], self + """ + new_contents = [] + for index, cell in enumerate(self.content): + try: + if isinstance(cell, list): + new_contents.append([value.lower() for value in cell]) + else: + new_contents.append(cell.lower()) + except Exception as e: + print(f"Exception happens when process value in index {index}.") + print(e) + return self._after_process(new_contents, inplace=inplace) + + def upper(self, inplace=True): + """ + 将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), + (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。) + + :param inplace: 如果为True,则将新生成值替换本field。否则返回list。 + :return: List[int], List[List[int]], self + """ + new_contents = [] + for index, cell in enumerate(self.content): + try: + if isinstance(cell, list): + new_contents.append([value.upper() for value in cell]) + else: + new_contents.append(cell.upper()) + except Exception as e: + print(f"Exception happens when process value in index {index}.") + print(e) + return self._after_process(new_contents, inplace=inplace) + + def value_count(self): + """ + 返回该field下不同value的数量。多用于统计label数量 + + :return: Counter, key是label,value是出现次数 + """ + count = Counter() + for cell in self.content: + count[cell] += 1 + return count + + def _after_process(self, new_contents, inplace): + """ + 当调用处理函数之后,决定是否要替换field。 + + :param new_contents: + :param inplace: + :return: self或者生成的content + """ + if inplace: + self.content = new_contents + try: + self.is_input = self.is_input + self.is_target = self.is_input + except SetInputOrTargetException as e: + print("The newly generated field cannot be set as input or target.") + raise e + return self + else: + return new_contents + def _get_ele_type_and_dim(cell:Any, dim=0): """ @@ -242,6 +392,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0): dims = set([j for i,j in res]) if len(types)>1: raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) + elif len(types)==0: + raise SetInputOrTargetException("Empty value encountered.") if len(dims)>1: raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) return types.pop(), dims.pop() @@ -257,6 +409,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0): dims = set([j for i,j in res]) if len(types)>1: raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) + elif len(types)==0: + raise SetInputOrTargetException("Empty value encountered.") if len(dims)>1: raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) return types.pop(), dims.pop() diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 9dab47b5..1eb2b70e 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -285,6 +285,7 @@ def _get_model_device(model): :param model: nn.Module :return: torch.device,None 如果返回值为None,说明这个模型没有任何参数。 """ + # TODO 这个函数存在一定的风险,因为同一个模型可能存在某些parameter不在显卡中,比如BertEmbedding assert isinstance(model, nn.Module) parameters = list(model.parameters()) @@ -295,6 +296,13 @@ def _get_model_device(model): def _build_args(func, **kwargs): + """ + 根据func的初始化参数,从kwargs中选择func需要的参数 + + :param func: callable + :param kwargs: 参数 + :return:dict. func中用到的参数 + """ spect = inspect.getfullargspec(func) if spect.varkw is not None: return kwargs diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index bca28e10..1d5d6f32 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -148,7 +148,7 @@ class Vocabulary(object): self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) self.build_reverse_vocab() self.rebuild = False - + def build_reverse_vocab(self): """ 基于 "word to index" dict, 构建 "index to word" dict. @@ -359,5 +359,7 @@ class Vocabulary(object): def __repr__(self): return "Vocabulary({}...)".format(list(self.word_count.keys())[:5]) + @_check_build_vocab def __iter__(self): - return iter(list(self.word_count.keys())) + for word, index in self.word2idx.items(): + yield word, index diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 4119d93f..34f66195 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -35,9 +35,9 @@ class EmbedLoader(BaseLoader): def __init__(self): super(EmbedLoader, self).__init__() - + @staticmethod - def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'): + def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='', unknown='', normalize=True, error='ignore'): """ 从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是 word2vec(第一行只有两个元素)还是glove格式的数据。 @@ -46,6 +46,8 @@ class EmbedLoader(BaseLoader): :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型,读取出现在vocab中的词的embedding。 没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来,以使得整个Embedding是同分布的。 :param dtype: 读出的embedding的类型 + :param str padding: 词表中padding的token + :param str unknown: 词表中unknown的token :param bool normalize: 是否将每个vector归一化到norm为1 :param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。 这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。 @@ -69,8 +71,14 @@ class EmbedLoader(BaseLoader): for idx, line in enumerate(f, start_idx): try: parts = line.strip().split() - if parts[0] in vocab: - index = vocab.to_index(parts[0]) + word = parts[0] + # 对齐unk与pad + if word==padding and vocab.padding is not None: + word = vocab.padding + elif word==unknown and vocab.unknown is not None: + word = vocab.unknown + if word in vocab: + index = vocab.to_index(word) matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) hit_flags[index] = True except Exception as e: @@ -102,8 +110,8 @@ class EmbedLoader(BaseLoader): :param str embed_filepath: 预训练的embedding的路径。 :param dtype: 读出的embedding的类型 - :param str padding: the padding tag for vocabulary. - :param str unknown: the unknown tag for vocabulary. + :param str padding: 词表中的padding的token. 并以此用做vocab的padding。 + :param str unknown: 词表中的unknown的token. 并以此用做vocab的unknown。 :param bool normalize: 是否将每个vector归一化到norm为1 :param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。这里主要可能出错的地 方在于词表有空行或者词表出现了维度不一致。 @@ -134,7 +142,7 @@ class EmbedLoader(BaseLoader): vocab.add_word(word) if unknown is not None and unknown == word: found_unknown = True - if found_pad is not None and padding == word: + if padding is not None and padding == word: found_pad = True except Exception as e: if error == 'ignore': diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py new file mode 100644 index 00000000..11c7ab64 --- /dev/null +++ b/fastNLP/io/file_utils.py @@ -0,0 +1,255 @@ + +import os +from pathlib import Path +from urllib.parse import urlparse +import re +import requests +import tempfile +from tqdm import tqdm +import shutil +import hashlib + + +def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: + """ + 给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 + 将文件放入到 + """ + if cache_dir is None: + dataset_cache = Path(get_defalt_path()) + else: + dataset_cache = cache_dir + + parsed = urlparse(url_or_filename) + + if parsed.scheme in ("http", "https"): + # URL, so get it from the cache (downloading if necessary) + return get_from_cache(url_or_filename, dataset_cache) + elif parsed.scheme == "" and Path(os.path.join(dataset_cache, url_or_filename)).exists(): + # File, and it exists. + return Path(url_or_filename) + elif parsed.scheme == "": + # File, but it doesn't exist. + raise FileNotFoundError("file {} not found".format(url_or_filename)) + else: + # Something unknown + raise ValueError( + "unable to parse {} as a URL or as a local path".format(url_or_filename) + ) + +def get_filepath(filepath): + """ + 如果filepath中只有一个文件,则直接返回对应的全路径 + :param filepath: + :return: + """ + if os.path.isdir(filepath): + files = os.listdir(filepath) + if len(files)==1: + return os.path.join(filepath, files[0]) + else: + return filepath + return filepath + +def get_defalt_path(): + """ + 获取默认的fastNLP存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中,将使用环境变量的值,使得不用每个用户都去下载。 + + :return: + """ + if 'FASTNLP_CACHE_DIR' in os.environ: + fastnlp_cache_dir = os.environ.get('FASTNLP_CACHE_DIR') + if os.path.exists(fastnlp_cache_dir): + return fastnlp_cache_dir + raise RuntimeError("Some errors happens on cache directory.") + else: + raise RuntimeError("There function is not available right now.") + fastnlp_cache_dir = os.path.expanduser(os.path.join("~", ".fastNLP")) + return fastnlp_cache_dir + +def _get_base_url(name): + # 返回的URL结尾必须是/ + if 'FASTNLP_BASE_URL' in os.environ: + fastnlp_base_url = os.environ['FASTNLP_BASE_URL'] + return fastnlp_base_url + raise RuntimeError("There function is not available right now.") + +def split_filename_suffix(filepath): + """ + 给定filepath返回对应的name和suffix + :param filepath: + :return: filename, suffix + """ + filename = os.path.basename(filepath) + if filename.endswith('.tar.gz'): + return filename[:-7], '.tar.gz' + return os.path.splitext(filename) + +def get_from_cache(url: str, cache_dir: Path = None) -> Path: + """ + 尝试在cache_dir中寻找url定义的资源; 如果没有找到。则从url下载并将结果放在cache_dir下,缓存的名称由url的结果推断而来。 + 如果从url中下载的资源解压后有多个文件,则返回directory的路径; 如果只有一个资源,则返回具体的路径 + + """ + cache_dir.mkdir(parents=True, exist_ok=True) + + filename = re.sub(r".+/", "", url) + dir_name, suffix = split_filename_suffix(filename) + sep_index = dir_name[::-1].index('-') + if sep_index<0: + check_sum = None + else: + check_sum = dir_name[-sep_index+1:] + sep_index = len(dir_name) if sep_index==-1 else -sep_index-1 + dir_name = dir_name[:sep_index] + + # 寻找与它名字匹配的内容, 而不关心后缀 + match_dir_name = match_file(dir_name, cache_dir) + if match_dir_name: + dir_name = match_dir_name + cache_path = cache_dir / dir_name + + # get cache path to put the file + if cache_path.exists(): + return get_filepath(cache_path) + + # make HEAD request to check ETag TODO ETag可以用来判断资源是否已经更新了,之后需要加上 + response = requests.head(url, headers={"User-Agent": "fastNLP"}) + if response.status_code != 200: + raise IOError( + f"HEAD request failed for url {url} with status code {response.status_code}." + ) + + # add ETag to filename if it exists + # etag = response.headers.get("ETag") + + if not cache_path.exists(): + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + fd, temp_filename = tempfile.mkstemp() + print("%s not found in cache, downloading to %s"%(url, temp_filename)) + + # GET file object + req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) + content_length = req.headers.get("Content-Length") + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total) + sha256 = hashlib.sha256() + with open(temp_filename, "wb") as temp_file: + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + sha256.update(chunk) + # check sum + digit = sha256.hexdigest()[:8] + if not check_sum: + assert digit == check_sum, "File corrupted when download." + progress.close() + print(f"Finish download from {url}.") + + # 开始解压 + delete_temp_dir = None + if suffix in ('.zip', '.tar.gz'): + uncompress_temp_dir = tempfile.mkdtemp() + delete_temp_dir = uncompress_temp_dir + print(f"Start to uncompress file to {uncompress_temp_dir}.") + if suffix == '.zip': + unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) + else: + untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) + filenames = os.listdir(uncompress_temp_dir) + if len(filenames)==1: + if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): + uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) + + cache_path.mkdir(parents=True, exist_ok=True) + print("Finish un-compressing file.") + else: + uncompress_temp_dir = temp_filename + cache_path = str(cache_path) + suffix + success = False + try: + # 复制到指定的位置 + print(f"Copy file to {cache_path}.") + if os.path.isdir(uncompress_temp_dir): + for filename in os.listdir(uncompress_temp_dir): + shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path/filename) + else: + shutil.copyfile(uncompress_temp_dir, cache_path) + success = True + except Exception as e: + print(e) + raise e + finally: + if not success: + if cache_path.exists(): + if cache_path.is_file(): + os.remove(cache_path) + else: + shutil.rmtree(cache_path) + if delete_temp_dir: + shutil.rmtree(delete_temp_dir) + os.close(fd) + os.remove(temp_filename) + + return get_filepath(cache_path) + +def unzip_file(file: Path, to: Path): + # unpack and write out in CoNLL column-like format + from zipfile import ZipFile + + with ZipFile(file, "r") as zipObj: + # Extract all the contents of zip file in current directory + zipObj.extractall(to) + +def untar_gz_file(file:Path, to:Path): + import tarfile + + with tarfile.open(file, 'r:gz') as tar: + tar.extractall(to) + +def match_file(dir_name:str, cache_dir:str)->str: + """ + 匹配的原则是,在cache_dir下的文件: (1) 与dir_name完全一致; (2) 除了后缀以外和dir_name完全一致。 + 如果找到了两个匹配的结果将报错. 如果找到了则返回匹配的文件的名称; 没有找到返回空字符串 + + :param dir_name: 需要匹配的名称 + :param cache_dir: 在该目录下找匹配dir_name是否存在 + :return: str + """ + files = os.listdir(cache_dir) + matched_filenames = [] + for file_name in files: + if re.match(dir_name+'$', file_name) or re.match(dir_name+'\\..*', file_name): + matched_filenames.append(file_name) + if len(matched_filenames)==0: + return '' + elif len(matched_filenames)==1: + return matched_filenames[-1] + else: + raise RuntimeError(f"Duplicate matched files:{matched_filenames}, this should be caused by a bug.") + +if __name__ == '__main__': + cache_dir = Path('caches') + cache_dir = None + # 需要对cache_dir进行测试 + base_url = 'http://0.0.0.0:8888/file/download' + # if True: + # for filename in os.listdir(cache_dir): + # if os.path.isdir(os.path.join(cache_dir, filename)): + # shutil.rmtree(os.path.join(cache_dir, filename)) + # else: + # os.remove(os.path.join(cache_dir, filename)) + # 1. 测试.txt文件 + print(cached_path(base_url + '/{}'.format('txt_test-bcb4fe65.txt'), cache_dir)) + # 2. 测试.zip文件(只有一个文件) + print(cached_path(base_url + '/{}'.format('zip_test-40966d39.zip'), cache_dir)) + # 3. 测试.zip文件(有多个文件) + print(cached_path(base_url + '/{}'.format('zip_pack_test-70c0b20d.zip'), cache_dir)) + # 4. 测试.tar.gz文件 + print(cached_path(base_url + '/{}'.format('tar_gz_test-3e2679cf.tar.gz'), cache_dir)) + # 5. 测试.tar.gz多个文件 + print(cached_path(base_url + '/{}'.format('tar_gz_pack_test-08dfdccd.tar.gz'), cache_dir)) + + # 6. 测试.pkl文件 diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index 3a71a80a..081dd510 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -7,6 +7,7 @@ import torch.nn as nn from ..core.const import Const as C from ..modules import encoder +from fastNLP import seq_len_to_mask class CNNText(torch.nn.Module): @@ -21,15 +22,13 @@ class CNNText(torch.nn.Module): :param int num_classes: 一共有多少类 :param int,tuple(int) out_channels: 输出channel的数量。如果为list,则需要与kernel_sizes的数量保持一致 :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。 - :param int padding: 对句子前后的pad的大小, 用0填充。 :param float dropout: Dropout的大小 """ def __init__(self, init_embed, num_classes, - kernel_nums=(3, 4, 5), - kernel_sizes=(3, 4, 5), - padding=0, + kernel_nums=(30, 40, 50), + kernel_sizes=(1, 3, 5), dropout=0.5): super(CNNText, self).__init__() @@ -38,8 +37,7 @@ class CNNText(torch.nn.Module): self.conv_pool = encoder.ConvMaxpool( in_channels=self.embed.embedding_dim, out_channels=kernel_nums, - kernel_sizes=kernel_sizes, - padding=padding) + kernel_sizes=kernel_sizes) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(sum(kernel_nums), num_classes) @@ -51,7 +49,11 @@ class CNNText(torch.nn.Module): :return output: dict of torch.LongTensor, [batch_size, num_classes] """ x = self.embed(words) # [N,L] -> [N,L,C] - x = self.conv_pool(x) # [N,L,C] -> [N,C] + if seq_len is not None: + mask = seq_len_to_mask(seq_len) + x = self.conv_pool(x, mask) + else: + x = self.conv_pool(x) # [N,L,C] -> [N,C] x = self.dropout(x) x = self.fc(x) # [N,C] -> [N, N_class] return {C.OUTPUT: x} diff --git a/fastNLP/modules/encoder/_bert.py b/fastNLP/modules/encoder/_bert.py new file mode 100644 index 00000000..fc62ea9c --- /dev/null +++ b/fastNLP/modules/encoder/_bert.py @@ -0,0 +1,625 @@ + + + +""" +这个页面的代码很大程度上参考了https://github.com/huggingface/pytorch-pretrained-BERT的代码 +""" + + +import torch +from torch import nn + +from ... import Vocabulary +import collections + +import os +import unicodedata +from ...io.file_utils import _get_base_url, cached_path +from .bert import BertModel +import numpy as np +from itertools import chain + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r", encoding="utf-8") as reader: + while True: + token = reader.readline() + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, + do_lower_case=True, + never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case and token not in self.never_split: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + if text in self.never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +class BertTokenizer(object): + """Runs end-to-end tokenization: punctuation splitting + wordpiece""" + + def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, + never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): + """Constructs a BertTokenizer. + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input + Only has an effect when do_wordpiece_only=False + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; + Effective maximum length is always the minimum of this + value (if specified) and the underlying BERT model's + sequence length. + never_split: List of tokens which will never be split during tokenization. + Only has an effect when do_wordpiece_only=False + """ + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + never_split=never_split) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + self.max_len = max_len if max_len is not None else int(1e12) + + def tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(self.vocab[token]) + if len(ids) > self.max_len: + print( + "Token indices sequence length is longer than the specified maximum " + " sequence length for this BERT model ({} > {}). Running this" + " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) + ) + return ids + + def convert_ids_to_tokens(self, ids): + """Converts a sequence of ids in wordpiece tokens using the vocab.""" + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a directory or file.""" + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_NAME) + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + print("Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + return vocab_file + + @classmethod + def from_pretrained(cls, model_dir, *inputs, **kwargs): + """ + 给定path,直接读取vocab. + + """ + pretrained_model_name_or_path = os.path.join(model_dir, VOCAB_NAME) + print("loading vocabulary file {}".format(pretrained_model_name_or_path)) + max_len = 512 + kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) + # Instantiate tokenizer. + tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs) + return tokenizer + +VOCAB_NAME = 'vocab.txt' + +class _WordBertModel(nn.Module): + def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False): + super().__init__() + + self.tokenzier = BertTokenizer.from_pretrained(model_dir) + self.encoder = BertModel.from_pretrained(model_dir) + # 检查encoder_layer_number是否合理 + encoder_layer_number = len(self.encoder.encoder.layer) + self.layers = list(map(int, layers.split(','))) + for layer in self.layers: + if layer<0: + assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + f"a bert model with {encoder_layer_number} layers." + else: + assert layer