Browse Source

1. Trainer支持使用DistributedDataParallel训练; 但是还没有经过广泛测试,谨慎使用; 2. 修复import os bug; 3.FitlogCallback支持不传入任何DataSet; 4. NullOptimizer的construct_from_optimer返回self; 5. 修复Bert中pooled_cls的bug;“

tags/v0.4.10
yh 5 years ago
parent
commit
22a8702d22
16 changed files with 139 additions and 128 deletions
  1. +12
    -4
      fastNLP/core/batch.py
  2. +1
    -1
      fastNLP/core/callback.py
  3. +9
    -3
      fastNLP/core/dataset.py
  4. +1
    -1
      fastNLP/core/losses.py
  5. +1
    -1
      fastNLP/core/optimizer.py
  6. +3
    -3
      fastNLP/core/sampler.py
  7. +8
    -6
      fastNLP/core/tester.py
  8. +66
    -38
      fastNLP/core/trainer.py
  9. +12
    -58
      fastNLP/core/utils.py
  10. +3
    -3
      fastNLP/embeddings/bert_embedding.py
  11. +1
    -1
      fastNLP/embeddings/elmo_embedding.py
  12. +2
    -2
      fastNLP/embeddings/static_embedding.py
  13. +2
    -0
      fastNLP/modules/encoder/bert.py
  14. +2
    -2
      fastNLP/modules/utils.py
  15. +11
    -0
      test/core/test_dataset.py
  16. +5
    -5
      test/core/test_field.py

+ 12
- 4
fastNLP/core/batch.py View File

@@ -93,9 +93,13 @@ class DataSetGetter:

class SamplerAdapter(torch.utils.data.Sampler):
def __init__(self, sampler, dataset):
super().__init__(dataset)
self.sampler = sampler
self.dataset = dataset

def __len__(self):
return len(self.dataset)

def __iter__(self):
return iter(self.sampler(self.dataset))

@@ -165,15 +169,19 @@ class DataSetIter(BatchIter):
timeout=0, worker_init_fn=None):
super().__init__()
assert isinstance(dataset, DataSet)
sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset)
if not isinstance(sampler, torch.utils.data.Sampler):
self.sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset)
else:
self.sampler = sampler
dataset = DataSetGetter(dataset, as_numpy)
collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None
self.dataiter = torch.utils.data.DataLoader(
dataset=dataset, batch_size=batch_size, sampler=sampler,
dataset=dataset, batch_size=batch_size, sampler=self.sampler,
collate_fn=collate_fn, num_workers=num_workers,
pin_memory=pin_memory, drop_last=drop_last,
timeout=timeout, worker_init_fn=worker_init_fn)
self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last)
# 以sampler的数量为准,因为DistributedSampler的时候每个进程上并不是所有的数据都用上了
self.num_batches = self.get_num_batches(len(self.dataiter.sampler), batch_size, drop_last)
self.batch_size = batch_size


@@ -182,7 +190,7 @@ class TorchLoaderIter(BatchIter):
super().__init__()
assert isinstance(dataset, torch.utils.data.DataLoader)
self.dataiter = dataset
self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last)
self.num_batches = self.get_num_batches(len(dataset.sampler), dataset.batch_size, dataset.drop_last)
self.batch_size = dataset.batch_size




+ 1
- 1
fastNLP/core/callback.py View File

@@ -479,7 +479,7 @@ class FitlogCallback(Callback):
self.datasets[key] = value
elif isinstance(data, DataSet):
self.datasets['test'] = data
else:
elif data is not None:
raise TypeError("data receives dict[DataSet] or DataSet object.")
self.verbose = verbose


+ 9
- 3
fastNLP/core/dataset.py View File

@@ -487,7 +487,7 @@ class DataSet(object):
"""
删除第index个instance

:param int index: 需要删除的instance的index,从0开始
:param int index: 需要删除的instance的index,序号从0开始
"""
assert isinstance(index, int), "Only integer supported."
if len(self) <= index:
@@ -566,7 +566,7 @@ class DataSet(object):
raise KeyError("DataSet has no field named {}.".format(old_name))
return self
def set_target(self, *field_names, flag=True):
def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True):
"""
将field_names的field设置为target

@@ -577,11 +577,14 @@ class DataSet(object):

:param str field_names: field的名称
:param bool flag: 将field_name的target状态设置为flag
:param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一
行的数据进行类型和维度推断本列的数据的类型和维度。
"""
assert isinstance(flag, bool), "Only bool type supported."
for name in field_names:
if name in self.field_arrays:
try:
self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type)
self.field_arrays[name].is_target = flag
except SetInputOrTargetException as e:
print(f"Cannot set field:{name} as target.")
@@ -589,7 +592,7 @@ class DataSet(object):
else:
raise KeyError("{} is not a valid field name.".format(name))
def set_input(self, *field_names, flag=True):
def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True):
"""
将field_names的field设置为input::

@@ -598,10 +601,13 @@ class DataSet(object):

:param str field_names: field的名称
:param bool flag: 将field_name的input状态设置为flag
:param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一
行的数据进行类型和维度推断本列的数据的类型和维度。
"""
for name in field_names:
if name in self.field_arrays:
try:
self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type)
self.field_arrays[name].is_input = flag
except SetInputOrTargetException as e:
print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.")


+ 1
- 1
fastNLP/core/losses.py View File

@@ -225,7 +225,7 @@ class CrossEntropyLoss(LossBase):
def get_loss(self, pred, target, seq_len=None):
if pred.dim() > 2:
if pred.size(1) != target.size(1):
if pred.size(1) != target.size(1): # 有可能顺序替换了
pred = pred.transpose(1, 2)
pred = pred.reshape(-1, pred.size(-1))
target = target.reshape(-1)


+ 1
- 1
fastNLP/core/optimizer.py View File

@@ -49,7 +49,7 @@ class NullOptimizer(Optimizer):
super().__init__(None)

def construct_from_pytorch(self, model_params):
pass
return self

def __getattr__(self, item):
def pass_func(*args, **kwargs):


+ 3
- 3
fastNLP/core/sampler.py View File

@@ -25,9 +25,9 @@ class Sampler(object):
def __call__(self, data_set):
"""
:param DataSet data_set: `DataSet` 对象, 需要Sample的数据
:return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出
"""
:param DataSet data_set: `DataSet` 对象, 需要Sample的数据
:return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出
"""
raise NotImplementedError




+ 8
- 6
fastNLP/core/tester.py View File

@@ -47,6 +47,7 @@ from .utils import _get_func_signature
from .utils import _get_model_device
from .utils import _move_model_to_device
from ._parallel_utils import _data_parallel_wrapper
from .utils import _model_contains_inner_module
from functools import partial

__all__ = [
@@ -83,9 +84,7 @@ class Tester(object):
def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1):
super(Tester, self).__init__()
if not isinstance(data, DataSet):
raise TypeError(f"The type of data must be `fastNLP.DataSet`, got `{type(data)}`.")

if not isinstance(model, nn.Module):
raise TypeError(f"The type of model must be `torch.nn.Module`, got `{type(model)}`.")
@@ -106,19 +105,22 @@ class Tester(object):

# check predict
if (hasattr(self._model, 'predict') and callable(self._model.predict)) or \
(isinstance(self._model, nn.DataParallel) and hasattr(self._model.module, 'predict') and
callable(self._model.module.predict)):
(_model_contains_inner_module(self._model) and hasattr(self._model.module, 'predict') and
callable(self._model.module.predict)):
if isinstance(self._model, nn.DataParallel):
self._predict_func_wrapper = partial(_data_parallel_wrapper('predict',
self._model.device_ids,
self._model.output_device),
network=self._model.module)
self._predict_func = self._model.module.predict # 用于匹配参数
elif isinstance(self._model, nn.parallel.DistributedDataParallel):
self._predict_func = self._model.module.predict
self._predict_func_wrapper = self._model.module.predict # 用于调用
else:
self._predict_func = self._model.predict
self._predict_func_wrapper = self._model.predict
else:
if isinstance(self._model, nn.DataParallel):
if _model_contains_inner_module(model):
self._predict_func_wrapper = self._model.forward
self._predict_func = self._model.module.forward
else:


+ 66
- 38
fastNLP/core/trainer.py View File

@@ -352,7 +352,7 @@ from .utils import _move_dict_value_to_device
from .utils import _get_func_signature
from .utils import _get_model_device
from .utils import _move_model_to_device
from .utils import _model_contains_inner_module

class Trainer(object):
"""
@@ -389,8 +389,8 @@ class Trainer(object):
要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表
明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。
:param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。
:param str,None save_path: 将模型保存路径。如果为None,则不保存模型。如果dev_data为None,则保存最后一次迭代的模型。
保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。
:param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存
最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。
:param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。
:param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型
的计算位置进行管理。支持以下的输入:
@@ -440,7 +440,7 @@ class Trainer(object):
# check update every
assert update_every >= 1, "update_every must be no less than 1."
self.update_every = int(update_every)
# check save_path
if not (save_path is None or isinstance(save_path, str)):
raise ValueError("save_path can only be None or `str`.")
@@ -458,30 +458,69 @@ class Trainer(object):
self.metric_key = None
# prepare loss
losser = _prepare_losser(loss)
# sampler check
if sampler is not None and not isinstance(sampler, Sampler):
raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))

if sampler is None:
sampler = RandomSampler()
elif hasattr(sampler, 'set_batch_size'):
sampler.set_batch_size(batch_size)
if isinstance(train_data, BatchIter):
if sampler is not None:
warnings.warn("sampler is ignored when train_data is a BatchIter.")
if num_workers>0:
warnings.warn("num_workers is ignored when train_data is BatchIter.")
if drop_last:
warnings.warn("drop_last is ignored when train_data is BatchIter.")

if isinstance(model, nn.parallel.DistributedDataParallel): # 如果是分布式的
# device为None
if device is not None:
warnings.warn("device is ignored when model is nn.parallel.DistributedDataParallel.")
device = None
# Sampler要是分布式的
if sampler is None:
sampler = torch.utils.data.DistributedSampler(train_data)
elif not isinstance(sampler, torch.utils.data.DistributedSampler):
raise TypeError("When using nn.parallel.DistributedDataParallel, "
"sampler must be None or torch.utils.data.DistributedSampler.")
# 不能保存模型
if save_path:
raise RuntimeError("Saving model in Distributed situation is not allowed right now.")
else:
# sampler check
if sampler is not None and not isinstance(sampler, (Sampler, torch.utils.data.Sampler)):
raise ValueError(f"The type of sampler should be fastNLP.BaseSampler or pytorch's Sampler, got {type(sampler)}")
if sampler is None:
sampler = RandomSampler()
elif hasattr(sampler, 'set_batch_size'):
sampler.set_batch_size(batch_size)

if isinstance(train_data, DataSet):
self.data_iterator = DataSetIter(
dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last)
elif isinstance(train_data, BatchIter):
self.data_iterator = train_data
train_data = train_data.dataset
else:
raise TypeError("train_data type {} not support".format(type(train_data)))

if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter):
_check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
metric_key=self.metric_key, check_level=check_code_level,
batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码
self.model = _move_model_to_device(model, device=device)
if _model_contains_inner_module(self.model):
self._forward_func = self.model.module.forward
else:
self._forward_func = self.model.forward
if check_code_level > -1:
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入
# 名是否匹配
dev_dataset = dev_data
if isinstance(dev_data, BatchIter):
dev_dataset = None
warnings.warn("dev_data is of BatchIter type, ignore validation checking.")
check_batch_size = min(batch_size, DEFAULT_CHECK_BATCH_SIZE)
if isinstance(self.model, nn.DataParallel):
_num_devices = len(self.model.device_ids)
if batch_size//_num_devices>1: # 如果多卡是每个卡可以分多个数据的,则用每个卡给两个sample
check_batch_size = max(len(self.model.device_ids)*2, check_batch_size)
else:
check_batch_size = max(len(self.model.device_ids), check_batch_size)
_check_code(dataset=train_data, model=self.model, losser=losser, forward_func=self._forward_func, metrics=metrics,
dev_data=dev_dataset, metric_key=self.metric_key, check_level=check_code_level,
batch_size=check_batch_size)

self.train_data = train_data
self.dev_data = dev_data # If None, No validation.
@@ -496,8 +535,7 @@ class Trainer(object):
self.best_dev_epoch = None
self.best_dev_step = None
self.best_dev_perf = None
self.n_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * int(drop_last==0) * self.n_epochs
self.n_steps = len(self.data_iterator) * self.n_epochs

if isinstance(optimizer, torch.optim.Optimizer):
self.optimizer = optimizer
@@ -600,10 +638,6 @@ class Trainer(object):
self.step = 0
self.epoch = 0
start = time.time()
if isinstance(self.model, nn.DataParallel):
self._forward_func = self.model.module.forward
else:
self._forward_func = self.model.forward
with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
self.pbar = pbar
avg_loss = 0
@@ -745,7 +779,7 @@ class Trainer(object):
model_path = os.path.join(self.save_path, model_name)
if not os.path.exists(self.save_path):
os.makedirs(self.save_path, exist_ok=True)
if isinstance(model, nn.DataParallel):
if _model_contains_inner_module(model):
model = model.module
if only_param:
state_dict = model.state_dict()
@@ -765,7 +799,7 @@ class Trainer(object):
states = torch.load(model_path)
else:
states = torch.load(model_path).state_dict()
if isinstance(model, nn.DataParallel):
if _model_contains_inner_module(model):
model.module.load_state_dict(states)
else:
model.load_state_dict(states)
@@ -823,12 +857,10 @@ def _get_value_info(_dict):

from numbers import Number
from .batch import _to_tensor
def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE,
dev_data=None, metric_key=None,
check_level=0):
def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAULT_CHECK_BATCH_SIZE,
dev_data=None, metric_key=None, check_level=0):
# check get_loss 方法
model_devcie = _get_model_device(model=model)
model_device = _get_model_device(model=model)
def _iter():
start_idx = 0
while start_idx<len(dataset):
@@ -849,7 +881,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
start_idx += batch_size

for batch_count, (batch_x, batch_y) in enumerate(_iter()):
_move_dict_value_to_device(batch_x, batch_y, device=model_devcie)
_move_dict_value_to_device(batch_x, batch_y, device=model_device)
# forward check
if batch_count == 0:
info_str = ""
@@ -868,15 +900,11 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
else:
info_str += 'There is no target field.'
print(info_str)
_check_forward_error(forward_func=model.forward, dataset=dataset,
_check_forward_error(forward_func=forward_func, dataset=dataset,
batch_x=batch_x, check_level=check_level)
if isinstance(model, nn.DataParallel):
forward_func = model.module.forward
else:
forward_func = model.forward
refined_batch_x = _build_args(forward_func, **batch_x)
pred_dict = model(**refined_batch_x)
func_signature = _get_func_signature(model.forward)
func_signature = _get_func_signature(forward_func)
if not isinstance(pred_dict, dict):
raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`.")
@@ -896,7 +924,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
loss.backward()
except _CheckError as e:
# TODO: another error raised if _CheckError caught
pre_func_signature = _get_func_signature(model.forward)
pre_func_signature = _get_func_signature(forward_func)
_check_loss_evaluate(prev_func_signature=pre_func_signature, func_signature=e.func_signature,
check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y,
dataset=dataset, check_level=check_level)


+ 12
- 58
fastNLP/core/utils.py View File

@@ -62,7 +62,6 @@ def _prepare_cache_filepath(filepath):
os.makedirs(cache_dir)


# TODO 可以保存下缓存时的参数,如果load的时候发现参数不一致,发出警告。
def cache_results(_cache_fp, _refresh=False, _verbose=1):
"""
别名::class:`fastNLP.cache_results` :class:`fastNLP.core.uitls.cache_results`
@@ -188,49 +187,17 @@ def _save_model(model, model_name, save_dir, only_param=False):
torch.save(model, model_path)
model.to(_model_device)

def _model_contains_inner_module(model):
"""

# def save_pickle(obj, pickle_path, file_name):
# """Save an object into a pickle file.
#
# :param obj: an object
# :param pickle_path: str, the directory where the pickle file is to be saved
# :param file_name: str, the name of the pickle file. In general, it should be ended by "pkl".
# """
# if not os.path.exists(pickle_path):
# os.mkdir(pickle_path)
# print("make dir {} before saving pickle file".format(pickle_path))
# with open(os.path.join(pickle_path, file_name), "wb") as f:
# _pickle.dump(obj, f)
# print("{} saved in {}".format(file_name, pickle_path))
#
#
# def load_pickle(pickle_path, file_name):
# """Load an object from a given pickle file.
#
# :param pickle_path: str, the directory where the pickle file is.
# :param file_name: str, the name of the pickle file.
# :return obj: an object stored in the pickle
# """
# with open(os.path.join(pickle_path, file_name), "rb") as f:
# obj = _pickle.load(f)
# print("{} loaded from {}".format(file_name, pickle_path))
# return obj
#
#
# def pickle_exist(pickle_path, pickle_name):
# """Check if a given pickle file exists in the directory.
#
# :param pickle_path: the directory of target pickle file
# :param pickle_name: the filename of target pickle file
# :return: True if file exists else False
# """
# if not os.path.exists(pickle_path):
# os.makedirs(pickle_path)
# file_name = os.path.join(pickle_path, pickle_name)
# if os.path.exists(file_name):
# return True
# else:
# return False
:param nn.Module model: 模型文件,判断是否内部包含model.module, 多用于check模型是否是nn.DataParallel,
nn.parallel.DistributedDataParallel。主要是在做形参匹配的时候需要使用最内部的model的function。
:return: bool
"""
if isinstance(model, nn.Module):
if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
return True
return False

def _move_model_to_device(model, device):
"""
@@ -254,8 +221,8 @@ def _move_model_to_device(model, device):

:return: torch.nn.DataParallel or torch.nn.Module
"""
if isinstance(model, torch.nn.parallel.DistributedDataParallel):
raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.")
# if isinstance(model, torch.nn.parallel.DistributedDataParallel):
# raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.")
if device is None:
if isinstance(model, torch.nn.DataParallel):
@@ -352,7 +319,6 @@ def _map_args(maps: dict, **kwargs):
output.update({name: val})
for keys in maps.keys():
if keys not in output.keys():
# TODO: add UNUSED warning.
pass
return output

@@ -570,18 +536,6 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re
else:
_tmp = f'Provide `{_miss}` in DataSet or output of {prev_func_signature}.'
suggestions.append(_tmp)
# for _miss in unmapped_missing:
# if _miss in dataset:
# suggestions.append(f"Set `{_miss}` as target.")
# else:
# _tmp = ''
# if check_res.unused:
# _tmp = f"Specify your assignment for `{input_func_map.get(_miss, _miss)}` when initialize {module_name}."
# if _tmp:
# _tmp += f' Or provide `{_miss}` in DataSet or output of {prev_func_signature}.'
# else:
# _tmp = f'Provide `{_miss}` in output of {prev_func_signature} or DataSet.'
# suggestions.append(_tmp)
if check_res.duplicated:
errs.append(f"\tduplicated param: {check_res.duplicated}.")


+ 3
- 3
fastNLP/embeddings/bert_embedding.py View File

@@ -37,8 +37,8 @@ class BertEmbedding(ContextualEmbedding):
:param ~fastNLP.Vocabulary vocab: 词表
:param str model_dir_or_name: 模型所在目录或者模型的名称。当传入模型所在目录时,目录中应该包含一个词表文件(以.txt作为后缀名),
权重文件(以.bin作为文件后缀名), 配置文件(以.json作为后缀名)。
:param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,可以以负数
去索引倒数几层。
:param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,层的序号是
从0开始,可以以负数去索引倒数几层。
:param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces
中计算得到它对应的表示。支持 ``last`` , ``first`` , ``avg`` , ``max``。
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
@@ -334,7 +334,7 @@ class _WordBertModel(nn.Module):
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
if self.include_cls_sep:
if l==len(bert_outputs) and self.pooled_cls:
if l in (len(bert_outputs)-1, -1) and self.pooled_cls:
outputs[l_index, :, 0] = pooled_cls
else:
outputs[l_index, :, 0] = output_layer[:, 0]


+ 1
- 1
fastNLP/embeddings/elmo_embedding.py View File

@@ -37,7 +37,7 @@ class ElmoEmbedding(ContextualEmbedding):
:param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo所在文件夹,该文件夹下面应该有两个文件,
其中一个是以json为后缀的配置文件,另一个是以pkl为后缀的权重文件;第二种是传入ELMo版本的名称,将自动查看缓存中是否存在该模型,
没有的话将自动下载并缓存。
:param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果
:param layers: str, 指定返回的层数(从0开始), 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果
按照这个顺序concat起来,默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致,
初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。)
:param requires_grad: bool, 该层是否需要gradient, 默认为False.


+ 2
- 2
fastNLP/embeddings/static_embedding.py View File

@@ -43,7 +43,7 @@ class StaticEmbedding(TokenEmbedding):
如果输入为None则使用embedding_dim的维度随机初始化一个embedding。
:param int embedding_dim: 随机初始化的embedding的维度,仅在model_dir_or_name为None时有效。
:param bool requires_grad: 是否需要gradient. 默认为True
:param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
:param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对
:param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独
为大写的词语开辟一个vector表示,则将lower设置为False。
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
@@ -84,7 +84,7 @@ class StaticEmbedding(TokenEmbedding):
if lowered_word not in lowered_vocab.word_count:
lowered_vocab.add_word(lowered_word)
lowered_vocab._no_create_word[lowered_word] += 1
print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered "
print(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered "
f"words.")
if model_path:
embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method)


+ 2
- 0
fastNLP/modules/encoder/bert.py View File

@@ -563,6 +563,8 @@ class WordpieceTokenizer(object):
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
if len(output_tokens)==0:
return [self.unk_token]
return output_tokens




+ 2
- 2
fastNLP/modules/utils.py View File

@@ -3,7 +3,8 @@ from functools import reduce
import torch
import torch.nn as nn
import torch.nn.init as init

import glob
import os

def initial_parameter(net, initial_method=None):
"""A method used to initialize the weights of PyTorch models.
@@ -119,7 +120,6 @@ def get_dropout_mask(drop_p: float, tensor: torch.Tensor):
training=False, inplace=True)
return mask_x

import glob

def _get_file_name_base_on_postfix(dir_path, postfix):
"""


+ 11
- 0
test/core/test_dataset.py View File

@@ -1,4 +1,5 @@
import os
import sys
import unittest

from fastNLP import DataSet
@@ -79,6 +80,16 @@ class TestDataSetMethods(unittest.TestCase):
self.assertFalse("x" in dd.field_arrays)
self.assertTrue("y" in dd.field_arrays)

def test_delete_instance(self):
dd = DataSet()
old_length = 2
dd.add_field("x", [[1, 2, 3]] * old_length)
dd.add_field("y", [[1, 2, 3, 4]] * old_length)
dd.delete_instance(0)
self.assertEqual(len(dd), old_length-1)
dd.delete_instance(0)
self.assertEqual(len(dd), old_length-2)

def test_getitem(self):
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
ins_1, ins_0 = ds[0], ds[1]


+ 5
- 5
test/core/test_field.py View File

@@ -170,22 +170,22 @@ class TestFieldArray(unittest.TestCase):

def test_append(self):
with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False)
fa.append(0)

with self.assertRaises(Exception):
fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True)
fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True, use_1st_ins_infer_dim_type=False)
fa.append([1, 2, 3, 4, 5])

with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False)
fa.append([])

with self.assertRaises(Exception):
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True)
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False)
fa.append(["str", 0, 0, 0, 1.89])

fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True)
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True, use_1st_ins_infer_dim_type=False)
fa.append([1.2, 2.3, 3.4, 4.5, 5.6])
self.assertEqual(len(fa), 3)
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6])


Loading…
Cancel
Save