@@ -93,9 +93,13 @@ class DataSetGetter: | |||
class SamplerAdapter(torch.utils.data.Sampler): | |||
def __init__(self, sampler, dataset): | |||
super().__init__(dataset) | |||
self.sampler = sampler | |||
self.dataset = dataset | |||
def __len__(self): | |||
return len(self.dataset) | |||
def __iter__(self): | |||
return iter(self.sampler(self.dataset)) | |||
@@ -165,15 +169,19 @@ class DataSetIter(BatchIter): | |||
timeout=0, worker_init_fn=None): | |||
super().__init__() | |||
assert isinstance(dataset, DataSet) | |||
sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) | |||
if not isinstance(sampler, torch.utils.data.Sampler): | |||
self.sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) | |||
else: | |||
self.sampler = sampler | |||
dataset = DataSetGetter(dataset, as_numpy) | |||
collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None | |||
self.dataiter = torch.utils.data.DataLoader( | |||
dataset=dataset, batch_size=batch_size, sampler=sampler, | |||
dataset=dataset, batch_size=batch_size, sampler=self.sampler, | |||
collate_fn=collate_fn, num_workers=num_workers, | |||
pin_memory=pin_memory, drop_last=drop_last, | |||
timeout=timeout, worker_init_fn=worker_init_fn) | |||
self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last) | |||
# 以sampler的数量为准,因为DistributedSampler的时候每个进程上并不是所有的数据都用上了 | |||
self.num_batches = self.get_num_batches(len(self.dataiter.sampler), batch_size, drop_last) | |||
self.batch_size = batch_size | |||
@@ -182,7 +190,7 @@ class TorchLoaderIter(BatchIter): | |||
super().__init__() | |||
assert isinstance(dataset, torch.utils.data.DataLoader) | |||
self.dataiter = dataset | |||
self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last) | |||
self.num_batches = self.get_num_batches(len(dataset.sampler), dataset.batch_size, dataset.drop_last) | |||
self.batch_size = dataset.batch_size | |||
@@ -479,7 +479,7 @@ class FitlogCallback(Callback): | |||
self.datasets[key] = value | |||
elif isinstance(data, DataSet): | |||
self.datasets['test'] = data | |||
else: | |||
elif data is not None: | |||
raise TypeError("data receives dict[DataSet] or DataSet object.") | |||
self.verbose = verbose | |||
@@ -487,7 +487,7 @@ class DataSet(object): | |||
""" | |||
删除第index个instance | |||
:param int index: 需要删除的instance的index,从0开始 | |||
:param int index: 需要删除的instance的index,序号从0开始。 | |||
""" | |||
assert isinstance(index, int), "Only integer supported." | |||
if len(self) <= index: | |||
@@ -566,7 +566,7 @@ class DataSet(object): | |||
raise KeyError("DataSet has no field named {}.".format(old_name)) | |||
return self | |||
def set_target(self, *field_names, flag=True): | |||
def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): | |||
""" | |||
将field_names的field设置为target | |||
@@ -577,11 +577,14 @@ class DataSet(object): | |||
:param str field_names: field的名称 | |||
:param bool flag: 将field_name的target状态设置为flag | |||
:param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | |||
行的数据进行类型和维度推断本列的数据的类型和维度。 | |||
""" | |||
assert isinstance(flag, bool), "Only bool type supported." | |||
for name in field_names: | |||
if name in self.field_arrays: | |||
try: | |||
self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) | |||
self.field_arrays[name].is_target = flag | |||
except SetInputOrTargetException as e: | |||
print(f"Cannot set field:{name} as target.") | |||
@@ -589,7 +592,7 @@ class DataSet(object): | |||
else: | |||
raise KeyError("{} is not a valid field name.".format(name)) | |||
def set_input(self, *field_names, flag=True): | |||
def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): | |||
""" | |||
将field_names的field设置为input:: | |||
@@ -598,10 +601,13 @@ class DataSet(object): | |||
:param str field_names: field的名称 | |||
:param bool flag: 将field_name的input状态设置为flag | |||
:param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 | |||
行的数据进行类型和维度推断本列的数据的类型和维度。 | |||
""" | |||
for name in field_names: | |||
if name in self.field_arrays: | |||
try: | |||
self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) | |||
self.field_arrays[name].is_input = flag | |||
except SetInputOrTargetException as e: | |||
print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.") | |||
@@ -225,7 +225,7 @@ class CrossEntropyLoss(LossBase): | |||
def get_loss(self, pred, target, seq_len=None): | |||
if pred.dim() > 2: | |||
if pred.size(1) != target.size(1): | |||
if pred.size(1) != target.size(1): # 有可能顺序替换了 | |||
pred = pred.transpose(1, 2) | |||
pred = pred.reshape(-1, pred.size(-1)) | |||
target = target.reshape(-1) | |||
@@ -49,7 +49,7 @@ class NullOptimizer(Optimizer): | |||
super().__init__(None) | |||
def construct_from_pytorch(self, model_params): | |||
pass | |||
return self | |||
def __getattr__(self, item): | |||
def pass_func(*args, **kwargs): | |||
@@ -25,9 +25,9 @@ class Sampler(object): | |||
def __call__(self, data_set): | |||
""" | |||
:param DataSet data_set: `DataSet` 对象, 需要Sample的数据 | |||
:return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出 | |||
""" | |||
:param DataSet data_set: `DataSet` 对象, 需要Sample的数据 | |||
:return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出 | |||
""" | |||
raise NotImplementedError | |||
@@ -47,6 +47,7 @@ from .utils import _get_func_signature | |||
from .utils import _get_model_device | |||
from .utils import _move_model_to_device | |||
from ._parallel_utils import _data_parallel_wrapper | |||
from .utils import _model_contains_inner_module | |||
from functools import partial | |||
__all__ = [ | |||
@@ -83,9 +84,7 @@ class Tester(object): | |||
def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1): | |||
super(Tester, self).__init__() | |||
if not isinstance(data, DataSet): | |||
raise TypeError(f"The type of data must be `fastNLP.DataSet`, got `{type(data)}`.") | |||
if not isinstance(model, nn.Module): | |||
raise TypeError(f"The type of model must be `torch.nn.Module`, got `{type(model)}`.") | |||
@@ -106,19 +105,22 @@ class Tester(object): | |||
# check predict | |||
if (hasattr(self._model, 'predict') and callable(self._model.predict)) or \ | |||
(isinstance(self._model, nn.DataParallel) and hasattr(self._model.module, 'predict') and | |||
callable(self._model.module.predict)): | |||
(_model_contains_inner_module(self._model) and hasattr(self._model.module, 'predict') and | |||
callable(self._model.module.predict)): | |||
if isinstance(self._model, nn.DataParallel): | |||
self._predict_func_wrapper = partial(_data_parallel_wrapper('predict', | |||
self._model.device_ids, | |||
self._model.output_device), | |||
network=self._model.module) | |||
self._predict_func = self._model.module.predict # 用于匹配参数 | |||
elif isinstance(self._model, nn.parallel.DistributedDataParallel): | |||
self._predict_func = self._model.module.predict | |||
self._predict_func_wrapper = self._model.module.predict # 用于调用 | |||
else: | |||
self._predict_func = self._model.predict | |||
self._predict_func_wrapper = self._model.predict | |||
else: | |||
if isinstance(self._model, nn.DataParallel): | |||
if _model_contains_inner_module(model): | |||
self._predict_func_wrapper = self._model.forward | |||
self._predict_func = self._model.module.forward | |||
else: | |||
@@ -352,7 +352,7 @@ from .utils import _move_dict_value_to_device | |||
from .utils import _get_func_signature | |||
from .utils import _get_model_device | |||
from .utils import _move_model_to_device | |||
from .utils import _model_contains_inner_module | |||
class Trainer(object): | |||
""" | |||
@@ -389,8 +389,8 @@ class Trainer(object): | |||
要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 | |||
明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 | |||
:param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 | |||
:param str,None save_path: 将模型保存路径。如果为None,则不保存模型。如果dev_data为None,则保存最后一次迭代的模型。 | |||
保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 | |||
:param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存 | |||
最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 | |||
:param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 | |||
:param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 | |||
的计算位置进行管理。支持以下的输入: | |||
@@ -440,7 +440,7 @@ class Trainer(object): | |||
# check update every | |||
assert update_every >= 1, "update_every must be no less than 1." | |||
self.update_every = int(update_every) | |||
# check save_path | |||
if not (save_path is None or isinstance(save_path, str)): | |||
raise ValueError("save_path can only be None or `str`.") | |||
@@ -458,30 +458,69 @@ class Trainer(object): | |||
self.metric_key = None | |||
# prepare loss | |||
losser = _prepare_losser(loss) | |||
# sampler check | |||
if sampler is not None and not isinstance(sampler, Sampler): | |||
raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) | |||
if sampler is None: | |||
sampler = RandomSampler() | |||
elif hasattr(sampler, 'set_batch_size'): | |||
sampler.set_batch_size(batch_size) | |||
if isinstance(train_data, BatchIter): | |||
if sampler is not None: | |||
warnings.warn("sampler is ignored when train_data is a BatchIter.") | |||
if num_workers>0: | |||
warnings.warn("num_workers is ignored when train_data is BatchIter.") | |||
if drop_last: | |||
warnings.warn("drop_last is ignored when train_data is BatchIter.") | |||
if isinstance(model, nn.parallel.DistributedDataParallel): # 如果是分布式的 | |||
# device为None | |||
if device is not None: | |||
warnings.warn("device is ignored when model is nn.parallel.DistributedDataParallel.") | |||
device = None | |||
# Sampler要是分布式的 | |||
if sampler is None: | |||
sampler = torch.utils.data.DistributedSampler(train_data) | |||
elif not isinstance(sampler, torch.utils.data.DistributedSampler): | |||
raise TypeError("When using nn.parallel.DistributedDataParallel, " | |||
"sampler must be None or torch.utils.data.DistributedSampler.") | |||
# 不能保存模型 | |||
if save_path: | |||
raise RuntimeError("Saving model in Distributed situation is not allowed right now.") | |||
else: | |||
# sampler check | |||
if sampler is not None and not isinstance(sampler, (Sampler, torch.utils.data.Sampler)): | |||
raise ValueError(f"The type of sampler should be fastNLP.BaseSampler or pytorch's Sampler, got {type(sampler)}") | |||
if sampler is None: | |||
sampler = RandomSampler() | |||
elif hasattr(sampler, 'set_batch_size'): | |||
sampler.set_batch_size(batch_size) | |||
if isinstance(train_data, DataSet): | |||
self.data_iterator = DataSetIter( | |||
dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last) | |||
elif isinstance(train_data, BatchIter): | |||
self.data_iterator = train_data | |||
train_data = train_data.dataset | |||
else: | |||
raise TypeError("train_data type {} not support".format(type(train_data))) | |||
if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): | |||
_check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, | |||
metric_key=self.metric_key, check_level=check_code_level, | |||
batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) | |||
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 | |||
self.model = _move_model_to_device(model, device=device) | |||
if _model_contains_inner_module(self.model): | |||
self._forward_func = self.model.module.forward | |||
else: | |||
self._forward_func = self.model.forward | |||
if check_code_level > -1: | |||
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 | |||
# 名是否匹配 | |||
dev_dataset = dev_data | |||
if isinstance(dev_data, BatchIter): | |||
dev_dataset = None | |||
warnings.warn("dev_data is of BatchIter type, ignore validation checking.") | |||
check_batch_size = min(batch_size, DEFAULT_CHECK_BATCH_SIZE) | |||
if isinstance(self.model, nn.DataParallel): | |||
_num_devices = len(self.model.device_ids) | |||
if batch_size//_num_devices>1: # 如果多卡是每个卡可以分多个数据的,则用每个卡给两个sample | |||
check_batch_size = max(len(self.model.device_ids)*2, check_batch_size) | |||
else: | |||
check_batch_size = max(len(self.model.device_ids), check_batch_size) | |||
_check_code(dataset=train_data, model=self.model, losser=losser, forward_func=self._forward_func, metrics=metrics, | |||
dev_data=dev_dataset, metric_key=self.metric_key, check_level=check_code_level, | |||
batch_size=check_batch_size) | |||
self.train_data = train_data | |||
self.dev_data = dev_data # If None, No validation. | |||
@@ -496,8 +535,7 @@ class Trainer(object): | |||
self.best_dev_epoch = None | |||
self.best_dev_step = None | |||
self.best_dev_perf = None | |||
self.n_steps = (len(self.train_data) // self.batch_size + int( | |||
len(self.train_data) % self.batch_size != 0)) * int(drop_last==0) * self.n_epochs | |||
self.n_steps = len(self.data_iterator) * self.n_epochs | |||
if isinstance(optimizer, torch.optim.Optimizer): | |||
self.optimizer = optimizer | |||
@@ -600,10 +638,6 @@ class Trainer(object): | |||
self.step = 0 | |||
self.epoch = 0 | |||
start = time.time() | |||
if isinstance(self.model, nn.DataParallel): | |||
self._forward_func = self.model.module.forward | |||
else: | |||
self._forward_func = self.model.forward | |||
with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: | |||
self.pbar = pbar | |||
avg_loss = 0 | |||
@@ -745,7 +779,7 @@ class Trainer(object): | |||
model_path = os.path.join(self.save_path, model_name) | |||
if not os.path.exists(self.save_path): | |||
os.makedirs(self.save_path, exist_ok=True) | |||
if isinstance(model, nn.DataParallel): | |||
if _model_contains_inner_module(model): | |||
model = model.module | |||
if only_param: | |||
state_dict = model.state_dict() | |||
@@ -765,7 +799,7 @@ class Trainer(object): | |||
states = torch.load(model_path) | |||
else: | |||
states = torch.load(model_path).state_dict() | |||
if isinstance(model, nn.DataParallel): | |||
if _model_contains_inner_module(model): | |||
model.module.load_state_dict(states) | |||
else: | |||
model.load_state_dict(states) | |||
@@ -823,12 +857,10 @@ def _get_value_info(_dict): | |||
from numbers import Number | |||
from .batch import _to_tensor | |||
def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, | |||
dev_data=None, metric_key=None, | |||
check_level=0): | |||
def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAULT_CHECK_BATCH_SIZE, | |||
dev_data=None, metric_key=None, check_level=0): | |||
# check get_loss 方法 | |||
model_devcie = _get_model_device(model=model) | |||
model_device = _get_model_device(model=model) | |||
def _iter(): | |||
start_idx = 0 | |||
while start_idx<len(dataset): | |||
@@ -849,7 +881,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ | |||
start_idx += batch_size | |||
for batch_count, (batch_x, batch_y) in enumerate(_iter()): | |||
_move_dict_value_to_device(batch_x, batch_y, device=model_devcie) | |||
_move_dict_value_to_device(batch_x, batch_y, device=model_device) | |||
# forward check | |||
if batch_count == 0: | |||
info_str = "" | |||
@@ -868,15 +900,11 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ | |||
else: | |||
info_str += 'There is no target field.' | |||
print(info_str) | |||
_check_forward_error(forward_func=model.forward, dataset=dataset, | |||
_check_forward_error(forward_func=forward_func, dataset=dataset, | |||
batch_x=batch_x, check_level=check_level) | |||
if isinstance(model, nn.DataParallel): | |||
forward_func = model.module.forward | |||
else: | |||
forward_func = model.forward | |||
refined_batch_x = _build_args(forward_func, **batch_x) | |||
pred_dict = model(**refined_batch_x) | |||
func_signature = _get_func_signature(model.forward) | |||
func_signature = _get_func_signature(forward_func) | |||
if not isinstance(pred_dict, dict): | |||
raise TypeError(f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`.") | |||
@@ -896,7 +924,7 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ | |||
loss.backward() | |||
except _CheckError as e: | |||
# TODO: another error raised if _CheckError caught | |||
pre_func_signature = _get_func_signature(model.forward) | |||
pre_func_signature = _get_func_signature(forward_func) | |||
_check_loss_evaluate(prev_func_signature=pre_func_signature, func_signature=e.func_signature, | |||
check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y, | |||
dataset=dataset, check_level=check_level) | |||
@@ -62,7 +62,6 @@ def _prepare_cache_filepath(filepath): | |||
os.makedirs(cache_dir) | |||
# TODO 可以保存下缓存时的参数,如果load的时候发现参数不一致,发出警告。 | |||
def cache_results(_cache_fp, _refresh=False, _verbose=1): | |||
""" | |||
别名::class:`fastNLP.cache_results` :class:`fastNLP.core.uitls.cache_results` | |||
@@ -188,49 +187,17 @@ def _save_model(model, model_name, save_dir, only_param=False): | |||
torch.save(model, model_path) | |||
model.to(_model_device) | |||
def _model_contains_inner_module(model): | |||
""" | |||
# def save_pickle(obj, pickle_path, file_name): | |||
# """Save an object into a pickle file. | |||
# | |||
# :param obj: an object | |||
# :param pickle_path: str, the directory where the pickle file is to be saved | |||
# :param file_name: str, the name of the pickle file. In general, it should be ended by "pkl". | |||
# """ | |||
# if not os.path.exists(pickle_path): | |||
# os.mkdir(pickle_path) | |||
# print("make dir {} before saving pickle file".format(pickle_path)) | |||
# with open(os.path.join(pickle_path, file_name), "wb") as f: | |||
# _pickle.dump(obj, f) | |||
# print("{} saved in {}".format(file_name, pickle_path)) | |||
# | |||
# | |||
# def load_pickle(pickle_path, file_name): | |||
# """Load an object from a given pickle file. | |||
# | |||
# :param pickle_path: str, the directory where the pickle file is. | |||
# :param file_name: str, the name of the pickle file. | |||
# :return obj: an object stored in the pickle | |||
# """ | |||
# with open(os.path.join(pickle_path, file_name), "rb") as f: | |||
# obj = _pickle.load(f) | |||
# print("{} loaded from {}".format(file_name, pickle_path)) | |||
# return obj | |||
# | |||
# | |||
# def pickle_exist(pickle_path, pickle_name): | |||
# """Check if a given pickle file exists in the directory. | |||
# | |||
# :param pickle_path: the directory of target pickle file | |||
# :param pickle_name: the filename of target pickle file | |||
# :return: True if file exists else False | |||
# """ | |||
# if not os.path.exists(pickle_path): | |||
# os.makedirs(pickle_path) | |||
# file_name = os.path.join(pickle_path, pickle_name) | |||
# if os.path.exists(file_name): | |||
# return True | |||
# else: | |||
# return False | |||
:param nn.Module model: 模型文件,判断是否内部包含model.module, 多用于check模型是否是nn.DataParallel, | |||
nn.parallel.DistributedDataParallel。主要是在做形参匹配的时候需要使用最内部的model的function。 | |||
:return: bool | |||
""" | |||
if isinstance(model, nn.Module): | |||
if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): | |||
return True | |||
return False | |||
def _move_model_to_device(model, device): | |||
""" | |||
@@ -254,8 +221,8 @@ def _move_model_to_device(model, device): | |||
:return: torch.nn.DataParallel or torch.nn.Module | |||
""" | |||
if isinstance(model, torch.nn.parallel.DistributedDataParallel): | |||
raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.") | |||
# if isinstance(model, torch.nn.parallel.DistributedDataParallel): | |||
# raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.") | |||
if device is None: | |||
if isinstance(model, torch.nn.DataParallel): | |||
@@ -352,7 +319,6 @@ def _map_args(maps: dict, **kwargs): | |||
output.update({name: val}) | |||
for keys in maps.keys(): | |||
if keys not in output.keys(): | |||
# TODO: add UNUSED warning. | |||
pass | |||
return output | |||
@@ -570,18 +536,6 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re | |||
else: | |||
_tmp = f'Provide `{_miss}` in DataSet or output of {prev_func_signature}.' | |||
suggestions.append(_tmp) | |||
# for _miss in unmapped_missing: | |||
# if _miss in dataset: | |||
# suggestions.append(f"Set `{_miss}` as target.") | |||
# else: | |||
# _tmp = '' | |||
# if check_res.unused: | |||
# _tmp = f"Specify your assignment for `{input_func_map.get(_miss, _miss)}` when initialize {module_name}." | |||
# if _tmp: | |||
# _tmp += f' Or provide `{_miss}` in DataSet or output of {prev_func_signature}.' | |||
# else: | |||
# _tmp = f'Provide `{_miss}` in output of {prev_func_signature} or DataSet.' | |||
# suggestions.append(_tmp) | |||
if check_res.duplicated: | |||
errs.append(f"\tduplicated param: {check_res.duplicated}.") | |||
@@ -37,8 +37,8 @@ class BertEmbedding(ContextualEmbedding): | |||
:param ~fastNLP.Vocabulary vocab: 词表 | |||
:param str model_dir_or_name: 模型所在目录或者模型的名称。当传入模型所在目录时,目录中应该包含一个词表文件(以.txt作为后缀名), | |||
权重文件(以.bin作为文件后缀名), 配置文件(以.json作为后缀名)。 | |||
:param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,可以以负数 | |||
去索引倒数几层。 | |||
:param str layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,层的序号是 | |||
从0开始,可以以负数去索引倒数几层。 | |||
:param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces | |||
中计算得到它对应的表示。支持 ``last`` , ``first`` , ``avg`` , ``max``。 | |||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||
@@ -334,7 +334,7 @@ class _WordBertModel(nn.Module): | |||
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] | |||
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) | |||
if self.include_cls_sep: | |||
if l==len(bert_outputs) and self.pooled_cls: | |||
if l in (len(bert_outputs)-1, -1) and self.pooled_cls: | |||
outputs[l_index, :, 0] = pooled_cls | |||
else: | |||
outputs[l_index, :, 0] = output_layer[:, 0] | |||
@@ -37,7 +37,7 @@ class ElmoEmbedding(ContextualEmbedding): | |||
:param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo所在文件夹,该文件夹下面应该有两个文件, | |||
其中一个是以json为后缀的配置文件,另一个是以pkl为后缀的权重文件;第二种是传入ELMo版本的名称,将自动查看缓存中是否存在该模型, | |||
没有的话将自动下载并缓存。 | |||
:param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果 | |||
:param layers: str, 指定返回的层数(从0开始), 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果 | |||
按照这个顺序concat起来,默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致, | |||
初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。) | |||
:param requires_grad: bool, 该层是否需要gradient, 默认为False. | |||
@@ -43,7 +43,7 @@ class StaticEmbedding(TokenEmbedding): | |||
如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 | |||
:param int embedding_dim: 随机初始化的embedding的维度,仅在model_dir_or_name为None时有效。 | |||
:param bool requires_grad: 是否需要gradient. 默认为True | |||
:param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。 | |||
:param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对 | |||
:param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 | |||
为大写的词语开辟一个vector表示,则将lower设置为False。 | |||
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 | |||
@@ -84,7 +84,7 @@ class StaticEmbedding(TokenEmbedding): | |||
if lowered_word not in lowered_vocab.word_count: | |||
lowered_vocab.add_word(lowered_word) | |||
lowered_vocab._no_create_word[lowered_word] += 1 | |||
print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered " | |||
print(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered " | |||
f"words.") | |||
if model_path: | |||
embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) | |||
@@ -563,6 +563,8 @@ class WordpieceTokenizer(object): | |||
output_tokens.append(self.unk_token) | |||
else: | |||
output_tokens.extend(sub_tokens) | |||
if len(output_tokens)==0: | |||
return [self.unk_token] | |||
return output_tokens | |||
@@ -3,7 +3,8 @@ from functools import reduce | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.init as init | |||
import glob | |||
import os | |||
def initial_parameter(net, initial_method=None): | |||
"""A method used to initialize the weights of PyTorch models. | |||
@@ -119,7 +120,6 @@ def get_dropout_mask(drop_p: float, tensor: torch.Tensor): | |||
training=False, inplace=True) | |||
return mask_x | |||
import glob | |||
def _get_file_name_base_on_postfix(dir_path, postfix): | |||
""" | |||
@@ -1,4 +1,5 @@ | |||
import os | |||
import sys | |||
import unittest | |||
from fastNLP import DataSet | |||
@@ -79,6 +80,16 @@ class TestDataSetMethods(unittest.TestCase): | |||
self.assertFalse("x" in dd.field_arrays) | |||
self.assertTrue("y" in dd.field_arrays) | |||
def test_delete_instance(self): | |||
dd = DataSet() | |||
old_length = 2 | |||
dd.add_field("x", [[1, 2, 3]] * old_length) | |||
dd.add_field("y", [[1, 2, 3, 4]] * old_length) | |||
dd.delete_instance(0) | |||
self.assertEqual(len(dd), old_length-1) | |||
dd.delete_instance(0) | |||
self.assertEqual(len(dd), old_length-2) | |||
def test_getitem(self): | |||
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) | |||
ins_1, ins_0 = ds[0], ds[1] | |||
@@ -170,22 +170,22 @@ class TestFieldArray(unittest.TestCase): | |||
def test_append(self): | |||
with self.assertRaises(Exception): | |||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) | |||
fa.append(0) | |||
with self.assertRaises(Exception): | |||
fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True) | |||
fa = FieldArray("y", [1.1, 2.2, 3.3, 4.4, 5.5], is_input=True, use_1st_ins_infer_dim_type=False) | |||
fa.append([1, 2, 3, 4, 5]) | |||
with self.assertRaises(Exception): | |||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) | |||
fa.append([]) | |||
with self.assertRaises(Exception): | |||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True) | |||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1, 2, 3, 4, 5]], is_input=True, use_1st_ins_infer_dim_type=False) | |||
fa.append(["str", 0, 0, 0, 1.89]) | |||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True) | |||
fa = FieldArray("y", [[1.1, 2.2, 3.3, 4.4, 5.5], [1.0, 2.0, 3.0, 4.0, 5.0]], is_input=True, use_1st_ins_infer_dim_type=False) | |||
fa.append([1.2, 2.3, 3.4, 4.5, 5.6]) | |||
self.assertEqual(len(fa), 3) | |||
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6]) | |||