From d9e9f9ca7c33ca41d1240516a042dde7f5672a92 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 12 Jul 2019 15:24:18 +0800 Subject: [PATCH 001/153] delete a todo item --- fastNLP/core/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index c9f51123..b246c6a0 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -10,8 +10,6 @@ core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fa 对于常用的功能,你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。 -.. todo:: - 介绍core 的子模块的分工,好像必要性不大 """ from .batch import DataSetIter, BatchIter, TorchLoaderIter From 3c2e419059b1d8c241030b62cf30daa18125503d Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 17 Jul 2019 20:06:40 +0800 Subject: [PATCH 002/153] =?UTF-8?q?=E5=A2=9E=E5=8A=A0pooled=5Fcls=E9=80=89?= =?UTF-8?q?=E9=A1=B9=EF=BC=8C=E5=8F=AF=E4=BB=A5=E6=98=AFBert=E5=9C=A8?= =?UTF-8?q?=E5=81=9A=E5=88=86=E7=B1=BB=E6=97=B6=E5=8F=AF=E4=BB=A5=E4=BD=BF?= =?UTF-8?q?=E7=94=A8=E9=A2=84=E8=AE=AD=E7=BB=83=E7=9A=84=E6=9D=83=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 26 ++++++++++++++++++-------- fastNLP/modules/encoder/bert.py | 10 +++++++--- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index aa72898a..847366af 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -46,11 +46,13 @@ class BertEmbedding(ContextualEmbedding): :param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样 会使得word embedding的结果比输入的结果长两个token。如果该值为True,则在使用 :class::StackEmbedding 可能会与其它类型的 embedding长度不匹配。 + :param bool pooled_cls: 返回的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取[CLS]做预测, + 一般该值为True。 :param bool requires_grad: 是否需要gradient以更新Bert的权重。 """ def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', - pool_method: str='first', word_dropout=0, dropout=0, requires_grad: bool=False, - include_cls_sep: bool=False): + pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False, + pooled_cls=True, requires_grad: bool=False): super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) # 根据model_dir_or_name检查是否存在并下载 @@ -66,7 +68,8 @@ class BertEmbedding(ContextualEmbedding): raise ValueError(f"Cannot recognize {model_dir_or_name}.") self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, - pool_method=pool_method, include_cls_sep=include_cls_sep) + pool_method=pool_method, include_cls_sep=include_cls_sep, + pooled_cls=pooled_cls) self.requires_grad = requires_grad self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size @@ -119,10 +122,12 @@ class BertWordPieceEncoder(nn.Module): :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased`` :param str layers: 最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 + :param bool pooled_cls: 返回的句子开头的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取 + [CLS]做预测,一般该值为True。 :param bool requires_grad: 是否需要gradient。 """ def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', - requires_grad: bool=False): + pooled_cls: bool = False, requires_grad: bool=False): super().__init__() PRETRAIN_URL = _get_base_url('bert') @@ -136,7 +141,7 @@ class BertWordPieceEncoder(nn.Module): else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers) + self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size self.requires_grad = requires_grad @@ -187,7 +192,8 @@ class BertWordPieceEncoder(nn.Module): class _WordBertModel(nn.Module): - def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False): + def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', + include_cls_sep:bool=False, pooled_cls:bool=False): super().__init__() self.tokenzier = BertTokenizer.from_pretrained(model_dir) @@ -206,6 +212,7 @@ class _WordBertModel(nn.Module): assert pool_method in ('avg', 'max', 'first', 'last') self.pool_method = pool_method self.include_cls_sep = include_cls_sep + self.pooled_cls = pooled_cls # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] print("Start to generating word pieces for word.") @@ -289,7 +296,7 @@ class _WordBertModel(nn.Module): # TODO 截掉长度超过的部分。 # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] - bert_outputs, _ = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks, + bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks, output_all_encoded_layers=True) # output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size @@ -327,7 +334,10 @@ class _WordBertModel(nn.Module): start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) if self.include_cls_sep: - outputs[l_index, :, 0] = output_layer[:, 0] + if l==len(bert_outputs) and self.pooled_cls: + outputs[l_index, :, 0] = pooled_cls + else: + outputs[l_index, :, 0] = output_layer[:, 0] outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift] # 3. 最终的embedding结果 return outputs diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index ce175df1..c5ad9a9c 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -848,7 +848,7 @@ class _WordPieceBertModel(nn.Module): """ - def __init__(self, model_dir: str, layers: str = '-1'): + def __init__(self, model_dir: str, layers: str = '-1', pooled_cls:bool=False): super().__init__() self.tokenzier = BertTokenizer.from_pretrained(model_dir) @@ -867,6 +867,7 @@ class _WordPieceBertModel(nn.Module): self._cls_index = self.tokenzier.vocab['[CLS]'] self._sep_index = self.tokenzier.vocab['[SEP]'] self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece + self.pooled_cls = pooled_cls def index_dataset(self, *datasets, field_name): """ @@ -909,10 +910,13 @@ class _WordPieceBertModel(nn.Module): batch_size, max_len = word_pieces.size() attn_masks = word_pieces.ne(self._wordpiece_pad_index) - bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, + bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, output_all_encoded_layers=True) # output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1))) for l_index, l in enumerate(self.layers): - outputs[l_index] = bert_outputs[l] + bert_output = bert_outputs[l] + if l==len(bert_outputs) and self.pooled_cls: + bert_output[:, 0] = pooled_cls + outputs[l_index] = bert_output return outputs From c19499e60a314cd4e555361da54fe6c63f41b921 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 18 Jul 2019 22:53:30 +0800 Subject: [PATCH 003/153] =?UTF-8?q?1.=20=E4=BF=AE=E5=A4=8DDataSet.delete?= =?UTF-8?q?=5Finstance=E7=9A=84bug;=202.=20FieldArray=E4=B8=AD=E6=94=AF?= =?UTF-8?q?=E6=8C=81=E5=8F=AA=E4=BD=BF=E7=94=A8=E7=AC=AC=E4=B8=80=E4=B8=AA?= =?UTF-8?q?instance=E6=8E=A8=E6=96=ADdimension=E5=92=8Ctype=EF=BC=8C?= =?UTF-8?q?=E8=8A=82=E7=9C=81=E6=97=B6=E9=97=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/field.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index bba854f5..d7d3bb8b 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -23,7 +23,8 @@ class AppendToTargetOrInputException(Exception): self.field_name = field_name # 标示当前field的名称 class FieldArray: - def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False): + def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False, + use_1st_ins_infer_dim_type=True): if len(content)==0: raise RuntimeError("Empty fieldarray is not allowed.") _content = content @@ -38,6 +39,7 @@ class FieldArray: # 根据input的情况设置input,target等 self._cell_ndim = None # 多少维度 self.dtype = None # 最内层的element都是什么类型的 + self._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self._is_input = False self._is_target = False @@ -77,7 +79,7 @@ class FieldArray: if value is True and \ self._is_target is False and \ self._ignore_type is False: - self._check_dtype_and_ndim() + self._check_dtype_and_ndim(only_check_1st_ins_dim_type=self._use_1st_ins_infer_dim_type) if value is False and self._is_target is False: self.dtype = None self._cell_ndim = None @@ -95,32 +97,34 @@ class FieldArray: if value is True and \ self._is_input is False and \ self._ignore_type is False: - self._check_dtype_and_ndim() + self._check_dtype_and_ndim(only_check_1st_ins_dim_type=self._use_1st_ins_infer_dim_type) if value is False and self._is_input is False: self.dtype = None self._cell_ndim = None self._is_target = value - def _check_dtype_and_ndim(self): + def _check_dtype_and_ndim(self, only_check_1st_ins_dim_type=True): """ 检查当前content所有的element是否是同一个类型,且是否每个元素具有相同的维度。通过的话,设置_cell_ndim与_ele_type属性;没有 通过将直接报错. + :param bool only_check_1st_ins_dim_type: 是否只检查第一个元素的type和dim :return: """ cell_0 = self.content[0] index = 0 try: type_0, dim_0 = _get_ele_type_and_dim(cell_0) - for cell in self.content[1:]: - index += 1 - type_i, dim_i = _get_ele_type_and_dim(cell) - if type_i!=type_0: - raise SetInputOrTargetException("Type:{} in index {} is different from the first element with type:{}." - ".".format(type_i, index, type_0)) - if dim_0!=dim_i: - raise SetInputOrTargetException("Dimension:{} in index {} is different from the first element with " - "dimension:{}.".format(dim_i, index, dim_0)) + if not only_check_1st_ins_dim_type: + for cell in self.content[1:]: + index += 1 + type_i, dim_i = _get_ele_type_and_dim(cell) + if type_i!=type_0: + raise SetInputOrTargetException("Type:{} in index {} is different from the first element with type:{}." + ".".format(type_i, index, type_0)) + if dim_0!=dim_i: + raise SetInputOrTargetException("Dimension:{} in index {} is different from the first element with " + "dimension:{}.".format(dim_i, index, dim_0)) self._cell_ndim = dim_0 self.dtype = type_0 except SetInputOrTargetException as e: @@ -132,7 +136,7 @@ class FieldArray: :param val: 把该val append到fieldarray。 :return: """ - if (self._is_target or self._is_input) and self._ignore_type is False: + if (self._is_target or self._is_input) and self._ignore_type is False and not self._use_1st_ins_infer_dim_type: type_, dim_ = _get_ele_type_and_dim(val) if self.dtype!=type_: raise AppendToTargetOrInputException(f"Value(type:{type_}) are of different types with " @@ -144,6 +148,14 @@ class FieldArray: else: self.content.append(val) + def pop(self, index): + """ + 删除该field中index处的元素 + :param int index: 从0开始的数据下标。 + :return: + """ + self.content.pop(index) + def __getitem__(self, indices): return self.get(indices, pad=False) From 22a8702d225e5d39f526daa3c56bd2f16ff7500f Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 18 Jul 2019 23:43:10 +0800 Subject: [PATCH 004/153] =?UTF-8?q?1.=20Trainer=E6=94=AF=E6=8C=81=E4=BD=BF?= =?UTF-8?q?=E7=94=A8DistributedDataParallel=E8=AE=AD=E7=BB=83;=20=E4=BD=86?= =?UTF-8?q?=E6=98=AF=E8=BF=98=E6=B2=A1=E6=9C=89=E7=BB=8F=E8=BF=87=E5=B9=BF?= =?UTF-8?q?=E6=B3=9B=E6=B5=8B=E8=AF=95=EF=BC=8C=E8=B0=A8=E6=85=8E=E4=BD=BF?= =?UTF-8?q?=E7=94=A8;=202.=20=E4=BF=AE=E5=A4=8Dimport=20os=20bug;=203.Fitl?= =?UTF-8?q?ogCallback=E6=94=AF=E6=8C=81=E4=B8=8D=E4=BC=A0=E5=85=A5?= =?UTF-8?q?=E4=BB=BB=E4=BD=95DataSet;=204.=20NullOptimizer=E7=9A=84constru?= =?UTF-8?q?ct=5Ffrom=5Foptimer=E8=BF=94=E5=9B=9Eself;=205.=20=E4=BF=AE?= =?UTF-8?q?=E5=A4=8DBert=E4=B8=ADpooled=5Fcls=E7=9A=84bug;=E2=80=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/batch.py | 16 +++- fastNLP/core/callback.py | 2 +- fastNLP/core/dataset.py | 12 ++- fastNLP/core/losses.py | 2 +- fastNLP/core/optimizer.py | 2 +- fastNLP/core/sampler.py | 6 +- fastNLP/core/tester.py | 14 ++-- fastNLP/core/trainer.py | 104 ++++++++++++++++--------- fastNLP/core/utils.py | 70 +++-------------- fastNLP/embeddings/bert_embedding.py | 6 +- fastNLP/embeddings/elmo_embedding.py | 2 +- fastNLP/embeddings/static_embedding.py | 4 +- fastNLP/modules/encoder/bert.py | 2 + fastNLP/modules/utils.py | 4 +- test/core/test_dataset.py | 11 +++ test/core/test_field.py | 10 +-- 16 files changed, 139 insertions(+), 128 deletions(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 64c5f48e..538f583a 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -93,9 +93,13 @@ class DataSetGetter: class SamplerAdapter(torch.utils.data.Sampler): def __init__(self, sampler, dataset): + super().__init__(dataset) self.sampler = sampler self.dataset = dataset + def __len__(self): + return len(self.dataset) + def __iter__(self): return iter(self.sampler(self.dataset)) @@ -165,15 +169,19 @@ class DataSetIter(BatchIter): timeout=0, worker_init_fn=None): super().__init__() assert isinstance(dataset, DataSet) - sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) + if not isinstance(sampler, torch.utils.data.Sampler): + self.sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) + else: + self.sampler = sampler dataset = DataSetGetter(dataset, as_numpy) collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None self.dataiter = torch.utils.data.DataLoader( - dataset=dataset, batch_size=batch_size, sampler=sampler, + dataset=dataset, batch_size=batch_size, sampler=self.sampler, collate_fn=collate_fn, num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn) - self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last) + # 以sampler的数量为准,因为DistributedSampler的时候每个进程上并不是所有的数据都用上了 + self.num_batches = self.get_num_batches(len(self.dataiter.sampler), batch_size, drop_last) self.batch_size = batch_size @@ -182,7 +190,7 @@ class TorchLoaderIter(BatchIter): super().__init__() assert isinstance(dataset, torch.utils.data.DataLoader) self.dataiter = dataset - self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last) + self.num_batches = self.get_num_batches(len(dataset.sampler), dataset.batch_size, dataset.drop_last) self.batch_size = dataset.batch_size diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 6f855397..874d0ad9 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -479,7 +479,7 @@ class FitlogCallback(Callback): self.datasets[key] = value elif isinstance(data, DataSet): self.datasets['test'] = data - else: + elif data is not None: raise TypeError("data receives dict[DataSet] or DataSet object.") self.verbose = verbose diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 7b7fa87a..2955eff6 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -487,7 +487,7 @@ class DataSet(object): """ 删除第index个instance - :param int index: 需要删除的instance的index,从0开始 + :param int index: 需要删除的instance的index,序号从0开始。 """ assert isinstance(index, int), "Only integer supported." if len(self) <= index: @@ -566,7 +566,7 @@ class DataSet(object): raise KeyError("DataSet has no field named {}.".format(old_name)) return self - def set_target(self, *field_names, flag=True): + def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): """ 将field_names的field设置为target @@ -577,11 +577,14 @@ class DataSet(object): :param str field_names: field的名称 :param bool flag: 将field_name的target状态设置为flag + :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 + 行的数据进行类型和维度推断本列的数据的类型和维度。 """ assert isinstance(flag, bool), "Only bool type supported." for name in field_names: if name in self.field_arrays: try: + self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self.field_arrays[name].is_target = flag except SetInputOrTargetException as e: print(f"Cannot set field:{name} as target.") @@ -589,7 +592,7 @@ class DataSet(object): else: raise KeyError("{} is not a valid field name.".format(name)) - def set_input(self, *field_names, flag=True): + def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): """ 将field_names的field设置为input:: @@ -598,10 +601,13 @@ class DataSet(object): :param str field_names: field的名称 :param bool flag: 将field_name的input状态设置为flag + :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 + 行的数据进行类型和维度推断本列的数据的类型和维度。 """ for name in field_names: if name in self.field_arrays: try: + self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self.field_arrays[name].is_input = flag except SetInputOrTargetException as e: print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.") diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 1f8923eb..21c024f0 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -225,7 +225,7 @@ class CrossEntropyLoss(LossBase): def get_loss(self, pred, target, seq_len=None): if pred.dim() > 2: - if pred.size(1) != target.size(1): + if pred.size(1) != target.size(1): # 有可能顺序替换了 pred = pred.transpose(1, 2) pred = pred.reshape(-1, pred.size(-1)) target = target.reshape(-1) diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index 3036257c..e95047b4 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -49,7 +49,7 @@ class NullOptimizer(Optimizer): super().__init__(None) def construct_from_pytorch(self, model_params): - pass + return self def __getattr__(self, item): def pass_func(*args, **kwargs): diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py index d8ba1ad1..9ca04fa0 100644 --- a/fastNLP/core/sampler.py +++ b/fastNLP/core/sampler.py @@ -25,9 +25,9 @@ class Sampler(object): def __call__(self, data_set): """ - :param DataSet data_set: `DataSet` 对象, 需要Sample的数据 - :return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出 - """ + :param DataSet data_set: `DataSet` 对象, 需要Sample的数据 + :return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出 + """ raise NotImplementedError diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index c1d270d1..3d672ccc 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -47,6 +47,7 @@ from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device from ._parallel_utils import _data_parallel_wrapper +from .utils import _model_contains_inner_module from functools import partial __all__ = [ @@ -83,9 +84,7 @@ class Tester(object): def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1): super(Tester, self).__init__() - - if not isinstance(data, DataSet): - raise TypeError(f"The type of data must be `fastNLP.DataSet`, got `{type(data)}`.") + if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be `torch.nn.Module`, got `{type(model)}`.") @@ -106,19 +105,22 @@ class Tester(object): # check predict if (hasattr(self._model, 'predict') and callable(self._model.predict)) or \ - (isinstance(self._model, nn.DataParallel) and hasattr(self._model.module, 'predict') and - callable(self._model.module.predict)): + (_model_contains_inner_module(self._model) and hasattr(self._model.module, 'predict') and + callable(self._model.module.predict)): if isinstance(self._model, nn.DataParallel): self._predict_func_wrapper = partial(_data_parallel_wrapper('predict', self._model.device_ids, self._model.output_device), network=self._model.module) + self._predict_func = self._model.module.predict # 用于匹配参数 + elif isinstance(self._model, nn.parallel.DistributedDataParallel): self._predict_func = self._model.module.predict + self._predict_func_wrapper = self._model.module.predict # 用于调用 else: self._predict_func = self._model.predict self._predict_func_wrapper = self._model.predict else: - if isinstance(self._model, nn.DataParallel): + if _model_contains_inner_module(model): self._predict_func_wrapper = self._model.forward self._predict_func = self._model.module.forward else: diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 671e2736..09e8a437 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -352,7 +352,7 @@ from .utils import _move_dict_value_to_device from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device - +from .utils import _model_contains_inner_module class Trainer(object): """ @@ -389,8 +389,8 @@ class Trainer(object): 要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 - :param str,None save_path: 将模型保存路径。如果为None,则不保存模型。如果dev_data为None,则保存最后一次迭代的模型。 - 保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 + :param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存 + 最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 的计算位置进行管理。支持以下的输入: @@ -440,7 +440,7 @@ class Trainer(object): # check update every assert update_every >= 1, "update_every must be no less than 1." self.update_every = int(update_every) - + # check save_path if not (save_path is None or isinstance(save_path, str)): raise ValueError("save_path can only be None or `str`.") @@ -458,30 +458,69 @@ class Trainer(object): self.metric_key = None # prepare loss losser = _prepare_losser(loss) - - # sampler check - if sampler is not None and not isinstance(sampler, Sampler): - raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) - if sampler is None: - sampler = RandomSampler() - elif hasattr(sampler, 'set_batch_size'): - sampler.set_batch_size(batch_size) + if isinstance(train_data, BatchIter): + if sampler is not None: + warnings.warn("sampler is ignored when train_data is a BatchIter.") + if num_workers>0: + warnings.warn("num_workers is ignored when train_data is BatchIter.") + if drop_last: + warnings.warn("drop_last is ignored when train_data is BatchIter.") + + if isinstance(model, nn.parallel.DistributedDataParallel): # 如果是分布式的 + # device为None + if device is not None: + warnings.warn("device is ignored when model is nn.parallel.DistributedDataParallel.") + device = None + # Sampler要是分布式的 + if sampler is None: + sampler = torch.utils.data.DistributedSampler(train_data) + elif not isinstance(sampler, torch.utils.data.DistributedSampler): + raise TypeError("When using nn.parallel.DistributedDataParallel, " + "sampler must be None or torch.utils.data.DistributedSampler.") + # 不能保存模型 + if save_path: + raise RuntimeError("Saving model in Distributed situation is not allowed right now.") + else: + # sampler check + if sampler is not None and not isinstance(sampler, (Sampler, torch.utils.data.Sampler)): + raise ValueError(f"The type of sampler should be fastNLP.BaseSampler or pytorch's Sampler, got {type(sampler)}") + if sampler is None: + sampler = RandomSampler() + elif hasattr(sampler, 'set_batch_size'): + sampler.set_batch_size(batch_size) if isinstance(train_data, DataSet): self.data_iterator = DataSetIter( dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last) elif isinstance(train_data, BatchIter): self.data_iterator = train_data + train_data = train_data.dataset else: raise TypeError("train_data type {} not support".format(type(train_data))) - if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): - _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, - metric_key=self.metric_key, check_level=check_code_level, - batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) - # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 self.model = _move_model_to_device(model, device=device) + if _model_contains_inner_module(self.model): + self._forward_func = self.model.module.forward + else: + self._forward_func = self.model.forward + if check_code_level > -1: + # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 + # 名是否匹配 + dev_dataset = dev_data + if isinstance(dev_data, BatchIter): + dev_dataset = None + warnings.warn("dev_data is of BatchIter type, ignore validation checking.") + check_batch_size = min(batch_size, DEFAULT_CHECK_BATCH_SIZE) + if isinstance(self.model, nn.DataParallel): + _num_devices = len(self.model.device_ids) + if batch_size//_num_devices>1: # 如果多卡是每个卡可以分多个数据的,则用每个卡给两个sample + check_batch_size = max(len(self.model.device_ids)*2, check_batch_size) + else: + check_batch_size = max(len(self.model.device_ids), check_batch_size) + _check_code(dataset=train_data, model=self.model, losser=losser, forward_func=self._forward_func, metrics=metrics, + dev_data=dev_dataset, metric_key=self.metric_key, check_level=check_code_level, + batch_size=check_batch_size) self.train_data = train_data self.dev_data = dev_data # If None, No validation. @@ -496,8 +535,7 @@ class Trainer(object): self.best_dev_epoch = None self.best_dev_step = None self.best_dev_perf = None - self.n_steps = (len(self.train_data) // self.batch_size + int( - len(self.train_data) % self.batch_size != 0)) * int(drop_last==0) * self.n_epochs + self.n_steps = len(self.data_iterator) * self.n_epochs if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer @@ -600,10 +638,6 @@ class Trainer(object): self.step = 0 self.epoch = 0 start = time.time() - if isinstance(self.model, nn.DataParallel): - self._forward_func = self.model.module.forward - else: - self._forward_func = self.model.forward with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: self.pbar = pbar avg_loss = 0 @@ -745,7 +779,7 @@ class Trainer(object): model_path = os.path.join(self.save_path, model_name) if not os.path.exists(self.save_path): os.makedirs(self.save_path, exist_ok=True) - if isinstance(model, nn.DataParallel): + if _model_contains_inner_module(model): model = model.module if only_param: state_dict = model.state_dict() @@ -765,7 +799,7 @@ class Trainer(object): states = torch.load(model_path) else: states = torch.load(model_path).state_dict() - if isinstance(model, nn.DataParallel): + if _model_contains_inner_module(model): model.module.load_state_dict(states) else: model.load_state_dict(states) @@ -823,12 +857,10 @@ def _get_value_info(_dict): from numbers import Number from .batch import _to_tensor -def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, - dev_data=None, metric_key=None, - check_level=0): +def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAULT_CHECK_BATCH_SIZE, + dev_data=None, metric_key=None, check_level=0): # check get_loss 方法 - model_devcie = _get_model_device(model=model) - + model_device = _get_model_device(model=model) def _iter(): start_idx = 0 while start_idx Date: Fri, 19 Jul 2019 02:05:33 +0800 Subject: [PATCH 005/153] fix bugs and add test codes for: 1. models.snli; 2. core.metrics.extractive_qa; 3. io.data_loader.mnli --- fastNLP/core/metrics.py | 4 +-- fastNLP/models/snli.py | 11 ++++--- test/core/test_metrics.py | 45 ++++++++++++++++++++++++++++- test/data_for_tests/sample_mnli.tsv | 12 ++++++++ test/io/test_data_loader.py | 15 ++++++++++ test/models/test_snli.py | 9 ++++++ 6 files changed, 89 insertions(+), 7 deletions(-) create mode 100644 test/data_for_tests/sample_mnli.tsv create mode 100644 test/io/test_data_loader.py create mode 100644 test/models/test_snli.py diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index f23eab91..94f50253 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -814,8 +814,8 @@ class ExtractiveQAMetric(MetricBase): if not self.right_open: e += 1 te += 1 - if ts == 0 and te == int(not self.right_open): - if s == 0 and e == int(not self.right_open): + if ts == 0 and te == 1: + if s == 0 and e == 1: self.no_ans_correct += 1 self.no2no += 1 else: diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py index 8e35b6bc..3be942e8 100644 --- a/fastNLP/models/snli.py +++ b/fastNLP/models/snli.py @@ -9,7 +9,7 @@ import torch.nn.functional as F from torch.nn import CrossEntropyLoss from .base_model import BaseModel -from ..embeddings.embedding import TokenEmbedding +from ..embeddings.embedding import TokenEmbedding, Embedding from ..core.const import Const from ..core.utils import seq_len_to_mask @@ -21,18 +21,21 @@ class ESIM(BaseModel): ESIM model的一个PyTorch实现 论文参见: https://arxiv.org/pdf/1609.06038.pdf - :param fastNLP.TokenEmbedding init_embedding: 初始化的TokenEmbedding + :param init_embedding: 初始化的Embedding :param int hidden_size: 隐藏层大小,默认值为Embedding的维度 :param int num_labels: 目标标签种类数量,默认值为3 :param float dropout_rate: dropout的比率,默认值为0.3 :param float dropout_embed: 对Embedding的dropout比率,默认值为0.1 """ - def __init__(self, init_embedding: TokenEmbedding, hidden_size=None, num_labels=3, dropout_rate=0.3, + def __init__(self, init_embedding, hidden_size=None, num_labels=3, dropout_rate=0.3, dropout_embed=0.1): super(ESIM, self).__init__() - self.embedding = init_embedding + if isinstance(init_embedding, TokenEmbedding) or isinstance(init_embedding, Embedding): + self.embedding = init_embedding + else: + self.embedding = Embedding(init_embedding) self.dropout_embed = EmbedDropout(p=dropout_embed) if hidden_size is None: hidden_size = self.embedding.embed_size diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index 9c8a586c..236066d6 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -7,7 +7,7 @@ from fastNLP import AccuracyMetric from fastNLP.core.metrics import _pred_topk, _accuracy_topk from fastNLP.core.vocabulary import Vocabulary from collections import Counter -from fastNLP.core.metrics import SpanFPreRecMetric +from fastNLP.core.metrics import SpanFPreRecMetric, ExtractiveQAMetric def _generate_tags(encoding_type, number_labels=4): @@ -347,3 +347,46 @@ class TestUsefulFunctions(unittest.TestCase): _ = _pred_topk(np.random.randint(0, 3, size=(10, 1))) # 跑通即可 + + +class TestExtractiveQAMetric(unittest.TestCase): + + def test_cast_1(self): + qa_prediction = torch.FloatTensor([[[-0.4424, -0.4579, -0.7376, 1.8129, 0.1316, 1.6566, -1.2169, + -0.3782, 0.8240], + [-1.2348, -0.1876, -0.1462, -0.4834, -0.6692, -0.9735, -1.1563, + -0.3562, -1.4116], + [-1.6550, -0.9555, 0.3782, -1.3160, -1.5835, -0.3443, -1.7858, + -2.0023, 0.0075], + [-0.3772, -0.5447, -1.5631, 1.1614, 1.4598, -1.2764, 0.5186, + 0.3832, -0.1540], + [-0.1011, 0.0600, 1.1090, -0.3545, 0.1284, 1.1484, -1.0120, + -1.3508, -0.9513], + [1.8948, 0.8627, -2.1359, 1.3740, -0.7499, 1.5019, 0.6919, + -0.0842, -0.4294]], + + [[-0.2802, 0.6941, -0.4788, -0.3845, 1.7752, 1.2950, -1.9490, + -1.4138, -0.8853], + [-1.3752, -0.5457, -0.5305, 0.4018, 0.2934, 0.7931, 2.3845, + -1.0726, 0.0364], + [0.3621, 0.2609, 0.1269, -0.5950, 0.7212, 0.5959, 1.6264, + -0.8836, -0.9320], + [0.2003, -1.0758, -1.1560, -0.6472, -1.7549, 0.1264, 0.6044, + -1.6857, 1.1571], + [1.4277, -0.4915, 0.4496, 2.2027, 0.0730, -3.1792, -0.5125, + 3.5837, 1.0184], + [1.6495, 1.7145, -0.2143, -0.1230, -0.2205, 0.8250, 0.4943, + -0.9025, 0.0864]]]) + qa_prediction = qa_prediction.permute(1, 2, 0) + pred1, pred2 = qa_prediction.split(1, dim=-1) + pred1 = pred1.squeeze(-1) + pred2 = pred2.squeeze(-1) + target1 = torch.LongTensor([3, 0, 2, 4, 4, 0]) + target2 = torch.LongTensor([4, 1, 6, 8, 7, 1]) + metric = ExtractiveQAMetric() + metric.evaluate(pred1, pred2, target1, target2) + result = metric.get_metric() + truth = {'EM': 62.5, 'f_1': 72.5, 'noAns-f_1': 50.0, 'noAns-EM': 50.0, 'hasAns-f_1': 95.0, 'hasAns-EM': 75.0} + for k, v in truth.items(): + self.assertTrue(k in result) + self.assertEqual(v, result[k]) diff --git a/test/data_for_tests/sample_mnli.tsv b/test/data_for_tests/sample_mnli.tsv new file mode 100644 index 00000000..9a30b95b --- /dev/null +++ b/test/data_for_tests/sample_mnli.tsv @@ -0,0 +1,12 @@ +index promptID pairID genre sentence1_binary_parse sentence2_binary_parse sentence1_parse sentence2_parse sentence1 sentence2 label1 label2 label3 label4 label5 gold_label +0 63735 63735n slate ( ( The ( new rights ) ) ( are ( nice enough ) ) ) ( Everyone ( really ( likes ( the ( newest benefits ) ) ) ) ) (ROOT (S (NP (DT The) (JJ new) (NNS rights)) (VP (VBP are) (ADJP (JJ nice) (RB enough))))) (ROOT (S (NP (NN Everyone)) (VP (ADVP (RB really)) (VBZ likes) (NP (DT the) (JJS newest) (NNS benefits))))) The new rights are nice enough Everyone really likes the newest benefits neutral entailment neutral neutral neutral neutral +1 91383 91383c government ( ( This site ) ( ( includes ( ( ( ( a list ) ( of ( all ( award winners ) ) ) ) and ) ( ( a ( searchable database ) ) ( of ( Government ( Executive articles ) ) ) ) ) ) . ) ) ( ( ( The ( Government ( Executive articles ) ) ) ( housed ( on ( the website ) ) ) ) ( ( ( are not ) ( able ( to ( be searched ) ) ) ) . ) ) (ROOT (S (NP (DT This) (NN site)) (VP (VBZ includes) (NP (NP (NP (DT a) (NN list)) (PP (IN of) (NP (DT all) (NN award) (NNS winners)))) (CC and) (NP (NP (DT a) (JJ searchable) (NN database)) (PP (IN of) (NP (NNP Government) (NNP Executive) (NNS articles)))))) (. .))) (ROOT (S (NP (NP (DT The) (NNP Government) (NNP Executive) (NNS articles)) (VP (VBN housed) (PP (IN on) (NP (DT the) (NN website))))) (VP (VBP are) (RB not) (ADJP (JJ able) (S (VP (TO to) (VP (VB be) (ADJP (JJ searched))))))) (. .))) This site includes a list of all award winners and a searchable database of Government Executive articles. The Government Executive articles housed on the website are not able to be searched. contradiction contradiction contradiction contradiction contradiction contradiction +2 755 755e telephone ( ( ( ( uh ( i ( ( do n't ) ( know ( ( i i ) ( have ( ( mixed emotions ) ( about ( him ( ( uh sometimes ) ( i ( like him ) ) ) ) ) ) ) ) ) ) ) ) but ) ( ( at ( the ( same times ) ) ) ( i ( love ( to ( see somebody ) ) ) ) ) ) ( beat him ) ) ( I ( ( ( ( ( ( like him ) ( for ( the ( most part ) ) ) ) , ) but ) ( ( would still ) ( enjoy ( seeing ( someone ( beat him ) ) ) ) ) ) . ) ) (ROOT (SINV (S (S (INTJ (UH uh)) (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP have) (VP (VBN mixed) (NP (NNS emotions)) (PP (IN about) (S (NP (PRP him)) (VP (VBG uh) (ADVP (RB sometimes)) (NP (NP (FW i)) (PP (IN like) (NP (PRP him))))))))))))))) (CC but) (S (PP (IN at) (NP (DT the) (JJ same) (NNS times))) (NP (FW i)) (VP (VBP love) (S (VP (TO to) (VP (VB see) (NP (NN somebody)))))))) (VP (VBD beat)) (NP (PRP him)))) (ROOT (S (NP (PRP I)) (VP (VP (VBP like) (NP (PRP him)) (PP (IN for) (NP (DT the) (JJS most) (NN part)))) (, ,) (CC but) (VP (MD would) (ADVP (RB still)) (VP (VB enjoy) (S (VP (VBG seeing) (S (NP (NN someone)) (VP (VB beat) (NP (PRP him))))))))) (. .))) uh i don't know i i have mixed emotions about him uh sometimes i like him but at the same times i love to see somebody beat him I like him for the most part, but would still enjoy seeing someone beat him. entailment entailment entailment entailment entailment entailment +3 78013 78013c telephone ( yeah ( ( i i ) ( think ( ( my ( favorite restaurant ) ) ( ( is always ) ( been ( ( the ( one closest ) ) ( you ( ( know ( the closest ) ) ( ( as long ) ( as ( it ( 's ( it ( meets ( ( the ( minimum criteria ) ) ( you ( know ( of ( good food ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( ( My ( favorite restaurants ) ) ( ( ( ( are always ) ( ( ( ( ( at least ) a ) hundred ) miles ) away ) ) ( from ( my house ) ) ) . ) ) (ROOT (S (VP (VB yeah) (NP (NP (FW i) (FW i)) (SBAR (S (VP (VBP think) (SBAR (S (NP (PRP$ my) (JJ favorite) (NN restaurant)) (VP (VBZ is) (ADVP (RB always)) (VP (VBN been) (NP (NP (DT the) (CD one) (JJS closest)) (SBAR (S (NP (PRP you)) (VP (VBP know) (NP (DT the) (JJS closest)) (ADVP (ADVP (RB as) (RB long)) (SBAR (IN as) (S (NP (PRP it)) (VP (VBZ 's) (SBAR (S (NP (PRP it)) (VP (VBZ meets) (NP (NP (DT the) (JJ minimum) (NNS criteria)) (SBAR (S (NP (PRP you)) (VP (VBP know) (PP (IN of) (NP (JJ good) (NN food))))))))))))))))))))))))))))) (ROOT (S (NP (PRP$ My) (JJ favorite) (NNS restaurants)) (VP (VBP are) (ADVP (RB always)) (ADVP (NP (QP (IN at) (JJS least) (DT a) (CD hundred)) (NNS miles)) (RB away)) (PP (IN from) (NP (PRP$ my) (NN house)))) (. .))) yeah i i think my favorite restaurant is always been the one closest you know the closest as long as it's it meets the minimum criteria you know of good food My favorite restaurants are always at least a hundred miles away from my house. contradiction contradiction contradiction contradiction contradiction contradiction +4 96377 96377c telephone ( i ( ( do n't ) ( know ( um ( do ( you ( do ( ( a lot ) ( of camping ) ) ) ) ) ) ) ) ) ( I ( ( know exactly ) . ) ) (ROOT (S (NP (FW i)) (VP (VBP do) (RB n't) (VP (VB know) (SBAR (S (NP (NN um)) (VP (VBP do) (SBAR (S (NP (PRP you)) (VP (VBP do) (NP (NP (DT a) (NN lot)) (PP (IN of) (NP (NN camping)))))))))))))) (ROOT (S (NP (PRP I)) (VP (VBP know) (ADVP (RB exactly))) (. .))) i don't know um do you do a lot of camping I know exactly. contradiction contradiction contradiction contradiction contradiction contradiction +5 139749 139749c telephone ( well ( that ( would ( be ( ( a help ) ( i ( wish ( they ( would ( do ( that ( ( ( here ( we ( have ( got ( so ( ( little ( landfill space ) ) ( left ( that ( we ( 're ( going ( to ( ( run out ) ( before ( ( the end ) ( of ( this decade ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) and ) ( it ( ( 's really ) ( going ( to be ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( We ( ( have ( plenty ( of ( space ( in ( the landfill ) ) ) ) ) ) . ) ) (ROOT (FRAG (ADVP (RB well)) (SBAR (WHNP (WDT that)) (S (VP (MD would) (VP (VB be) (NP (NP (DT a) (NN help)) (SBAR (S (NP (FW i)) (VP (VBP wish) (SBAR (S (NP (PRP they)) (VP (MD would) (VP (VB do) (SBAR (IN that) (S (S (ADVP (RB here)) (NP (PRP we)) (VP (VBP have) (VP (VBN got) (SBAR (IN so) (S (NP (JJ little) (NN landfill) (NN space)) (VP (VBD left) (SBAR (IN that) (S (NP (PRP we)) (VP (VBP 're) (VP (VBG going) (S (VP (TO to) (VP (VB run) (PRT (RP out)) (PP (IN before) (NP (NP (DT the) (NN end)) (PP (IN of) (NP (DT this) (NN decade)))))))))))))))))) (CC and) (S (NP (PRP it)) (VP (VBZ 's) (ADVP (RB really)) (VP (VBG going) (S (VP (TO to) (VP (VB be))))))))))))))))))))))) (ROOT (S (NP (PRP We)) (VP (VBP have) (NP (NP (RB plenty)) (PP (IN of) (NP (NP (NN space)) (PP (IN in) (NP (DT the) (NN landfill))))))) (. .))) well that would be a help i wish they would do that here we have got so little landfill space left that we're going to run out before the end of this decade and it's really going to be We have plenty of space in the landfill. contradiction contradiction contradiction contradiction contradiction contradiction +6 101415 101415c telephone ( yeah ( ( ( i know ) and ) ( i ( did ( that ( ( ( all ( through college ) ) and ) ( it ( worked too ) ) ) ) ) ) ) ) ( I ( ( ( did ( that all ) ) ( through college ) ) ( but ( it ( never worked ) ) ) ) ) (ROOT (S (VP (VB yeah) (S (S (NP (FW i)) (VP (VBP know))) (CC and) (S (NP (FW i)) (VP (VBD did) (SBAR (IN that) (S (S (NP (DT all)) (PP (IN through) (NP (NN college)))) (CC and) (S (NP (PRP it)) (VP (VBD worked) (ADVP (RB too)))))))))))) (ROOT (S (NP (PRP I)) (VP (VBD did) (ADVP (IN that) (DT all)) (PP (IN through) (NP (NN college))) (SBAR (CC but) (S (NP (PRP it)) (ADVP (RB never)) (VP (VBD worked))))))) yeah i know and i did that all through college and it worked too I did that all through college but it never worked contradiction contradiction contradiction contradiction contradiction contradiction +7 93958 93958n travel ( ( ( ( ( Calcutta ( seems ( to ( be ( ( the ( only ( other ( production center ) ) ) ) ( ( having ( any pretensions ) ) ( to ( ( artistic creativity ) ( at all ) ) ) ) ) ) ) ) ) , ) but ) ( ironically ( you ( ( 're actually ) ( ( more ( likely ( to ( see ( ( the works ) ( of ( ( ( Satyajit Ray ) or ) ( ( Mrinal Sen ) ( shown ( in ( Europe ( or ( North America ) ) ) ) ) ) ) ) ) ) ) ) ) ( than ( in ( India itself ) ) ) ) ) ) ) ) . ) ( ( Most ( of ( ( Mrinal ( Sen 's ) ) work ) ) ) ( ( can ( be ( found ( in ( European collections ) ) ) ) ) . ) ) (ROOT (S (S (NP (NNP Calcutta)) (VP (VBZ seems) (S (VP (TO to) (VP (VB be) (NP (NP (DT the) (JJ only) (JJ other) (NN production) (NN center)) (VP (VBG having) (NP (DT any) (NNS pretensions)) (PP (TO to) (NP (NP (JJ artistic) (NN creativity)) (ADVP (IN at) (DT all))))))))))) (, ,) (CC but) (S (ADVP (RB ironically)) (NP (PRP you)) (VP (VBP 're) (ADVP (RB actually)) (ADJP (ADJP (RBR more) (JJ likely) (S (VP (TO to) (VP (VB see) (NP (NP (DT the) (NNS works)) (PP (IN of) (NP (NP (NNP Satyajit) (NNP Ray)) (CC or) (NP (NP (NNP Mrinal) (NNP Sen)) (VP (VBN shown) (PP (IN in) (NP (NNP Europe) (CC or) (NNP North) (NNP America)))))))))))) (ADVP (IN than) (PP (IN in) (S (VP (VBG India) (NP (PRP itself))))))))) (. .))) (ROOT (S (NP (NP (JJS Most)) (PP (IN of) (NP (NP (NNP Mrinal) (NNP Sen) (POS 's)) (NN work)))) (VP (MD can) (VP (VB be) (VP (VBN found) (PP (IN in) (NP (JJ European) (NNS collections)))))) (. .))) Calcutta seems to be the only other production center having any pretensions to artistic creativity at all, but ironically you're actually more likely to see the works of Satyajit Ray or Mrinal Sen shown in Europe or North America than in India itself. Most of Mrinal Sen's work can be found in European collections. neutral neutral entailment neutral neutral neutral +8 12567 12567c slate ( ( If ( ( that investor ) ( were ( willing ( to ( pay ( extra ( for ( ( the security ) ( of ( limited downside ) ) ) ) ) ) ) ) ) ) ) ( , ( she ( ( could ( ( buy ( put options ) ) ( with ( ( a ( strike price ) ) ( of ( ( ( $ 98 ) , ) ( which ( would ( ( ( lock ( in ( ( her profit ) ( on ( ( the shares ) ( at ( $ 18 ) ) ) ) ) ) ) , ) ( less ( whatever ( ( the options ) cost ) ) ) ) ) ) ) ) ) ) ) ) . ) ) ) ) ( ( THe ( strike price ) ) ( ( could ( be ( $ 8 ) ) ) . ) ) (ROOT (S (SBAR (IN If) (S (NP (DT that) (NN investor)) (VP (VBD were) (ADJP (JJ willing) (S (VP (TO to) (VP (VB pay) (NP (NP (JJ extra)) (PP (IN for) (NP (NP (DT the) (NN security)) (PP (IN of) (NP (JJ limited) (NN downside))))))))))))) (, ,) (NP (PRP she)) (VP (MD could) (VP (VB buy) (NP (NN put) (NNS options)) (PP (IN with) (NP (NP (DT a) (NN strike) (NN price)) (PP (IN of) (NP (NP ($ $) (CD 98)) (, ,) (SBAR (WHNP (WDT which)) (S (VP (MD would) (VP (VB lock) (PP (IN in) (NP (NP (PRP$ her) (NN profit)) (PP (IN on) (NP (NP (DT the) (NNS shares)) (PP (IN at) (NP ($ $) (CD 18))))))) (, ,) (ADVP (ADVP (RBR less)) (SBAR (WHNP (WDT whatever)) (S (NP (DT the) (NNS options)) (VP (VBD cost))))))))))))))) (. .))) (ROOT (S (NP (NNP THe) (NN strike) (NN price)) (VP (MD could) (VP (VB be) (NP ($ $) (CD 8)))) (. .))) If that investor were willing to pay extra for the security of limited downside, she could buy put options with a strike price of $98, which would lock in her profit on the shares at $18, less whatever the options cost. THe strike price could be $8. contradiction contradiction contradiction contradiction contradiction contradiction +9 117487 117487n slate ( ( 3 -RRB- ) ( ( Dare ( you ( ( ( rise ( to ( ( ( ( the occasion ) , ) ( like Raskolnikov ) ) , ) ) ) and ) ( reject ( ( the ( petty rules ) ) ( that ( govern ( lesser men ) ) ) ) ) ) ) ) ? ) ) ( ( ( Would you ) ( ( ( rise up ) and ) ( defeaat ( ( all ( evil lords ) ) ( in ( the town ) ) ) ) ) ) ? ) (ROOT (S (LST (LS 3) (-RRB- -RRB-)) (VP (VB Dare) (S (NP (PRP you)) (VP (VP (VB rise) (PP (TO to) (NP (NP (DT the) (NN occasion)) (, ,) (PP (IN like) (NP (NNP Raskolnikov))) (, ,)))) (CC and) (VP (VB reject) (NP (NP (DT the) (JJ petty) (NNS rules)) (SBAR (WHNP (WDT that)) (S (VP (VBP govern) (NP (JJR lesser) (NNS men)))))))))) (. ?))) (ROOT (SQ (MD Would) (NP (PRP you)) (VP (VP (VB rise) (PRT (RP up))) (CC and) (VP (VB defeaat) (NP (NP (DT all) (JJ evil) (NNS lords)) (PP (IN in) (NP (DT the) (NN town)))))) (. ?))) 3) Dare you rise to the occasion, like Raskolnikov, and reject the petty rules that govern lesser men? Would you rise up and defeaat all evil lords in the town? neutral neutral neutral neutral neutral neutral +10 9616 9616c travel ( ( The ( ( most important ) directions ) ) ( ( ( are ( simply ( ( up and ) up ) ) ) ( ( ( ( ( ( ( ( leads eventually ) ( to ( the cathedral ) ) ) and ) ( fortress ( commanding ( the hilltop ) ) ) ) , ) and ) down ) ( inevitably ( ( leads ( to ( one ( of ( three gates ) ) ) ) ) ( through ( ( the wall ) ( to ( the ( new town ) ) ) ) ) ) ) ) ) . ) ) ( Go ( ( downwards ( to ( one ( of ( ( ( the gates ) , ) ( ( all ( of which ) ) ( will ( ( lead you ) ( into ( the cathedral ) ) ) ) ) ) ) ) ) ) . ) ) (ROOT (S (NP (DT The) (ADJP (RBS most) (JJ important)) (NNS directions)) (VP (VBP are) (PRN (ADVP (RB simply)) (ADVP (RB up) (CC and) (RB up))) (VP (VP (VBZ leads) (ADVP (RB eventually)) (PP (TO to) (NP (DT the) (NN cathedral)))) (CC and) (VP (VBZ fortress) (NP (JJ commanding) (DT the) (NN hilltop))) (, ,) (CC and) (ADVP (RB down)) (VP (ADVP (RB inevitably)) (VBZ leads) (PP (TO to) (NP (NP (CD one)) (PP (IN of) (NP (CD three) (NNS gates))))) (PP (IN through) (NP (NP (DT the) (NN wall)) (PP (TO to) (NP (DT the) (JJ new) (NN town)))))))) (. .))) (ROOT (S (NP (NNP Go)) (VP (VBZ downwards) (PP (TO to) (NP (NP (CD one)) (PP (IN of) (NP (NP (DT the) (NNS gates)) (, ,) (SBAR (WHNP (DT all) (WHPP (IN of) (WHNP (WDT which)))) (S (VP (MD will) (VP (VB lead) (NP (PRP you)) (PP (IN into) (NP (DT the) (NN cathedral)))))))))))) (. .))) The most important directions are simply up and up leads eventually to the cathedral and fortress commanding the hilltop, and down inevitably leads to one of three gates through the wall to the new town. Go downwards to one of the gates, all of which will lead you into the cathedral. contradiction contradiction entailment contradiction contradiction contradiction diff --git a/test/io/test_data_loader.py b/test/io/test_data_loader.py new file mode 100644 index 00000000..5b1bb749 --- /dev/null +++ b/test/io/test_data_loader.py @@ -0,0 +1,15 @@ +import unittest + +from fastNLP.core.const import Const +from fastNLP.io.data_loader import MNLILoader + + +class TestDataLoader(unittest.TestCase): + + def test_mnli_loader(self): + ds = MNLILoader().process('test/data_for_tests/sample_mnli.tsv', + to_lower=True, get_index=True, seq_len_type='mask') + self.assertTrue('train' in ds.datasets) + self.assertTrue(len(ds.datasets) == 1) + self.assertTrue(len(ds.datasets['train']) == 11) + self.assertTrue(isinstance(ds.datasets['train'][0][Const.INPUT_LENS(0)], list)) diff --git a/test/models/test_snli.py b/test/models/test_snli.py new file mode 100644 index 00000000..7a588a4c --- /dev/null +++ b/test/models/test_snli.py @@ -0,0 +1,9 @@ +import unittest +from .model_runner import * +from fastNLP.models.snli import ESIM + + +class TestSNLIModel(unittest.TestCase): + def test_snli(self): + model = ESIM((VOCAB_SIZE, 10), num_labels=NUM_CLS, dropout_rate=0) + RUNNER.run_model_with_task(NLI, model) From 4718804e2208e6642d8e2440ddcfa9998296c3e9 Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 19 Jul 2019 17:33:41 +0800 Subject: [PATCH 006/153] =?UTF-8?q?1.=20=E4=BF=AE=E6=94=B9=E5=86=85?= =?UTF-8?q?=E9=83=A8=E5=87=BD=E6=95=B0=E4=BD=8D=E7=BD=AE;=202.=E4=BF=AE?= =?UTF-8?q?=E6=94=B9BertWordPieceEncoder=E7=9A=84=E9=83=A8=E5=88=86?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/_parallel_utils.py | 14 ++++++++++++++ fastNLP/core/tester.py | 2 +- fastNLP/core/trainer.py | 3 ++- fastNLP/core/utils.py | 12 ------------ fastNLP/embeddings/bert_embedding.py | 25 +++++++++++++++++-------- fastNLP/embeddings/char_embedding.py | 4 ++-- fastNLP/embeddings/embedding.py | 7 ++++++- fastNLP/modules/encoder/bert.py | 11 ++++++----- 8 files changed, 48 insertions(+), 30 deletions(-) diff --git a/fastNLP/core/_parallel_utils.py b/fastNLP/core/_parallel_utils.py index 4a7757d3..6b24d9f9 100644 --- a/fastNLP/core/_parallel_utils.py +++ b/fastNLP/core/_parallel_utils.py @@ -1,6 +1,7 @@ import threading import torch +from torch import nn from torch.nn.parallel.parallel_apply import get_a_var from torch.nn.parallel.scatter_gather import scatter_kwargs, gather @@ -86,3 +87,16 @@ def _data_parallel_wrapper(func_name, device_ids, output_device): outputs = parallel_apply(replicas, func_name, inputs, kwargs, device_ids[:len(replicas)]) return gather(outputs, output_device) return wrapper + + +def _model_contains_inner_module(model): + """ + + :param nn.Module model: 模型文件,判断是否内部包含model.module, 多用于check模型是否是nn.DataParallel, + nn.parallel.DistributedDataParallel。主要是在做形参匹配的时候需要使用最内部的model的function。 + :return: bool + """ + if isinstance(model, nn.Module): + if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): + return True + return False \ No newline at end of file diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 3d672ccc..067ff30c 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -47,7 +47,7 @@ from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device from ._parallel_utils import _data_parallel_wrapper -from .utils import _model_contains_inner_module +from fastNLP.core._parallel_utils import _model_contains_inner_module from functools import partial __all__ = [ diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 09e8a437..4ec3d0f4 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -352,7 +352,8 @@ from .utils import _move_dict_value_to_device from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device -from .utils import _model_contains_inner_module +from fastNLP.core._parallel_utils import _model_contains_inner_module + class Trainer(object): """ diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index b849687b..8483f9f2 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -187,18 +187,6 @@ def _save_model(model, model_name, save_dir, only_param=False): torch.save(model, model_path) model.to(_model_device) -def _model_contains_inner_module(model): - """ - - :param nn.Module model: 模型文件,判断是否内部包含model.module, 多用于check模型是否是nn.DataParallel, - nn.parallel.DistributedDataParallel。主要是在做形参匹配的时候需要使用最内部的model的function。 - :return: bool - """ - if isinstance(model, nn.Module): - if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): - return True - return False - def _move_model_to_device(model, device): """ 将model移动到device diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 010b464d..21944570 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -129,14 +129,14 @@ class BertWordPieceEncoder(nn.Module): def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False, requires_grad: bool=False): super().__init__() - PRETRAIN_URL = _get_base_url('bert') if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR: + PRETRAIN_URL = _get_base_url('bert') model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] model_url = PRETRAIN_URL + model_name model_dir = cached_path(model_url) # 检查是否存在 - elif os.path.isdir(model_dir_or_name): + elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") @@ -166,16 +166,25 @@ class BertWordPieceEncoder(nn.Module): def embed_size(self): return self._embed_size - def index_datasets(self, *datasets, field_name): + @property + def embedding_dim(self): + return self._embed_size + + @property + def num_embedding(self): + return self.model.encoder.config.vocab_size + + def index_datasets(self, *datasets, field_name, add_cls_sep=True): """ - 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 - [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 + 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 + bert的pad value。 - :param datasets: DataSet对象 - :param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。 + :param DataSet datasets: DataSet对象 + :param str field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。 + :param bool add_cls_sep: 如果首尾不是[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP]。 :return: """ - self.model.index_dataset(*datasets, field_name=field_name) + self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep) def forward(self, word_pieces, token_type_ids=None): """ diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index b9e6659e..b670313e 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -92,7 +92,7 @@ class CNNCharEmbedding(TokenEmbedding): for i in range(len(kernel_sizes))]) self._embed_size = embed_size self.fc = nn.Linear(sum(filter_nums), embed_size) - self.init_param() + self.reset_parameters() def forward(self, words): """ @@ -149,7 +149,7 @@ class CNNCharEmbedding(TokenEmbedding): continue param.requires_grad = value - def init_param(self): + def reset_parameters(self): for name, param in self.named_parameters(): if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset continue diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index 111bacd0..a9f228fb 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -41,7 +41,12 @@ class Embedding(nn.Module): self.dropout = nn.Dropout(dropout) if not isinstance(self.embed, TokenEmbedding): - self._embed_size = self.embed.weight.size(1) + if hasattr(self, 'embed_size'): + self._embed_size = self.embed.embed_size + elif hasattr(self, 'embedding_dim'): + self._embed_size = self.embed.embedding_dim + else: + self._embed_size = self.embed.weight.size(1) if word_dropout>0 and not isinstance(unk_index, int): raise ValueError("When drop word is set, you need to pass in the unk_index.") else: diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 1bd810a8..9a990d9d 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -871,7 +871,7 @@ class _WordPieceBertModel(nn.Module): self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece self.pooled_cls = pooled_cls - def index_dataset(self, *datasets, field_name): + def index_dataset(self, *datasets, field_name, add_cls_sep=True): """ 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 @@ -887,10 +887,11 @@ class _WordPieceBertModel(nn.Module): tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word) word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens) word_pieces.extend(word_piece_ids) - if word_pieces[0] != self._cls_index: - word_pieces.insert(0, self._cls_index) - if word_pieces[-1] != self._sep_index: - word_pieces.insert(-1, self._sep_index) + if add_cls_sep: + if word_pieces[0] != self._cls_index: + word_pieces.insert(0, self._cls_index) + if word_pieces[-1] != self._sep_index: + word_pieces.insert(-1, self._sep_index) return word_pieces for index, dataset in enumerate(datasets): From 861f5387a4125d1847c133596f75f1b71b03d2b0 Mon Sep 17 00:00:00 2001 From: yunfan Date: Fri, 19 Jul 2019 22:40:26 +0800 Subject: [PATCH 007/153] [add] very first version of distributed trainer --- fastNLP/core/callback.py | 12 ++ fastNLP/core/dist_trainer.py | 302 +++++++++++++++++++++++++++++++++ test/core/test_dist_trainer.py | 110 ++++++++++++ 3 files changed, 424 insertions(+) create mode 100644 fastNLP/core/dist_trainer.py create mode 100644 test/core/test_dist_trainer.py diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 874d0ad9..cf3b158c 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -250,6 +250,14 @@ class Callback(object): :return: """ pass + + def on_validation(self): + """ + 如果Trainer中设置了验证,则会在每次需要验证时调用该函数 + + :return: + """ + pass def on_epoch_end(self): """ @@ -352,6 +360,10 @@ class CallbackManager(Callback): @_transfer def on_valid_end(self, eval_result, metric_key, optimizer, is_better_eval): pass + + @_transfer + def on_validation(self): + pass @_transfer def on_epoch_end(self): diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py new file mode 100644 index 00000000..1d782733 --- /dev/null +++ b/fastNLP/core/dist_trainer.py @@ -0,0 +1,302 @@ +import torch +import torch.cuda +import torch.optim +import torch.distributed as dist +from torch.utils.data.distributed import DistributedSampler +from torch.nn.parallel import DistributedDataParallel as DDP +import os +from tqdm import tqdm +import logging +import time +from datetime import datetime, timedelta + +from .batch import DataSetIter, BatchIter +from .callback import CallbackManager, CallbackException +from .dataset import DataSet +from .losses import _prepare_losser +from .optimizer import Optimizer +from .utils import _build_args +from .utils import _move_dict_value_to_device +from .utils import _get_func_signature + +__all__ = [ + 'get_local_rank', + 'DistTrainer', +] + + +def get_local_rank(): + if 'LOCAL_RANK' in os.environ: + return int(os.environ['LOCAL_RANK']) + from argparse import ArgumentParser + parser = ArgumentParser() + parser.add_argument('--local_rank', type=int) + args, _ = parser.parse_known_args() + if 'local_rank' in args and args.local_rank: + os.environ['LOCAL_RANK'] = str(args.local_rank) # for multiple calls for this function + return args.local_rank + raise RuntimeError('Please use "python -m torch.distributed.launch train_script.py') + + +class DistTrainer(): + def __init__(self, model, train_data, optimizer, loss, callbacks=None, + batch_size_per_gpu=8, n_epochs=1, + num_workers=1, drop_last=False, + update_every=1, print_every=10, validate_every=-1, + save_every=-1, save_path=None, + logging_level=logging.INFO, + fp16='', backend='nccl', init_method=None): + self.model = model + self.train_data = train_data + self.batch_size_per_gpu = int(batch_size_per_gpu) + self.n_epochs = int(n_epochs) + self.num_workers = int(num_workers) + self.drop_last = drop_last + self.update_every = int(update_every) + self.print_every = int(print_every) + self.validate_every = int(validate_every) + self.save_every = int(save_every) + self.save_path = save_path + self.losser = _prepare_losser(loss) + self.fp16 = fp16 + self.init_method = init_method + self.backend = backend + self.local_rank = get_local_rank() + self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) + self._forward_func = model.forward + + assert torch.cuda.is_available(), "Distributed Trainer requires cuda to be enabled." + # init distributed + torch.cuda.set_device(self.local_rank) + self.device = torch.device("cuda", self.local_rank) + dist.init_process_group(backend=self.backend, init_method=self.init_method) + model.to(self.device) + optimizer = self.get_optimizer(optimizer) + + # init fp16, must before DataParallel init + if len(self.fp16): + assert isinstance(self.fp16, str), "Please set Apex AMP optimization level selected in ['O0', 'O1', 'O2', 'O3']" + try: + from apex import amp + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." + model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16) + + # init DataParallel + self.model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank) + self.optimizer = optimizer + self.world_size = dist.get_world_size() + self.rank = dist.get_rank() # unique id for each process + self.sampler = DistributedSampler(self.train_data) + self.data_iterator = self.get_data_iter(self.train_data) + self.n_steps = self.get_n_steps() + + # Setup logging + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging_level) + self.logger = logging.getLogger(__name__) + self.logger.info("Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".format( + os.getpid(), self.rank, self.local_rank, self.device, self.fp16 if self.fp16 else False)) + if self.is_master: + self.logger.info('Total epochs: %d'% self.n_epochs) + self.logger.info('Total steps: %d'% self.n_steps) + self.logger.info('Num instances per GPU %d'% self.batch_size_per_gpu) + self.logger.info('Total batch_size: %d'% self.batch_size_per_gpu * dist.get_world_size()) + self.logger.info('Total num of samples: %d'% len(self.train_data)) + self.logger.info("Num of callbacks: {}".format(len(self.callback_manager.callbacks))) + self.logger.info( + "Use callbacks: {}".format([repr(cb) for cb in self.callback_manager.callbacks])) + + # only master process save model + if self.save_path: + self.save_path = os.path.join( + self.save_path, + datetime.now().strftime('%m_%d_%y-%H_%M_%S')+'-'+str(os.getpid())) + + def get_n_steps(self): + batch_size = self.world_size * self.batch_size_per_gpu + return (len(self.train_data) // batch_size + int( + len(self.train_data) % batch_size != 0)) * int(self.drop_last == 0) * self.n_epochs + + def get_data_iter(self, dataset): + if isinstance(dataset, DataSet): + return DataSetIter( + dataset=dataset, batch_size=self.batch_size_per_gpu, + num_workers=self.num_workers, sampler=self.sampler, + drop_last=self.drop_last + ) + elif isinstance(dataset, BatchIter): + return dataset + else: + raise TypeError("train_data type {} not support".format(type(dataset))) + + def get_optimizer(self, optimizer): + if isinstance(optimizer, torch.optim.Optimizer): + return optimizer + elif isinstance(optimizer, Optimizer): + return optimizer.construct_from_pytorch(self.model.parameters()) + elif optimizer is None: + return torch.optim.Adam(self.model.parameters(), lr=4e-3) + else: + raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer))) + + @property + def is_master(self): + return self.rank == 0 + + def train(self, on_exception='auto'): + start_time = time.time() + results = {} + if self.n_epochs <= 0: + if self.is_master: + self.logger.info("Training epoch is {}, nothing was done.".format(self.n_epochs)) + results['seconds'] = 0. + return results + + if self.is_master: + self.logger.info("###### Training epochs started ######") + + try: + self.callback_manager.on_train_begin() + self._train() + self.callback_manager.on_train_end() + + except BaseException as e: + self.callback_manager.on_exception(e) + if on_exception == 'auto': + if not isinstance(e, (CallbackException, KeyboardInterrupt)): + raise e + else: + self.logger.info('Catch {}, ignored.'.format(e.__class__.__name__)) + elif on_exception == 'raise': + raise e + + results['seconds'] = round(time.time() - start_time, 2) + if self.is_master: + self.logger.info("###### Train finished ######") + self.logger.info('Total train time: {} seconds.'. format(results['seconds'])) + return results + + def _train(self): + if self.fp16: + # skip check, done in __init__() + from apex import amp + self.step = 0 + self.epoch = 0 + self.pbar = tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', + leave=False, dynamic_ncols=True, disable=not self.is_master) + pbar = self.pbar + avg_loss = 0 + data_iterator = self.data_iterator + self.model.zero_grad() + for epoch in range(1, self.n_epochs + 1): + self.epoch = epoch + pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) + # early stopping + self.callback_manager.on_epoch_begin() + for batch_x, batch_y in data_iterator: + self.model.train() + self.step += 1 + _move_dict_value_to_device(batch_x, batch_y, device=self.device) + indices = data_iterator.get_batch_indices() + # negative sampling; replace unknown; re-weight batch_y + self.callback_manager.on_batch_begin(batch_x, batch_y, indices) + prediction = self._data_forward(self.model, batch_x) + + # edit prediction + self.callback_manager.on_loss_begin(batch_y, prediction) + loss = self._compute_loss(prediction, batch_y) + avg_loss += loss.item() + + # Is loss NaN or inf? requires_grad = False + self.callback_manager.on_backward_begin(loss) + + if self.fp16: + with amp.scale_loss(loss, self.optimizer) as scale_loss: + scale_loss.backward() + else: + loss.backward() + + self.callback_manager.on_backward_end() + + self._update() + self.callback_manager.on_step_end() + + if self.step % self.print_every == 0: + avg_loss = float(avg_loss) / self.print_every + print_output = "loss:{:<6.5f}".format(avg_loss) + pbar.update(self.print_every) + pbar.set_postfix_str(print_output) + avg_loss = 0 + + self.callback_manager.on_batch_end() + + if ((self.validate_every > 0 and self.step % self.validate_every == 0) or + (self.validate_every < 0 and self.step % len(data_iterator) == 0)): + eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, + self.n_steps) + if self.is_master: + self.logger.info(eval_str) + self.callback_manager.on_validation() + dist.barrier() + + if self.save_path and \ + self.save_every > 0 and \ + self.step % self.save_every == 0: + self.save_check_point() + + # ================= mini-batch end ==================== # + if self.save_path and self.save_every < 0: + self.save_check_point() + # lr decay; early stopping + self.callback_manager.on_epoch_end() + # =============== epochs end =================== # + pbar.close() + self.pbar = None + # ============ tqdm end ============== # + + def _update(self): + """Perform weight update on a model. + + """ + if self.step % self.update_every == 0: + self.optimizer.step() + self.model.zero_grad() + + def _data_forward(self, network, x): + x = _build_args(self._forward_func, **x) + y = network(**x) + if not isinstance(y, dict): + raise TypeError( + f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") + return y + + def _compute_loss(self, predict, truth): + """Compute loss given prediction and ground truth. + + :param predict: prediction dict, produced by model.forward + :param truth: ground truth dict, produced by batch_y + :return: a scalar + """ + loss = self.losser(predict, truth) + if self.update_every > 1: + loss = loss / self.update_every + return loss.mean() + + def save_check_point(self, only_params=False): + if self.is_master: + if not os.path.exists(self.save_path): + os.makedirs(self.save_path) + path = os.path.join(self.save_path, 'checkpoint-{}.bin'.format(self.step)) + self.logger.info("Save checkpoint to {}".format(path)) + model_to_save = self.model.module + if only_params: + model_to_save = model_to_save.state_dict() + torch.save(model_to_save, path) + dist.barrier() + + def close(self): + dist.destroy_process_group() diff --git a/test/core/test_dist_trainer.py b/test/core/test_dist_trainer.py new file mode 100644 index 00000000..59be35c6 --- /dev/null +++ b/test/core/test_dist_trainer.py @@ -0,0 +1,110 @@ +import unittest + +import numpy as np +import torch.cuda +from fastNLP import DataSet +from fastNLP import Instance +from fastNLP import CrossEntropyLoss +from fastNLP import SGD +from fastNLP.core.dist_trainer import DistTrainer, get_local_rank +from fastNLP.models.base_model import NaiveClassifier +import shutil +import os +import subprocess +from argparse import ArgumentParser + +def prepare_fake_dataset(): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=0) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=1) for item in class_B]) + return data_set + +def prepare_fake_dataset2(*args, size=100): + ys = np.random.randint(4, size=100, dtype=np.int64) + data = {'y': ys} + for arg in args: + data[arg] = np.random.randn(size, 5) + return DataSet(data=data) + +def set_rng_seed(seed): + np.random.seed(seed) + +class TestDistTrainer(unittest.TestCase): + save_path = './save_cp' + + def run1(self): + # test distributed training + print('local rank', get_local_rank()) + set_rng_seed(100) + data_set = prepare_fake_dataset() + data_set.set_input("x", flag=True) + data_set.set_target("y", flag=True) + + model = NaiveClassifier(2, 2) + + trainer = DistTrainer( + model=model, train_data=data_set, optimizer=SGD(lr=0.1), + loss=CrossEntropyLoss(pred="predict", target="y"), + batch_size_per_gpu=8, n_epochs=3, print_every=50, save_path=self.save_path, + ) + trainer.train() + """ + # 应该正确运行 + """ + if trainer.is_master and os.path.exists(self.save_path): + shutil.rmtree(self.save_path) + + def run2(self): + # test fp16 with distributed training + print('local rank', get_local_rank()) + set_rng_seed(100) + data_set = prepare_fake_dataset() + data_set.set_input("x", flag=True) + data_set.set_target("y", flag=True) + + model = NaiveClassifier(2, 2) + + trainer = DistTrainer( + model=model, train_data=data_set, optimizer=SGD(lr=0.1), + loss=CrossEntropyLoss(pred="predict", target="y"), + batch_size_per_gpu=8, n_epochs=3, print_every=50, save_path=self.save_path, + fp16='O1' + ) + trainer.train() + """ + # 应该正确运行 + """ + if trainer.is_master and os.path.exists(self.save_path): + shutil.rmtree(self.save_path) + + def run_dist(self, run_id): + if torch.cuda.is_available(): + ngpu = min(4, torch.cuda.device_count()) + path = __file__ + cmd = ['python', '-m', 'torch.distributed.launch', + '--nproc_per_node', str(ngpu), path, '--test', str(run_id)] + print(' '.join(cmd)) + retcode = subprocess.call(cmd) + if retcode: + raise RuntimeError('subprocess got non-zero exit status %d' % retcode) + + def test1(self): + self.run_dist(1) + + def test2(self): + self.run_dist(2) + +if __name__ == '__main__': + runner = TestDistTrainer() + parser = ArgumentParser() + parser.add_argument('--test', type=int) + args, _ = parser.parse_known_args() + if args.test and hasattr(runner, 'run%s'%args.test): + getattr(runner, 'run%s'%args.test)() From 606d63a5a4d3a3d9bc37b4a39ba72939163b15ca Mon Sep 17 00:00:00 2001 From: yunfan Date: Sat, 20 Jul 2019 16:18:18 +0800 Subject: [PATCH 008/153] [update] distributed trainer --- fastNLP/core/callback.py | 54 ++++++++++- fastNLP/core/dist_trainer.py | 169 +++++++++++++++++++-------------- fastNLP/core/trainer.py | 57 +++++------ test/core/test_dist_trainer.py | 47 +++++++-- 4 files changed, 218 insertions(+), 109 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index cf3b158c..dd493567 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -100,7 +100,8 @@ class Callback(object): def __init__(self): super(Callback, self).__init__() self._trainer = None # 在Trainer内部被重新赋值 - + self._disabled = False + @property def trainer(self): """ @@ -158,6 +159,14 @@ class Callback(object): def batch_per_epoch(self): """每个epoch一共有多少个batch,只有在on_epoch_begin之后才能调用该属性。""" return self._trainer.batch_per_epoch + + @property + def is_master(self): + return self._trainer.is_master() + + @property + def disabled(self): + return self._disabled def on_train_begin(self): """ @@ -289,6 +298,8 @@ def _transfer(func): def wrapper(manager, *arg): returns = [] for callback in manager.callbacks: + if callback.disabled: + continue returns.append(getattr(callback, func.__name__)(*arg)) return returns @@ -320,7 +331,7 @@ class CallbackManager(Callback): for env_name, env_val in env.items(): for callback in self.callbacks: setattr(callback, '_' + env_name, env_val) # Callback.trainer - + @_transfer def on_train_begin(self): pass @@ -378,6 +389,24 @@ class CallbackManager(Callback): pass +class DistCallbackManager(CallbackManager): + def __init__(self, env, callbacks_all=None, callbacks_master=None): + assert 'trainer' in env + is_master = env['trainer'].is_master + self.patch_callback(callbacks_master, disabled=not is_master) + self.callbacks_all = CallbackManager(env, callbacks_all).callbacks + self.callbacks_master = CallbackManager(env, callbacks_master).callbacks + self.callbacks = self.callbacks_all + self.callbacks_master + + def patch_callback(self, callbacks, disabled): + if not callbacks: + return + if not isinstance(callbacks, (list, tuple)): + callbacks = [callbacks] + for cb in callbacks: + cb._disabled = disabled + + class GradientClipCallback(Callback): """ 别名::class:`fastNLP.GradientClipCallback` :class:`fastNLP.core.callback.GradientClipCallback` @@ -415,6 +444,9 @@ class GradientClipCallback(Callback): def on_backward_end(self): if self.step%self.update_every==0: if self.parameters is None: + if getattr(self.trainer, 'fp16', default=''): + from apex import amp + self.clip_fun(amp.master_params(self.optimizer), self.clip_value) self.clip_fun(self.model.parameters(), self.clip_value) else: self.clip_fun(self.parameters, self.clip_value) @@ -896,3 +928,21 @@ class EarlyStopError(CallbackException): def __init__(self, msg): super(EarlyStopError, self).__init__(msg) + + +class EchoCallback(Callback): + def __init__(self, name, out=sys.stdout): + super(EchoCallback, self).__init__() + self.name = name + self.out = out + + def __getattribute__(self, item): + if item.startswith('on_'): + print('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid()), + file=self.out) + return super(EchoCallback, self).__getattribute__(item) + + +class TesterCallback(Callback): + def __init__(self, data, model, metrics, batch_size=16, num_workers=None): + self.tester = Tester(data, model) diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 1d782733..700dcf38 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -11,7 +11,7 @@ import time from datetime import datetime, timedelta from .batch import DataSetIter, BatchIter -from .callback import CallbackManager, CallbackException +from .callback import DistCallbackManager, CallbackException from .dataset import DataSet from .losses import _prepare_losser from .optimizer import Optimizer @@ -39,18 +39,36 @@ def get_local_rank(): class DistTrainer(): - def __init__(self, model, train_data, optimizer, loss, callbacks=None, + def __init__(self, train_data, model, optimizer=None, loss=None, + callbacks_all=None, callbacks_master=None, batch_size_per_gpu=8, n_epochs=1, - num_workers=1, drop_last=False, + num_data_workers=1, drop_last=False, update_every=1, print_every=10, validate_every=-1, - save_every=-1, save_path=None, - logging_level=logging.INFO, - fp16='', backend='nccl', init_method=None): + save_every=-1, save_path=None, device='auto', + fp16='', backend=None, init_method=None): + + assert device in ['auto', 'cuda', 'cpu'], "Please set correct device in [auto', 'cuda', 'cpu']" + if device == 'auto': + device = 'cuda' if torch.cuda.is_available() else 'cpu' + if backend is None: + backend = 'nccl' if device == 'cuda' else 'gloo' + + # init distributed + if device == 'cuda': + torch.cuda.set_device(get_local_rank()) + self.device = torch.device("cuda", get_local_rank()) + else: + self.device = torch.device(device) + + dist.init_process_group(backend=backend, init_method=init_method) + self.world_size = dist.get_world_size() + self.rank = dist.get_rank() # unique id for each process + self.model = model self.train_data = train_data self.batch_size_per_gpu = int(batch_size_per_gpu) self.n_epochs = int(n_epochs) - self.num_workers = int(num_workers) + self.num_data_workers = int(num_data_workers) self.drop_last = drop_last self.update_every = int(update_every) self.print_every = int(print_every) @@ -62,16 +80,13 @@ class DistTrainer(): self.init_method = init_method self.backend = backend self.local_rank = get_local_rank() - self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) self._forward_func = model.forward + self.callback_manager = DistCallbackManager( + env={"trainer": self}, callbacks_all=callbacks_all, + callbacks_master=callbacks_master) - assert torch.cuda.is_available(), "Distributed Trainer requires cuda to be enabled." - # init distributed - torch.cuda.set_device(self.local_rank) - self.device = torch.device("cuda", self.local_rank) - dist.init_process_group(backend=self.backend, init_method=self.init_method) model.to(self.device) - optimizer = self.get_optimizer(optimizer) + optimizer = self._get_optimizer(optimizer) # init fp16, must before DataParallel init if len(self.fp16): @@ -81,51 +96,48 @@ class DistTrainer(): except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." + assert device == 'cuda', "Amp requires cuda device" model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16) # init DataParallel self.model = DDP(model, device_ids=[self.local_rank], output_device=self.local_rank) self.optimizer = optimizer - self.world_size = dist.get_world_size() - self.rank = dist.get_rank() # unique id for each process self.sampler = DistributedSampler(self.train_data) - self.data_iterator = self.get_data_iter(self.train_data) - self.n_steps = self.get_n_steps() + self.data_iterator = self._get_data_iter(self.train_data) + self.n_steps = self._get_n_steps() # Setup logging + dist.barrier() + self.start_time = datetime.now().strftime('%m_%d_%Y-%H_%M') + if self.save_path: + self.cp_save_path = os.path.join(self.save_path, 'checkpoints', self.start_time) + else: + self.cp_save_path = None + + # use INFO in the master, WARN for others logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', - level=logging_level) + level=logging.INFO if self.is_master else logging.WARN) self.logger = logging.getLogger(__name__) - self.logger.info("Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".format( + self.logger.info("Setup Distributed Trainer") + self.logger.warning("Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".format( os.getpid(), self.rank, self.local_rank, self.device, self.fp16 if self.fp16 else False)) - if self.is_master: - self.logger.info('Total epochs: %d'% self.n_epochs) - self.logger.info('Total steps: %d'% self.n_steps) - self.logger.info('Num instances per GPU %d'% self.batch_size_per_gpu) - self.logger.info('Total batch_size: %d'% self.batch_size_per_gpu * dist.get_world_size()) - self.logger.info('Total num of samples: %d'% len(self.train_data)) - self.logger.info("Num of callbacks: {}".format(len(self.callback_manager.callbacks))) - self.logger.info( - "Use callbacks: {}".format([repr(cb) for cb in self.callback_manager.callbacks])) - - # only master process save model - if self.save_path: - self.save_path = os.path.join( - self.save_path, - datetime.now().strftime('%m_%d_%y-%H_%M_%S')+'-'+str(os.getpid())) + self.logger.info("Num of processes: {}".format(self.world_size)) + self.logger.info("Use device: {}".format(device)) + self.logger.info("Training with fp16: {}, optimization level: {}".format( + len(self.fp16) > 0, self.fp16 if self.fp16 else None)) - def get_n_steps(self): + def _get_n_steps(self): batch_size = self.world_size * self.batch_size_per_gpu return (len(self.train_data) // batch_size + int( len(self.train_data) % batch_size != 0)) * int(self.drop_last == 0) * self.n_epochs - def get_data_iter(self, dataset): + def _get_data_iter(self, dataset): if isinstance(dataset, DataSet): return DataSetIter( dataset=dataset, batch_size=self.batch_size_per_gpu, - num_workers=self.num_workers, sampler=self.sampler, + num_workers=self.num_data_workers, sampler=self.sampler, drop_last=self.drop_last ) elif isinstance(dataset, BatchIter): @@ -133,7 +145,7 @@ class DistTrainer(): else: raise TypeError("train_data type {} not support".format(type(dataset))) - def get_optimizer(self, optimizer): + def _get_optimizer(self, optimizer): if isinstance(optimizer, torch.optim.Optimizer): return optimizer elif isinstance(optimizer, Optimizer): @@ -148,37 +160,50 @@ class DistTrainer(): return self.rank == 0 def train(self, on_exception='auto'): - start_time = time.time() - results = {} - if self.n_epochs <= 0: - if self.is_master: - self.logger.info("Training epoch is {}, nothing was done.".format(self.n_epochs)) - results['seconds'] = 0. - return results - - if self.is_master: + try: self.logger.info("###### Training epochs started ######") + self.logger.info('Total epochs: %d'% self.n_epochs) + self.logger.info('Total steps: %d'% self.n_steps) + self.logger.info('Num instances per GPU %d'% self.batch_size_per_gpu) + self.logger.info('Total batch_size: %d'% self.batch_size_per_gpu * dist.get_world_size()) + self.logger.info('Total num of samples: %d'% len(self.train_data)) + self.logger.info("Num of callbacks for all workers: {}".format( + len(self.callback_manager.callbacks_all))) + self.logger.info("Num of callbacks for master workers: {}".format( + len(self.callback_manager.callbacks_master))) + self.logger.info("Callbacks for all workers: {}".format( + [repr(cb) for cb in self.callback_manager.callbacks_all])) + self.logger.info("Callbacks for master workers: {}".format( + [repr(cb) for cb in self.callback_manager.callbacks_master])) + + start_time = time.time() + results = {} + if self.n_epochs <= 0: + self.logger.info("Training epoch is {}, nothing was done.".format(self.n_epochs)) + results['seconds'] = 0. + return results - try: - self.callback_manager.on_train_begin() - self._train() - self.callback_manager.on_train_end() - - except BaseException as e: - self.callback_manager.on_exception(e) - if on_exception == 'auto': - if not isinstance(e, (CallbackException, KeyboardInterrupt)): + try: + self.callback_manager.on_train_begin() + self._train() + self.callback_manager.on_train_end() + + except BaseException as e: + self.callback_manager.on_exception(e) + if on_exception == 'auto': + if not isinstance(e, (CallbackException, KeyboardInterrupt)): + raise e + else: + self.logger.info('Catch {}, ignored.'.format(e.__class__.__name__)) + elif on_exception == 'raise': raise e - else: - self.logger.info('Catch {}, ignored.'.format(e.__class__.__name__)) - elif on_exception == 'raise': - raise e - results['seconds'] = round(time.time() - start_time, 2) - if self.is_master: + results['seconds'] = round(time.time() - start_time, 2) self.logger.info("###### Train finished ######") self.logger.info('Total train time: {} seconds.'. format(results['seconds'])) - return results + return results + finally: + self.close() def _train(self): if self.fp16: @@ -187,7 +212,7 @@ class DistTrainer(): self.step = 0 self.epoch = 0 self.pbar = tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', - leave=False, dynamic_ncols=True, disable=not self.is_master) + leave=False, dynamic_ncols=True, disable=not self.is_master) pbar = self.pbar avg_loss = 0 data_iterator = self.data_iterator @@ -238,18 +263,17 @@ class DistTrainer(): (self.validate_every < 0 and self.step % len(data_iterator) == 0)): eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, self.n_steps) - if self.is_master: - self.logger.info(eval_str) + self.logger.info(eval_str) self.callback_manager.on_validation() dist.barrier() - if self.save_path and \ + if self.cp_save_path and \ self.save_every > 0 and \ self.step % self.save_every == 0: self.save_check_point() # ================= mini-batch end ==================== # - if self.save_path and self.save_every < 0: + if self.save_every < 0 and self.cp_save_path: self.save_check_point() # lr decay; early stopping self.callback_manager.on_epoch_end() @@ -287,16 +311,15 @@ class DistTrainer(): return loss.mean() def save_check_point(self, only_params=False): + # only master save models if self.is_master: - if not os.path.exists(self.save_path): - os.makedirs(self.save_path) - path = os.path.join(self.save_path, 'checkpoint-{}.bin'.format(self.step)) + os.makedirs(self.cp_save_path, exist_ok=True) + path = os.path.join(self.cp_save_path, 'checkpoint-{}.bin'.format(self.step)) self.logger.info("Save checkpoint to {}".format(path)) model_to_save = self.model.module if only_params: model_to_save = model_to_save.state_dict() torch.save(model_to_save, path) - dist.barrier() def close(self): dist.destroy_process_group() diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 4ec3d0f4..83bdb4b0 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -431,13 +431,13 @@ class Trainer(object): super(Trainer, self).__init__() if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") - + # check metrics and dev_data if (not metrics) and dev_data is not None: raise ValueError("No metric for dev_data evaluation.") if metrics and (dev_data is None): raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ") - + # check update every assert update_every >= 1, "update_every must be no less than 1." self.update_every = int(update_every) @@ -447,7 +447,7 @@ class Trainer(object): raise ValueError("save_path can only be None or `str`.") # prepare evaluate metrics = _prepare_metrics(metrics) - + # parse metric_key # increase_better is True. It means the exp result gets better if the indicator increases. # It is true by default. @@ -546,7 +546,7 @@ class Trainer(object): self.optimizer = torch.optim.Adam(self.model.parameters(), lr=4e-3) else: raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer))) - + self.use_tqdm = use_tqdm self.pbar = None self.print_every = abs(self.print_every) @@ -558,10 +558,10 @@ class Trainer(object): batch_size=self.batch_size, device=None, # 由上面的部分处理device verbose=0) - + self.step = 0 self.start_time = None # start timestamp - + self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) @@ -597,7 +597,7 @@ class Trainer(object): self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) start_time = time.time() print("training epochs started " + self.start_time, flush=True) - + try: self.callback_manager.on_train_begin() self._train() @@ -610,7 +610,7 @@ class Trainer(object): raise e elif on_exception == 'raise': raise e - + if self.dev_data is not None and self.best_dev_perf is not None: print( "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + @@ -628,9 +628,9 @@ class Trainer(object): finally: pass results['seconds'] = round(time.time() - start_time, 2) - + return results - + def _train(self): if not self.use_tqdm: from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm @@ -656,21 +656,21 @@ class Trainer(object): # negative sampling; replace unknown; re-weight batch_y self.callback_manager.on_batch_begin(batch_x, batch_y, indices) prediction = self._data_forward(self.model, batch_x) - + # edit prediction self.callback_manager.on_loss_begin(batch_y, prediction) loss = self._compute_loss(prediction, batch_y).mean() avg_loss += loss.item() loss = loss / self.update_every - + # Is loss NaN or inf? requires_grad = False self.callback_manager.on_backward_begin(loss) self._grad_backward(loss) self.callback_manager.on_backward_end() - + self._update() self.callback_manager.on_step_end() - + if self.step % self.print_every == 0: avg_loss = float(avg_loss) / self.print_every if self.use_tqdm: @@ -684,7 +684,7 @@ class Trainer(object): pbar.set_postfix_str(print_output) avg_loss = 0 self.callback_manager.on_batch_end() - + if ((self.validate_every > 0 and self.step % self.validate_every == 0) or (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ and self.dev_data is not None: @@ -693,20 +693,20 @@ class Trainer(object): self.n_steps) + \ self.tester._format_eval_results(eval_res) pbar.write(eval_str + '\n') - + # ================= mini-batch end ==================== # - + # lr decay; early stopping self.callback_manager.on_epoch_end() # =============== epochs end =================== # pbar.close() self.pbar = None # ============ tqdm end ============== # - + def _do_validation(self, epoch, step): self.callback_manager.on_valid_begin() res = self.tester.test() - + is_better_eval = False if self._better_eval_result(res): if self.save_path is not None: @@ -721,7 +721,7 @@ class Trainer(object): # get validation results; adjust optimizer self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer, is_better_eval) return res - + def _mode(self, model, is_test=False): """Train mode or Test mode. This is for PyTorch currently. @@ -733,14 +733,14 @@ class Trainer(object): model.eval() else: model.train() - + def _update(self): """Perform weight update on a model. """ if self.step % self.update_every == 0: self.optimizer.step() - + def _data_forward(self, network, x): x = _build_args(self._forward_func, **x) y = network(**x) @@ -748,7 +748,7 @@ class Trainer(object): raise TypeError( f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") return y - + def _grad_backward(self, loss): """Compute gradient with link rules. @@ -759,7 +759,7 @@ class Trainer(object): if (self.step-1) % self.update_every == 0: self.model.zero_grad() loss.backward() - + def _compute_loss(self, predict, truth): """Compute loss given prediction and ground truth. @@ -768,7 +768,7 @@ class Trainer(object): :return: a scalar """ return self.losser(predict, truth) - + def _save_model(self, model, model_name, only_param=False): """ 存储不含有显卡信息的state_dict或model :param model: @@ -791,7 +791,7 @@ class Trainer(object): model.cpu() torch.save(model, model_path) model.to(self._model_device) - + def _load_model(self, model, model_name, only_param=False): # 返回bool值指示是否成功reload模型 if self.save_path is not None: @@ -809,7 +809,7 @@ class Trainer(object): else: return False return True - + def _better_eval_result(self, metrics): """Check if the current epoch yields better validation results. @@ -835,6 +835,9 @@ class Trainer(object): is_better = False return is_better + @property + def is_master(self): + return True DEFAULT_CHECK_BATCH_SIZE = 2 DEFAULT_CHECK_NUM_BATCH = 2 diff --git a/test/core/test_dist_trainer.py b/test/core/test_dist_trainer.py index 59be35c6..e36615dd 100644 --- a/test/core/test_dist_trainer.py +++ b/test/core/test_dist_trainer.py @@ -4,7 +4,7 @@ import numpy as np import torch.cuda from fastNLP import DataSet from fastNLP import Instance -from fastNLP import CrossEntropyLoss +from fastNLP import CrossEntropyLoss, BCELoss from fastNLP import SGD from fastNLP.core.dist_trainer import DistTrainer, get_local_rank from fastNLP.models.base_model import NaiveClassifier @@ -12,6 +12,7 @@ import shutil import os import subprocess from argparse import ArgumentParser +from fastNLP.core.callback import EchoCallback def prepare_fake_dataset(): mean = np.array([-3, -3]) @@ -36,6 +37,26 @@ def prepare_fake_dataset2(*args, size=100): def set_rng_seed(seed): np.random.seed(seed) +def prepare_env(): + def prepare_fake_dataset(): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) + return data_set + + data_set = prepare_fake_dataset() + data_set.set_input("x") + data_set.set_target("y") + model = NaiveClassifier(2, 1) + return data_set, model + class TestDistTrainer(unittest.TestCase): save_path = './save_cp' @@ -84,23 +105,35 @@ class TestDistTrainer(unittest.TestCase): if trainer.is_master and os.path.exists(self.save_path): shutil.rmtree(self.save_path) + def run3(self): + data_set, model = prepare_env() + trainer = DistTrainer( + data_set, model, optimizer=None, loss=BCELoss(pred="predict", target="y"), + n_epochs=3, print_every=50, + callbacks_all=[EchoCallback('callbacks_all')], + callbacks_master=[EchoCallback('callbacks_master')] + ) + trainer.train() + def run_dist(self, run_id): if torch.cuda.is_available(): - ngpu = min(4, torch.cuda.device_count()) + ngpu = min(2, torch.cuda.device_count()) path = __file__ cmd = ['python', '-m', 'torch.distributed.launch', '--nproc_per_node', str(ngpu), path, '--test', str(run_id)] print(' '.join(cmd)) - retcode = subprocess.call(cmd) - if retcode: - raise RuntimeError('subprocess got non-zero exit status %d' % retcode) + subprocess.check_call(cmd, timeout=60.0) - def test1(self): + def test_normal_run(self): self.run_dist(1) - def test2(self): + def test_fp16(self): self.run_dist(2) + def test_callback(self): + self.run_dist(3) + + if __name__ == '__main__': runner = TestDistTrainer() parser = ArgumentParser() From 329a18976ff3cf0d669cde6ba7571c7b3b20bcb0 Mon Sep 17 00:00:00 2001 From: yunfan Date: Sat, 20 Jul 2019 17:00:50 +0800 Subject: [PATCH 009/153] [update] distributed trainer, add evaluation part --- fastNLP/core/callback.py | 62 ++++++++++++++++++++++++---------- fastNLP/core/dist_trainer.py | 16 ++++++--- test/core/test_dist_trainer.py | 26 +++++++++++++- 3 files changed, 82 insertions(+), 22 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index dd493567..14803e56 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -79,6 +79,7 @@ except: from ..io.model_io import ModelSaver, ModelLoader from .dataset import DataSet from .tester import Tester +import logging try: import fitlog @@ -167,7 +168,11 @@ class Callback(object): @property def disabled(self): return self._disabled - + + @property + def logger(self): + return getattr(self._trainer, 'logger', logging) + def on_train_begin(self): """ 在Train过程开始之前调用。 @@ -316,21 +321,27 @@ class CallbackManager(Callback): """ super(CallbackManager, self).__init__() # set attribute of trainer environment - + self._env = env self.callbacks = [] - if callbacks is not None: - if isinstance(callbacks, list): - if all([isinstance(cb, Callback) for cb in callbacks]) is True: - self.callbacks.extend(callbacks) - else: - obj = [not isinstance(cb, Callback) for cb in callbacks][0] - raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}") + if callbacks: + self.callbacks += self.prepare_callbacks(callbacks) + + def prepare_callbacks(self, callbacks): + if not callbacks: + return [] + if isinstance(callbacks, list): + if all([isinstance(cb, Callback) for cb in callbacks]) is True: + self.callbacks.extend(callbacks) else: - raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.") - - for env_name, env_val in env.items(): - for callback in self.callbacks: + obj = [not isinstance(cb, Callback) for cb in callbacks][0] + raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}") + else: + raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.") + + for env_name, env_val in self._env.items(): + for callback in callbacks: setattr(callback, '_' + env_name, env_val) # Callback.trainer + return callbacks @_transfer def on_train_begin(self): @@ -391,11 +402,12 @@ class CallbackManager(Callback): class DistCallbackManager(CallbackManager): def __init__(self, env, callbacks_all=None, callbacks_master=None): + super(DistCallbackManager, self).__init__(env) assert 'trainer' in env is_master = env['trainer'].is_master self.patch_callback(callbacks_master, disabled=not is_master) - self.callbacks_all = CallbackManager(env, callbacks_all).callbacks - self.callbacks_master = CallbackManager(env, callbacks_master).callbacks + self.callbacks_all = self.prepare_callbacks(callbacks_all) + self.callbacks_master = self.prepare_callbacks(callbacks_master) self.callbacks = self.callbacks_all + self.callbacks_master def patch_callback(self, callbacks, disabled): @@ -944,5 +956,21 @@ class EchoCallback(Callback): class TesterCallback(Callback): - def __init__(self, data, model, metrics, batch_size=16, num_workers=None): - self.tester = Tester(data, model) + def __init__(self, data, model, metrics, batch_size=16, num_workers=None):\ + #TODO add compare & save best + super(TesterCallback, self).__init__() + self.tester = Tester(data, model, + metrics=metrics, batch_size=batch_size, + num_workers=num_workers, verbose=0) + self.score = None + + def on_validation(self): + cur_socre = self.tester.test() + eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. - {}".format( + self.epoch, self.n_epochs, self.step, self.n_steps, + self.tester._format_eval_results(cur_socre)) + self.logger.info(eval_str) + + def on_train_end(self): + self.logger.info('Evaluate on training ends.') + self.on_validation() diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 700dcf38..260b93b0 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -11,7 +11,7 @@ import time from datetime import datetime, timedelta from .batch import DataSetIter, BatchIter -from .callback import DistCallbackManager, CallbackException +from .callback import DistCallbackManager, CallbackException, TesterCallback from .dataset import DataSet from .losses import _prepare_losser from .optimizer import Optimizer @@ -39,10 +39,13 @@ def get_local_rank(): class DistTrainer(): + """Distributed Trainer that support distributed and mixed precision training + """ def __init__(self, train_data, model, optimizer=None, loss=None, callbacks_all=None, callbacks_master=None, batch_size_per_gpu=8, n_epochs=1, num_data_workers=1, drop_last=False, + dev_data=None, metrics=None, update_every=1, print_every=10, validate_every=-1, save_every=-1, save_path=None, device='auto', fp16='', backend=None, init_method=None): @@ -107,6 +110,14 @@ class DistTrainer(): self.data_iterator = self._get_data_iter(self.train_data) self.n_steps = self._get_n_steps() + # for evaluation, only run eval on master proc + if dev_data and metrics: + cb = TesterCallback( + dev_data, model, metrics, + batch_size=batch_size_per_gpu, num_workers=num_data_workers) + self.callback_manager.callbacks_master += \ + self.callback_manager.prepare_callbacks([cb]) + # Setup logging dist.barrier() self.start_time = datetime.now().strftime('%m_%d_%Y-%H_%M') @@ -261,9 +272,6 @@ class DistTrainer(): if ((self.validate_every > 0 and self.step % self.validate_every == 0) or (self.validate_every < 0 and self.step % len(data_iterator) == 0)): - eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, - self.n_steps) - self.logger.info(eval_str) self.callback_manager.on_validation() dist.barrier() diff --git a/test/core/test_dist_trainer.py b/test/core/test_dist_trainer.py index e36615dd..93d87407 100644 --- a/test/core/test_dist_trainer.py +++ b/test/core/test_dist_trainer.py @@ -13,6 +13,7 @@ import os import subprocess from argparse import ArgumentParser from fastNLP.core.callback import EchoCallback +from fastNLP import AccuracyMetric def prepare_fake_dataset(): mean = np.array([-3, -3]) @@ -106,15 +107,36 @@ class TestDistTrainer(unittest.TestCase): shutil.rmtree(self.save_path) def run3(self): + set_rng_seed(100) data_set, model = prepare_env() trainer = DistTrainer( - data_set, model, optimizer=None, loss=BCELoss(pred="predict", target="y"), + data_set, model, optimizer=None, + loss=BCELoss(pred="predict", target="y"), n_epochs=3, print_every=50, callbacks_all=[EchoCallback('callbacks_all')], callbacks_master=[EchoCallback('callbacks_master')] ) trainer.train() + def run4(self): + set_rng_seed(100) + data_set, model = prepare_env() + + train_set, dev_set = data_set.split(0.3) + + model = NaiveClassifier(2, 1) + + trainer = DistTrainer( + train_set, model, optimizer=SGD(lr=0.1), + loss=BCELoss(pred="predict", target="y"), + batch_size_per_gpu=32, n_epochs=3, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + ) + trainer.train() + """ + # 应该正确运行 + """ + def run_dist(self, run_id): if torch.cuda.is_available(): ngpu = min(2, torch.cuda.device_count()) @@ -133,6 +155,8 @@ class TestDistTrainer(unittest.TestCase): def test_callback(self): self.run_dist(3) + def test_dev_data(self): + self.run_dist(4) if __name__ == '__main__': runner = TestDistTrainer() From 210af73c6ee95de88997df3de5111ec0748106e6 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 25 Jul 2019 16:01:17 +0800 Subject: [PATCH 010/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dcallback=E8=BF=9E?= =?UTF-8?q?=E7=BB=AD=E4=B8=A4=E6=AC=A1=E5=8A=A0=E5=85=A5=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 14803e56..07ca70dc 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -324,7 +324,7 @@ class CallbackManager(Callback): self._env = env self.callbacks = [] if callbacks: - self.callbacks += self.prepare_callbacks(callbacks) + self.prepare_callbacks(callbacks) def prepare_callbacks(self, callbacks): if not callbacks: From a8111087a3d6b07684da762cf5191b2a3ef64d17 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 25 Jul 2019 16:36:28 +0800 Subject: [PATCH 011/153] =?UTF-8?q?=E5=88=A0=E9=99=A4getattr=E7=9A=84defau?= =?UTF-8?q?lt=20keyword?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 07ca70dc..09ff860b 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -456,7 +456,7 @@ class GradientClipCallback(Callback): def on_backward_end(self): if self.step%self.update_every==0: if self.parameters is None: - if getattr(self.trainer, 'fp16', default=''): + if getattr(self.trainer, 'fp16', ''): from apex import amp self.clip_fun(amp.master_params(self.optimizer), self.clip_value) self.clip_fun(self.model.parameters(), self.clip_value) From cb3cb8bc5cf5c2d0f083a6e5f1608f091682f405 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 25 Jul 2019 16:51:59 +0800 Subject: [PATCH 012/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DBertEmbedding?= =?UTF-8?q?=E7=9A=84weight?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index a9f228fb..9447c6ad 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -41,9 +41,9 @@ class Embedding(nn.Module): self.dropout = nn.Dropout(dropout) if not isinstance(self.embed, TokenEmbedding): - if hasattr(self, 'embed_size'): + if hasattr(self.embed, 'embed_size'): self._embed_size = self.embed.embed_size - elif hasattr(self, 'embedding_dim'): + elif hasattr(self.embed, 'embedding_dim'): self._embed_size = self.embed.embedding_dim else: self._embed_size = self.embed.weight.size(1) From d401bd2208d2536ca22a6c20b169768e2065df14 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 25 Jul 2019 16:56:54 +0800 Subject: [PATCH 013/153] =?UTF-8?q?sst=20loader=E4=B8=AD=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=B8=80=E5=88=97raw=5Fwords?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/io/data_loader/sst.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastNLP/io/data_loader/sst.py b/fastNLP/io/data_loader/sst.py index 0d881e65..6c06a9ce 100644 --- a/fastNLP/io/data_loader/sst.py +++ b/fastNLP/io/data_loader/sst.py @@ -134,6 +134,7 @@ class SST2Loader(CSVLoader): info = DataBundle() for name, path in paths.items(): dataset = self.load(path) + dataset.apply_field(lambda words:words.copy(), field_name='words', new_field_name='raw_words') datasets[name] = dataset def wordtochar(words): From cacf40366c794e337cbe9d39b21306cada58ef7e Mon Sep 17 00:00:00 2001 From: yunfan Date: Fri, 26 Jul 2019 16:26:43 +0800 Subject: [PATCH 014/153] [fix] distributed trainer --- fastNLP/core/callback.py | 36 ++++++++++++++++++++++++++-------- fastNLP/core/dist_trainer.py | 30 ++++++++++++++++++++++------ test/core/test_dist_trainer.py | 4 ++-- 3 files changed, 54 insertions(+), 16 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 09ff860b..acd39e98 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -324,15 +324,13 @@ class CallbackManager(Callback): self._env = env self.callbacks = [] if callbacks: - self.prepare_callbacks(callbacks) + self.callbacks += self.prepare_callbacks(callbacks) def prepare_callbacks(self, callbacks): if not callbacks: return [] if isinstance(callbacks, list): - if all([isinstance(cb, Callback) for cb in callbacks]) is True: - self.callbacks.extend(callbacks) - else: + if not all([isinstance(cb, Callback) for cb in callbacks]): obj = [not isinstance(cb, Callback) for cb in callbacks][0] raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}") else: @@ -956,20 +954,42 @@ class EchoCallback(Callback): class TesterCallback(Callback): - def __init__(self, data, model, metrics, batch_size=16, num_workers=None):\ - #TODO add compare & save best + def __init__(self, data, model, metrics, metric_key=None, batch_size=16, num_workers=None): super(TesterCallback, self).__init__() self.tester = Tester(data, model, metrics=metrics, batch_size=batch_size, num_workers=num_workers, verbose=0) + # parse metric_key + # increase_better is True. It means the exp result gets better if the indicator increases. + # It is true by default. + self.increase_better = True + if metric_key is not None: + self.increase_better = False if metric_key[0] == "-" else True + self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key + else: + self.metric_key = None self.score = None def on_validation(self): - cur_socre = self.tester.test() + cur_score = self.tester.test() eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. - {}".format( self.epoch, self.n_epochs, self.step, self.n_steps, - self.tester._format_eval_results(cur_socre)) + self.tester._format_eval_results(cur_score)) self.logger.info(eval_str) + is_better = self.compare_better(cur_score) + if is_better: + self.score = cur_score + return cur_score, is_better + + def compare_better(self, a): + if self.score is None: + return True + k = self.metric_key + is_increase = self.score[k] <= a[k] # if equal, prefer more recent results + if self.increase_better: + return is_increase + else: + return not is_increase def on_train_end(self): self.logger.info('Evaluate on training ends.') diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 260b93b0..bbe4f62a 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -9,6 +9,7 @@ from tqdm import tqdm import logging import time from datetime import datetime, timedelta +from functools import partial from .batch import DataSetIter, BatchIter from .callback import DistCallbackManager, CallbackException, TesterCallback @@ -45,10 +46,12 @@ class DistTrainer(): callbacks_all=None, callbacks_master=None, batch_size_per_gpu=8, n_epochs=1, num_data_workers=1, drop_last=False, - dev_data=None, metrics=None, + dev_data=None, metrics=None, metric_key=None, update_every=1, print_every=10, validate_every=-1, + log_path=None, save_every=-1, save_path=None, device='auto', - fp16='', backend=None, init_method=None): + fp16='', backend=None, init_method=None, + find_unused_parameters=True): assert device in ['auto', 'cuda', 'cpu'], "Please set correct device in [auto', 'cuda', 'cpu']" if device == 'auto': @@ -87,6 +90,7 @@ class DistTrainer(): self.callback_manager = DistCallbackManager( env={"trainer": self}, callbacks_all=callbacks_all, callbacks_master=callbacks_master) + self.metric_key = metric_key model.to(self.device) optimizer = self._get_optimizer(optimizer) @@ -103,8 +107,13 @@ class DistTrainer(): model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16) # init DataParallel - self.model = DDP(model, device_ids=[self.local_rank], - output_device=self.local_rank) + if find_unused_parameters: + # to support old version + self.model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank, find_unused_parameters=find_unused_parameters) + else: + self.model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank) self.optimizer = optimizer self.sampler = DistributedSampler(self.train_data) self.data_iterator = self._get_data_iter(self.train_data) @@ -127,7 +136,8 @@ class DistTrainer(): self.cp_save_path = None # use INFO in the master, WARN for others - logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + logging.basicConfig(filename=log_path, + format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if self.is_master else logging.WARN) self.logger = logging.getLogger(__name__) @@ -272,7 +282,15 @@ class DistTrainer(): if ((self.validate_every > 0 and self.step % self.validate_every == 0) or (self.validate_every < 0 and self.step % len(data_iterator) == 0)): - self.callback_manager.on_validation() + self.callback_manager.on_valid_begin() + eval_res = self.callback_manager.on_validation() + eval_res = list(filter(lambda x: x is not None, eval_res)) + if len(eval_res): + eval_res, is_better = list(zip(*eval_res)) + else: + eval_res, is_better = None, None + self.callback_manager.on_valid_end( + eval_res, self.metric_key, self.optimizer, is_better) dist.barrier() if self.cp_save_path and \ diff --git a/test/core/test_dist_trainer.py b/test/core/test_dist_trainer.py index 93d87407..c6879634 100644 --- a/test/core/test_dist_trainer.py +++ b/test/core/test_dist_trainer.py @@ -144,12 +144,12 @@ class TestDistTrainer(unittest.TestCase): cmd = ['python', '-m', 'torch.distributed.launch', '--nproc_per_node', str(ngpu), path, '--test', str(run_id)] print(' '.join(cmd)) - subprocess.check_call(cmd, timeout=60.0) + subprocess.check_call(cmd) def test_normal_run(self): self.run_dist(1) - def test_fp16(self): + def no_test_fp16(self): self.run_dist(2) def test_callback(self): From db8c6a0b8a0606516a0cbdb61b633a28f1d3aa29 Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 26 Jul 2019 19:44:20 +0800 Subject: [PATCH 015/153] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E8=AF=BB=E5=8F=96ber?= =?UTF-8?q?t=E6=9D=83=E9=87=8D=E6=BD=9C=E5=9C=A8=E7=9A=84bug=E9=97=AE?= =?UTF-8?q?=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 2 +- fastNLP/modules/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 21944570..adc205c2 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -63,7 +63,7 @@ class BertEmbedding(ContextualEmbedding): model_dir = cached_path(model_url) # 检查是否存在 elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): - model_dir = model_dir_or_name + model_dir = os.path.expanduser(os.path.abspath(model_dir_or_name)) else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index 700e9620..21608c5d 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -128,9 +128,9 @@ def _get_file_name_base_on_postfix(dir_path, postfix): :param postfix: 形如".bin", ".json"等 :return: str,文件的路径 """ - files = glob.glob(os.path.join(dir_path, '*' + postfix)) + files = list(filter(lambda filename:filename.endswith(postfix), os.listdir(os.path.join(dir_path)))) if len(files) == 0: - raise FileNotFoundError(f"There is no file endswith *.{postfix} file in {dir_path}") + raise FileNotFoundError(f"There is no file endswith *{postfix} file in {dir_path}") elif len(files) > 1: - raise FileExistsError(f"There are multiple *.{postfix} files in {dir_path}") + raise FileExistsError(f"There are multiple *{postfix} files in {dir_path}") return os.path.join(dir_path, files[0]) \ No newline at end of file From 71c9e0c30ec53fd825cbdbda265cfea005f04f9a Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 29 Jul 2019 23:54:57 +0800 Subject: [PATCH 016/153] =?UTF-8?q?=E4=BB=8E=E8=BF=9C=E7=A8=8B=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD=E6=9D=83=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/elmo_embedding.py | 4 +- fastNLP/io/file_utils.py | 98 ++++++++++------------------ fastNLP/modules/encoder/bert.py | 2 +- 3 files changed, 38 insertions(+), 66 deletions(-) diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index bd14cf58..53adfd62 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -182,8 +182,8 @@ class _ElmoModel(nn.Module): raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.") elif config_count == 0 or weight_count == 0: raise Exception(f"No config file or weight file found in {model_dir}") - - config = json.load(open(os.path.join(model_dir, config_file), 'r')) + with open(os.path.join(model_dir, config_file), 'r') as config_f: + config = json.load(config_f) self.weight_file = os.path.join(model_dir, weight_file) self.config = config diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index cb762eb7..4be1360b 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -11,7 +11,7 @@ import hashlib PRETRAINED_BERT_MODEL_DIR = { - 'en': 'bert-base-cased-f89bfe08.zip', + 'en': 'bert-large-cased-wwm.zip', 'en-base-uncased': 'bert-base-uncased-3413b23c.zip', 'en-base-cased': 'bert-base-cased-f89bfe08.zip', 'en-large-uncased': 'bert-large-uncased-20939f45.zip', @@ -24,14 +24,14 @@ PRETRAINED_BERT_MODEL_DIR = { 'cn': 'bert-base-chinese-29d0a84a.zip', 'cn-base': 'bert-base-chinese-29d0a84a.zip', - 'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip', - 'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip', - 'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip', + 'multilingual': 'bert-base-multilingual-cased.zip', + 'multilingual-base-uncased': 'bert-base-multilingual-uncased.zip', + 'multilingual-base-cased': 'bert-base-multilingual-cased.zip', } PRETRAINED_ELMO_MODEL_DIR = { 'en': 'elmo_en-d39843fe.tar.gz', - 'cn': 'elmo_cn-5e9b34e2.tar.gz' + 'en-small': "elmo_en_Small.zip" } PRETRAIN_STATIC_FILES = { @@ -39,7 +39,7 @@ PRETRAIN_STATIC_FILES = { 'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz', 'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz", 'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz", - 'en-fasttext': "cc.en.300.vec-d53187b2.gz", + 'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", 'cn': "tencent_cn-dab24577.tar.gz", 'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz", } @@ -47,11 +47,15 @@ PRETRAIN_STATIC_FILES = { def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: """ - 给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 - 将文件放入到cache_dir中 + 给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 + 将文件放入到cache_dir中. + + :param url_or_filename: 文件的下载url或者文件路径 + :param cache_dir: 文件的缓存文件夹 + :return: """ if cache_dir is None: - dataset_cache = Path(get_defalt_path()) + dataset_cache = Path(get_default_cache_path()) else: dataset_cache = cache_dir @@ -75,7 +79,7 @@ def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: def get_filepath(filepath): """ - 如果filepath中只有一个文件,则直接返回对应的全路径 + 如果filepath中只有一个文件,则直接返回对应的全路径. :param filepath: :return: """ @@ -88,7 +92,7 @@ def get_filepath(filepath): return filepath -def get_defalt_path(): +def get_default_cache_path(): """ 获取默认的fastNLP存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中,将使用环境变量的值,使得不用每个用户都去下载。 @@ -96,11 +100,10 @@ def get_defalt_path(): """ if 'FASTNLP_CACHE_DIR' in os.environ: fastnlp_cache_dir = os.environ.get('FASTNLP_CACHE_DIR') - if os.path.exists(fastnlp_cache_dir): + if os.path.isdir(fastnlp_cache_dir): return fastnlp_cache_dir - raise RuntimeError("Some errors happens on cache directory.") - else: - raise RuntimeError("There function is not available right now.") + else: + raise NotADirectoryError(f"{os.environ['FASTNLP_CACHE_DIR']} is not a directory.") fastnlp_cache_dir = os.path.expanduser(os.path.join("~", ".fastNLP")) return fastnlp_cache_dir @@ -109,13 +112,19 @@ def _get_base_url(name): # 返回的URL结尾必须是/ if 'FASTNLP_BASE_URL' in os.environ: fastnlp_base_url = os.environ['FASTNLP_BASE_URL'] - return fastnlp_base_url - raise RuntimeError("There function is not available right now.") + if fastnlp_base_url.endswith('/'): + return fastnlp_base_url + else: + return fastnlp_base_url + '/' + else: + # TODO 替换 + dbbrain_url = "http://dbcloud.irocn.cn:8989/api/public/dl/" + return dbbrain_url def split_filename_suffix(filepath): """ - 给定filepath返回对应的name和suffix + 给定filepath返回对应的name和suffix. 如果后缀是多个点,仅支持.tar.gz类型 :param filepath: :return: filename, suffix """ @@ -135,13 +144,6 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: filename = re.sub(r".+/", "", url) dir_name, suffix = split_filename_suffix(filename) - sep_index = dir_name[::-1].index('-') - if sep_index<0: - check_sum = None - else: - check_sum = dir_name[-sep_index+1:] - sep_index = len(dir_name) if sep_index==-1 else -sep_index-1 - dir_name = dir_name[:sep_index] # 寻找与它名字匹配的内容, 而不关心后缀 match_dir_name = match_file(dir_name, cache_dir) @@ -154,11 +156,11 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: return get_filepath(cache_path) # make HEAD request to check ETag TODO ETag可以用来判断资源是否已经更新了,之后需要加上 - response = requests.head(url, headers={"User-Agent": "fastNLP"}) - if response.status_code != 200: - raise IOError( - f"HEAD request failed for url {url} with status code {response.status_code}." - ) + # response = requests.head(url, headers={"User-Agent": "fastNLP"}) + # if response.status_code != 200: + # raise IOError( + # f"HEAD request failed for url {url} with status code {response.status_code}." + # ) # add ETag to filename if it exists # etag = response.headers.get("ETag") @@ -174,17 +176,11 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: content_length = req.headers.get("Content-Length") total = int(content_length) if content_length is not None else None progress = tqdm(unit="B", total=total) - sha256 = hashlib.sha256() with open(temp_filename, "wb") as temp_file: for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) - sha256.update(chunk) - # check sum - digit = sha256.hexdigest()[:8] - if not check_sum: - assert digit == check_sum, "File corrupted when download." progress.close() print(f"Finish download from {url}.") @@ -193,7 +189,7 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: if suffix in ('.zip', '.tar.gz'): uncompress_temp_dir = tempfile.mkdtemp() delete_temp_dir = uncompress_temp_dir - print(f"Start to uncompress file to {uncompress_temp_dir}.") + print(f"Start to uncompress file to {uncompress_temp_dir}") if suffix == '.zip': unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) else: @@ -211,7 +207,7 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: success = False try: # 复制到指定的位置 - print(f"Copy file to {cache_path}.") + print(f"Copy file to {cache_path}") if os.path.isdir(uncompress_temp_dir): for filename in os.listdir(uncompress_temp_dir): shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path/filename) @@ -252,7 +248,7 @@ def untar_gz_file(file:Path, to:Path): tar.extractall(to) -def match_file(dir_name: str, cache_dir: str) -> str: +def match_file(dir_name: str, cache_dir: Path) -> str: """ 匹配的原则是,在cache_dir下的文件: (1) 与dir_name完全一致; (2) 除了后缀以外和dir_name完全一致。 如果找到了两个匹配的结果将报错. 如果找到了则返回匹配的文件的名称; 没有找到返回空字符串 @@ -273,27 +269,3 @@ def match_file(dir_name: str, cache_dir: str) -> str: else: raise RuntimeError(f"Duplicate matched files:{matched_filenames}, this should be caused by a bug.") - -if __name__ == '__main__': - cache_dir = Path('caches') - cache_dir = None - # 需要对cache_dir进行测试 - base_url = 'http://0.0.0.0:8888/file/download' - # if True: - # for filename in os.listdir(cache_dir): - # if os.path.isdir(os.path.join(cache_dir, filename)): - # shutil.rmtree(os.path.join(cache_dir, filename)) - # else: - # os.remove(os.path.join(cache_dir, filename)) - # 1. 测试.txt文件 - print(cached_path(base_url + '/{}'.format('txt_test-bcb4fe65.txt'), cache_dir)) - # 2. 测试.zip文件(只有一个文件) - print(cached_path(base_url + '/{}'.format('zip_test-40966d39.zip'), cache_dir)) - # 3. 测试.zip文件(有多个文件) - print(cached_path(base_url + '/{}'.format('zip_pack_test-70c0b20d.zip'), cache_dir)) - # 4. 测试.tar.gz文件 - print(cached_path(base_url + '/{}'.format('tar_gz_test-3e2679cf.tar.gz'), cache_dir)) - # 5. 测试.tar.gz多个文件 - print(cached_path(base_url + '/{}'.format('tar_gz_pack_test-08dfdccd.tar.gz'), cache_dir)) - - # 6. 测试.pkl文件 diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 9a990d9d..e73b2c40 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -563,7 +563,7 @@ class WordpieceTokenizer(object): output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) - if len(output_tokens)==0: + if len(output_tokens)==0: #防止里面全是空格或者回车符号 return [self.unk_token] return output_tokens From af55db201990d66b9e43a95e36e96b7a340e43e7 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 29 Jul 2019 23:56:53 +0800 Subject: [PATCH 017/153] =?UTF-8?q?1.=E4=BF=AE=E6=94=B9callback=20prepare?= =?UTF-8?q?=5Fcallbacks,=202.=E8=AE=A9Dist=5FTrainer=E6=94=AF=E6=8C=81find?= =?UTF-8?q?=5Funused=5Fparameters,=20=E4=BD=86=E4=BB=85=E5=9C=A81.1?= =?UTF-8?q?=E4=BB=A5=E4=B8=8A=E7=89=88=E6=9C=AC=E6=9C=89=E6=95=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/callback.py | 4 ++-- fastNLP/core/dist_trainer.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 09ff860b..85903315 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -324,14 +324,14 @@ class CallbackManager(Callback): self._env = env self.callbacks = [] if callbacks: - self.prepare_callbacks(callbacks) + self.callbacks = self.prepare_callbacks(callbacks) def prepare_callbacks(self, callbacks): if not callbacks: return [] if isinstance(callbacks, list): if all([isinstance(cb, Callback) for cb in callbacks]) is True: - self.callbacks.extend(callbacks) + pass else: obj = [not isinstance(cb, Callback) for cb in callbacks][0] raise TypeError(f"Expect sub-classes of Callback. Got {type(obj)}") diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 260b93b0..57c5f56b 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -18,6 +18,7 @@ from .optimizer import Optimizer from .utils import _build_args from .utils import _move_dict_value_to_device from .utils import _get_func_signature +from pkg_resources import parse_version __all__ = [ 'get_local_rank', @@ -103,8 +104,13 @@ class DistTrainer(): model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16) # init DataParallel - self.model = DDP(model, device_ids=[self.local_rank], - output_device=self.local_rank) + if parse_version(torch.__version__)>=parse_version('1.1'): + self.model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank, find_unused_parameters=True) + else: + self.model = DDP(model, device_ids=[self.local_rank], + output_device=self.local_rank) + self.optimizer = optimizer self.sampler = DistributedSampler(self.train_data) self.data_iterator = self._get_data_iter(self.train_data) From e166c119f58d52ca08973f59772702c62bb39d7a Mon Sep 17 00:00:00 2001 From: yh_cc Date: Tue, 6 Aug 2019 02:01:02 +0800 Subject: [PATCH 018/153] =?UTF-8?q?bert=5Fembedding=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=B8=80=E4=B8=AAauto=5Ftruncate=E7=9A=84=E5=8F=82=E6=95=B0?= =?UTF-8?q?=EF=BC=8C=E5=9C=A8word=20pieces=E9=95=BF=E5=BA=A6=E8=B6=85?= =?UTF-8?q?=E8=BF=87512=E7=9A=84=E6=83=85=E5=86=B5=E8=87=AA=E5=8A=A8?= =?UTF-8?q?=E4=BD=BF=E7=94=A80=E5=8E=BBpadding?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 29 +++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index adc205c2..38b8daf2 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -49,10 +49,13 @@ class BertEmbedding(ContextualEmbedding): :param bool pooled_cls: 返回的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取[CLS]做预测, 一般该值为True。 :param bool requires_grad: 是否需要gradient以更新Bert的权重。 + :param bool auto_truncate: 当句子words拆分为word pieces长度超过bert最大允许长度(一般为512), 自动截掉拆分后的超过510个 + word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] + 来进行分类的任务将auto_truncate置为True。 """ def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False, - pooled_cls=True, requires_grad: bool=False): + pooled_cls=True, requires_grad: bool=False, auto_truncate:bool=False): super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) # 根据model_dir_or_name检查是否存在并下载 @@ -69,7 +72,7 @@ class BertEmbedding(ContextualEmbedding): self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, - pooled_cls=pooled_cls) + pooled_cls=pooled_cls, auto_truncate=auto_truncate) self.requires_grad = requires_grad self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size @@ -202,11 +205,12 @@ class BertWordPieceEncoder(nn.Module): class _WordBertModel(nn.Module): def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', - include_cls_sep:bool=False, pooled_cls:bool=False): + include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False): super().__init__() self.tokenzier = BertTokenizer.from_pretrained(model_dir) self.encoder = BertModel.from_pretrained(model_dir) + self._max_position_embeddings = self.encoder.config.max_position_embeddings # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) @@ -222,6 +226,7 @@ class _WordBertModel(nn.Module): self.pool_method = pool_method self.include_cls_sep = include_cls_sep self.pooled_cls = pooled_cls + self.auto_truncate = auto_truncate # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] print("Start to generating word pieces for word.") @@ -290,6 +295,17 @@ class _WordBertModel(nn.Module): batch_word_pieces_length = self.word_pieces_lengths[words] # batch_size x max_len word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) max_word_piece_length = word_pieces_lengths.max().item() + real_max_word_piece_length = max_word_piece_length # 表示没有截断的word piece的长度 + if max_word_piece_length+2>self._max_position_embeddings: + if self.auto_truncate: + word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings, + self._max_position_embeddings-2) + max_word_piece_length = self._max_position_embeddings-2 + else: + raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " + f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") + + # +2是由于需要加入[CLS]与[SEP] word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index) word_pieces[:, 0].fill_(self._cls_index) @@ -300,6 +316,8 @@ class _WordBertModel(nn.Module): word_indexes = words.tolist() for i in range(batch_size): word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]])) + if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: + word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i) attn_masks[i, :len(word_pieces_i)+2].fill_(1) # TODO 截掉长度超过的部分。 @@ -321,6 +339,11 @@ class _WordBertModel(nn.Module): batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1) # batch_size x max_len for l_index, l in enumerate(self.layers): output_layer = bert_outputs[l] + if real_max_word_piece_length > max_word_piece_length: # 如果实际上是截取出来的 + paddings = output_layer.new_zeros(batch_size, + real_max_word_piece_length-max_word_piece_length, + output_layer.size(2)) + output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() # 从word_piece collapse到word的表示 truncate_output_layer = output_layer[:, 1:-1] # 删除[CLS]与[SEP] batch_size x len x hidden_size outputs_seq_len = seq_len + s_shift From e7d027cdb4e15340c25b4a9e08f5b7c6ab88b49c Mon Sep 17 00:00:00 2001 From: xuyige Date: Wed, 7 Aug 2019 03:19:32 +0800 Subject: [PATCH 019/153] add tqdm bars in tester and fix some import statements --- fastNLP/core/tester.py | 59 ++++++++++++++++++++++++++++++----------- fastNLP/core/trainer.py | 10 ++++--- 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 067ff30c..691bf2ae 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -32,9 +32,16 @@ Tester在验证进行之前会调用model.eval()提示当前进入了evaluation """ +import time + import torch import torch.nn as nn +try: + from tqdm.auto import tqdm +except: + from .utils import _pseudo_tqdm as tqdm + from .batch import BatchIter, DataSetIter from .dataset import DataSet from .metrics import _prepare_metrics @@ -47,7 +54,7 @@ from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device from ._parallel_utils import _data_parallel_wrapper -from fastNLP.core._parallel_utils import _model_contains_inner_module +from ._parallel_utils import _model_contains_inner_module from functools import partial __all__ = [ @@ -80,9 +87,10 @@ class Tester(object): 如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 + :param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 """ - def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1): + def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True): super(Tester, self).__init__() if not isinstance(model, nn.Module): @@ -94,6 +102,7 @@ class Tester(object): self._model = _move_model_to_device(model, device=device) self.batch_size = batch_size self.verbose = verbose + self.use_tqdm = use_tqdm if isinstance(data, DataSet): self.data_iterator = DataSetIter( @@ -141,21 +150,39 @@ class Tester(object): eval_results = {} try: with torch.no_grad(): - for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - pred_dict = self._data_forward(self._predict_func, batch_x) - if not isinstance(pred_dict, dict): - raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " - f"must be `dict`, got {type(pred_dict)}.") + if not self.use_tqdm: + from .utils import _pseudo_tqdm as inner_tqdm + else: + inner_tqdm = tqdm + with inner_tqdm(total=len(data_iterator), leave=False, dynamic_ncols=True) as pbar: + pbar.set_description_str(desc="Test") + + start_time = time.time() + + for batch_x, batch_y in data_iterator: + _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) + pred_dict = self._data_forward(self._predict_func, batch_x) + if not isinstance(pred_dict, dict): + raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " + f"must be `dict`, got {type(pred_dict)}.") + for metric in self.metrics: + metric(pred_dict, batch_y) + + if self.use_tqdm: + pbar.update() + for metric in self.metrics: - metric(pred_dict, batch_y) - for metric in self.metrics: - eval_result = metric.get_metric() - if not isinstance(eval_result, dict): - raise TypeError(f"The return value of {_get_func_signature(metric.get_metric)} must be " - f"`dict`, got {type(eval_result)}") - metric_name = metric.__class__.__name__ - eval_results[metric_name] = eval_result + eval_result = metric.get_metric() + if not isinstance(eval_result, dict): + raise TypeError(f"The return value of {_get_func_signature(metric.get_metric)} must be " + f"`dict`, got {type(eval_result)}") + metric_name = metric.__class__.__name__ + eval_results[metric_name] = eval_result + + end_time = time.time() + test_str = f'Evaluate data in {round(end_time - start_time, 2)} seconds!' + pbar.write(test_str) + pbar.close() except _CheckError as e: prev_func_signature = _get_func_signature(self._predict_func) _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 83bdb4b0..a85b7fee 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -352,7 +352,7 @@ from .utils import _move_dict_value_to_device from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device -from fastNLP.core._parallel_utils import _model_contains_inner_module +from ._parallel_utils import _model_contains_inner_module class Trainer(object): @@ -557,7 +557,8 @@ class Trainer(object): metrics=self.metrics, batch_size=self.batch_size, device=None, # 由上面的部分处理device - verbose=0) + verbose=0, + use_tqdm=self.use_tqdm) self.step = 0 self.start_time = None # start timestamp @@ -633,7 +634,7 @@ class Trainer(object): def _train(self): if not self.use_tqdm: - from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm + from .utils import _pseudo_tqdm as inner_tqdm else: inner_tqdm = tqdm self.step = 0 @@ -859,8 +860,11 @@ def _get_value_info(_dict): strs.append(_str) return strs + from numbers import Number from .batch import _to_tensor + + def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, metric_key=None, check_level=0): # check get_loss 方法 From 216efb446f150a6e6d9e8e6687e363e69af9e90b Mon Sep 17 00:00:00 2001 From: wyg <1505116161@qq.com> Date: Thu, 8 Aug 2019 14:56:03 +0800 Subject: [PATCH 020/153] [verify] add data source in readme --- reproduction/text_classification/README.md | 7 +++++++ reproduction/text_classification/train_char_cnn.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/reproduction/text_classification/README.md b/reproduction/text_classification/README.md index 8bdfb9fe..96ea7a10 100644 --- a/reproduction/text_classification/README.md +++ b/reproduction/text_classification/README.md @@ -11,6 +11,13 @@ LSTM+self_attention:论文链接[A Structured Self-attentive Sentence Embedding] AWD-LSTM:论文链接[Regularizing and Optimizing LSTM Language Models](https://arxiv.org/pdf/1708.02182.pdf) +#数据集来源 +IMDB:http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz +SST-2:https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8 +SST:https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip +yelp_full:https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M +yelp_polarity:https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M + # 数据集及复现结果汇总 使用fastNLP复现的结果vs论文汇报结果(/前为fastNLP实现,后面为论文报道,-表示论文没有在该数据集上列出结果) diff --git a/reproduction/text_classification/train_char_cnn.py b/reproduction/text_classification/train_char_cnn.py index 0b8fc535..3482de70 100644 --- a/reproduction/text_classification/train_char_cnn.py +++ b/reproduction/text_classification/train_char_cnn.py @@ -203,7 +203,7 @@ callbacks.append( def train(model,datainfo,loss,metrics,optimizer,num_epochs=100): trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss(target='target'),batch_size=ops.batch_size, metrics=[metrics(target='target')], dev_data=datainfo.datasets['test'], device=[0,1,2], check_code_level=-1, - n_epochs=num_epochs) + n_epochs=num_epochs,callbacks=callbacks) print(trainer.train()) From 2098a81f2fad4a11c53f6347f41670c71f06bdb9 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 12 Aug 2019 00:58:57 +0800 Subject: [PATCH 021/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DBertEmbedding?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 38b8daf2..80a5b45f 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -291,9 +291,10 @@ class _WordBertModel(nn.Module): :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size """ batch_size, max_word_len = words.size() - seq_len = words.ne(self._pad_index).sum(dim=-1) + word_mask = words.ne(self._pad_index) + seq_len = word_mask.sum(dim=-1) batch_word_pieces_length = self.word_pieces_lengths[words] # batch_size x max_len - word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) + word_pieces_lengths = batch_word_pieces_length.masked_fill(word_mask, 0).sum(dim=-1) max_word_piece_length = word_pieces_lengths.max().item() real_max_word_piece_length = max_word_piece_length # 表示没有截断的word piece的长度 if max_word_piece_length+2>self._max_position_embeddings: @@ -319,8 +320,7 @@ class _WordBertModel(nn.Module): if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i) - attn_masks[i, :len(word_pieces_i)+2].fill_(1) - # TODO 截掉长度超过的部分。 + attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks, From 1b661e907aa768f63f8ba60c66a9a27c45d686e2 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 12 Aug 2019 01:12:08 +0800 Subject: [PATCH 022/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DBertEmbedding?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 80a5b45f..9bedd983 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -294,7 +294,7 @@ class _WordBertModel(nn.Module): word_mask = words.ne(self._pad_index) seq_len = word_mask.sum(dim=-1) batch_word_pieces_length = self.word_pieces_lengths[words] # batch_size x max_len - word_pieces_lengths = batch_word_pieces_length.masked_fill(word_mask, 0).sum(dim=-1) + word_pieces_lengths = batch_word_pieces_length.masked_fill(word_mask.eq(0), 0).sum(dim=-1) max_word_piece_length = word_pieces_lengths.max().item() real_max_word_piece_length = max_word_piece_length # 表示没有截断的word piece的长度 if max_word_piece_length+2>self._max_position_embeddings: From b0c50f7299f4439f1b015ad74c2aa291e0dd798f Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 12 Aug 2019 01:18:24 +0800 Subject: [PATCH 023/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DBertEmbedding?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 9bedd983..963ba04c 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -311,7 +311,6 @@ class _WordBertModel(nn.Module): word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index) word_pieces[:, 0].fill_(self._cls_index) batch_indexes = torch.arange(batch_size).to(words) - word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index attn_masks = torch.zeros_like(word_pieces) # 1. 获取words的word_pieces的id,以及对应的span范围 word_indexes = words.tolist() @@ -320,6 +319,7 @@ class _WordBertModel(nn.Module): if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i) + word_pieces[i, len(word_pieces_i)+1] = self._sep_index # 补上sep attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] From 88dafd7f9a9ce25605fd27363e58fe5ea7b066f7 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 12 Aug 2019 01:20:04 +0800 Subject: [PATCH 024/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DBertEmbedding?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 963ba04c..afba9d13 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -306,11 +306,8 @@ class _WordBertModel(nn.Module): raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") - # +2是由于需要加入[CLS]与[SEP] word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index) - word_pieces[:, 0].fill_(self._cls_index) - batch_indexes = torch.arange(batch_size).to(words) attn_masks = torch.zeros_like(word_pieces) # 1. 获取words的word_pieces的id,以及对应的span范围 word_indexes = words.tolist() @@ -319,8 +316,11 @@ class _WordBertModel(nn.Module): if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i) - word_pieces[i, len(word_pieces_i)+1] = self._sep_index # 补上sep attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) + # 添加[cls]和[sep] + word_pieces[:, 0].fill_(self._cls_index) + batch_indexes = torch.arange(batch_size).to(words) + word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks, From 6b22d6d3be5211ed5f9399db7cf213f6fa5a838a Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 12 Aug 2019 11:14:03 +0800 Subject: [PATCH 025/153] =?UTF-8?q?bert=20embedding=E4=BF=AE=E5=A4=8Dbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index afba9d13..1fadd491 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -278,7 +278,7 @@ class _WordBertModel(nn.Module): print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab))) self._cls_index = self.tokenzier.vocab['[CLS]'] self._sep_index = self.tokenzier.vocab['[SEP]'] - self._pad_index = vocab.padding_idx + self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece self.word_to_wordpieces = np.array(word_to_wordpieces) self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) @@ -291,23 +291,22 @@ class _WordBertModel(nn.Module): :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size """ batch_size, max_word_len = words.size() - word_mask = words.ne(self._pad_index) + word_mask = words.ne(self._word_pad_index) # 为1的地方有word seq_len = word_mask.sum(dim=-1) batch_word_pieces_length = self.word_pieces_lengths[words] # batch_size x max_len - word_pieces_lengths = batch_word_pieces_length.masked_fill(word_mask.eq(0), 0).sum(dim=-1) - max_word_piece_length = word_pieces_lengths.max().item() - real_max_word_piece_length = max_word_piece_length # 表示没有截断的word piece的长度 - if max_word_piece_length+2>self._max_position_embeddings: + word_pieces_lengths = batch_word_pieces_length.masked_fill(word_mask.eq(0), 0).sum(dim=-1) # batch_size + word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) + if word_piece_length+2>self._max_position_embeddings: if self.auto_truncate: word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings, self._max_position_embeddings-2) - max_word_piece_length = self._max_position_embeddings-2 else: raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") # +2是由于需要加入[CLS]与[SEP] - word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index) + word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)), + fill_value=self._wordpiece_pad_index) attn_masks = torch.zeros_like(word_pieces) # 1. 获取words的word_pieces的id,以及对应的span范围 word_indexes = words.tolist() @@ -325,7 +324,7 @@ class _WordBertModel(nn.Module): # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks, output_all_encoded_layers=True) - # output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size + # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size if self.include_cls_sep: outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, @@ -339,9 +338,10 @@ class _WordBertModel(nn.Module): batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1) # batch_size x max_len for l_index, l in enumerate(self.layers): output_layer = bert_outputs[l] - if real_max_word_piece_length > max_word_piece_length: # 如果实际上是截取出来的 + real_word_piece_length = output_layer.size(1) - 2 + if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 paddings = output_layer.new_zeros(batch_size, - real_max_word_piece_length-max_word_piece_length, + word_piece_length-real_word_piece_length, output_layer.size(2)) output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() # 从word_piece collapse到word的表示 From efb909d19191a2e4e95e0981df2069d1d1daf6ae Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 12 Aug 2019 13:36:26 +0800 Subject: [PATCH 026/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DSpanFMetric=E4=B8=AD?= =?UTF-8?q?=E7=9A=84micro=E8=AE=A1=E7=AE=97bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 94f50253..8dd51eb6 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -624,7 +624,7 @@ class SpanFPreRecMetric(MetricBase): f, pre, rec = self._compute_f_pre_rec(tp, fn, fp) f_sum += f pre_sum += pre - rec_sum + rec + rec_sum += rec if not self.only_gross and tag != '': # tag!=''防止无tag的情况 f_key = 'f-{}'.format(tag) pre_key = 'pre-{}'.format(tag) From 014e9786c7abbbb3c043c3a1db19e703ad338659 Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 14 Aug 2019 15:35:05 +0800 Subject: [PATCH 027/153] =?UTF-8?q?1.=20=E5=88=86=E7=B1=BBDataSetLoader?= =?UTF-8?q?=E4=B8=AD=E7=9A=84Loader=E5=8A=9F=E8=83=BDPipe=E5=8A=9F?= =?UTF-8?q?=E8=83=BD;=202.=20=E5=A2=9E=E5=8A=A0=E6=95=B0=E6=8D=AE=E9=9B=86?= =?UTF-8?q?=E8=87=AA=E5=8A=A8=E4=B8=8B=E8=BD=BD;=203.=E4=BF=AE=E5=A4=8Dvoc?= =?UTF-8?q?abulary=E4=B8=AD=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 3 + fastNLP/core/batch.py | 12 + fastNLP/core/const.py | 26 +- fastNLP/core/dataset.py | 33 +- fastNLP/core/field.py | 10 +- fastNLP/core/instance.py | 7 + fastNLP/core/utils.py | 21 + fastNLP/core/vocabulary.py | 63 +-- fastNLP/embeddings/__init__.py | 3 +- fastNLP/embeddings/bert_embedding.py | 16 +- fastNLP/embeddings/elmo_embedding.py | 8 +- fastNLP/embeddings/static_embedding.py | 12 +- fastNLP/io/base_loader.py | 89 +++- fastNLP/io/data_loader/conll.py | 116 +++-- fastNLP/io/data_loader/matching.py | 2 +- fastNLP/io/data_loader/mtl.py | 4 +- fastNLP/io/data_loader/sst.py | 10 +- fastNLP/io/data_loader/yelp.py | 4 +- fastNLP/io/dataset_loader.py | 22 - fastNLP/io/file_reader.py | 10 +- fastNLP/io/file_utils.py | 277 +++++++---- fastNLP/io/loader/__init__.py | 30 ++ fastNLP/io/loader/classification.py | 369 +++++++++++++++ fastNLP/io/loader/conll.py | 264 +++++++++++ fastNLP/io/loader/csv.py | 32 ++ fastNLP/io/loader/cws.py | 41 ++ fastNLP/io/loader/json.py | 40 ++ fastNLP/io/loader/loader.py | 75 +++ fastNLP/io/loader/matching.py | 309 ++++++++++++ fastNLP/io/pipe/__init__.py | 8 + fastNLP/io/pipe/classification.py | 444 ++++++++++++++++++ fastNLP/io/pipe/conll.py | 149 ++++++ fastNLP/io/pipe/matching.py | 254 ++++++++++ fastNLP/io/pipe/pipe.py | 9 + fastNLP/io/pipe/utils.py | 142 ++++++ fastNLP/io/utils.py | 14 +- test/embeddings/__init__.py | 0 .../encoder => embeddings}/test_bert.py | 0 test/embeddings/test_elmo_embedding.py | 21 + test/io/loader/test_classification_loader.py | 19 + test/io/loader/test_matching_loader.py | 22 + test/io/pipe/test_classification.py | 13 + test/io/pipe/test_matching.py | 26 + 43 files changed, 2802 insertions(+), 227 deletions(-) create mode 100644 fastNLP/io/loader/__init__.py create mode 100644 fastNLP/io/loader/classification.py create mode 100644 fastNLP/io/loader/conll.py create mode 100644 fastNLP/io/loader/csv.py create mode 100644 fastNLP/io/loader/cws.py create mode 100644 fastNLP/io/loader/json.py create mode 100644 fastNLP/io/loader/loader.py create mode 100644 fastNLP/io/loader/matching.py create mode 100644 fastNLP/io/pipe/__init__.py create mode 100644 fastNLP/io/pipe/classification.py create mode 100644 fastNLP/io/pipe/conll.py create mode 100644 fastNLP/io/pipe/matching.py create mode 100644 fastNLP/io/pipe/pipe.py create mode 100644 fastNLP/io/pipe/utils.py create mode 100644 test/embeddings/__init__.py rename test/{modules/encoder => embeddings}/test_bert.py (100%) create mode 100644 test/embeddings/test_elmo_embedding.py create mode 100644 test/io/loader/test_classification_loader.py create mode 100644 test/io/loader/test_matching_loader.py create mode 100644 test/io/pipe/test_classification.py create mode 100644 test/io/pipe/test_matching.py diff --git a/.travis.yml b/.travis.yml index 210d158a..856ec9c8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,9 @@ language: python python: - "3.6" + +env + - TRAVIS=1 # command to install dependencies install: - pip install --quiet -r requirements.txt diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 538f583a..8d97783e 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -48,6 +48,11 @@ class DataSetGetter: return len(self.dataset) def collate_fn(self, batch: list): + """ + + :param batch: [[idx1, x_dict1, y_dict1], [idx2, x_dict2, y_dict2], [xx, xx, xx]] + :return: + """ # TODO 支持在DataSet中定义collate_fn,因为有时候可能需要不同的field之间融合,比如BERT的场景 batch_x = {n:[] for n in self.inputs.keys()} batch_y = {n:[] for n in self.targets.keys()} @@ -208,6 +213,13 @@ class OnlineDataIter(BatchIter): def _to_tensor(batch, field_dtype): + """ + + :param batch: np.array() + :param field_dtype: 数据类型 + :return: batch, flag. 如果传入的数据支持转为tensor,返回的batch就是tensor,且flag为True;如果传入的数据不支持转为tensor, + 返回的batch就是原来的数据,且flag为False + """ try: if field_dtype is not None and isinstance(field_dtype, type)\ and issubclass(field_dtype, Number) \ diff --git a/fastNLP/core/const.py b/fastNLP/core/const.py index 89ff51a2..27e8d1cb 100644 --- a/fastNLP/core/const.py +++ b/fastNLP/core/const.py @@ -7,12 +7,14 @@ class Const: 具体列表:: - INPUT 模型的序列输入 words(复数words1, words2) - CHAR_INPUT 模型character输入 chars(复数chars1, chars2) - INPUT_LEN 序列长度 seq_len(复数seq_len1,seq_len2) - OUTPUT 模型输出 pred(复数pred1, pred2) - TARGET 真实目标 target(复数target1,target2) - LOSS 损失函数 loss (复数loss1,loss2) + INPUT 模型的序列输入 words(具有多列words时,依次使用words1, words2, ) + CHAR_INPUT 模型character输入 chars(具有多列chars时,依次使用chars1, chars2) + INPUT_LEN 序列长度 seq_len(具有多列seq_len时,依次使用seq_len1,seq_len2) + OUTPUT 模型输出 pred(具有多列pred时,依次使用pred1, pred2) + TARGET 真实目标 target(具有多列target时,依次使用target1,target2) + LOSS 损失函数 loss (具有多列loss时,依次使用loss1,loss2) + RAW_WORD 原文的词 raw_words (具有多列raw_words时,依次使用raw_words1, raw_words2) + RAW_CHAR 原文的字 raw_chars (具有多列raw_chars时,依次使用raw_chars1, raw_chars2) """ INPUT = 'words' @@ -21,6 +23,8 @@ class Const: OUTPUT = 'pred' TARGET = 'target' LOSS = 'loss' + RAW_WORD = 'raw_words' + RAW_CHAR = 'raw_chars' @staticmethod def INPUTS(i): @@ -34,6 +38,16 @@ class Const: i = int(i) + 1 return Const.CHAR_INPUT + str(i) + @staticmethod + def RAW_WORDS(i): + i = int(i) + 1 + return Const.RAW_WORD + str(i) + + @staticmethod + def RAW_CHARS(i): + i = int(i) + 1 + return Const.RAW_CHAR + str(i) + @staticmethod def INPUT_LENS(i): """得到第 i 个 ``INPUT_LEN`` 的命名""" diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 2955eff6..0f98ed1f 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -291,6 +291,7 @@ import _pickle as pickle import warnings import numpy as np +from copy import deepcopy from .field import AutoPadder from .field import FieldArray @@ -298,6 +299,7 @@ from .instance import Instance from .utils import _get_func_signature from .field import AppendToTargetOrInputException from .field import SetInputOrTargetException +from .const import Const class DataSet(object): """ @@ -349,7 +351,11 @@ class DataSet(object): self.idx]) assert self.idx < len(self.dataset.field_arrays[item]), "index:{} out of range".format(self.idx) return self.dataset.field_arrays[item][self.idx] - + + def items(self): + ins = self.dataset[self.idx] + return ins.items() + def __repr__(self): return self.dataset[self.idx].__repr__() @@ -497,6 +503,7 @@ class DataSet(object): else: for field in self.field_arrays.values(): field.pop(index) + return self def delete_field(self, field_name): """ @@ -505,7 +512,22 @@ class DataSet(object): :param str field_name: 需要删除的field的名称. """ self.field_arrays.pop(field_name) - + return self + + def copy_field(self, field_name, new_field_name): + """ + 深度copy名为field_name的field到new_field_name + + :param str field_name: 需要copy的field。 + :param str new_field_name: copy生成的field名称 + :return: self + """ + if not self.has_field(field_name): + raise KeyError(f"Field:{field_name} not found in DataSet.") + fieldarray = deepcopy(self.get_field(field_name)) + self.add_fieldarray(field_name=new_field_name, fieldarray=fieldarray) + return self + def has_field(self, field_name): """ 判断DataSet中是否有名为field_name这个field @@ -701,7 +723,7 @@ class DataSet(object): results.append(func(ins[field_name])) except Exception as e: if idx != -1: - print("Exception happens at the `{}`th instance.".format(idx)) + print("Exception happens at the `{}`th(from 1) instance.".format(idx+1)) raise e if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(_get_func_signature(func=func))) @@ -766,10 +788,11 @@ class DataSet(object): results = [] for idx, ins in enumerate(self._inner_iter()): results.append(func(ins)) - except Exception as e: + except BaseException as e: if idx != -1: print("Exception happens at the `{}`th instance.".format(idx)) raise e + # results = [func(ins) for ins in self._inner_iter()] if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(_get_func_signature(func=func))) @@ -779,7 +802,7 @@ class DataSet(object): return results - def add_seq_len(self, field_name:str, new_field_name='seq_len'): + def add_seq_len(self, field_name:str, new_field_name=Const.INPUT_LEN): """ 将使用len()直接对field_name中每个元素作用,将其结果作为seqence length, 并放入seq_len这个field。 diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index d7d3bb8b..65bd9be4 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -7,6 +7,7 @@ from typing import Any from abc import abstractmethod from copy import deepcopy from collections import Counter +from .utils import _is_iterable class SetInputOrTargetException(Exception): def __init__(self, msg, index=None, field_name=None): @@ -443,15 +444,6 @@ def _get_ele_type_and_dim(cell:Any, dim=0): raise SetInputOrTargetException(f"Cannot process type:{type(cell)}.") -def _is_iterable(value): - # 检查是否是iterable的, duck typing - try: - iter(value) - return True - except BaseException as e: - return False - - class Padder: """ 别名::class:`fastNLP.Padder` :class:`fastNLP.core.field.Padder` diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 5408522e..9a5d9edf 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -35,6 +35,13 @@ class Instance(object): :param Any field: 新增field的内容 """ self.fields[field_name] = field + + def items(self): + """ + 返回一个迭代器,迭代器返回两个内容,第一个内容是field_name, 第二个内容是field_value + :return: + """ + return self.fields.items() def __getitem__(self, name): if name in self.fields: diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 8483f9f2..4ce382f3 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -4,6 +4,7 @@ utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户 __all__ = [ "cache_results", "seq_len_to_mask", + "get_seq_len" ] import _pickle @@ -730,3 +731,23 @@ def iob2bioes(tags: List[str]) -> List[str]: else: raise TypeError("Invalid IOB format.") return new_tags + + +def _is_iterable(value): + # 检查是否是iterable的, duck typing + try: + iter(value) + return True + except BaseException as e: + return False + + +def get_seq_len(words, pad_value=0): + """ + 给定batch_size x max_len的words矩阵,返回句子长度 + + :param words: batch_size x max_len + :return: (batch_size,) + """ + mask = words.ne(pad_value) + return mask.sum(dim=-1) diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 9ce59a8c..a51c3f92 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -4,12 +4,12 @@ __all__ = [ ] from functools import wraps -from collections import Counter, defaultdict +from collections import Counter from .dataset import DataSet from .utils import Option from functools import partial import numpy as np - +from .utils import _is_iterable class VocabularyOption(Option): def __init__(self, @@ -131,11 +131,11 @@ class Vocabulary(object): """ 在新加入word时,检查_no_create_word的设置。 - :param str, List[str] word: + :param str List[str] word: :param bool no_create_entry: :return: """ - if isinstance(word, str): + if isinstance(word, str) or not _is_iterable(word): word = [word] for w in word: if no_create_entry and self.word_count.get(w, 0) == self._no_create_word.get(w, 0): @@ -257,35 +257,45 @@ class Vocabulary(object): vocab.index_dataset(train_data, dev_data, test_data, field_name='words') :param ~fastNLP.DataSet,List[~fastNLP.DataSet] datasets: 需要转index的一个或多个数据集 - :param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field. - 目前仅支持 ``str`` , ``List[str]`` , ``List[List[str]]`` - :param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field. - Default: ``None`` + :param list,str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field. + 目前支持 ``str`` , ``List[str]`` + :param list,str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field. + Default: ``None``. """ - def index_instance(ins): + def index_instance(field): """ 有几种情况, str, 1d-list, 2d-list :param ins: :return: """ - field = ins[field_name] - if isinstance(field, str): + if isinstance(field, str) or not _is_iterable(field): return self.to_index(field) - elif isinstance(field, list): - if not isinstance(field[0], list): + else: + if isinstance(field[0], str) or not _is_iterable(field[0]): return [self.to_index(w) for w in field] else: - if isinstance(field[0][0], list): + if not isinstance(field[0][0], str) and _is_iterable(field[0][0]): raise RuntimeError("Only support field with 2 dimensions.") return [[self.to_index(c) for c in w] for w in field] - - if new_field_name is None: - new_field_name = field_name + + new_field_name = new_field_name or field_name + + if type(new_field_name) == type(field_name): + if isinstance(new_field_name, list): + assert len(new_field_name) == len(field_name), "new_field_name should have same number elements with " \ + "field_name." + elif isinstance(new_field_name, str): + field_name = [field_name] + new_field_name = [new_field_name] + else: + raise TypeError("field_name and new_field_name can only be str or List[str].") + for idx, dataset in enumerate(datasets): if isinstance(dataset, DataSet): try: - dataset.apply(index_instance, new_field_name=new_field_name) + for f_n, n_f_n in zip(field_name, new_field_name): + dataset.apply_field(index_instance, field_name=f_n, new_field_name=n_f_n) except Exception as e: print("When processing the `{}` dataset, the following error occurred.".format(idx)) raise e @@ -306,9 +316,8 @@ class Vocabulary(object): :param ~fastNLP.DataSet,List[~fastNLP.DataSet] datasets: 需要转index的一个或多个数据集 :param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` . - 构建词典所使用的 field(s), 支持一个或多个field - 若有多个 DataSet, 每个DataSet都必须有这些field. - 目前仅支持的field结构: ``str`` , ``List[str]`` , ``list[List[str]]`` + 构建词典所使用的 field(s), 支持一个或多个field,若有多个 DataSet, 每个DataSet都必须有这些field. 目前支持的field结构 + : ``str`` , ``List[str]`` :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain 的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev 中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 @@ -326,14 +335,14 @@ class Vocabulary(object): def construct_vocab(ins, no_create_entry=False): for fn in field_name: field = ins[fn] - if isinstance(field, str): + if isinstance(field, str) or not _is_iterable(field): self.add_word(field, no_create_entry=no_create_entry) - elif isinstance(field, (list, np.ndarray)): - if not isinstance(field[0], (list, np.ndarray)): + else: + if isinstance(field[0], str) or not _is_iterable(field[0]): for word in field: self.add_word(word, no_create_entry=no_create_entry) else: - if isinstance(field[0][0], (list, np.ndarray)): + if not isinstance(field[0][0], str) and _is_iterable(field[0][0]): raise RuntimeError("Only support field with 2 dimensions.") for words in field: for word in words: @@ -343,8 +352,8 @@ class Vocabulary(object): if isinstance(dataset, DataSet): try: dataset.apply(construct_vocab) - except Exception as e: - print("When processing the `{}` dataset, the following error occurred.".format(idx)) + except BaseException as e: + print("When processing the `{}` dataset, the following error occurred:".format(idx)) raise e else: raise TypeError("Only DataSet type is allowed.") diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py index 2bfb2960..4f90ac63 100644 --- a/fastNLP/embeddings/__init__.py +++ b/fastNLP/embeddings/__init__.py @@ -10,6 +10,7 @@ __all__ = [ "StaticEmbedding", "ElmoEmbedding", "BertEmbedding", + "BertWordPieceEncoder", "StackEmbedding", "LSTMCharEmbedding", "CNNCharEmbedding", @@ -20,7 +21,7 @@ __all__ = [ from .embedding import Embedding from .static_embedding import StaticEmbedding from .elmo_embedding import ElmoEmbedding -from .bert_embedding import BertEmbedding +from .bert_embedding import BertEmbedding, BertWordPieceEncoder from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding from .stack_embedding import StackEmbedding from .utils import get_embeddings \ No newline at end of file diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 1fadd491..261007ae 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -8,7 +8,7 @@ import numpy as np from itertools import chain from ..core.vocabulary import Vocabulary -from ..io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR +from ..io.file_utils import _get_embedding_url, cached_path, PRETRAINED_BERT_MODEL_DIR from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer from .contextual_embedding import ContextualEmbedding @@ -60,10 +60,8 @@ class BertEmbedding(ContextualEmbedding): # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: - PRETRAIN_URL = _get_base_url('bert') - model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) + model_url = _get_embedding_url('bert', model_dir_or_name.lower()) + model_dir = cached_path(model_url, name='embedding') # 检查是否存在 elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_dir = os.path.expanduser(os.path.abspath(model_dir_or_name)) @@ -133,11 +131,9 @@ class BertWordPieceEncoder(nn.Module): pooled_cls: bool = False, requires_grad: bool=False): super().__init__() - if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR: - PRETRAIN_URL = _get_base_url('bert') - model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) + if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: + model_url = _get_embedding_url('bert', model_dir_or_name.lower()) + model_dir = cached_path(model_url, name='embedding') # 检查是否存在 elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_dir = model_dir_or_name diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index 53adfd62..590aba74 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -8,7 +8,7 @@ import json import codecs from ..core.vocabulary import Vocabulary -from ..io.file_utils import cached_path, _get_base_url, PRETRAINED_ELMO_MODEL_DIR +from ..io.file_utils import cached_path, _get_embedding_url, PRETRAINED_ELMO_MODEL_DIR from ..modules.encoder._elmo import ElmobiLm, ConvTokenEmbedder from .contextual_embedding import ContextualEmbedding @@ -53,10 +53,8 @@ class ElmoEmbedding(ContextualEmbedding): # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: - PRETRAIN_URL = _get_base_url('elmo') - model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) + model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) + model_dir = cached_path(model_url, name='embedding') # 检查是否存在 elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_dir = model_dir_or_name diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index b78e63e8..d44d7087 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -7,7 +7,7 @@ import numpy as np import warnings from ..core.vocabulary import Vocabulary -from ..io.file_utils import PRETRAIN_STATIC_FILES, _get_base_url, cached_path +from ..io.file_utils import PRETRAIN_STATIC_FILES, _get_embedding_url, cached_path from .embedding import TokenEmbedding from ..modules.utils import _get_file_name_base_on_postfix @@ -60,10 +60,8 @@ class StaticEmbedding(TokenEmbedding): embedding_dim = int(embedding_dim) model_path = None elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: - PRETRAIN_URL = _get_base_url('static') - model_name = PRETRAIN_STATIC_FILES[model_dir_or_name] - model_url = PRETRAIN_URL + model_name - model_path = cached_path(model_url) + model_url = _get_embedding_url('static', model_dir_or_name.lower()) + model_path = cached_path(model_url, name='embedding') # 检查是否存在 elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_path = model_dir_or_name @@ -84,8 +82,8 @@ class StaticEmbedding(TokenEmbedding): if lowered_word not in lowered_vocab.word_count: lowered_vocab.add_word(lowered_word) lowered_vocab._no_create_word[lowered_word] += 1 - print(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered " - f"words.") + print(f"All word in the vocab have been lowered before finding pretrained vectors. There are {len(vocab)} " + f"words, {len(lowered_vocab)} unique lowered words.") if model_path: embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) else: diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py index 5d61c16a..01232627 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/base_loader.py @@ -5,10 +5,10 @@ __all__ = [ ] import _pickle as pickle -import os from typing import Union, Dict import os from ..core.dataset import DataSet +from ..core.vocabulary import Vocabulary class BaseLoader(object): @@ -111,7 +111,10 @@ def _uncompress(src, dst): class DataBundle: """ - 经过处理的数据信息,包括一系列数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。 + 经过处理的数据信息,包括一系列数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。该对象一般由fastNLP中各种 + DataSetLoader的load函数生成,可以通过以下的方法获取里面的内容 + + Example:: :param vocabs: 从名称(字符串)到 :class:`~fastNLP.Vocabulary` 类型的dict :param datasets: 从名称(字符串)到 :class:`~fastNLP.DataSet` 类型的dict @@ -121,6 +124,88 @@ class DataBundle: self.vocabs = vocabs or {} self.datasets = datasets or {} + def set_vocab(self, vocab, field_name): + """ + 向DataBunlde中增加vocab + + :param Vocabulary vocab: 词表 + :param str field_name: 这个vocab对应的field名称 + :return: + """ + assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary supports." + self.vocabs[field_name] = vocab + + def set_dataset(self, dataset, name): + """ + + :param DataSet dataset: 传递给DataBundle的DataSet + :param str name: dataset的名称 + :return: + """ + self.datasets[name] = dataset + + def get_dataset(self, name:str): + """ + 获取名为name的dataset + + :param str name: dataset的名称,一般为'train', 'dev', 'test' + :return: DataSet + """ + return self.datasets[name] + + def get_vocab(self, field_name:str): + """ + 获取field名为field_name对应的vocab + + :param str field_name: 名称 + :return: Vocabulary + """ + return self.vocabs[field_name] + + def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_field=True): + """ + 将field_names中的field设置为input, 对data_bundle中所有的dataset执行该操作:: + + data_bundle.set_input('words', 'seq_len') # 将words和seq_len这两个field的input属性设置为True + data_bundle.set_input('words', flag=False) # 将words这个field的input属性设置为False + + :param str field_names: field的名称 + :param bool flag: 将field_name的input状态设置为flag + :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 + 行的数据进行类型和维度推断本列的数据的类型和维度。 + :param bool ignore_miss_field: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 + """ + for field_name in field_names: + for name, dataset in self.datasets.items(): + if not ignore_miss_field and not dataset.has_field(field_name): + raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") + if not dataset.has_field(field_name): + continue + else: + dataset.set_input(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) + + def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_field=True): + """ + 将field_names中的field设置为target, 对data_bundle中所有的dataset执行该操作:: + + data_bundle.set_target('target', 'seq_len') # 将words和target这两个field的input属性设置为True + data_bundle.set_target('target', flag=False) # 将target这个field的input属性设置为False + + :param str field_names: field的名称 + :param bool flag: 将field_name的target状态设置为flag + :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 + 行的数据进行类型和维度推断本列的数据的类型和维度。 + :param bool ignore_miss_field: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 + """ + for field_name in field_names: + for name, dataset in self.datasets.items(): + if not ignore_miss_field and not dataset.has_field(field_name): + raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") + if not dataset.has_field(field_name): + continue + else: + dataset.set_target(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) + def __repr__(self): _str = 'In total {} datasets:\n'.format(len(self.datasets)) for name, dataset in self.datasets.items(): diff --git a/fastNLP/io/data_loader/conll.py b/fastNLP/io/data_loader/conll.py index 9b2402a2..0285173c 100644 --- a/fastNLP/io/data_loader/conll.py +++ b/fastNLP/io/data_loader/conll.py @@ -3,38 +3,47 @@ from ...core.dataset import DataSet from ...core.instance import Instance from ..base_loader import DataSetLoader from ..file_reader import _read_conll - +from typing import Union, Dict +from ..utils import check_loader_paths +from ..base_loader import DataBundle class ConllLoader(DataSetLoader): """ 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader` - 读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html. 数据中以"-DOCSTART-"开头的行将被忽略,因为 - 该符号在conll 2003中被用为文档分割符。 - - 列号从0开始, 每列对应内容为:: - - Column Type - 0 Document ID - 1 Part number - 2 Word number - 3 Word itself - 4 Part-of-Speech - 5 Parse bit - 6 Predicate lemma - 7 Predicate Frameset ID - 8 Word sense - 9 Speaker/Author - 10 Named Entities - 11:N Predicate Arguments - N Coreference - - :param headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 - :param indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` - :param dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``False`` + 该ConllLoader支持读取的数据格式: 以空行隔开两个sample,除了分割行,每一行用空格或者制表符隔开不同的元素。如下例所示: + + Example:: + + # 文件中的内容 + Nadim NNP B-NP B-PER + Ladki NNP I-NP I-PER + + AL-AIN NNP B-NP B-LOC + United NNP B-NP B-LOC + Arab NNP I-NP I-LOC + Emirates NNPS I-NP I-LOC + 1996-12-06 CD I-NP O + ... + + # 如果用以下的参数读取,返回的DataSet将包含raw_words和pos两个field, 这两个field的值分别取自于第0列与第1列 + dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll') + # 如果用以下的参数读取,返回的DataSet将包含raw_words和ner两个field, 这两个field的值分别取自于第0列与第2列 + dataset = ConllLoader(headers=['raw_words', 'ner'], indexes=[0, 3])._load('/path/to/train.conll') + # 如果用以下的参数读取,返回的DataSet将包含raw_words, pos和ner三个field + dataset = ConllLoader(headers=['raw_words', 'pos', 'ner'], indexes=[0, 1, 3])._load('/path/to/train.conll') + + dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll')中DataSet的raw_words + 列与pos列的内容都是List[str] + + 数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。 + + :param list headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 + :param list indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` + :param bool dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True`` """ - def __init__(self, headers, indexes=None, dropna=False): + def __init__(self, headers, indexes=None, dropna=True): super(ConllLoader, self).__init__() if not isinstance(headers, (list, tuple)): raise TypeError( @@ -49,25 +58,74 @@ class ConllLoader(DataSetLoader): self.indexes = indexes def _load(self, path): + """ + 传入的一个文件路径,将该文件读入DataSet中,field由Loader初始化时指定的headers决定。 + + :param str path: 文件的路径 + :return: DataSet + """ ds = DataSet() for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): ins = {h: data[i] for i, h in enumerate(self.headers)} ds.append(Instance(**ins)) return ds + def load(self, paths: Union[str, Dict[str, str]]) -> DataBundle: + """ + 从指定一个或多个路径中的文件中读取数据,返回:class:`~fastNLP.io.DataBundle` 。 + + 读取的field根据ConllLoader初始化时传入的headers决定。 + + :param Union[str, Dict[str, str]] paths: 支持以下的几种输入方式 + (1) 传入一个目录, 该目录下名称包含train的被认为是train,包含test的被认为是test,包含dev的被认为是dev,如果检测到多个文件 + 名包含'train'、 'dev'、 'test'则会报错 + + Example:: + data_bundle = ConllLoader().load('/path/to/dir') # 返回的DataBundle中datasets根据目录下是否检测到train, dev, test等有所变化 + # 可以通过以下的方式取出DataSet + tr_data = data_bundle.datasets['train'] + te_data = data_bundle.datasets['test'] # 如果目录下有文件包含test这个字段 + + (2) 传入文件path + + Example:: + data_bundle = ConllLoader().load("/path/to/a/train.conll") # 返回DataBundle对象, datasets中仅包含'train' + tr_data = data_bundle.datasets['train'] # 可以通过以下的方式取出DataSet + + (3) 传入一个dict,比如train,dev,test不在同一个目录下,或者名称中不包含train, dev, test + + Example:: + paths = {'train':"/path/to/tr.conll", 'dev':"/to/validate.conll", "test":"/to/te.conll"} + data_bundle = ConllLoader().load(paths) # 返回的DataBundle中的dataset中包含"train", "dev", "test" + dev_data = data_bundle.datasets['dev'] + + :return: :class:`~fastNLP.DataSet` 类的对象或 :class:`~fastNLP.io.DataBundle` 的字典 + """ + paths = check_loader_paths(paths) + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + class Conll2003Loader(ConllLoader): """ 别名::class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.data_loader.Conll2003Loader` - 读取Conll2003数据 + 该Loader用以读取Conll2003数据,conll2003的数据可以在https://github.com/davidsbatista/NER-datasets/tree/master/CONLL2003 + 找到。数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。 + + 返回的DataSet将具有以下['raw_words', 'pos', 'chunks', 'ner']四个field, 每个field中的内容都是List[str]。 + + .. csv-table:: Conll2003Loader处理之 :header: "raw_words", "words", "target", "seq_len" + + "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 + "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 5 + "[...]", "[...]", "[...]", . - 关于数据集的更多信息,参考: - https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data """ def __init__(self): headers = [ - 'tokens', 'pos', 'chunks', 'ner', + 'raw_words', 'pos', 'chunks', 'ner', ] super(Conll2003Loader, self).__init__(headers=headers) diff --git a/fastNLP/io/data_loader/matching.py b/fastNLP/io/data_loader/matching.py index 481b5056..1242b432 100644 --- a/fastNLP/io/data_loader/matching.py +++ b/fastNLP/io/data_loader/matching.py @@ -121,7 +121,7 @@ class MatchingLoader(DataSetLoader): PRETRAIN_URL = _get_base_url('bert') model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) + model_dir = cached_path(model_url, name='embedding') # 检查是否存在 elif os.path.isdir(bert_tokenizer): model_dir = bert_tokenizer diff --git a/fastNLP/io/data_loader/mtl.py b/fastNLP/io/data_loader/mtl.py index cbca413d..20824958 100644 --- a/fastNLP/io/data_loader/mtl.py +++ b/fastNLP/io/data_loader/mtl.py @@ -5,7 +5,7 @@ from ..base_loader import DataBundle from ..dataset_loader import CSVLoader from ...core.vocabulary import Vocabulary, VocabularyOption from ...core.const import Const -from ..utils import check_dataloader_paths +from ..utils import check_loader_paths class MTL16Loader(CSVLoader): @@ -38,7 +38,7 @@ class MTL16Loader(CSVLoader): src_vocab_opt: VocabularyOption = None, tgt_vocab_opt: VocabularyOption = None,): - paths = check_dataloader_paths(paths) + paths = check_loader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): diff --git a/fastNLP/io/data_loader/sst.py b/fastNLP/io/data_loader/sst.py index 6c06a9ce..c2e0eca1 100644 --- a/fastNLP/io/data_loader/sst.py +++ b/fastNLP/io/data_loader/sst.py @@ -8,7 +8,7 @@ from ...core.vocabulary import VocabularyOption, Vocabulary from ...core.dataset import DataSet from ...core.const import Const from ...core.instance import Instance -from ..utils import check_dataloader_paths, get_tokenizer +from ..utils import check_loader_paths, get_tokenizer class SSTLoader(DataSetLoader): @@ -67,7 +67,7 @@ class SSTLoader(DataSetLoader): paths, train_subtree=True, src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None,): - paths = check_dataloader_paths(paths) + paths = check_loader_paths(paths) input_name, target_name = 'words', 'target' src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ @@ -129,7 +129,7 @@ class SST2Loader(CSVLoader): tgt_vocab_opt: VocabularyOption = None, char_level_op=False): - paths = check_dataloader_paths(paths) + paths = check_loader_paths(paths) datasets = {} info = DataBundle() for name, path in paths.items(): @@ -155,7 +155,9 @@ class SST2Loader(CSVLoader): for dataset in datasets.values(): dataset.apply_field(wordtochar, field_name=Const.INPUT, new_field_name=Const.CHAR_INPUT) src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) - src_vocab.from_dataset(datasets['train'], field_name=Const.INPUT) + src_vocab.from_dataset(datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[ + dataset for name, dataset in datasets.items() if name!='train' + ]) src_vocab.index_dataset(*datasets.values(), field_name=Const.INPUT) tgt_vocab = Vocabulary(unknown=None, padding=None) \ diff --git a/fastNLP/io/data_loader/yelp.py b/fastNLP/io/data_loader/yelp.py index 333fcab0..15533b04 100644 --- a/fastNLP/io/data_loader/yelp.py +++ b/fastNLP/io/data_loader/yelp.py @@ -8,7 +8,7 @@ from ...core.instance import Instance from ...core.vocabulary import VocabularyOption, Vocabulary from ..base_loader import DataBundle, DataSetLoader from typing import Union, Dict -from ..utils import check_dataloader_paths, get_tokenizer +from ..utils import check_loader_paths, get_tokenizer class YelpLoader(DataSetLoader): @@ -62,7 +62,7 @@ class YelpLoader(DataSetLoader): src_vocab_op: VocabularyOption = None, tgt_vocab_op: VocabularyOption = None, char_level_op=False): - paths = check_dataloader_paths(paths) + paths = check_loader_paths(paths) info = DataBundle(datasets=self.load(paths)) src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) tgt_vocab = Vocabulary(unknown=None, padding=None) \ diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index ad6bbdc1..3e3ac575 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -114,25 +114,3 @@ def _cut_long_sentence(sent, max_sample_length=200): else: cutted_sentence.append(sent) return cutted_sentence - - -def _add_seg_tag(data): - """ - - :param data: list of ([word], [pos], [heads], [head_tags]) - :return: list of ([word], [pos]) - """ - - _processed = [] - for word_list, pos_list, _, _ in data: - new_sample = [] - for word, pos in zip(word_list, pos_list): - if len(word) == 1: - new_sample.append((word, 'S-' + pos)) - else: - new_sample.append((word[0], 'B-' + pos)) - for c in word[1:-1]: - new_sample.append((c, 'M-' + pos)) - new_sample.append((word[-1], 'E-' + pos)) - _processed.append(list(map(list, zip(*new_sample)))) - return _processed diff --git a/fastNLP/io/file_reader.py b/fastNLP/io/file_reader.py index 0ae0a319..6aa89b80 100644 --- a/fastNLP/io/file_reader.py +++ b/fastNLP/io/file_reader.py @@ -2,7 +2,7 @@ 此模块用于给其它模块提供读取文件的函数,没有为用户提供 API """ import json - +import warnings def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True): """ @@ -91,7 +91,7 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): with open(path, 'r', encoding=encoding) as f: sample = [] start = next(f).strip() - if '-DOCSTART-' not in start and start!='': + if start!='': sample.append(start.split()) for line_idx, line in enumerate(f, 1): line = line.strip() @@ -103,13 +103,13 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): yield line_idx, res except Exception as e: if dropna: + warnings.warn('Invalid instance ends at line: {} has been dropped.'.format(line_idx)) continue - raise ValueError('invalid instance ends at line: {}'.format(line_idx)) + raise ValueError('Invalid instance ends at line: {}'.format(line_idx)) elif line.startswith('#'): continue else: - if not line.startswith('-DOCSTART-'): - sample.append(line.split()) + sample.append(line.split()) if len(sample) > 0: try: res = parse_conll(sample) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 4be1360b..b465ed9b 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -7,7 +7,7 @@ import requests import tempfile from tqdm import tqdm import shutil -import hashlib +from requests import HTTPError PRETRAINED_BERT_MODEL_DIR = { @@ -23,15 +23,25 @@ PRETRAINED_BERT_MODEL_DIR = { 'cn': 'bert-base-chinese-29d0a84a.zip', 'cn-base': 'bert-base-chinese-29d0a84a.zip', - - 'multilingual': 'bert-base-multilingual-cased.zip', - 'multilingual-base-uncased': 'bert-base-multilingual-uncased.zip', - 'multilingual-base-cased': 'bert-base-multilingual-cased.zip', + 'bert-base-chinese': 'bert-base-chinese.zip', + 'bert-base-cased': 'bert-base-cased.zip', + 'bert-base-cased-finetuned-mrpc': 'bert-base-cased-finetuned-mrpc.zip', + 'bert-large-cased-wwm': 'bert-large-cased-wwm.zip', + 'bert-large-uncased': 'bert-large-uncased.zip', + 'bert-large-cased': 'bert-large-cased.zip', + 'bert-base-uncased': 'bert-base-uncased.zip', + 'bert-large-uncased-wwm': 'bert-large-uncased-wwm.zip', + 'bert-chinese-wwm': 'bert-chinese-wwm.zip', + 'bert-base-multilingual-cased': 'bert-base-multilingual-cased.zip', + 'bert-base-multilingual-uncased': 'bert-base-multilingual-uncased.zip', } PRETRAINED_ELMO_MODEL_DIR = { 'en': 'elmo_en-d39843fe.tar.gz', - 'en-small': "elmo_en_Small.zip" + 'en-small': "elmo_en_Small.zip", + 'en-original-5.5b': 'elmo_en_Original_5.5B.zip', + 'en-original': 'elmo_en_Original.zip', + 'en-medium': 'elmo_en_Medium.zip' } PRETRAIN_STATIC_FILES = { @@ -42,34 +52,68 @@ PRETRAIN_STATIC_FILES = { 'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", 'cn': "tencent_cn-dab24577.tar.gz", 'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz", + 'sgns-literature-word':'sgns.literature.word.txt.zip', + 'glove-42b-300d': 'glove.42B.300d.zip', + 'glove-6b-50d': 'glove.6B.50d.zip', + 'glove-6b-100d': 'glove.6B.100d.zip', + 'glove-6b-200d': 'glove.6B.200d.zip', + 'glove-6b-300d': 'glove.6B.300d.zip', + 'glove-840b-300d': 'glove.840B.300d.zip', + 'glove-twitter-27b-25d': 'glove.twitter.27B.25d.zip', + 'glove-twitter-27b-50d': 'glove.twitter.27B.50d.zip', + 'glove-twitter-27b-100d': 'glove.twitter.27B.100d.zip', + 'glove-twitter-27b-200d': 'glove.twitter.27B.200d.zip' +} + + +DATASET_DIR = { + 'aclImdb': "imdb.zip", + "yelp-review-full":"yelp_review_full.tar.gz", + "yelp-review-polarity": "yelp_review_polarity.tar.gz", + "mnli": "MNLI.zip", + "snli": "SNLI.zip", + "qnli": "QNLI.zip", + "sst-2": "SST-2.zip", + "sst": "SST.zip", + "rte": "RTE.zip" } -def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: +def cached_path(url_or_filename:str, cache_dir:str=None, name=None) -> Path: """ - 给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 + 给定一个url,尝试通过url中的解析出来的文件名字filename到{cache_dir}/{name}/{filename}下寻找这个文件, + (1)如果cache_dir=None, 则cache_dir=~/.fastNLP/; 否则cache_dir=cache_dir + (2)如果name=None, 则没有中间的{name}这一层结构;否者中间结构就为{name} + 如果有该文件,就直接返回路径 + 如果没有该文件,则尝试用传入的url下载 + + 或者文件名(可以是具体的文件名,也可以是文件夹),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 将文件放入到cache_dir中. - :param url_or_filename: 文件的下载url或者文件路径 - :param cache_dir: 文件的缓存文件夹 + :param str url_or_filename: 文件的下载url或者文件名称。 + :param str cache_dir: 文件的缓存文件夹。如果为None,将使用"~/.fastNLP"这个默认路径 + :param str name: 中间一层的名称。如embedding, dataset :return: """ if cache_dir is None: - dataset_cache = Path(get_default_cache_path()) + data_cache = Path(get_default_cache_path()) else: - dataset_cache = cache_dir + data_cache = cache_dir + + if name: + data_cache = os.path.join(data_cache, name) parsed = urlparse(url_or_filename) if parsed.scheme in ("http", "https"): # URL, so get it from the cache (downloading if necessary) - return get_from_cache(url_or_filename, dataset_cache) - elif parsed.scheme == "" and Path(os.path.join(dataset_cache, url_or_filename)).exists(): + return get_from_cache(url_or_filename, Path(data_cache)) + elif parsed.scheme == "" and Path(os.path.join(data_cache, url_or_filename)).exists(): # File, and it exists. - return Path(url_or_filename) + return Path(os.path.join(data_cache, url_or_filename)) elif parsed.scheme == "": # File, but it doesn't exist. - raise FileNotFoundError("file {} not found".format(url_or_filename)) + raise FileNotFoundError("file {} not found in {}.".format(url_or_filename, data_cache)) else: # Something unknown raise ValueError( @@ -79,8 +123,12 @@ def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: def get_filepath(filepath): """ - 如果filepath中只有一个文件,则直接返回对应的全路径. - :param filepath: + 如果filepath为文件夹, + 如果内含多个文件, 返回filepath + 如果只有一个文件, 返回filepath + filename + 如果filepath为文件 + 返回filepath + :param str filepath: 路径 :return: """ if os.path.isdir(filepath): @@ -89,14 +137,17 @@ def get_filepath(filepath): return os.path.join(filepath, files[0]) else: return filepath - return filepath + elif os.path.isfile(filepath): + return filepath + else: + raise FileNotFoundError(f"{filepath} is not a valid file or directory.") def get_default_cache_path(): """ 获取默认的fastNLP存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中,将使用环境变量的值,使得不用每个用户都去下载。 - :return: + :return: str """ if 'FASTNLP_CACHE_DIR' in os.environ: fastnlp_cache_dir = os.environ.get('FASTNLP_CACHE_DIR') @@ -109,17 +160,66 @@ def get_default_cache_path(): def _get_base_url(name): + """ + 根据name返回下载的url地址。 + + :param str name: 支持dataset和embedding两种 + :return: + """ # 返回的URL结尾必须是/ - if 'FASTNLP_BASE_URL' in os.environ: - fastnlp_base_url = os.environ['FASTNLP_BASE_URL'] - if fastnlp_base_url.endswith('/'): - return fastnlp_base_url + environ_name = "FASTNLP_{}_URL".format(name.upper()) + + if environ_name in os.environ: + url = os.environ[environ_name] + if url.endswith('/'): + return url else: - return fastnlp_base_url + '/' + return url + '/' else: - # TODO 替换 - dbbrain_url = "http://dbcloud.irocn.cn:8989/api/public/dl/" - return dbbrain_url + URLS = { + 'embedding': "http://dbcloud.irocn.cn:8989/api/public/dl/", + "dataset": "http://dbcloud.irocn.cn:8989/api/public/dl/dataset/" + } + if name.lower() not in URLS: + raise KeyError(f"{name} is not recognized.") + return URLS[name.lower()] + + +def _get_embedding_url(type, name): + """ + 给定embedding类似和名称,返回下载url + + :param str type: 支持static, bert, elmo。即embedding的类型 + :param str name: embedding的名称, 例如en, cn, based等 + :return: str, 下载的url地址 + """ + PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, + "bert": PRETRAINED_BERT_MODEL_DIR, + "static":PRETRAIN_STATIC_FILES} + map = PRETRAIN_MAP.get(type, None) + if map: + filename = map.get(name, None) + if filename: + url = _get_base_url('embedding') + filename + return url + raise KeyError("There is no {}. Only supports {}.".format(name, list(map.keys()))) + else: + raise KeyError(f"There is no {type}. Only supports bert, elmo, static") + + +def _get_dataset_url(name): + """ + 给定dataset的名称,返回下载url + + :param str name: 给定dataset的名称,比如imdb, sst-2等 + :return: str + """ + filename = DATASET_DIR.get(name, None) + if filename: + url = _get_base_url('dataset') + filename + return url + else: + raise KeyError(f"There is no {name}.") def split_filename_suffix(filepath): @@ -136,9 +236,9 @@ def split_filename_suffix(filepath): def get_from_cache(url: str, cache_dir: Path = None) -> Path: """ - 尝试在cache_dir中寻找url定义的资源; 如果没有找到。则从url下载并将结果放在cache_dir下,缓存的名称由url的结果推断而来。 - 如果从url中下载的资源解压后有多个文件,则返回directory的路径; 如果只有一个资源,则返回具体的路径。 - + 尝试在cache_dir中寻找url定义的资源; 如果没有找到; 则从url下载并将结果放在cache_dir下,缓存的名称由url的结果推断而来。会将下载的 + 文件解压,将解压后的文件全部放在cache_dir文件夹中。 + 如果从url中下载的资源解压后有多个文件,则返回目录的路径; 如果只有一个资源文件,则返回具体的路径。 """ cache_dir.mkdir(parents=True, exist_ok=True) @@ -173,63 +273,68 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: # GET file object req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) - content_length = req.headers.get("Content-Length") - total = int(content_length) if content_length is not None else None - progress = tqdm(unit="B", total=total) - with open(temp_filename, "wb") as temp_file: - for chunk in req.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - progress.update(len(chunk)) - temp_file.write(chunk) - progress.close() - print(f"Finish download from {url}.") - - # 开始解压 - delete_temp_dir = None - if suffix in ('.zip', '.tar.gz'): - uncompress_temp_dir = tempfile.mkdtemp() - delete_temp_dir = uncompress_temp_dir - print(f"Start to uncompress file to {uncompress_temp_dir}") - if suffix == '.zip': - unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) + if req.status_code==200: + content_length = req.headers.get("Content-Length") + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total, unit_scale=1) + with open(temp_filename, "wb") as temp_file: + for chunk in req.iter_content(chunk_size=1024*16): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + print(f"Finish download from {url}.") + + # 开始解压 + delete_temp_dir = None + if suffix in ('.zip', '.tar.gz'): + uncompress_temp_dir = tempfile.mkdtemp() + delete_temp_dir = uncompress_temp_dir + print(f"Start to uncompress file to {uncompress_temp_dir}") + if suffix == '.zip': + unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) + else: + untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) + filenames = os.listdir(uncompress_temp_dir) + if len(filenames)==1: + if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): + uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) + + cache_path.mkdir(parents=True, exist_ok=True) + print("Finish un-compressing file.") else: - untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) - filenames = os.listdir(uncompress_temp_dir) - if len(filenames)==1: - if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): - uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) - - cache_path.mkdir(parents=True, exist_ok=True) - print("Finish un-compressing file.") + uncompress_temp_dir = temp_filename + cache_path = str(cache_path) + suffix + success = False + try: + # 复制到指定的位置 + print(f"Copy file to {cache_path}") + if os.path.isdir(uncompress_temp_dir): + for filename in os.listdir(uncompress_temp_dir): + if os.path.isdir(os.path.join(uncompress_temp_dir, filename)): + shutil.copytree(os.path.join(uncompress_temp_dir, filename), cache_path/filename) + else: + shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path/filename) + else: + shutil.copyfile(uncompress_temp_dir, cache_path) + success = True + except Exception as e: + print(e) + raise e + finally: + if not success: + if cache_path.exists(): + if cache_path.is_file(): + os.remove(cache_path) + else: + shutil.rmtree(cache_path) + if delete_temp_dir: + shutil.rmtree(delete_temp_dir) + os.close(fd) + os.remove(temp_filename) + return get_filepath(cache_path) else: - uncompress_temp_dir = temp_filename - cache_path = str(cache_path) + suffix - success = False - try: - # 复制到指定的位置 - print(f"Copy file to {cache_path}") - if os.path.isdir(uncompress_temp_dir): - for filename in os.listdir(uncompress_temp_dir): - shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path/filename) - else: - shutil.copyfile(uncompress_temp_dir, cache_path) - success = True - except Exception as e: - print(e) - raise e - finally: - if not success: - if cache_path.exists(): - if cache_path.is_file(): - os.remove(cache_path) - else: - shutil.rmtree(cache_path) - if delete_temp_dir: - shutil.rmtree(delete_temp_dir) - os.close(fd) - os.remove(temp_filename) - - return get_filepath(cache_path) + raise HTTPError(f"Fail to download from {url}.") def unzip_file(file: Path, to: Path): diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py new file mode 100644 index 00000000..8e436532 --- /dev/null +++ b/fastNLP/io/loader/__init__.py @@ -0,0 +1,30 @@ + +""" +Loader用于读取数据,并将内容读取到 :class:`~fastNLP.DataSet` 或者 :class:`~fastNLP.io.DataBundle`中。所有的Loader都支持以下的 + 三个方法: __init__(),_load(), loads(). 其中__init__()用于申明读取参数,以及说明该Loader支持的数据格式,读取后Dataset中field + ; _load(path)方法传入一个文件路径读取单个文件,并返回DataSet; load(paths)用于读取文件夹下的文件,并返回DataBundle, load()方法 + 支持以下三种类型的参数 + + Example:: + (0) 如果传入None,将尝试自动下载数据集并缓存。但不是所有的数据都可以直接下载。 + (1) 如果传入的是一个文件path,则返回的DataBundle包含一个名为train的DataSet可以通过data_bundle.datasets['train']获取 + (2) 传入的是一个文件夹目录,将读取的是这个文件夹下文件名中包含'train', 'test', 'dev'的文件,其它文件会被忽略。 + 假设某个目录下的文件为 + -train.txt + -dev.txt + -test.txt + -other.txt + Loader().load('/path/to/dir')读取,返回的data_bundle中可以用data_bundle.datasets['train'], data_bundle.datasets['dev'], + data_bundle.datasets['test']获取对应的DataSet,其中other.txt的内容会被忽略。 + 假设某个目录下的文件为 + -train.txt + -dev.txt + Loader().load('/path/to/dir')读取,返回的data_bundle中可以用data_bundle.datasets['train'], data_bundle.datasets['dev']获取 + 对应的DataSet。 + (3) 传入一个dict,key为dataset的名称,value是该dataset的文件路径。 + paths = {'train':'/path/to/train', 'dev': '/path/to/dev', 'test':'/path/to/test'} + Loader().load(paths) # 返回的data_bundle可以通过以下的方式获取相应的DataSet, data_bundle.datasets['train'], data_bundle.datasets['dev'], + data_bundle.datasets['test'] + +""" + diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py new file mode 100644 index 00000000..dd85b4fe --- /dev/null +++ b/fastNLP/io/loader/classification.py @@ -0,0 +1,369 @@ +from ...core.dataset import DataSet +from ...core.instance import Instance +from .loader import Loader +import warnings +import os +import random +import shutil +import numpy as np + +class YelpLoader(Loader): + """ + 别名::class:`fastNLP.io.YelpLoader` :class:`fastNLP.io.loader.YelpLoader` + + 原始数据中内容应该为, 每一行为一个sample,第一个逗号之前为target,第一个逗号之后为文本内容。 + + Example:: + "1","I got 'new' tires from the..." + "1","Don't waste your time..." + + 读取YelpFull, YelpPolarity的数据。可以通过xxx下载并预处理数据。 + 读取的DataSet将具备以下的数据结构 + + .. csv-table:: + :header: "raw_words", "target" + + "I got 'new' tires from them and... ", "1" + "Don't waste your time. We had two...", "1" + "...", "..." + + """ + + def __init__(self): + super(YelpLoader, self).__init__() + + def _load(self, path: str=None): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + sep_index = line.index(',') + target = line[:sep_index] + raw_words = line[sep_index + 1:] + if target.startswith("\""): + target = target[1:] + if target.endswith("\""): + target = target[:-1] + if raw_words.endswith("\""): + raw_words = raw_words[:-1] + if raw_words.startswith('"'): + raw_words = raw_words[1:] + raw_words = raw_words.replace('""', '"') # 替换双引号 + if raw_words: + ds.append(Instance(raw_words=raw_words, target=target)) + return ds + + +class YelpFullLoader(YelpLoader): + def download(self, dev_ratio: float = 0.1, seed: int = 0): + """ + 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 + + Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances + in Neural Information Processing Systems 28 (NIPS 2015) + + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后在output_dir中有train.csv, test.csv, + dev.csv三个文件。 + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param int seed: 划分dev时的随机数种子 + :return: str, 数据集的目录地址 + """ + + dataset_name = 'yelp-review-full' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否需要重新下载 + re_download = True + if dev_ratio>0: + dev_line_count = 0 + tr_line_count = 0 + with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.csv'), 'r', encoding='utf-8') as f2: + for line in f1: + tr_line_count += 1 + for line in f2: + dev_line_count += 1 + if not np.isclose(dev_line_count, dev_ratio*(tr_line_count + dev_line_count), rtol=0.005): + re_download = True + else: + re_download = False + if re_download: + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + random.seed(int(seed)) + try: + with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.csv'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < dev_ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, 'train.csv')) + os.renames(os.path.join(data_dir, 'middle_file.csv'), os.path.join(data_dir, 'train.csv')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.csv')): + os.remove(os.path.join(data_dir, 'middle_file.csv')) + + return data_dir + + +class YelpPolarityLoader(YelpLoader): + def download(self, dev_ratio: float = 0.1, seed: int = 0): + """ + 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 + + Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances + in Neural Information Processing Systems 28 (NIPS 2015) + + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev + + :param float dev_ratio: 如果路径中不存在dev.csv, 从train划分多少作为dev的数据. 如果为0,则不划分dev + :param int seed: 划分dev时的随机数种子 + :return: str, 数据集的目录地址 + """ + dataset_name = 'yelp-review-polarity' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否符合比例要求 + re_download = True + if dev_ratio>0: + dev_line_count = 0 + tr_line_count = 0 + with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.csv'), 'r', encoding='utf-8') as f2: + for line in f1: + tr_line_count += 1 + for line in f2: + dev_line_count += 1 + if not np.isclose(dev_line_count, dev_ratio*(tr_line_count + dev_line_count), rtol=0.005): + re_download = True + else: + re_download = False + if re_download: + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + random.seed(int(seed)) + try: + with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.csv'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < dev_ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, 'train.csv')) + os.renames(os.path.join(data_dir, 'middle_file.csv'), os.path.join(data_dir, 'train.csv')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.csv')): + os.remove(os.path.join(data_dir, 'middle_file.csv')) + + return data_dir + + +class IMDBLoader(Loader): + """ + 别名::class:`fastNLP.io.IMDBLoader` :class:`fastNLP.io.loader.IMDBLoader` + + IMDBLoader读取后的数据将具有以下两列内容: raw_words: str, 需要分类的文本; target: str, 文本的标签 + DataSet具备以下的结构: + + .. csv-table:: + :header: "raw_words", "target" + + "Bromwell High is a cartoon ... ", "pos" + "Story of a man who has ...", "neg" + "...", "..." + + """ + + def __init__(self): + super(IMDBLoader, self).__init__() + + def _load(self, path: str): + dataset = DataSet() + with open(path, 'r', encoding="utf-8") as f: + for line in f: + line = line.strip() + if not line: + continue + parts = line.split('\t') + target = parts[0] + words = parts[1] + if words: + dataset.append(Instance(raw_words=words, target=target)) + + if len(dataset) == 0: + raise RuntimeError(f"{path} has no valid data.") + + return dataset + + def download(self, dev_ratio: float = 0.1, seed: int = 0): + """ + 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 + + http://www.aclweb.org/anthology/P11-1015 + + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev + + :param float dev_ratio: 如果路径中没有dev.txt。从train划分多少作为dev的数据. 如果为0,则不划分dev + :param int seed: 划分dev时的随机数种子 + :return: str, 数据集的目录地址 + """ + dataset_name = 'aclImdb' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + if os.path.exists(os.path.join(data_dir, 'dev.txt')): # 存在dev的话,check是否符合比例要求 + re_download = True + if dev_ratio>0: + dev_line_count = 0 + tr_line_count = 0 + with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.txt'), 'r', encoding='utf-8') as f2: + for line in f1: + tr_line_count += 1 + for line in f2: + dev_line_count += 1 + if not np.isclose(dev_line_count, dev_ratio*(tr_line_count + dev_line_count), rtol=0.005): + re_download = True + else: + re_download = False + if re_download: + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + random.seed(int(seed)) + try: + with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.txt'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.txt'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < dev_ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, 'train.txt')) + os.renames(os.path.join(data_dir, 'middle_file.txt'), os.path.join(data_dir, 'train.txt')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.txt')): + os.remove(os.path.join(data_dir, 'middle_file.txt')) + + return data_dir + + +class SSTLoader(Loader): + """ + 别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.loader.SSTLoader` + + 读取之后的DataSet具有以下的结构 + + .. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field + :header: "raw_words" + + "(3 (2 It) (4 (4 (2 's) (4 (3 (2 a)..." + "(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare)..." + "..." + + raw_words列是str。 + + """ + + def __init__(self): + super().__init__() + + def _load(self, path: str): + """ + 从path读取SST文件 + + :param str path: 文件路径 + :return: DataSet + """ + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line: + ds.append(Instance(raw_words=line)) + return ds + + def download(self): + """ + 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 + + https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf + + :return: str, 数据集的目录地址 + """ + output_dir = self._get_dataset_path(dataset_name='sst') + return output_dir + + +class SST2Loader(Loader): + """ + 数据SST2的Loader + 读取之后DataSet将如下所示 + + .. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field + :header: "raw_words", "target" + + "it 's a charming and often affecting...", "1" + "unflinchingly bleak and...", "0" + "..." + + test的DataSet没有target列。 + """ + + def __init__(self): + super().__init__() + + def _load(self, path: str): + """ + 从path读取SST2文件 + + :param str path: 数据路径 + :return: DataSet + """ + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + f.readline() # 跳过header + if 'test' in os.path.split(path)[1]: + warnings.warn("SST2's test file has no target.") + for line in f: + line = line.strip() + if line: + sep_index = line.index('\t') + raw_words = line[sep_index + 1:] + if raw_words: + ds.append(Instance(raw_words=raw_words)) + else: + for line in f: + line = line.strip() + if line: + raw_words = line[:-2] + target = line[-1] + if raw_words: + ds.append(Instance(raw_words=raw_words, target=target)) + return ds + + def download(self): + """ + 自动下载数据集,如果你使用了该数据集,请引用以下的文章 + + https://nlp.stanford.edu/pubs/SocherBauerManningNg_ACL2013.pdf + + :return: + """ + output_dir = self._get_dataset_path(dataset_name='sst-2') + return output_dir diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py new file mode 100644 index 00000000..43790c15 --- /dev/null +++ b/fastNLP/io/loader/conll.py @@ -0,0 +1,264 @@ +from typing import Dict, Union + +from .loader import Loader +from ... import DataSet +from ..file_reader import _read_conll +from ... import Instance +from .. import DataBundle +from ..utils import check_loader_paths +from ... import Const + + +class ConllLoader(Loader): + """ + 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader` + + ConllLoader支持读取的数据格式: 以空行隔开两个sample,除了分割行,每一行用空格或者制表符隔开不同的元素。如下例所示: + + Example:: + + # 文件中的内容 + Nadim NNP B-NP B-PER + Ladki NNP I-NP I-PER + + AL-AIN NNP B-NP B-LOC + United NNP B-NP B-LOC + Arab NNP I-NP I-LOC + Emirates NNPS I-NP I-LOC + 1996-12-06 CD I-NP O + ... + + # 如果用以下的参数读取,返回的DataSet将包含raw_words和pos两个field, 这两个field的值分别取自于第0列与第1列 + dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll') + # 如果用以下的参数读取,返回的DataSet将包含raw_words和ner两个field, 这两个field的值分别取自于第0列与第2列 + dataset = ConllLoader(headers=['raw_words', 'ner'], indexes=[0, 3])._load('/path/to/train.conll') + # 如果用以下的参数读取,返回的DataSet将包含raw_words, pos和ner三个field + dataset = ConllLoader(headers=['raw_words', 'pos', 'ner'], indexes=[0, 1, 3])._load('/path/to/train.conll') + + ConllLoader返回的DataSet的field由传入的headers确定。 + + 数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。 + + :param list headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 + :param list indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` + :param bool dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True`` + + """ + def __init__(self, headers, indexes=None, dropna=True): + super(ConllLoader, self).__init__() + if not isinstance(headers, (list, tuple)): + raise TypeError( + 'invalid headers: {}, should be list of strings'.format(headers)) + self.headers = headers + self.dropna = dropna + if indexes is None: + self.indexes = list(range(len(self.headers))) + else: + if len(indexes) != len(headers): + raise ValueError + self.indexes = indexes + + def _load(self, path): + """ + 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 + + :param str path: 文件的路径 + :return: DataSet + """ + ds = DataSet() + for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): + ins = {h: data[i] for i, h in enumerate(self.headers)} + ds.append(Instance(**ins)) + return ds + + +class Conll2003Loader(ConllLoader): + """ + 用于读取conll2003任务的数据。数据的内容应该类似与以下的内容, 第一列为raw_words, 第二列为pos, 第三列为chunking,第四列为ner。 + + Example:: + + Nadim NNP B-NP B-PER + Ladki NNP I-NP I-PER + + AL-AIN NNP B-NP B-LOC + United NNP B-NP B-LOC + Arab NNP I-NP I-LOC + Emirates NNPS I-NP I-LOC + 1996-12-06 CD I-NP O + ... + + 返回的DataSet的内容为 + + .. csv-table:: 下面是Conll2003Loader加载后数据具备的结构。 + :header: "raw_words", "pos", "chunk", "ner" + + "[Nadim, Ladki]", "[NNP, NNP]", "[B-NP, I-NP]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[NNP, NNP, NNP, ...]", "[B-NP, B-NP, I-NP, ...]", "[B-LOC, B-LOC, I-LOC, ...]" + "[...]", "[...]", "[...]", "[...]" + + """ + def __init__(self): + headers = [ + 'raw_words', 'pos', 'chunk', 'ner', + ] + super(Conll2003Loader, self).__init__(headers=headers) + + def _load(self, path): + """ + 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 + + :param str path: 文件的路径 + :return: DataSet + """ + ds = DataSet() + for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): + doc_start = False + for i, h in enumerate(self.headers): + field = data[i] + if str(field[0]).startswith('-DOCSTART-'): + doc_start = True + break + if doc_start: + continue + ins = {h: data[i] for i, h in enumerate(self.headers)} + ds.append(Instance(**ins)) + return ds + + def download(self, output_dir=None): + raise RuntimeError("conll2003 cannot be downloaded automatically.") + + +class Conll2003NERLoader(ConllLoader): + """ + 用于读取conll2003任务的NER数据。 + + Example:: + + Nadim NNP B-NP B-PER + Ladki NNP I-NP I-PER + + AL-AIN NNP B-NP B-LOC + United NNP B-NP B-LOC + Arab NNP I-NP I-LOC + Emirates NNPS I-NP I-LOC + 1996-12-06 CD I-NP O + ... + + 返回的DataSet的内容为 + + .. csv-table:: 下面是Conll2003Loader加载后数据具备的结构, target是BIO2编码 + :header: "raw_words", "target" + + "[Nadim, Ladki]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" + "[...]", "[...]" + + """ + def __init__(self): + headers = [ + 'raw_words', 'target', + ] + super().__init__(headers=headers, indexes=[0, 3]) + + def _load(self, path): + """ + 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 + + :param str path: 文件的路径 + :return: DataSet + """ + ds = DataSet() + for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): + doc_start = False + for i, h in enumerate(self.headers): + field = data[i] + if str(field[0]).startswith('-DOCSTART-'): + doc_start = True + break + if doc_start: + continue + ins = {h: data[i] for i, h in enumerate(self.headers)} + ds.append(Instance(**ins)) + return ds + + def download(self): + raise RuntimeError("conll2003 cannot be downloaded automatically.") + + +class OntoNotesNERLoader(ConllLoader): + """ + 用以读取OntoNotes的NER数据,同时也是Conll2012的NER任务数据。将OntoNote数据处理为conll格式的过程可以参考 + https://github.com/yhcc/OntoNotes-5.0-NER。OntoNoteNERLoader将取第4列和第11列的内容。 + + 返回的DataSet的内容为 + + .. csv-table:: 下面是使用OntoNoteNERLoader读取的DataSet所具备的结构, target列是BIO编码 + :header: "raw_words", "target" + + "[Nadim, Ladki]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" + "[...]", "[...]" + + """ + + def __init__(self): + super().__init__(headers=[Const.RAW_WORD, Const.TARGET], indexes=[3, 10]) + + def _load(self, path:str): + dataset = super()._load(path) + + def convert_to_bio(tags): + bio_tags = [] + flag = None + for tag in tags: + label = tag.strip("()*") + if '(' in tag: + bio_label = 'B-' + label + flag = label + elif flag: + bio_label = 'I-' + flag + else: + bio_label = 'O' + if ')' in tag: + flag = None + bio_tags.append(bio_label) + return bio_tags + + def convert_word(words): + converted_words = [] + for word in words: + word = word.replace('/.', '.') # 有些结尾的.是/.形式的 + if not word.startswith('-'): + converted_words.append(word) + continue + # 以下是由于这些符号被转义了,再转回来 + tfrs = {'-LRB-':'(', + '-RRB-': ')', + '-LSB-': '[', + '-RSB-': ']', + '-LCB-': '{', + '-RCB-': '}' + } + if word in tfrs: + converted_words.append(tfrs[word]) + else: + converted_words.append(word) + return converted_words + + dataset.apply_field(convert_word, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD) + dataset.apply_field(convert_to_bio, field_name=Const.TARGET, new_field_name=Const.TARGET) + + return dataset + + def download(self): + raise RuntimeError("Ontonotes cannot be downloaded automatically, you can refer " + "https://github.com/yhcc/OntoNotes-5.0-NER to download and preprocess.") + + +class CTBLoader(Loader): + def __init__(self): + super().__init__() + + def _load(self, path:str): + pass diff --git a/fastNLP/io/loader/csv.py b/fastNLP/io/loader/csv.py new file mode 100644 index 00000000..166f912b --- /dev/null +++ b/fastNLP/io/loader/csv.py @@ -0,0 +1,32 @@ +from ...core.dataset import DataSet +from ...core.instance import Instance +from ..file_reader import _read_csv +from .loader import Loader + + +class CSVLoader(Loader): + """ + 别名::class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.dataset_loader.CSVLoader` + + 读取CSV格式的数据集, 返回 ``DataSet`` 。 + + :param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称 + 若为 ``None`` ,则将读入文件的第一行视作 ``headers`` . Default: ``None`` + :param str sep: CSV文件中列与列之间的分隔符. Default: "," + :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` . + Default: ``False`` + """ + + def __init__(self, headers=None, sep=",", dropna=False): + super().__init__() + self.headers = headers + self.sep = sep + self.dropna = dropna + + def _load(self, path): + ds = DataSet() + for idx, data in _read_csv(path, headers=self.headers, + sep=self.sep, dropna=self.dropna): + ds.append(Instance(**data)) + return ds + diff --git a/fastNLP/io/loader/cws.py b/fastNLP/io/loader/cws.py new file mode 100644 index 00000000..46c07f28 --- /dev/null +++ b/fastNLP/io/loader/cws.py @@ -0,0 +1,41 @@ + +from .loader import Loader +from ...core import DataSet, Instance + + +class CWSLoader(Loader): + """ + 分词任务数据加载器, + SigHan2005的数据可以用xxx下载并预处理 + + CWSLoader支持的数据格式为,一行一句话,不同词之间用空格隔开, 例如: + + Example:: + + 上海 浦东 开发 与 法制 建设 同步 + 新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 ) + ... + + 该Loader读取后的DataSet具有如下的结构 + + .. csv-table:: + :header: "raw_words" + + "上海 浦东 开发 与 法制 建设 同步" + "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" + "..." + """ + def __init__(self): + super().__init__() + + def _load(self, path:str): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line: + ds.append(Instance(raw_words=line)) + return ds + + def download(self, output_dir=None): + raise RuntimeError("You can refer {} for sighan2005's data downloading.") diff --git a/fastNLP/io/loader/json.py b/fastNLP/io/loader/json.py new file mode 100644 index 00000000..8856b73a --- /dev/null +++ b/fastNLP/io/loader/json.py @@ -0,0 +1,40 @@ +from ...core.dataset import DataSet +from ...core.instance import Instance +from ..file_reader import _read_json +from .loader import Loader + + +class JsonLoader(Loader): + """ + 别名::class:`fastNLP.io.JsonLoader` :class:`fastNLP.io.loader.JsonLoader` + + 读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象 + + :param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name + ``fields`` 的 `key` 必须是json对象的属性名. ``fields`` 的 `value` 为读入后在DataSet存储的 `field_name` , + `value` 也可为 ``None`` , 这时读入后的 `field_name` 与json对象对应属性同名 + ``fields`` 可为 ``None`` , 这时,json对象所有属性都保存在DataSet中. Default: ``None`` + :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` . + Default: ``False`` + """ + + def __init__(self, fields=None, dropna=False): + super(JsonLoader, self).__init__() + self.dropna = dropna + self.fields = None + self.fields_list = None + if fields: + self.fields = {} + for k, v in fields.items(): + self.fields[k] = k if v is None else v + self.fields_list = list(self.fields.keys()) + + def _load(self, path): + ds = DataSet() + for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): + if self.fields: + ins = {self.fields[k]: v for k, v in d.items()} + else: + ins = d + ds.append(Instance(**ins)) + return ds diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py new file mode 100644 index 00000000..4cf5bcf3 --- /dev/null +++ b/fastNLP/io/loader/loader.py @@ -0,0 +1,75 @@ +from ... import DataSet +from .. import DataBundle +from ..utils import check_loader_paths +from typing import Union, Dict +import os +from ..file_utils import _get_dataset_url, get_default_cache_path, cached_path + +class Loader: + def __init__(self): + pass + + def _load(self, path:str) -> DataSet: + raise NotImplementedError + + def load(self, paths: Union[str, Dict[str, str]]=None) -> DataBundle: + """ + 从指定一个或多个路径中的文件中读取数据,返回:class:`~fastNLP.io.DataBundle` 。 + + 读取的field根据ConllLoader初始化时传入的headers决定。 + + :param Union[str, Dict[str, str]] paths: 支持以下的几种输入方式 + (0) 如果为None,则先查看本地是否有缓存,如果没有则自动下载并缓存。 + + (1) 传入一个目录, 该目录下名称包含train的被认为是train,包含test的被认为是test,包含dev的被认为是dev,如果检测到多个文件 + 名包含'train'、 'dev'、 'test'则会报错 + + Example:: + + data_bundle = ConllLoader().load('/path/to/dir') # 返回的DataBundle中datasets根据目录下是否检测到train、 + # dev、 test等有所变化,可以通过以下的方式取出DataSet + tr_data = data_bundle.datasets['train'] + te_data = data_bundle.datasets['test'] # 如果目录下有文件包含test这个字段 + + (2) 传入文件路径 + + Example:: + + data_bundle = ConllLoader().load("/path/to/a/train.conll") # 返回DataBundle对象, datasets中仅包含'train' + tr_data = data_bundle.datasets['train'] # 可以通过以下的方式取出DataSet + + (3) 传入一个dict,比如train,dev,test不在同一个目录下,或者名称中不包含train, dev, test + + Example:: + + paths = {'train':"/path/to/tr.conll", 'dev':"/to/validate.conll", "test":"/to/te.conll"} + data_bundle = ConllLoader().load(paths) # 返回的DataBundle中的dataset中包含"train", "dev", "test" + dev_data = data_bundle.datasets['dev'] + + :return: 返回的:class:`~fastNLP.io.DataBundle` + """ + if paths is None: + paths = self.download() + paths = check_loader_paths(paths) + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + def download(self): + raise NotImplementedError(f"{self.__class__} cannot download data automatically.") + + def _get_dataset_path(self, dataset_name): + """ + 传入dataset的名称,获取读取数据的目录。如果数据不存在,会尝试自动下载并缓存 + + :param str dataset_name: 数据集的名称 + :return: str, 数据集的目录地址。直接到该目录下读取相应的数据即可。 + """ + + default_cache_path = get_default_cache_path() + url = _get_dataset_url(dataset_name) + output_dir = cached_path(url_or_filename=url, cache_dir=default_cache_path, name='dataset') + + return output_dir + + diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py new file mode 100644 index 00000000..eff98ba3 --- /dev/null +++ b/fastNLP/io/loader/matching.py @@ -0,0 +1,309 @@ + +import warnings +from .loader import Loader +from .json import JsonLoader +from ...core import Const +from .. import DataBundle +import os +from typing import Union, Dict +from ...core import DataSet +from ...core import Instance + +__all__ = ['MNLILoader', + "QuoraLoader", + "SNLILoader", + "QNLILoader", + "RTELoader"] + + +class MNLILoader(Loader): + """ + 读取MNLI任务的数据,读取之后的DataSet中包含以下的内容,words0是sentence1, words1是sentence2, target是gold_label, 测试集中没 + 有target列。 + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "The new rights are...", "Everyone really likes..", "neutral" + "This site includes a...", "The Government Executive...", "contradiction" + "...", "...","." + + """ + def __init__(self): + super().__init__() + + def _load(self, path:str): + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + f.readline() # 跳过header + if path.endswith("test.tsv"): + warnings.warn("RTE's test file has no target.") + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[8] + raw_words2 = parts[9] + if raw_words1 and raw_words2: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2)) + else: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[8] + raw_words2 = parts[9] + target = parts[-1] + if raw_words1 and raw_words2 and target: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) + return ds + + def load(self, paths:str=None): + """ + + :param str paths: 传入数据所在目录,会在该目录下寻找dev_matched.tsv, dev_mismatched.tsv, test_matched.tsv, + test_mismatched.tsv, train.tsv文件夹 + :return: DataBundle + """ + if paths: + paths = os.path.abspath(os.path.expanduser(paths)) + else: + paths = self.download() + if not os.path.isdir(paths): + raise NotADirectoryError(f"{paths} is not a valid directory.") + + files = {'dev_matched':"dev_matched.tsv", + "dev_mismatched":"dev_mismatched.tsv", + "test_matched":"test_matched.tsv", + "test_mismatched":"test_mismatched.tsv", + "train":'train.tsv'} + + datasets = {} + for name, filename in files.items(): + filepath = os.path.join(paths, filename) + if not os.path.isfile(filepath): + if 'test' not in name: + raise FileNotFoundError(f"{name} not found in directory {filepath}.") + datasets[name] = self._load(filepath) + + data_bundle = DataBundle(datasets=datasets) + + return data_bundle + + def download(self): + """ + 如果你使用了这个数据,请引用 + + https://www.nyu.edu/projects/bowman/multinli/paper.pdf + :return: + """ + output_dir = self._get_dataset_path('mnli') + return output_dir + + +class SNLILoader(JsonLoader): + """ + 读取之后的DataSet中的field情况为 + + .. csv-table:: 下面是使用SNLILoader加载的DataSet所具备的field + :header: "raw_words1", "raw_words2", "target" + + "The new rights are...", "Everyone really likes..", "neutral" + "This site includes a...", "The Government Executive...", "entailment" + "...", "...", "." + + """ + def __init__(self): + super().__init__(fields={ + 'sentence1': Const.RAW_WORDS(0), + 'sentence2': Const.RAW_WORDS(1), + 'gold_label': Const.TARGET, + }) + + def load(self, paths: Union[str, Dict[str, str]]=None) -> DataBundle: + """ + 从指定一个或多个路径中的文件中读取数据,返回:class:`~fastNLP.io.DataBundle` 。 + + 读取的field根据ConllLoader初始化时传入的headers决定。 + + :param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl + 和snli_1.0_test.jsonl三个文件。 + + :return: 返回的:class:`~fastNLP.io.DataBundle` + """ + _paths = {} + if paths is None: + paths = self.download() + if paths: + if os.path.isdir(paths): + if not os.path.isfile(os.path.join(paths, 'snli_1.0_train.jsonl')): + raise FileNotFoundError(f"snli_1.0_train.jsonl is not found in {paths}") + _paths['train'] = os.path.join(paths, 'snli_1.0_train.jsonl') + for filename in ['snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl']: + filepath = os.path.join(paths, filename) + _paths[filename.split('_')[-1].split('.')[0]] = filepath + paths = _paths + else: + raise NotADirectoryError(f"{paths} is not a valid directory.") + + datasets = {name: self._load(path) for name, path in paths.items()} + data_bundle = DataBundle(datasets=datasets) + return data_bundle + + def download(self): + """ + 如果您的文章使用了这份数据,请引用 + + http://nlp.stanford.edu/pubs/snli_paper.pdf + + :return: str + """ + return self._get_dataset_path('snli') + + +class QNLILoader(JsonLoader): + """ + QNLI数据集的Loader, + 加载的DataSet将具备以下的field, raw_words1是question, raw_words2是sentence, target是label + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "What came into force after the new...", "As of that day...", "entailment" + "What is the first major...", "The most important tributaries", "not_entailment" + "...","." + + test数据集没有target列 + + """ + def __init__(self): + super().__init__() + + def _load(self, path): + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + f.readline() # 跳过header + if path.endswith("test.tsv"): + warnings.warn("QNLI's test file has no target.") + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + if raw_words1 and raw_words2: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2)) + else: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + target = parts[-1] + if raw_words1 and raw_words2 and target: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) + return ds + + def download(self): + """ + 如果您的实验使用到了该数据,请引用 + + TODO 补充 + + :return: + """ + return self._get_dataset_path('qnli') + + +class RTELoader(Loader): + """ + RTE数据的loader + 加载的DataSet将具备以下的field, raw_words1是sentence0,raw_words2是sentence1, target是label + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "Dana Reeve, the widow of the actor...", "Christopher Reeve had an...", "not_entailment" + "Yet, we now are discovering that...", "Bacteria is winning...", "entailment" + "...","." + + test数据集没有target列 + """ + def __init__(self): + super().__init__() + + def _load(self, path:str): + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + f.readline() # 跳过header + if path.endswith("test.tsv"): + warnings.warn("RTE's test file has no target.") + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + if raw_words1 and raw_words2: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2)) + else: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + target = parts[-1] + if raw_words1 and raw_words2 and target: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) + return ds + + def download(self): + return self._get_dataset_path('rte') + + +class QuoraLoader(Loader): + """ + Quora matching任务的数据集Loader + + 支持读取的文件中的内容,应该有以下的形式, 以制表符分隔,且前三列的内容必须是:第一列是label,第二列和第三列是句子 + + Example:: + + 1 How do I get funding for my web based startup idea ? How do I get seed funding pre product ? 327970 + 1 How can I stop my depression ? What can I do to stop being depressed ? 339556 + ... + + 加载的DataSet将具备以下的field + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "What should I do to avoid...", "1" + "How do I not sleep in a boring class...", "0" + "...","." + + """ + def __init__(self): + super().__init__() + + def _load(self, path:str): + ds = DataSet() + + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + raw_words1 = parts[1] + raw_words2 = parts[2] + target = parts[0] + if raw_words1 and raw_words2 and target: + ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) + return ds + + def download(self): + raise RuntimeError("Quora cannot be downloaded automatically.") diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py new file mode 100644 index 00000000..0cf8d949 --- /dev/null +++ b/fastNLP/io/pipe/__init__.py @@ -0,0 +1,8 @@ + + +""" +Pipe用于处理数据,所有的Pipe都包含一个process(DataBundle)方法,传入一个DataBundle对象, 在传入DataBundle上进行原位修改,并将其返回; + process_from_file(paths)传入的文件路径,返回一个DataBundle。process(DataBundle)或者process_from_file(paths)的返回DataBundle + 中的DataSet一般都包含原文与转换为index的输入,以及转换为index的target;除了DataSet之外,还会包含将field转为index时所建立的词表。 + +""" \ No newline at end of file diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py new file mode 100644 index 00000000..a64e5328 --- /dev/null +++ b/fastNLP/io/pipe/classification.py @@ -0,0 +1,444 @@ + +from nltk import Tree + +from ..base_loader import DataBundle +from ...core.vocabulary import Vocabulary +from ...core.const import Const +from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader +from ...core import DataSet, Instance + +from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance +from .pipe import Pipe +import re +nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') +from ...core import cache_results + +class _CLSPipe(Pipe): + """ + 分类问题的基类,负责对classification的数据进行tokenize操作。默认是对raw_words列操作,然后生成words列 + + """ + def __init__(self, tokenizer:str='spacy', lang='en'): + self.tokenizer = get_tokenizer(tokenizer, lang=lang) + + def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None): + """ + 将DataBundle中的数据进行tokenize + + :param DataBundle data_bundle: + :param str field_name: + :param str new_field_name: + :return: 传入的DataBundle对象 + """ + new_field_name = new_field_name or field_name + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name) + + return data_bundle + + def _granularize(self, data_bundle, tag_map): + """ + 该函数对data_bundle中'target'列中的内容进行转换。 + + :param data_bundle: + :param dict tag_map: 将target列中的tag做以下的映射,比如{"0":0, "1":0, "3":1, "4":1}, 则会删除target为"2"的instance, + 且将"1"认为是第0类。 + :return: 传入的data_bundle + """ + for name in list(data_bundle.datasets.keys()): + dataset = data_bundle.get_dataset(name) + dataset.apply_field(lambda target:tag_map.get(target, -100), field_name=Const.TARGET, + new_field_name=Const.TARGET) + dataset.drop(lambda ins:ins[Const.TARGET] == -100) + data_bundle.set_dataset(dataset, name) + return data_bundle + + +def _clean_str(words): + """ + heavily borrowed from github + https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb + :param sentence: is a str + :return: + """ + words_collection = [] + for word in words: + if word in ['-lrb-', '-rrb-', '', '-r', '-l', 'b-']: + continue + tt = nonalpnum.split(word) + t = ''.join(tt) + if t != '': + words_collection.append(t) + + return words_collection + + +class YelpFullPipe(_CLSPipe): + """ + 处理YelpFull的数据, 处理之后DataSet中的内容如下 + + .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field + :header: "raw_words", "words", "target", "seq_len" + + "It 's a ...", "[4, 2, 10, ...]", 0, 10 + "Offers that ...", "[20, 40, ...]", 1, 21 + "...", "[...]", ., . + + :param bool lower: 是否对输入进行小写化。 + :param int granularity: 支持2, 3, 5。若为2, 则认为是2分类问题,将1、2归为1类,4、5归为一类,丢掉2;若为3, 则有3分类问题,将 + 1、2归为1类,3归为1类,4、5归为1类;若为5, 则有5分类问题。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + """ + def __init__(self, lower:bool=False, granularity=5, tokenizer:str='spacy'): + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + assert granularity in (2, 3, 5), "granularity can only be 2,3,5." + self.granularity = granularity + + if granularity==2: + self.tag_map = {"1": 0, "2": 0, "4": 1, "5": 1} + elif granularity==3: + self.tag_map = {"1": 0, "2": 0, "3":1, "4": 2, "5": 2} + else: + self.tag_map = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4} + + def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None): + """ + 将DataBundle中的数据进行tokenize + + :param DataBundle data_bundle: + :param str field_name: + :param str new_field_name: + :return: 传入的DataBundle对象 + """ + new_field_name = new_field_name or field_name + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name) + dataset.apply_field(_clean_str, field_name=field_name, new_field_name=new_field_name) + return data_bundle + + def process(self, data_bundle): + """ + 传入的DataSet应该具备如下的结构 + + .. csv-table:: + :header: "raw_words", "target" + + "I got 'new' tires from them and... ", "1" + "Don't waste your time. We had two...", "1" + "...", "..." + + :param data_bundle: + :return: + """ + + # 复制一列words + data_bundle = _add_words_field(data_bundle, lower=self.lower) + + # 进行tokenize + data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) + + # 根据granularity设置tag + data_bundle = self._granularize(data_bundle, tag_map=self.tag_map) + + # 删除空行 + data_bundle = _drop_empty_instance(data_bundle, field_name=Const.INPUT) + + # index + data_bundle = _indexize(data_bundle=data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param paths: + :return: DataBundle + """ + data_bundle = YelpFullLoader().load(paths) + return self.process(data_bundle=data_bundle) + + +class YelpPolarityPipe(_CLSPipe): + """ + 处理YelpPolarity的数据, 处理之后DataSet中的内容如下 + + .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field + :header: "raw_words", "words", "target", "seq_len" + + "It 's a ...", "[4, 2, 10, ...]", 0, 10 + "Offers that ...", "[20, 40, ...]", 1, 21 + "...", "[...]", ., . + + :param bool lower: 是否对输入进行小写化。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + """ + def __init__(self, lower:bool=False, tokenizer:str='spacy'): + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process(self, data_bundle): + # 复制一列words + data_bundle = _add_words_field(data_bundle, lower=self.lower) + + # 进行tokenize + data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) + # index + data_bundle = _indexize(data_bundle=data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param str paths: + :return: DataBundle + """ + data_bundle = YelpPolarityLoader().load(paths) + return self.process(data_bundle=data_bundle) + + +class SSTPipe(_CLSPipe): + """ + 别名::class:`fastNLP.io.SSTPipe` :class:`fastNLP.io.pipe.SSTPipe` + + 经过该Pipe之后,DataSet中具备的field如下所示 + + .. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field + :header: "raw_words", "words", "target", "seq_len" + + "It 's a ...", "[4, 2, 10, ...]", 0, 16 + "Offers that ...", "[20, 40, ...]", 1, 18 + "...", "[...]", ., . + + :param bool subtree: 是否将train, test, dev数据展开为子树,扩充数据量。 Default: ``False`` + :param bool train_subtree: 是否将train集通过子树扩展数据。 + :param bool lower: 是否对输入进行小写化。 + :param int granularity: 支持2, 3, 5。若为2, 则认为是2分类问题,将0、1归为1类,3、4归为一类,丢掉2;若为3, 则有3分类问题,将 + 0、1归为1类,2归为1类,3、4归为1类;若为5, 则有5分类问题。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + """ + + def __init__(self, subtree=False, train_subtree=True, lower=False, granularity=5, tokenizer='spacy'): + super().__init__(tokenizer=tokenizer, lang='en') + self.subtree = subtree + self.train_tree = train_subtree + self.lower = lower + assert granularity in (2, 3, 5), "granularity can only be 2,3,5." + self.granularity = granularity + + if granularity==2: + self.tag_map = {"0": 0, "1": 0, "3": 1, "4": 1} + elif granularity==3: + self.tag_map = {"0": 0, "1": 0, "2":1, "3": 2, "4": 2} + else: + self.tag_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} + + def process(self, data_bundle:DataBundle): + """ + 对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与 + + .. csv-table:: + :header: "raw_words" + + "(3 (2 It) (4 (4 (2 's) (4 (3 (2 a)..." + "(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare)..." + "..." + + :param DataBundle data_bundle: 需要处理的DataBundle对象 + :return: + """ + # 先取出subtree + for name in list(data_bundle.datasets.keys()): + dataset = data_bundle.get_dataset(name) + ds = DataSet() + use_subtree = self.subtree or (name == 'train' and self.train_tree) + for ins in dataset: + raw_words = ins['raw_words'] + tree = Tree.fromstring(raw_words) + if use_subtree: + for t in tree.subtrees(): + raw_words = " ".join(t.leaves()) + instance = Instance(raw_words=raw_words, target=t.label()) + ds.append(instance) + else: + instance = Instance(raw_words=' '.join(tree.leaves()), target=tree.label()) + ds.append(instance) + data_bundle.set_dataset(ds, name) + + _add_words_field(data_bundle, lower=self.lower) + + # 进行tokenize + data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) + + # 根据granularity设置tag + data_bundle = self._granularize(data_bundle, tag_map=self.tag_map) + + # index + data_bundle = _indexize(data_bundle=data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + data_bundle = SSTLoader().load(paths) + return self.process(data_bundle=data_bundle) + + +class SST2Pipe(_CLSPipe): + """ + 加载SST2的数据, 处理完成之后DataSet将拥有以下的field + + .. csv-table:: + :header: "raw_words", "words", "target", "seq_len" + + "it 's a charming and... ", "[3, 4, 5, 6, 7,...]", 1, 43 + "unflinchingly bleak and...", "[10, 11, 7,...]", 1, 21 + "...", "...", ., . + + :param bool lower: 是否对输入进行小写化。 + :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 + """ + def __init__(self, lower=False, tokenizer='spacy'): + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process(self, data_bundle:DataBundle): + """ + 可以处理的DataSet应该具备如下的结构 + + .. csv-table:: + :header: "raw_words", "target" + + "it 's a charming and... ", 1 + "unflinchingly bleak and...", 1 + "...", "..." + + :param data_bundle: + :return: + """ + _add_words_field(data_bundle, self.lower) + + data_bundle = self._tokenize(data_bundle=data_bundle) + + src_vocab = Vocabulary() + src_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT, + no_create_entry_dataset=[dataset for name,dataset in data_bundle.datasets.items() if + name != 'train']) + src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) + + tgt_vocab = Vocabulary(unknown=None, padding=None) + tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) + datasets = [] + for name, dataset in data_bundle.datasets.items(): + if dataset.has_field(Const.TARGET): + datasets.append(dataset) + tgt_vocab.index_dataset(*datasets, field_name=Const.TARGET) + + data_bundle.set_vocab(src_vocab, Const.INPUT) + data_bundle.set_vocab(tgt_vocab, Const.TARGET) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) + data_bundle.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param str paths: 如果为None,则自动下载并缓存到fastNLP的缓存地址。 + :return: DataBundle + """ + data_bundle = SST2Loader().load(paths) + return self.process(data_bundle) + + +class IMDBPipe(_CLSPipe): + """ + 经过本Pipe处理后DataSet将如下 + + .. csv-table:: 输出DataSet的field + :header: "raw_words", "words", "target", "seq_len" + + "Bromwell High is a cartoon ... ", "[3, 5, 6, 9, ...]", 0, 20 + "Story of a man who has ...", "[20, 43, 9, 10, ...]", 1, 31 + "...", "[...]", ., . + + 其中raw_words为str类型,是原文; words是转换为index的输入; target是转换为index的目标值; + words列被设置为input; target列被设置为target。 + + :param bool lower: 是否将words列的数据小写。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + """ + def __init__(self, lower:bool=False, tokenizer:str='spacy'): + super().__init__(tokenizer=tokenizer, lang='en') + self.lower = lower + + def process(self, data_bundle:DataBundle): + """ + 期待的DataBunlde中输入的DataSet应该类似于如下,有两个field,raw_words和target,且均为str类型 + + .. csv-table:: 输入DataSet的field + :header: "raw_words", "target" + + "Bromwell High is a cartoon ... ", "pos" + "Story of a man who has ...", "neg" + "...", "..." + + :param DataBunlde data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和target两个field,且raw_words列应该为str, + target列应该为str。 + :return:DataBundle + """ + # 替换
+ def replace_br(raw_words): + raw_words = raw_words.replace("
", ' ') + return raw_words + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(replace_br, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD) + + _add_words_field(data_bundle, lower=self.lower) + self._tokenize(data_bundle, field_name=Const.INPUT, new_field_name=Const.INPUT) + _indexize(data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + dataset.set_input(Const.INPUT, Const.INPUT_LEN) + dataset.set_target(Const.TARGET) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = IMDBLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + + diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py new file mode 100644 index 00000000..4f780614 --- /dev/null +++ b/fastNLP/io/pipe/conll.py @@ -0,0 +1,149 @@ +from .pipe import Pipe +from .. import DataBundle +from .utils import iob2, iob2bioes +from ... import Const +from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader +from .utils import _indexize, _add_words_field + + +class _NERPipe(Pipe): + """ + NER任务的处理Pipe, 该Pipe会(1)复制raw_words列,并命名为words; (2)在words, target列建立词表 + (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将words,target列根据相应的 + Vocabulary转换为index。 + + raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target, seq_len。 + + :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 + :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 + :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 + """ + def __init__(self, encoding_type:str='bio', lower:bool=False, target_pad_val=0): + if encoding_type == 'bio': + self.convert_tag = iob2 + else: + self.convert_tag = iob2bioes + self.lower = lower + self.target_pad_val = int(target_pad_val) + + def process(self, data_bundle:DataBundle)->DataBundle: + """ + 支持的DataSet的field为 + + .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader + :header: "raw_words", "target" + + "[Nadim, Ladki]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" + "[...]", "[...]" + + + :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 + 在传入DataBundle基础上原位修改。 + :return: DataBundle + + Example:: + + data_bundle = Conll2003Loader().load('/path/to/conll2003/') + data_bundle = Conll2003NERPipe().process(data_bundle) + + # 获取train + tr_data = data_bundle.get_dataset('train') + + # 获取target这个field的词表 + target_vocab = data_bundle.get_vocab('target') + # 获取words这个field的词表 + word_vocab = data_bundle.get_vocab('words') + + """ + # 转换tag + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) + + _add_words_field(data_bundle, lower=self.lower) + + # index + _indexize(data_bundle) + + input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] + target_fields = [Const.TARGET, Const.INPUT_LEN] + + for name, dataset in data_bundle.datasets.items(): + dataset.set_pad_val(Const.TARGET, self.target_pad_val) + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths) -> DataBundle: + """ + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = Conll2003NERLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class Conll2003NERPipe(_NERPipe): + """ + Conll2003的NER任务的处理Pipe, 该Pipe会(1)复制raw_words列,并命名为words; (2)在words, target列建立词表 + (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将words,target列根据相应的 + Vocabulary转换为index。 + 经过该Pipe过后,DataSet中的内容如下所示 + + .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader + :header: "raw_words", "words", "target", "seq_len" + + "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 + "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 10 + "[...]", "[...]", "[...]", . + + raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 + + :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 + :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 + :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 + """ + + def process_from_file(self, paths) -> DataBundle: + """ + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = Conll2003NERLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle + + +class OntoNotesNERPipe(_NERPipe): + """ + 处理OntoNotes的NER数据,处理之后DataSet中的field情况为 + + .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader + :header: "raw_words", "words", "target", "seq_len" + + "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 + "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 6 + "[...]", "[...]", "[...]", . + + + :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 + :param bool delete_unused_fields: 是否删除NER任务中用不到的field。 + :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 + """ + + def process_from_file(self, paths): + data_bundle = OntoNotesNERLoader().load(paths) + return self.process(data_bundle) + diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py new file mode 100644 index 00000000..76a0eaf7 --- /dev/null +++ b/fastNLP/io/pipe/matching.py @@ -0,0 +1,254 @@ +import math + +from .pipe import Pipe +from .utils import get_tokenizer +from ...core import Const +from ...core import Vocabulary +from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader + +class MatchingBertPipe(Pipe): + """ + Matching任务的Bert pipe,输出的DataSet将包含以下的field + + .. csv-table:: + :header: "raw_words1", "raw_words2", "words", "target", "seq_len" + + "The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", 1, 10 + "This site includes a...", "The Government Executive...", "[11, 12, 13,...]", 0, 5 + "...", "...", "[...]", ., . + + words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。 + words列被设置为input,target列被设置为target. + + :param bool lower: 是否将word小写化。 + :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 + :param int max_concat_sent_length: 如果concat后的句子长度超过了该值,则合并后的句子将被截断到这个长度,截断时同时对premise + 和hypothesis按比例截断。 + """ + def __init__(self, lower=False, tokenizer:str='raw', max_concat_sent_length:int=480): + super().__init__() + + self.lower = bool(lower) + self.tokenizer = get_tokenizer(tokenizer=tokenizer) + self.max_concat_sent_length = int(max_concat_sent_length) + + def _tokenize(self, data_bundle, field_names, new_field_names): + """ + + :param DataBundle data_bundle: DataBundle. + :param list field_names: List[str], 需要tokenize的field名称 + :param list new_field_names: List[str], tokenize之后field的名称,与field_names一一对应。 + :return: 输入的DataBundle对象 + """ + for name, dataset in data_bundle.datasets.items(): + for field_name, new_field_name in zip(field_names, new_field_names): + dataset.apply_field(lambda words:self.tokenizer(words), field_name=field_name, + new_field_name=new_field_name) + return data_bundle + + def process(self, data_bundle): + for name, dataset in data_bundle.datasets.items(): + dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0)) + dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1)) + + if self.lower: + for name, dataset in data_bundle.datasets.items(): + dataset[Const.INPUTS(0)].lower() + dataset[Const.INPUTS(1)].lower() + + data_bundle = self._tokenize(data_bundle, [Const.INPUTS(0), Const.INPUT(1)], + [Const.INPUTS(0), Const.INPUTS(1)]) + + # concat两个words + def concat(ins): + words0 = ins[Const.INPUTS(0)] + words1 = ins[Const.INPUTS(1)] + len0 = len(words0) + len1 = len(words1) + if len0 + len1 > self.max_concat_sent_length: + ratio = self.max_concat_sent_length / (len0 + len1) + len0 = math.floor(ratio * len0) + len1 = math.floor(ratio * len1) + words0 = words0[:len0] + words1 = words1[:len1] + + words = words0 + ['[SEP]'] + words1 + return words + for name, dataset in data_bundle.datasets.items(): + dataset.apply(concat, new_field_name=Const.INPUT) + dataset.delete_field(Const.INPUTS(0)) + dataset.delete_field(Const.INPUTS(1)) + + word_vocab = Vocabulary() + word_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT, + no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if + name != 'train']) + word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) + + target_vocab = Vocabulary(padding=None, unknown=None) + target_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) + has_target_datasets = [] + for name, dataset in data_bundle.datasets.items(): + if dataset.has_field(Const.TARGET): + has_target_datasets.append(dataset) + target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) + + data_bundle.set_vocab(word_vocab, Const.INPUT) + data_bundle.set_vocab(target_vocab, Const.TARGET) + + input_fields = [Const.INPUT, Const.INPUT_LEN] + target_fields = [Const.TARGET] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUT) + dataset.set_input(*input_fields, flag=True) + dataset.set_target(*target_fields, flag=True) + + return data_bundle + + +class RTEBertPipe(MatchingBertPipe): + def process_from_file(self, paths=None): + data_bundle = RTELoader().load(paths) + return self.process(data_bundle) + + +class SNLIBertPipe(MatchingBertPipe): + def process_from_file(self, paths=None): + data_bundle = SNLILoader().load(paths) + return self.process(data_bundle) + + +class QuoraBertPipe(MatchingBertPipe): + def process_from_file(self, paths): + data_bundle = QuoraLoader().load(paths) + return self.process(data_bundle) + + +class QNLIBertPipe(MatchingBertPipe): + def process_from_file(self, paths=None): + data_bundle = QNLILoader().load(paths) + return self.process(data_bundle) + + +class MNLIBertPipe(MatchingBertPipe): + def process_from_file(self, paths=None): + data_bundle = MNLILoader().load(paths) + return self.process(data_bundle) + + +class MatchingPipe(Pipe): + """ + Matching任务的Pipe。输出的DataSet将包含以下的field + + .. csv-table:: + :header: "raw_words1", "raw_words2", "words1", "words2", "target", "seq_len1", "seq_len2" + + "The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", "[10, 20, 6]", 1, 10, 13 + "This site includes a...", "The Government Executive...", "[11, 12, 13,...]", "[2, 7, ...]", 0, 6, 7 + "...", "...", "[...]", "[...]", ., ., . + + words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target。 + + :param bool lower: 是否将所有raw_words转为小写。 + :param str tokenizer: 将原始数据tokenize的方式。支持spacy, raw. spacy是使用spacy切分,raw就是用空格切分。 + """ + def __init__(self, lower=False, tokenizer:str='raw'): + super().__init__() + + self.lower = bool(lower) + self.tokenizer = get_tokenizer(tokenizer=tokenizer) + + def _tokenize(self, data_bundle, field_names, new_field_names): + """ + + :param DataBundle data_bundle: DataBundle. + :param list field_names: List[str], 需要tokenize的field名称 + :param list new_field_names: List[str], tokenize之后field的名称,与field_names一一对应。 + :return: 输入的DataBundle对象 + """ + for name, dataset in data_bundle.datasets.items(): + for field_name, new_field_name in zip(field_names, new_field_names): + dataset.apply_field(lambda words:self.tokenizer(words), field_name=field_name, + new_field_name=new_field_name) + return data_bundle + + def process(self, data_bundle): + """ + 接受的DataBundle中的DataSet应该具有以下的field, target列可以没有 + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "The new rights are...", "Everyone really likes..", "entailment" + "This site includes a...", "The Government Executive...", "not_entailment" + "...", "..." + + :param data_bundle: + :return: + """ + data_bundle = self._tokenize(data_bundle, [Const.RAW_WORDS(0), Const.RAW_WORDS(1)], + [Const.INPUTS(0), Const.INPUTS(1)]) + + if self.lower: + for name, dataset in data_bundle.datasets.items(): + dataset[Const.INPUTS(0)].lower() + dataset[Const.INPUTS(1)].lower() + + word_vocab = Vocabulary() + word_vocab.from_dataset(data_bundle.datasets['train'], field_name=[Const.INPUTS(0), Const.INPUTS(1)], + no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if + name != 'train']) + word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=[Const.INPUTS(0), Const.INPUTS(1)]) + + target_vocab = Vocabulary(padding=None, unknown=None) + target_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) + has_target_datasets = [] + for name, dataset in data_bundle.datasets.items(): + if dataset.has_field(Const.TARGET): + has_target_datasets.append(dataset) + target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) + + data_bundle.set_vocab(word_vocab, Const.INPUTS(0)) + data_bundle.set_vocab(target_vocab, Const.TARGET) + + input_fields = [Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LEN(0), Const.INPUT_LEN(1)] + target_fields = [Const.TARGET] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.INPUTS(0), Const.INPUT_LEN(0)) + dataset.add_seq_len(Const.INPUTS(1), Const.INPUT_LEN(1)) + dataset.set_input(*input_fields, flag=True) + dataset.set_target(*target_fields, flag=True) + + return data_bundle + + +class RTEPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = RTELoader().load(paths) + return self.process(data_bundle) + + +class SNLIPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = SNLILoader().load(paths) + return self.process(data_bundle) + + +class QuoraPipe(MatchingPipe): + def process_from_file(self, paths): + data_bundle = QuoraLoader().load(paths) + return self.process(data_bundle) + +class QNLIPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = QNLILoader().load(paths) + return self.process(data_bundle) + + +class MNLIPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = MNLILoader().load(paths) + return self.process(data_bundle) + diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py new file mode 100644 index 00000000..14c3866a --- /dev/null +++ b/fastNLP/io/pipe/pipe.py @@ -0,0 +1,9 @@ + +from .. import DataBundle + +class Pipe: + def process(self, data_bundle:DataBundle)->DataBundle: + raise NotImplementedError + + def process_from_file(self, paths)->DataBundle: + raise NotImplementedError diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py new file mode 100644 index 00000000..59bee96e --- /dev/null +++ b/fastNLP/io/pipe/utils.py @@ -0,0 +1,142 @@ +from typing import List +from ...core import Vocabulary +from ...core import Const + +def iob2(tags:List[str])->List[str]: + """ + 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。两种格式的区别见https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format + + :param tags: 需要转换的tags + """ + for i, tag in enumerate(tags): + if tag == "O": + continue + split = tag.split("-") + if len(split) != 2 or split[0] not in ["I", "B"]: + raise TypeError("The encoding schema is not a valid IOB type.") + if split[0] == "B": + continue + elif i == 0 or tags[i - 1] == "O": # conversion IOB1 to IOB2 + tags[i] = "B" + tag[1:] + elif tags[i - 1][1:] == tag[1:]: + continue + else: # conversion IOB1 to IOB2 + tags[i] = "B" + tag[1:] + return tags + +def iob2bioes(tags:List[str])->List[str]: + """ + 将iob的tag转换为bioes编码 + :param tags: + :return: + """ + new_tags = [] + for i, tag in enumerate(tags): + if tag == 'O': + new_tags.append(tag) + else: + split = tag.split('-')[0] + if split == 'B': + if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I': + new_tags.append(tag) + else: + new_tags.append(tag.replace('B-', 'S-')) + elif split == 'I': + if i + 1Dict[str, str]: +def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: """ 检查传入dataloader的文件的合法性。如果为合法路径,将返回至少包含'train'这个key的dict。类似于下面的结果 { @@ -11,13 +12,14 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: 'test': 'xxx' # 可能有,也可能没有 ... } - 如果paths为不合法的,将直接进行raise相应的错误 + 如果paths为不合法的,将直接进行raise相应的错误. 如果paths内不包含train也会报错。 - :param paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录,将在该目录下寻找train(文件名 + :param str paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录,将在该目录下寻找train(文件名 中包含train这个字段), test.txt, dev.txt; 可以为一个dict, 则key是用户自定义的某个文件的名称,value是这个文件的路径。 :return: """ - if isinstance(paths, str): + if isinstance(paths, (str, Path)): + paths = os.path.abspath(os.path.expanduser(paths)) if os.path.isfile(paths): return {'train': paths} elif os.path.isdir(paths): @@ -37,6 +39,8 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: path_pair = ('test', filename) if path_pair: files[path_pair[0]] = os.path.join(paths, path_pair[1]) + if 'train' not in files: + raise KeyError(f"There is no train file in {paths}.") return files else: raise FileNotFoundError(f"{paths} is not a valid file path.") @@ -47,8 +51,10 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: raise KeyError("You have to include `train` in your dict.") for key, value in paths.items(): if isinstance(key, str) and isinstance(value, str): + value = os.path.abspath(os.path.expanduser(value)) if not os.path.isfile(value): raise TypeError(f"{value} is not a valid file.") + paths[key] = value else: raise TypeError("All keys and values in paths should be str.") return paths diff --git a/test/embeddings/__init__.py b/test/embeddings/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/modules/encoder/test_bert.py b/test/embeddings/test_bert.py similarity index 100% rename from test/modules/encoder/test_bert.py rename to test/embeddings/test_bert.py diff --git a/test/embeddings/test_elmo_embedding.py b/test/embeddings/test_elmo_embedding.py new file mode 100644 index 00000000..a087f0a4 --- /dev/null +++ b/test/embeddings/test_elmo_embedding.py @@ -0,0 +1,21 @@ + +import unittest +from fastNLP import Vocabulary +from fastNLP.embeddings import ElmoEmbedding +import torch +import os + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestDownload(unittest.TestCase): + def test_download_small(self): + # import os + vocab = Vocabulary().add_word_lst("This is a test .".split()) + elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='en-small') + words = torch.LongTensor([[0, 1, 2]]) + print(elmo_embed(words).size()) + + +# 首先保证所有权重可以加载;上传权重;验证可以下载 + + + diff --git a/test/io/loader/test_classification_loader.py b/test/io/loader/test_classification_loader.py new file mode 100644 index 00000000..28f08921 --- /dev/null +++ b/test/io/loader/test_classification_loader.py @@ -0,0 +1,19 @@ + +import unittest +from fastNLP.io.loader.classification import YelpFullLoader +from fastNLP.io.loader.classification import YelpPolarityLoader +from fastNLP.io.loader.classification import IMDBLoader +from fastNLP.io.loader.classification import SST2Loader +from fastNLP.io.loader.classification import SSTLoader +import os + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestDownload(unittest.TestCase): + def test_download(self): + for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader]: + loader().download() + + def test_load(self): + for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader]: + data_bundle = loader().load() + print(data_bundle) diff --git a/test/io/loader/test_matching_loader.py b/test/io/loader/test_matching_loader.py new file mode 100644 index 00000000..5c1a91f1 --- /dev/null +++ b/test/io/loader/test_matching_loader.py @@ -0,0 +1,22 @@ + +import unittest +from fastNLP.io.loader.matching import RTELoader +from fastNLP.io.loader.matching import QNLILoader +from fastNLP.io.loader.matching import SNLILoader +from fastNLP.io.loader.matching import QuoraLoader +from fastNLP.io.loader.matching import MNLILoader +import os + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestDownload(unittest.TestCase): + def test_download(self): + for loader in [RTELoader, QNLILoader, SNLILoader, MNLILoader]: + loader().download() + with self.assertRaises(Exception): + QuoraLoader().load() + + def test_load(self): + for loader in [RTELoader, QNLILoader, SNLILoader, MNLILoader]: + data_bundle = loader().load() + print(data_bundle) + diff --git a/test/io/pipe/test_classification.py b/test/io/pipe/test_classification.py new file mode 100644 index 00000000..39dc71e0 --- /dev/null +++ b/test/io/pipe/test_classification.py @@ -0,0 +1,13 @@ +import unittest +import os + +from fastNLP.io.pipe.classification import SSTPipe, SST2Pipe, IMDBPipe, YelpFullPipe, YelpPolarityPipe + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [YelpPolarityPipe, SST2Pipe, IMDBPipe, YelpFullPipe, SSTPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) diff --git a/test/io/pipe/test_matching.py b/test/io/pipe/test_matching.py new file mode 100644 index 00000000..c057bb0c --- /dev/null +++ b/test/io/pipe/test_matching.py @@ -0,0 +1,26 @@ + +import unittest +import os + +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, MNLIPipe +from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, MNLIBertPipe + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [SNLIPipe, RTEPipe, QNLIPipe, MNLIPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestBertPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [SNLIBertPipe, RTEBertPipe, QNLIBertPipe, MNLIBertPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe(tokenizer='raw').process_from_file() + print(data_bundle) From afa73bf5c88720890923318974be3ef44047a0e5 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 14 Aug 2019 18:03:42 +0800 Subject: [PATCH 028/153] format some docs --- fastNLP/io/__init__.py | 62 +++++++++++++++++------- fastNLP/io/loader/__init__.py | 79 +++++++++++++++++++++---------- fastNLP/io/loader/matching.py | 7 --- fastNLP/io/pipe/__init__.py | 36 ++++++++++++-- fastNLP/io/pipe/classification.py | 1 - fastNLP/io/pipe/conll.py | 8 ++-- fastNLP/io/pipe/matching.py | 1 + fastNLP/io/pipe/pipe.py | 6 +-- 8 files changed, 140 insertions(+), 60 deletions(-) diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index cd0d3527..bf5c2c36 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -14,27 +14,56 @@ __all__ = [ 'EmbedLoader', - 'CSVLoader', - 'JsonLoader', - 'DataBundle', 'DataSetLoader', - 'ConllLoader', - 'Conll2003Loader', + 'YelpLoader', + 'YelpFullLoader', + 'YelpPolarityLoader', 'IMDBLoader', - 'MatchingLoader', - 'SNLILoader', - 'MNLILoader', - 'MTL16Loader', - 'PeopleDailyCorpusLoader', - 'QNLILoader', - 'QuoraLoader', - 'RTELoader', 'SSTLoader', 'SST2Loader', - 'YelpLoader', - + + 'ConllLoader', + 'Conll2003Loader', + 'Conll2003NERLoader', + 'OntoNotesNERLoader', + 'CTBLoader', + + 'Loader', + 'CSVLoader', + 'JsonLoader', + + 'CWSLoader', + + 'MNLILoader', + "QuoraLoader", + "SNLILoader", + "QNLILoader", + "RTELoader", + + "YelpFullPipe", + "YelpPolarityPipe", + "SSTPipe", + "SST2Pipe", + "IMDBPipe", + + "Conll2003NERPipe", + "OntoNotesNERPipe", + + "MatchingBertPipe", + "RTEBertPipe", + "SNLIBertPipe", + "QuoraBertPipe", + "QNLIBertPipe", + "MNLIBertPipe", + "MatchingPipe", + "RTEPipe", + "SNLIPipe", + "QuoraPipe", + "QNLIPipe", + "MNLIPipe", + 'ModelLoader', 'ModelSaver', ] @@ -44,4 +73,5 @@ from .base_loader import DataBundle, DataSetLoader from .dataset_loader import CSVLoader, JsonLoader from .model_io import ModelLoader, ModelSaver -from .data_loader import * +from .loader import * +from .pipe import * diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 8e436532..4905a34f 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -1,30 +1,61 @@ - """ Loader用于读取数据,并将内容读取到 :class:`~fastNLP.DataSet` 或者 :class:`~fastNLP.io.DataBundle`中。所有的Loader都支持以下的 - 三个方法: __init__(),_load(), loads(). 其中__init__()用于申明读取参数,以及说明该Loader支持的数据格式,读取后Dataset中field - ; _load(path)方法传入一个文件路径读取单个文件,并返回DataSet; load(paths)用于读取文件夹下的文件,并返回DataBundle, load()方法 - 支持以下三种类型的参数 +三个方法: __init__(),_load(), loads(). 其中__init__()用于申明读取参数,以及说明该Loader支持的数据格式,读取后Dataset中field +; _load(path)方法传入一个文件路径读取单个文件,并返回DataSet; load(paths)用于读取文件夹下的文件,并返回DataBundle, load()方法 +支持以下三种类型的参数:: - Example:: - (0) 如果传入None,将尝试自动下载数据集并缓存。但不是所有的数据都可以直接下载。 - (1) 如果传入的是一个文件path,则返回的DataBundle包含一个名为train的DataSet可以通过data_bundle.datasets['train']获取 - (2) 传入的是一个文件夹目录,将读取的是这个文件夹下文件名中包含'train', 'test', 'dev'的文件,其它文件会被忽略。 - 假设某个目录下的文件为 - -train.txt - -dev.txt - -test.txt - -other.txt - Loader().load('/path/to/dir')读取,返回的data_bundle中可以用data_bundle.datasets['train'], data_bundle.datasets['dev'], - data_bundle.datasets['test']获取对应的DataSet,其中other.txt的内容会被忽略。 - 假设某个目录下的文件为 - -train.txt - -dev.txt - Loader().load('/path/to/dir')读取,返回的data_bundle中可以用data_bundle.datasets['train'], data_bundle.datasets['dev']获取 - 对应的DataSet。 - (3) 传入一个dict,key为dataset的名称,value是该dataset的文件路径。 - paths = {'train':'/path/to/train', 'dev': '/path/to/dev', 'test':'/path/to/test'} - Loader().load(paths) # 返回的data_bundle可以通过以下的方式获取相应的DataSet, data_bundle.datasets['train'], data_bundle.datasets['dev'], - data_bundle.datasets['test'] + (0) 如果传入None,将尝试自动下载数据集并缓存。但不是所有的数据都可以直接下载。 + (1) 如果传入的是一个文件path,则返回的DataBundle包含一个名为train的DataSet可以通过data_bundle.datasets['train']获取 + (2) 传入的是一个文件夹目录,将读取的是这个文件夹下文件名中包含'train', 'test', 'dev'的文件,其它文件会被忽略。 + 假设某个目录下的文件为 + -train.txt + -dev.txt + -test.txt + -other.txt + Loader().load('/path/to/dir')读取,返回的data_bundle中可以用data_bundle.datasets['train'], data_bundle.datasets['dev'], + data_bundle.datasets['test']获取对应的DataSet,其中other.txt的内容会被忽略。 + 假设某个目录下的文件为 + -train.txt + -dev.txt + Loader().load('/path/to/dir')读取,返回的data_bundle中可以用data_bundle.datasets['train'], data_bundle.datasets['dev']获取 + 对应的DataSet。 + (3) 传入一个dict,key为dataset的名称,value是该dataset的文件路径。 + paths = {'train':'/path/to/train', 'dev': '/path/to/dev', 'test':'/path/to/test'} + Loader().load(paths) # 返回的data_bundle可以通过以下的方式获取相应的DataSet, data_bundle.datasets['train'], data_bundle.datasets['dev'], + data_bundle.datasets['test'] """ +__all__ = [ + 'YelpLoader', + 'YelpFullLoader', + 'YelpPolarityLoader', + 'IMDBLoader', + 'SSTLoader', + 'SST2Loader', + + 'ConllLoader', + 'Conll2003Loader', + 'Conll2003NERLoader', + 'OntoNotesNERLoader', + 'CTBLoader', + + 'Loader', + 'CSVLoader', + 'JsonLoader', + + 'CWSLoader', + + 'MNLILoader', + "QuoraLoader", + "SNLILoader", + "QNLILoader", + "RTELoader" +] +from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader +from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader +from .csv import CSVLoader +from .cws import CWSLoader +from .json import JsonLoader +from .loader import Loader +from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py index eff98ba3..05f113c1 100644 --- a/fastNLP/io/loader/matching.py +++ b/fastNLP/io/loader/matching.py @@ -1,4 +1,3 @@ - import warnings from .loader import Loader from .json import JsonLoader @@ -9,12 +8,6 @@ from typing import Union, Dict from ...core import DataSet from ...core import Instance -__all__ = ['MNLILoader', - "QuoraLoader", - "SNLILoader", - "QNLILoader", - "RTELoader"] - class MNLILoader(Loader): """ diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 0cf8d949..4cec3ad5 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -1,8 +1,34 @@ - - """ Pipe用于处理数据,所有的Pipe都包含一个process(DataBundle)方法,传入一个DataBundle对象, 在传入DataBundle上进行原位修改,并将其返回; - process_from_file(paths)传入的文件路径,返回一个DataBundle。process(DataBundle)或者process_from_file(paths)的返回DataBundle - 中的DataSet一般都包含原文与转换为index的输入,以及转换为index的target;除了DataSet之外,还会包含将field转为index时所建立的词表。 +process_from_file(paths)传入的文件路径,返回一个DataBundle。process(DataBundle)或者process_from_file(paths)的返回DataBundle +中的DataSet一般都包含原文与转换为index的输入,以及转换为index的target;除了DataSet之外,还会包含将field转为index时所建立的词表。 + +""" +__all__ = [ + "YelpFullPipe", + "YelpPolarityPipe", + "SSTPipe", + "SST2Pipe", + "IMDBPipe", + + "Conll2003NERPipe", + "OntoNotesNERPipe", + + "MatchingBertPipe", + "RTEBertPipe", + "SNLIBertPipe", + "QuoraBertPipe", + "QNLIBertPipe", + "MNLIBertPipe", + "MatchingPipe", + "RTEPipe", + "SNLIPipe", + "QuoraPipe", + "QNLIPipe", + "MNLIPipe", +] -""" \ No newline at end of file +from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe +from .conll import Conll2003NERPipe, OntoNotesNERPipe +from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ + MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index a64e5328..d370a28a 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -1,4 +1,3 @@ - from nltk import Tree from ..base_loader import DataBundle diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index 4f780614..e62d1a05 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -19,15 +19,16 @@ class _NERPipe(Pipe): :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 """ - def __init__(self, encoding_type:str='bio', lower:bool=False, target_pad_val=0): - if encoding_type == 'bio': + + def __init__(self, encoding_type: str = 'bio', lower: bool = False, target_pad_val=0): + if encoding_type == 'bio': self.convert_tag = iob2 else: self.convert_tag = iob2bioes self.lower = lower self.target_pad_val = int(target_pad_val) - def process(self, data_bundle:DataBundle)->DataBundle: + def process(self, data_bundle: DataBundle) -> DataBundle: """ 支持的DataSet的field为 @@ -146,4 +147,3 @@ class OntoNotesNERPipe(_NERPipe): def process_from_file(self, paths): data_bundle = OntoNotesNERLoader().load(paths) return self.process(data_bundle) - diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 76a0eaf7..1e551f1d 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -6,6 +6,7 @@ from ...core import Const from ...core import Vocabulary from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader + class MatchingBertPipe(Pipe): """ Matching任务的Bert pipe,输出的DataSet将包含以下的field diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py index 14c3866a..76cc00ec 100644 --- a/fastNLP/io/pipe/pipe.py +++ b/fastNLP/io/pipe/pipe.py @@ -1,9 +1,9 @@ - from .. import DataBundle + class Pipe: - def process(self, data_bundle:DataBundle)->DataBundle: + def process(self, data_bundle: DataBundle) -> DataBundle: raise NotImplementedError - def process_from_file(self, paths)->DataBundle: + def process_from_file(self, paths) -> DataBundle: raise NotImplementedError From a8a21b169a38c5172105c0d3b05a78326d11e1eb Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 14 Aug 2019 20:18:54 +0800 Subject: [PATCH 029/153] fix a serial bugs on importing --- fastNLP/core/dist_trainer.py | 6 +++++- fastNLP/embeddings/contextual_embedding.py | 5 ++++- fastNLP/io/config_io.py | 4 +++- fastNLP/io/loader/conll.py | 6 +++--- fastNLP/io/loader/cws.py | 4 ++-- fastNLP/io/loader/loader.py | 2 +- fastNLP/io/loader/matching.py | 6 +++--- fastNLP/io/pipe/classification.py | 5 +++-- fastNLP/io/pipe/conll.py | 2 +- fastNLP/io/pipe/matching.py | 4 ++-- fastNLP/io/pipe/utils.py | 4 ++-- .../seqence_labelling/chinese_ner/data/ChineseNER.py | 2 +- 12 files changed, 30 insertions(+), 20 deletions(-) diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 4a423933..00db6361 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -1,3 +1,6 @@ +""" +正在开发中的分布式训练代码 +""" import torch import torch.cuda import torch.optim @@ -41,7 +44,8 @@ def get_local_rank(): class DistTrainer(): - """Distributed Trainer that support distributed and mixed precision training + """ + Distributed Trainer that support distributed and mixed precision training """ def __init__(self, train_data, model, optimizer=None, loss=None, callbacks_all=None, callbacks_master=None, diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py index 1831af4e..152b0ab9 100644 --- a/fastNLP/embeddings/contextual_embedding.py +++ b/fastNLP/embeddings/contextual_embedding.py @@ -1,4 +1,3 @@ - from abc import abstractmethod import torch @@ -9,6 +8,10 @@ from ..core.sampler import SequentialSampler from ..core.utils import _move_model_to_device, _get_model_device from .embedding import TokenEmbedding +__all__ = [ + "ContextualEmbedding" +] + class ContextualEmbedding(TokenEmbedding): def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0): diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py index 4acdbb96..ac349080 100644 --- a/fastNLP/io/config_io.py +++ b/fastNLP/io/config_io.py @@ -1,7 +1,9 @@ """ 用于读入和处理和保存 config 文件 - .. todo:: + +.. todo:: 这个模块中的类可能被抛弃? + """ __all__ = [ "ConfigLoader", diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py index 43790c15..b2c89ecc 100644 --- a/fastNLP/io/loader/conll.py +++ b/fastNLP/io/loader/conll.py @@ -1,12 +1,12 @@ from typing import Dict, Union from .loader import Loader -from ... import DataSet +from ...core.dataset import DataSet from ..file_reader import _read_conll -from ... import Instance +from ...core.instance import Instance from .. import DataBundle from ..utils import check_loader_paths -from ... import Const +from ...core.const import Const class ConllLoader(Loader): diff --git a/fastNLP/io/loader/cws.py b/fastNLP/io/loader/cws.py index 46c07f28..3af28116 100644 --- a/fastNLP/io/loader/cws.py +++ b/fastNLP/io/loader/cws.py @@ -1,6 +1,6 @@ - from .loader import Loader -from ...core import DataSet, Instance +from ...core.dataset import DataSet +from ...core.instance import Instance class CWSLoader(Loader): diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py index 4cf5bcf3..c59de29f 100644 --- a/fastNLP/io/loader/loader.py +++ b/fastNLP/io/loader/loader.py @@ -1,4 +1,4 @@ -from ... import DataSet +from ...core.dataset import DataSet from .. import DataBundle from ..utils import check_loader_paths from typing import Union, Dict diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py index 05f113c1..58fa0d6f 100644 --- a/fastNLP/io/loader/matching.py +++ b/fastNLP/io/loader/matching.py @@ -1,12 +1,12 @@ import warnings from .loader import Loader from .json import JsonLoader -from ...core import Const +from ...core.const import Const from .. import DataBundle import os from typing import Union, Dict -from ...core import DataSet -from ...core import Instance +from ...core.dataset import DataSet +from ...core.instance import Instance class MNLILoader(Loader): diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index d370a28a..1b111e40 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -4,13 +4,14 @@ from ..base_loader import DataBundle from ...core.vocabulary import Vocabulary from ...core.const import Const from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader -from ...core import DataSet, Instance +from ...core.dataset import DataSet +from ...core.instance import Instance from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance from .pipe import Pipe import re nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') -from ...core import cache_results +from ...core.utils import cache_results class _CLSPipe(Pipe): """ diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index e62d1a05..b9007344 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -1,7 +1,7 @@ from .pipe import Pipe from .. import DataBundle from .utils import iob2, iob2bioes -from ... import Const +from ...core.const import Const from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader from .utils import _indexize, _add_words_field diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 1e551f1d..93e854b1 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -2,8 +2,8 @@ import math from .pipe import Pipe from .utils import get_tokenizer -from ...core import Const -from ...core import Vocabulary +from ...core.const import Const +from ...core.vocabulary import Vocabulary from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py index 59bee96e..5e9ff8dc 100644 --- a/fastNLP/io/pipe/utils.py +++ b/fastNLP/io/pipe/utils.py @@ -1,6 +1,6 @@ from typing import List -from ...core import Vocabulary -from ...core import Const +from ...core.vocabulary import Vocabulary +from ...core.const import Const def iob2(tags:List[str])->List[str]: """ diff --git a/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py b/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py index cec5ab76..0d292bdc 100644 --- a/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py +++ b/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py @@ -51,7 +51,7 @@ class ChineseNERLoader(DataSetLoader): :param paths: :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d] :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd, d] - :return: DataBundle + :return: ~fastNLP.io.DataBundle 包含以下的fields raw_chars: List[str] chars: List[int] From b6bad76415fee9ac9123a36680e810b7b55b918d Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 14 Aug 2019 20:28:49 +0800 Subject: [PATCH 030/153] update the Makefile to make api-extractor work better --- docs/Makefile | 2 +- docs/format.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 docs/format.py diff --git a/docs/Makefile b/docs/Makefile index 2b4de2d8..b9f1cf95 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -14,7 +14,7 @@ help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) apidoc: - $(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ) + $(SPHINXAPIDOC) -efM -o source ../$(SPHINXPROJ) && python3 format.py server: cd build/html && python -m http.server diff --git a/docs/format.py b/docs/format.py new file mode 100644 index 00000000..7cc341c2 --- /dev/null +++ b/docs/format.py @@ -0,0 +1,65 @@ +import os + + +def shorten(file, to_delete, cut=False): + if file.endswith("index.rst") or file.endswith("conf.py"): + return + res = [] + with open(file, "r") as fin: + lines = fin.readlines() + for line in lines: + if cut and line.rstrip() == "Submodules": + break + else: + res.append(line.rstrip()) + for i, line in enumerate(res): + if line.endswith(" package"): + res[i] = res[i][:-len(" package")] + res[i + 1] = res[i + 1][:-len(" package")] + elif line.endswith(" module"): + res[i] = res[i][:-len(" module")] + res[i + 1] = res[i + 1][:-len(" module")] + else: + for name in to_delete: + if line.endswith(name): + res[i] = "del" + + with open(file, "w") as fout: + for line in res: + if line != "del": + print(line, file=fout) + + +def clear(path='./source/'): + files = os.listdir(path) + to_delete = [ + "fastNLP.core.dist_trainer", + "fastNLP.core.predictor", + + "fastNLP.io.file_reader", + "fastNLP.io.config_io", + + "fastNLP.embeddings.contextual_embedding", + + "fastNLP.modules.dropout", + "fastNLP.models.base_model", + "fastNLP.models.bert", + "fastNLP.models.enas_utils", + "fastNLP.models.enas_controller", + "fastNLP.models.enas_model", + "fastNLP.models.enas_trainer", + ] + for file in files: + if not os.path.isdir(path + file): + res = file.split('.') + if len(res) > 4: + to_delete.append(file[:-4]) + elif len(res) == 4: + shorten(path + file, to_delete, True) + else: + shorten(path + file, to_delete) + for file in to_delete: + os.remove(path + file + ".rst") + + +clear() From cdf8406ec1c8f2b48a739d9430b6e328a0bbd745 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 14 Aug 2019 20:29:14 +0800 Subject: [PATCH 031/153] updated docs --- docs/source/fastNLP.core.rst | 3 +-- docs/source/fastNLP.embeddings.rst | 3 +-- docs/source/fastNLP.io.data_loader.rst | 5 +++-- docs/source/fastNLP.io.file_utils.rst | 7 +++++++ docs/source/fastNLP.io.loader.rst | 8 ++++++++ docs/source/fastNLP.io.pipe.rst | 8 ++++++++ docs/source/fastNLP.io.rst | 17 +++++++++++++---- docs/source/fastNLP.io.utils.rst | 7 +++++++ docs/source/fastNLP.models.rst | 3 +-- docs/source/fastNLP.modules.encoder.rst | 1 + docs/source/fastNLP.modules.rst | 13 +++++++++---- docs/source/fastNLP.modules.utils.rst | 7 +++++++ docs/source/fastNLP.rst | 7 +++---- docs/source/modules.rst | 1 - 14 files changed, 69 insertions(+), 21 deletions(-) create mode 100644 docs/source/fastNLP.io.file_utils.rst create mode 100644 docs/source/fastNLP.io.loader.rst create mode 100644 docs/source/fastNLP.io.pipe.rst create mode 100644 docs/source/fastNLP.io.utils.rst create mode 100644 docs/source/fastNLP.modules.utils.rst diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst index cacc6622..08d161b7 100644 --- a/docs/source/fastNLP.core.rst +++ b/docs/source/fastNLP.core.rst @@ -6,11 +6,10 @@ fastNLP.core :undoc-members: :show-inheritance: -子模块 +Submodules ---------- .. toctree:: - :maxdepth: 1 fastNLP.core.batch fastNLP.core.callback diff --git a/docs/source/fastNLP.embeddings.rst b/docs/source/fastNLP.embeddings.rst index 6b168906..6872e91d 100644 --- a/docs/source/fastNLP.embeddings.rst +++ b/docs/source/fastNLP.embeddings.rst @@ -6,11 +6,10 @@ fastNLP.embeddings :undoc-members: :show-inheritance: -子模块 +Submodules ---------- .. toctree:: - :maxdepth: 1 fastNLP.embeddings.bert_embedding fastNLP.embeddings.char_embedding diff --git a/docs/source/fastNLP.io.data_loader.rst b/docs/source/fastNLP.io.data_loader.rst index 8f990102..0b4f5d0b 100644 --- a/docs/source/fastNLP.io.data_loader.rst +++ b/docs/source/fastNLP.io.data_loader.rst @@ -1,7 +1,8 @@ fastNLP.io.data\_loader -========================== +======================= .. automodule:: fastNLP.io.data_loader :members: :undoc-members: - :show-inheritance: \ No newline at end of file + :show-inheritance: + diff --git a/docs/source/fastNLP.io.file_utils.rst b/docs/source/fastNLP.io.file_utils.rst new file mode 100644 index 00000000..944550d7 --- /dev/null +++ b/docs/source/fastNLP.io.file_utils.rst @@ -0,0 +1,7 @@ +fastNLP.io.file\_utils +====================== + +.. automodule:: fastNLP.io.file_utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.io.loader.rst b/docs/source/fastNLP.io.loader.rst new file mode 100644 index 00000000..bbdc1d7a --- /dev/null +++ b/docs/source/fastNLP.io.loader.rst @@ -0,0 +1,8 @@ +fastNLP.io.loader +================= + +.. automodule:: fastNLP.io.loader + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/fastNLP.io.pipe.rst b/docs/source/fastNLP.io.pipe.rst new file mode 100644 index 00000000..bf126585 --- /dev/null +++ b/docs/source/fastNLP.io.pipe.rst @@ -0,0 +1,8 @@ +fastNLP.io.pipe +=============== + +.. automodule:: fastNLP.io.pipe + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst index a97ed67d..0a006709 100644 --- a/docs/source/fastNLP.io.rst +++ b/docs/source/fastNLP.io.rst @@ -6,14 +6,23 @@ fastNLP.io :undoc-members: :show-inheritance: -子模块 +Subpackages +----------- + +.. toctree:: + + fastNLP.io.data_loader + fastNLP.io.loader + fastNLP.io.pipe + +Submodules ---------- .. toctree:: - :maxdepth: 1 fastNLP.io.base_loader - fastNLP.io.embed_loader fastNLP.io.dataset_loader - fastNLP.io.data_loader + fastNLP.io.embed_loader + fastNLP.io.file_utils fastNLP.io.model_io + fastNLP.io.utils diff --git a/docs/source/fastNLP.io.utils.rst b/docs/source/fastNLP.io.utils.rst new file mode 100644 index 00000000..0b3f3938 --- /dev/null +++ b/docs/source/fastNLP.io.utils.rst @@ -0,0 +1,7 @@ +fastNLP.io.utils +================ + +.. automodule:: fastNLP.io.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst index 2ea546e2..36875b85 100644 --- a/docs/source/fastNLP.models.rst +++ b/docs/source/fastNLP.models.rst @@ -6,11 +6,10 @@ fastNLP.models :undoc-members: :show-inheritance: -子模块 +Submodules ---------- .. toctree:: - :maxdepth: 1 fastNLP.models.biaffine_parser fastNLP.models.cnn_text_classification diff --git a/docs/source/fastNLP.modules.encoder.rst b/docs/source/fastNLP.modules.encoder.rst index 0562f12d..e60f9fa4 100644 --- a/docs/source/fastNLP.modules.encoder.rst +++ b/docs/source/fastNLP.modules.encoder.rst @@ -5,3 +5,4 @@ fastNLP.modules.encoder :members: :undoc-members: :show-inheritance: + diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst index 646ef2d3..06494b53 100644 --- a/docs/source/fastNLP.modules.rst +++ b/docs/source/fastNLP.modules.rst @@ -6,12 +6,17 @@ fastNLP.modules :undoc-members: :show-inheritance: -子模块 +Subpackages ----------- .. toctree:: - :titlesonly: - :maxdepth: 1 fastNLP.modules.decoder - fastNLP.modules.encoder \ No newline at end of file + fastNLP.modules.encoder + +Submodules +---------- + +.. toctree:: + + fastNLP.modules.utils diff --git a/docs/source/fastNLP.modules.utils.rst b/docs/source/fastNLP.modules.utils.rst new file mode 100644 index 00000000..c0219435 --- /dev/null +++ b/docs/source/fastNLP.modules.utils.rst @@ -0,0 +1,7 @@ +fastNLP.modules.utils +===================== + +.. automodule:: fastNLP.modules.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst index 0057a184..e3ba429d 100644 --- a/docs/source/fastNLP.rst +++ b/docs/source/fastNLP.rst @@ -1,16 +1,15 @@ -API 文档 -=============== +fastNLP +======= .. automodule:: fastNLP :members: :undoc-members: :show-inheritance: -内部模块 +Subpackages ----------- .. toctree:: - :maxdepth: 1 fastNLP.core fastNLP.embeddings diff --git a/docs/source/modules.rst b/docs/source/modules.rst index 9ca3c7f3..e9a92cb7 100644 --- a/docs/source/modules.rst +++ b/docs/source/modules.rst @@ -2,7 +2,6 @@ fastNLP ======= .. toctree:: - :titlesonly: :maxdepth: 4 fastNLP From ad9d5eba3a437fe5d8a8d517cc583434bd28fbd0 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 14 Aug 2019 20:45:37 +0800 Subject: [PATCH 032/153] fix some mistakes --- fastNLP/embeddings/bert_embedding.py | 4 ++-- fastNLP/io/base_loader.py | 4 ++-- fastNLP/io/file_utils.py | 6 +++++- fastNLP/io/loader/__init__.py | 2 +- fastNLP/io/pipe/classification.py | 4 ++-- fastNLP/io/utils.py | 14 ++++++++------ fastNLP/modules/utils.py | 2 +- 7 files changed, 21 insertions(+), 15 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 261007ae..b079f69f 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -176,9 +176,9 @@ class BertWordPieceEncoder(nn.Module): def index_datasets(self, *datasets, field_name, add_cls_sep=True): """ 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 - bert的pad value。 + bert的pad value。 - :param DataSet datasets: DataSet对象 + :param ~fastNLP.DataSet datasets: DataSet对象 :param str field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。 :param bool add_cls_sep: 如果首尾不是[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP]。 :return: diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py index 01232627..429a8406 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/base_loader.py @@ -128,7 +128,7 @@ class DataBundle: """ 向DataBunlde中增加vocab - :param Vocabulary vocab: 词表 + :param ~fastNLP.Vocabulary vocab: 词表 :param str field_name: 这个vocab对应的field名称 :return: """ @@ -138,7 +138,7 @@ class DataBundle: def set_dataset(self, dataset, name): """ - :param DataSet dataset: 传递给DataBundle的DataSet + :param ~fastNLP.DataSet dataset: 传递给DataBundle的DataSet :param str name: dataset的名称 :return: """ diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index b465ed9b..43fe2ab1 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -84,6 +84,7 @@ def cached_path(url_or_filename:str, cache_dir:str=None, name=None) -> Path: 给定一个url,尝试通过url中的解析出来的文件名字filename到{cache_dir}/{name}/{filename}下寻找这个文件, (1)如果cache_dir=None, 则cache_dir=~/.fastNLP/; 否则cache_dir=cache_dir (2)如果name=None, 则没有中间的{name}这一层结构;否者中间结构就为{name} + 如果有该文件,就直接返回路径 如果没有该文件,则尝试用传入的url下载 @@ -126,8 +127,10 @@ def get_filepath(filepath): 如果filepath为文件夹, 如果内含多个文件, 返回filepath 如果只有一个文件, 返回filepath + filename + 如果filepath为文件 返回filepath + :param str filepath: 路径 :return: """ @@ -237,7 +240,8 @@ def split_filename_suffix(filepath): def get_from_cache(url: str, cache_dir: Path = None) -> Path: """ 尝试在cache_dir中寻找url定义的资源; 如果没有找到; 则从url下载并将结果放在cache_dir下,缓存的名称由url的结果推断而来。会将下载的 - 文件解压,将解压后的文件全部放在cache_dir文件夹中。 + 文件解压,将解压后的文件全部放在cache_dir文件夹中。 + 如果从url中下载的资源解压后有多个文件,则返回目录的路径; 如果只有一个资源文件,则返回具体的路径。 """ cache_dir.mkdir(parents=True, exist_ok=True) diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 4905a34f..8c0d391c 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -1,5 +1,5 @@ """ -Loader用于读取数据,并将内容读取到 :class:`~fastNLP.DataSet` 或者 :class:`~fastNLP.io.DataBundle`中。所有的Loader都支持以下的 +Loader用于读取数据,并将内容读取到 :class:`~fastNLP.DataSet` 或者 :class:`~fastNLP.io.DataBundle` 中。所有的Loader都支持以下的 三个方法: __init__(),_load(), loads(). 其中__init__()用于申明读取参数,以及说明该Loader支持的数据格式,读取后Dataset中field ; _load(path)方法传入一个文件路径读取单个文件,并返回DataSet; load(paths)用于读取文件夹下的文件,并返回DataBundle, load()方法 支持以下三种类型的参数:: diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index 1b111e40..429b6552 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -257,7 +257,7 @@ class SSTPipe(_CLSPipe): "(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare)..." "..." - :param DataBundle data_bundle: 需要处理的DataBundle对象 + :param ~fastNLP.io.DataBundle data_bundle: 需要处理的DataBundle对象 :return: """ # 先取出subtree @@ -407,7 +407,7 @@ class IMDBPipe(_CLSPipe): :param DataBunlde data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和target两个field,且raw_words列应该为str, target列应该为str。 - :return:DataBundle + :return: DataBundle """ # 替换
def replace_br(raw_words): diff --git a/fastNLP/io/utils.py b/fastNLP/io/utils.py index a4ca2954..76b32b0a 100644 --- a/fastNLP/io/utils.py +++ b/fastNLP/io/utils.py @@ -6,12 +6,14 @@ from pathlib import Path def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: """ - 检查传入dataloader的文件的合法性。如果为合法路径,将返回至少包含'train'这个key的dict。类似于下面的结果 - { - 'train': '/some/path/to/', # 一定包含,建词表应该在这上面建立,剩下的其它文件应该只需要处理并index。 - 'test': 'xxx' # 可能有,也可能没有 - ... - } + 检查传入dataloader的文件的合法性。如果为合法路径,将返回至少包含'train'这个key的dict。类似于下面的结果:: + + { + 'train': '/some/path/to/', # 一定包含,建词表应该在这上面建立,剩下的其它文件应该只需要处理并index。 + 'test': 'xxx' # 可能有,也可能没有 + ... + } + 如果paths为不合法的,将直接进行raise相应的错误. 如果paths内不包含train也会报错。 :param str paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录,将在该目录下寻找train(文件名 diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index 21608c5d..ead75711 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -112,7 +112,7 @@ def get_dropout_mask(drop_p: float, tensor: torch.Tensor): 根据tensor的形状,生成一个mask :param drop_p: float, 以多大的概率置为0。 - :param tensor:torch.Tensor + :param tensor: torch.Tensor :return: torch.FloatTensor. 与tensor一样的shape """ mask_x = torch.ones_like(tensor) From 7a21c2a5879685625f7395fdc9ac8f4af0edd564 Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 14 Aug 2019 23:14:59 +0800 Subject: [PATCH 033/153] =?UTF-8?q?1.=20=E5=A2=9E=E5=BC=BABertEmbedding?= =?UTF-8?q?=E4=BD=BF=E5=85=B6=E5=8F=AF=E4=BB=A5=E8=87=AA=E5=8A=A8=E5=88=A4?= =?UTF-8?q?=E6=96=ADtoken=5Ftype=5Fids;=202.=E5=A2=9E=E5=8A=A0CrossEntropy?= =?UTF-8?q?Loss=E4=B8=AD=E5=AF=B9label=20dimension=E7=9A=84=E6=8A=A5?= =?UTF-8?q?=E9=94=99=E5=A4=84=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 4 +- fastNLP/core/trainer.py | 2 +- fastNLP/embeddings/bert_embedding.py | 17 +++- .../joint_cws_parse/models/CharParser.py | 8 +- reproduction/joint_cws_parse/train.py | 85 +++++++++++-------- 5 files changed, 72 insertions(+), 44 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 21c024f0..05e5b440 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -28,6 +28,7 @@ from .utils import _check_arg_dict_list from .utils import _check_function_or_method from .utils import _get_func_signature from .utils import seq_len_to_mask +import warnings class LossBase(object): @@ -226,7 +227,8 @@ class CrossEntropyLoss(LossBase): def get_loss(self, pred, target, seq_len=None): if pred.dim() > 2: if pred.size(1) != target.size(1): # 有可能顺序替换了 - pred = pred.transpose(1, 2) + raise RuntimeError("It seems like that your prediction's shape is (batch_size, num_labels, max_len)." + " It should be (batch_size, max_len, num_labels).") pred = pred.reshape(-1, pred.size(-1)) target = target.reshape(-1) if seq_len is not None: diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index a85b7fee..6d18fd48 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -942,7 +942,7 @@ def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAUL if dev_data is not None: tester = Tester(data=dev_data[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, - batch_size=batch_size, verbose=-1) + batch_size=batch_size, verbose=-1, use_tqdm=False) evaluate_results = tester.test() _check_eval_results(metrics=evaluate_results, metric_key=metric_key, metric_list=metrics) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 261007ae..ea5e84ac 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -11,7 +11,7 @@ from ..core.vocabulary import Vocabulary from ..io.file_utils import _get_embedding_url, cached_path, PRETRAINED_BERT_MODEL_DIR from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer from .contextual_embedding import ContextualEmbedding - +import warnings class BertEmbedding(ContextualEmbedding): """ @@ -229,6 +229,10 @@ class _WordBertModel(nn.Module): # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的 found_count = 0 + self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids + if "[CLS]" in vocab: + warnings.warn("[CLS] detected in your vocabulary. BertEmbedding will add [CSL] and [SEP] to the begin " + "and end of the sentence automatically.") for word, index in vocab: if index == vocab.padding_idx: # pad是个特殊的符号 word = '[PAD]' @@ -316,9 +320,18 @@ class _WordBertModel(nn.Module): word_pieces[:, 0].fill_(self._cls_index) batch_indexes = torch.arange(batch_size).to(words) word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index + if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids + with torch.no_grad(): + sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len + sep_mask_cumsum = sep_mask.flip(dim=-1).cumsum(dim=-1).flip(dim=-1) + token_type_ids = sep_mask_cumsum.fmod(2) + if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 + token_type_ids = token_type_ids.eq(0).float() + else: + token_type_ids = torch.zeros_like(word_pieces) # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] - bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks, + bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, output_all_encoded_layers=True) # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size diff --git a/reproduction/joint_cws_parse/models/CharParser.py b/reproduction/joint_cws_parse/models/CharParser.py index c07c070e..7d89cacb 100644 --- a/reproduction/joint_cws_parse/models/CharParser.py +++ b/reproduction/joint_cws_parse/models/CharParser.py @@ -224,11 +224,11 @@ class CharBiaffineParser(BiaffineParser): batch_size, seq_len, _ = arc_pred.shape flip_mask = (mask == 0) - _arc_pred = arc_pred.clone() - _arc_pred.masked_fill_(flip_mask.unsqueeze(1), -float('inf')) + # _arc_pred = arc_pred.clone() + _arc_pred = arc_pred.masked_fill(flip_mask.unsqueeze(1), -float('inf')) - arc_true[:, 0].fill_(-1) - label_true[:, 0].fill_(-1) + arc_true.data[:, 0].fill_(-1) + label_true.data[:, 0].fill_(-1) arc_nll = F.cross_entropy(_arc_pred.view(-1, seq_len), arc_true.view(-1), ignore_index=-1) label_nll = F.cross_entropy(label_pred.view(-1, label_pred.size(-1)), label_true.view(-1), ignore_index=-1) diff --git a/reproduction/joint_cws_parse/train.py b/reproduction/joint_cws_parse/train.py index 0c34614b..ed4b07f0 100644 --- a/reproduction/joint_cws_parse/train.py +++ b/reproduction/joint_cws_parse/train.py @@ -14,6 +14,7 @@ from torch.optim.lr_scheduler import StepLR from fastNLP import Tester from fastNLP import GradientClipCallback, LRScheduler import os +from fastNLP import cache_results def set_random_seed(random_seed=666): import random, numpy, torch @@ -39,43 +40,42 @@ label_mlp_size = 100 batch_size = 32 update_every = 4 n_epochs = 100 -data_folder = '' # 填写在数据所在文件夹, 文件夹下应该有train, dev, test等三个文件 -vector_folder = '' # 预训练的vector,下面应该包含三个文件: 1grams_t3_m50_corpus.txt, 2grams_t3_m50_corpus.txt, 3grams_t3_m50_corpus.txt +data_name = 'new_ctb7' #################################################### +data_folder = f'/remote-home/hyan01/exps/JointCwsPosParser/data/{data_name}/output' # 填写在数据所在文件夹, 文件夹下应该有train, dev, test等三个文件 +vector_folder = '/remote-home/hyan01/exps/CWS/pretrain/vectors' # 预训练的vector,下面应该包含三个文件: 1grams_t3_m50_corpus.txt, 2grams_t3_m50_corpus.txt, 3grams_t3_m50_corpus.txt set_random_seed(1234) device = 0 -# @cache_results('caches/{}.pkl'.format(data_name)) -# def get_data(): -data = CTBxJointLoader().process(data_folder) - -char_labels_vocab = data.vocabs['char_labels'] - -pre_chars_vocab = data.vocabs['pre_chars'] -pre_bigrams_vocab = data.vocabs['pre_bigrams'] -pre_trigrams_vocab = data.vocabs['pre_trigrams'] - -chars_vocab = data.vocabs['chars'] -bigrams_vocab = data.vocabs['bigrams'] -trigrams_vocab = data.vocabs['trigrams'] - -pre_chars_embed = StaticEmbedding(pre_chars_vocab, - model_dir_or_name=os.path.join(vector_folder, '1grams_t3_m50_corpus.txt'), - init_method=uniform_init, normalize=False) -pre_chars_embed.embedding.weight.data = pre_chars_embed.embedding.weight.data/pre_chars_embed.embedding.weight.data.std() -pre_bigrams_embed = StaticEmbedding(pre_bigrams_vocab, - model_dir_or_name=os.path.join(vector_folder, '2grams_t3_m50_corpus.txt'), - init_method=uniform_init, normalize=False) -pre_bigrams_embed.embedding.weight.data = pre_bigrams_embed.embedding.weight.data/pre_bigrams_embed.embedding.weight.data.std() -pre_trigrams_embed = StaticEmbedding(pre_trigrams_vocab, - model_dir_or_name=os.path.join(vector_folder, '3grams_t3_m50_corpus.txt'), - init_method=uniform_init, normalize=False) -pre_trigrams_embed.embedding.weight.data = pre_trigrams_embed.embedding.weight.data/pre_trigrams_embed.embedding.weight.data.std() - - # return chars_vocab, bigrams_vocab, trigrams_vocab, char_labels_vocab, pre_chars_embed, pre_bigrams_embed, pre_trigrams_embed, data - -# chars_vocab, bigrams_vocab, trigrams_vocab, char_labels_vocab, pre_chars_embed, pre_bigrams_embed, pre_trigrams_embed, data = get_data() +@cache_results('caches/{}.pkl'.format(data_name)) +def get_data(): + data = CTBxJointLoader().process(data_folder) + char_labels_vocab = data.vocabs['char_labels'] + + pre_chars_vocab = data.vocabs['pre_chars'] + pre_bigrams_vocab = data.vocabs['pre_bigrams'] + pre_trigrams_vocab = data.vocabs['pre_trigrams'] + + chars_vocab = data.vocabs['chars'] + bigrams_vocab = data.vocabs['bigrams'] + trigrams_vocab = data.vocabs['trigrams'] + pre_chars_embed = StaticEmbedding(pre_chars_vocab, + model_dir_or_name=os.path.join(vector_folder, '1grams_t3_m50_corpus.txt'), + init_method=uniform_init, normalize=False) + pre_chars_embed.embedding.weight.data = pre_chars_embed.embedding.weight.data / pre_chars_embed.embedding.weight.data.std() + pre_bigrams_embed = StaticEmbedding(pre_bigrams_vocab, + model_dir_or_name=os.path.join(vector_folder, '2grams_t3_m50_corpus.txt'), + init_method=uniform_init, normalize=False) + pre_bigrams_embed.embedding.weight.data = pre_bigrams_embed.embedding.weight.data / pre_bigrams_embed.embedding.weight.data.std() + pre_trigrams_embed = StaticEmbedding(pre_trigrams_vocab, + model_dir_or_name=os.path.join(vector_folder, '3grams_t3_m50_corpus.txt'), + init_method=uniform_init, normalize=False) + pre_trigrams_embed.embedding.weight.data = pre_trigrams_embed.embedding.weight.data / pre_trigrams_embed.embedding.weight.data.std() + + return chars_vocab, bigrams_vocab, trigrams_vocab, char_labels_vocab, pre_chars_embed, pre_bigrams_embed, pre_trigrams_embed, data + +chars_vocab, bigrams_vocab, trigrams_vocab, char_labels_vocab, pre_chars_embed, pre_bigrams_embed, pre_trigrams_embed, data = get_data() print(data) model = CharParser(char_vocab_size=len(chars_vocab), @@ -104,11 +104,24 @@ optimizer = optim.Adam([param for param in model.parameters() if param.requires_ sampler = BucketSampler(seq_len_field_name='seq_lens') callbacks = [] + +from fastNLP.core.callback import Callback +from torch.optim.lr_scheduler import LambdaLR +class SchedulerCallback(Callback): + def __init__(self, scheduler): + super().__init__() + self.scheduler = scheduler + + def on_backward_end(self): + if self.step % self.update_every==0: + self.scheduler.step() + +scheduler = LambdaLR(optimizer, lr_lambda=lambda step:(0.75)**(step//5000)) # scheduler = LambdaLR(optimizer, lr_lambda=lambda step:(0.75)**(step//5000)) -scheduler = StepLR(optimizer, step_size=18, gamma=0.75) -# optim_callback = OptimizerCallback(optimizer, scheduler, update_every) +# scheduler = StepLR(optimizer, step_size=18, gamma=0.75) +scheduler_callback = SchedulerCallback(scheduler) # callbacks.append(optim_callback) -scheduler_callback = LRScheduler(scheduler) +# scheduler_callback = LRScheduler(scheduler) callbacks.append(scheduler_callback) callbacks.append(GradientClipCallback(clip_type='value', clip_value=5)) @@ -119,6 +132,6 @@ callbacks.append(dev_callback) trainer = Trainer(data.datasets['train'], model, loss=None, metrics=metrics, n_epochs=n_epochs, batch_size=batch_size, print_every=3, validate_every=-1, dev_data=data.datasets['dev'], save_path=None, optimizer=optimizer, - check_code_level=0, metric_key='u_f1', sampler=sampler, prefetch=True, use_tqdm=True, + check_code_level=0, metric_key='u_f1', sampler=sampler, num_workers=2, use_tqdm=True, device=device, callbacks=callbacks, update_every=update_every) trainer.train() \ No newline at end of file From 3ae383efc321f46c27652ed3ed2c9ede31a7a2a8 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 15 Aug 2019 01:20:35 +0800 Subject: [PATCH 034/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DBertEmbedding?= =?UTF-8?q?=E4=B8=AD=E9=95=BF=E5=BA=A6=E4=BC=9A=E9=A2=9D=E5=A4=96=E5=8A=A0?= =?UTF-8?q?=E9=95=BF=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 76 +++++++++---------- test/modules/__init__.py | 0 test/modules/decoder/__init__.py | 0 .../decoder}/test_bert.py | 0 4 files changed, 38 insertions(+), 38 deletions(-) create mode 100644 test/modules/__init__.py create mode 100644 test/modules/decoder/__init__.py rename test/{embeddings => modules/decoder}/test_bert.py (100%) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 723cd2d5..db50f9f4 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -290,45 +290,45 @@ class _WordBertModel(nn.Module): :param words: torch.LongTensor, batch_size x max_len :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size """ - batch_size, max_word_len = words.size() - word_mask = words.ne(self._word_pad_index) # 为1的地方有word - seq_len = word_mask.sum(dim=-1) - batch_word_pieces_length = self.word_pieces_lengths[words] # batch_size x max_len - word_pieces_lengths = batch_word_pieces_length.masked_fill(word_mask.eq(0), 0).sum(dim=-1) # batch_size - word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) - if word_piece_length+2>self._max_position_embeddings: - if self.auto_truncate: - word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings, - self._max_position_embeddings-2) - else: - raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " - f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") - - # +2是由于需要加入[CLS]与[SEP] - word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)), - fill_value=self._wordpiece_pad_index) - attn_masks = torch.zeros_like(word_pieces) - # 1. 获取words的word_pieces的id,以及对应的span范围 - word_indexes = words.tolist() - for i in range(batch_size): - word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]])) - if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: - word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] - word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i) - attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) - # 添加[cls]和[sep] - word_pieces[:, 0].fill_(self._cls_index) - batch_indexes = torch.arange(batch_size).to(words) - word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index - if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids - with torch.no_grad(): + with torch.no_grad(): + batch_size, max_word_len = words.size() + word_mask = words.ne(self._word_pad_index) # 为1的地方有word + seq_len = word_mask.sum(dim=-1) + batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), 0) # batch_size x max_len + word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size + word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) + if word_piece_length+2>self._max_position_embeddings: + if self.auto_truncate: + word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings, + self._max_position_embeddings-2) + else: + raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " + f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") + + # +2是由于需要加入[CLS]与[SEP] + word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)), + fill_value=self._wordpiece_pad_index) + attn_masks = torch.zeros_like(word_pieces) + # 1. 获取words的word_pieces的id,以及对应的span范围 + word_indexes = words.cpu().numpy() + for i in range(batch_size): + word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) + if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: + word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] + word_pieces[i, 1:word_pieces_lengths[i]+1] = torch.LongTensor(word_pieces_i) + attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) + # 添加[cls]和[sep] + word_pieces[:, 0].fill_(self._cls_index) + batch_indexes = torch.arange(batch_size).to(words) + word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index + if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len - sep_mask_cumsum = sep_mask.flip(dim=-1).cumsum(dim=-1).flip(dim=-1) - token_type_ids = sep_mask_cumsum.fmod(2) - if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 - token_type_ids = token_type_ids.eq(0).float() - else: - token_type_ids = torch.zeros_like(word_pieces) + sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) + token_type_ids = sep_mask_cumsum.fmod(2) + if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 + token_type_ids = token_type_ids.eq(0).float() + else: + token_type_ids = torch.zeros_like(word_pieces) # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, diff --git a/test/modules/__init__.py b/test/modules/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/modules/decoder/__init__.py b/test/modules/decoder/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/embeddings/test_bert.py b/test/modules/decoder/test_bert.py similarity index 100% rename from test/embeddings/test_bert.py rename to test/modules/decoder/test_bert.py From 02c8fc0de71680847f03481dcde4c575ccb15dd5 Mon Sep 17 00:00:00 2001 From: xuyige Date: Thu, 15 Aug 2019 01:47:16 +0800 Subject: [PATCH 035/153] rename param names in model/bert.py to adjust fastNLP.Const --- fastNLP/models/bert.py | 71 ++++++++++++++++++++++-------------------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py index adecab60..ad7750ec 100644 --- a/fastNLP/models/bert.py +++ b/fastNLP/models/bert.py @@ -2,13 +2,14 @@ bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. """ +import os import torch from torch import nn from .base_model import BaseModel from ..core.const import Const from ..modules.encoder import BertModel -from ..modules.encoder.bert import BertConfig +from ..modules.encoder.bert import BertConfig, CONFIG_FILE class BertForSequenceClassification(BaseModel): @@ -54,6 +55,7 @@ class BertForSequenceClassification(BaseModel): self.num_labels = num_labels if bert_dir is not None: self.bert = BertModel.from_pretrained(bert_dir) + config = BertConfig(os.path.join(bert_dir, CONFIG_FILE)) else: if config is None: config = BertConfig(30522) @@ -67,20 +69,20 @@ class BertForSequenceClassification(BaseModel): model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir) return model - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): - _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + def forward(self, words, seq_len=None, target=None): + _, pooled_output = self.bert(words, attention_mask=seq_len, output_all_encoded_layers=False) pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) - if labels is not None: + if target is not None: loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = loss_fct(logits, target) return {Const.OUTPUT: logits, Const.LOSS: loss} else: return {Const.OUTPUT: logits} - def predict(self, input_ids, token_type_ids=None, attention_mask=None): - logits = self.forward(input_ids, token_type_ids, attention_mask) + def predict(self, words, seq_len=None): + logits = self.forward(words, seq_len=seq_len)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} @@ -140,7 +142,8 @@ class BertForMultipleChoice(BaseModel): model = cls(num_choices=num_choices, config=config, bert_dir=pretrained_model_dir) return model - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): + def forward(self, words, seq_len1=None, seq_len2=None, target=None): + input_ids, token_type_ids, attention_mask = words, seq_len1, seq_len2 flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) @@ -149,15 +152,15 @@ class BertForMultipleChoice(BaseModel): logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, self.num_choices) - if labels is not None: + if target is not None: loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(reshaped_logits, labels) + loss = loss_fct(reshaped_logits, target) return {Const.OUTPUT: reshaped_logits, Const.LOSS: loss} else: return {Const.OUTPUT: reshaped_logits} - def predict(self, input_ids, token_type_ids=None, attention_mask=None): - logits = self.forward(input_ids, token_type_ids, attention_mask)[Const.OUTPUT] + def predict(self, words, seq_len1=None, seq_len2=None,): + logits = self.forward(words, seq_len1=seq_len1, seq_len2=seq_len2)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} @@ -219,27 +222,27 @@ class BertForTokenClassification(BaseModel): model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir) return model - def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): - sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + def forward(self, words, seq_len1=None, seq_len2=None, target=None): + sequence_output, _ = self.bert(words, seq_len1, seq_len2, output_all_encoded_layers=False) sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) - if labels is not None: + if target is not None: loss_fct = nn.CrossEntropyLoss() # Only keep active parts of the loss - if attention_mask is not None: - active_loss = attention_mask.view(-1) == 1 + if seq_len2 is not None: + active_loss = seq_len2.view(-1) == 1 active_logits = logits.view(-1, self.num_labels)[active_loss] - active_labels = labels.view(-1)[active_loss] + active_labels = target.view(-1)[active_loss] loss = loss_fct(active_logits, active_labels) else: - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = loss_fct(logits.view(-1, self.num_labels), target.view(-1)) return {Const.OUTPUT: logits, Const.LOSS: loss} else: return {Const.OUTPUT: logits} - def predict(self, input_ids, token_type_ids=None, attention_mask=None): - logits = self.forward(input_ids, token_type_ids, attention_mask)[Const.OUTPUT] + def predict(self, words, seq_len1=None, seq_len2=None): + logits = self.forward(words, seq_len1, seq_len2)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} @@ -304,34 +307,34 @@ class BertForQuestionAnswering(BaseModel): model = cls(config=config, bert_dir=pretrained_model_dir) return model - def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None): - sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + def forward(self, words, seq_len1=None, seq_len2=None, target1=None, target2=None): + sequence_output, _ = self.bert(words, seq_len1, seq_len2, output_all_encoded_layers=False) logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) - if start_positions is not None and end_positions is not None: + if target1 is not None and target2 is not None: # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) + if len(target1.size()) > 1: + target1 = target1.squeeze(-1) + if len(target2.size()) > 1: + target2 = target2.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) + target1.clamp_(0, ignored_index) + target2.clamp_(0, ignored_index) loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) + start_loss = loss_fct(start_logits, target1) + end_loss = loss_fct(end_logits, target2) total_loss = (start_loss + end_loss) / 2 return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits, Const.LOSS: total_loss} else: return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits} - def predict(self, input_ids, token_type_ids=None, attention_mask=None, **kwargs): - logits = self.forward(input_ids, token_type_ids, attention_mask) + def predict(self, words, seq_len1=None, seq_len2=None): + logits = self.forward(words, seq_len1, seq_len2) start_logits = logits[Const.OUTPUTS(0)] end_logits = logits[Const.OUTPUTS(1)] return {Const.OUTPUTS(0): torch.argmax(start_logits, dim=-1), From 09e24b3bd7ba05f2dfe31c941a35cb977ac3b6a5 Mon Sep 17 00:00:00 2001 From: xuyige Date: Thu, 15 Aug 2019 02:01:16 +0800 Subject: [PATCH 036/153] update matching pipe. --- fastNLP/io/pipe/matching.py | 69 ++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 93e854b1..9f7c7d68 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -1,4 +1,3 @@ -import math from .pipe import Pipe from .utils import get_tokenizer @@ -19,19 +18,17 @@ class MatchingBertPipe(Pipe): "...", "...", "[...]", ., . words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。 - words列被设置为input,target列被设置为target. + words列被设置为input,target列被设置为target和input(设置为input以方便在forward函数中计算loss, + 如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数的形参名进行传参). :param bool lower: 是否将word小写化。 :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 - :param int max_concat_sent_length: 如果concat后的句子长度超过了该值,则合并后的句子将被截断到这个长度,截断时同时对premise - 和hypothesis按比例截断。 """ - def __init__(self, lower=False, tokenizer:str='raw', max_concat_sent_length:int=480): + def __init__(self, lower=False, tokenizer: str='raw'): super().__init__() self.lower = bool(lower) self.tokenizer = get_tokenizer(tokenizer=tokenizer) - self.max_concat_sent_length = int(max_concat_sent_length) def _tokenize(self, data_bundle, field_names, new_field_names): """ @@ -43,11 +40,15 @@ class MatchingBertPipe(Pipe): """ for name, dataset in data_bundle.datasets.items(): for field_name, new_field_name in zip(field_names, new_field_names): - dataset.apply_field(lambda words:self.tokenizer(words), field_name=field_name, + dataset.apply_field(lambda words: self.tokenizer(words), field_name=field_name, new_field_name=new_field_name) return data_bundle def process(self, data_bundle): + for dataset in data_bundle.datasets.values(): + if dataset.has_field(Const.TARGET): + dataset.drop(lambda x: x[Const.TARGET] == '-') + for name, dataset in data_bundle.datasets.items(): dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0)) dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1)) @@ -57,47 +58,38 @@ class MatchingBertPipe(Pipe): dataset[Const.INPUTS(0)].lower() dataset[Const.INPUTS(1)].lower() - data_bundle = self._tokenize(data_bundle, [Const.INPUTS(0), Const.INPUT(1)], + data_bundle = self._tokenize(data_bundle, [Const.INPUTS(0), Const.INPUTS(1)], [Const.INPUTS(0), Const.INPUTS(1)]) # concat两个words def concat(ins): words0 = ins[Const.INPUTS(0)] words1 = ins[Const.INPUTS(1)] - len0 = len(words0) - len1 = len(words1) - if len0 + len1 > self.max_concat_sent_length: - ratio = self.max_concat_sent_length / (len0 + len1) - len0 = math.floor(ratio * len0) - len1 = math.floor(ratio * len1) - words0 = words0[:len0] - words1 = words1[:len1] - words = words0 + ['[SEP]'] + words1 return words + for name, dataset in data_bundle.datasets.items(): dataset.apply(concat, new_field_name=Const.INPUT) dataset.delete_field(Const.INPUTS(0)) dataset.delete_field(Const.INPUTS(1)) word_vocab = Vocabulary() - word_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT, + word_vocab.from_dataset(*[dataset for name, dataset in data_bundle.datasets.items() if 'train' in name], + field_name=Const.INPUT, no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if - name != 'train']) + 'train' not in name]) word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) - has_target_datasets = [] - for name, dataset in data_bundle.datasets.items(): - if dataset.has_field(Const.TARGET): - has_target_datasets.append(dataset) + has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if + dataset.has_field(Const.TARGET)] target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) data_bundle.set_vocab(word_vocab, Const.INPUT) data_bundle.set_vocab(target_vocab, Const.TARGET) - input_fields = [Const.INPUT, Const.INPUT_LEN] + input_fields = [Const.INPUT, Const.INPUT_LEN, Const.TARGET] target_fields = [Const.TARGET] for name, dataset in data_bundle.datasets.items(): @@ -149,12 +141,14 @@ class MatchingPipe(Pipe): "This site includes a...", "The Government Executive...", "[11, 12, 13,...]", "[2, 7, ...]", 0, 6, 7 "...", "...", "[...]", "[...]", ., ., . - words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target。 + words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target + 和input(设置为input以方便在forward函数中计算loss,如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数 + 的形参名进行传参)。 :param bool lower: 是否将所有raw_words转为小写。 :param str tokenizer: 将原始数据tokenize的方式。支持spacy, raw. spacy是使用spacy切分,raw就是用空格切分。 """ - def __init__(self, lower=False, tokenizer:str='raw'): + def __init__(self, lower=False, tokenizer: str='raw'): super().__init__() self.lower = bool(lower) @@ -170,7 +164,7 @@ class MatchingPipe(Pipe): """ for name, dataset in data_bundle.datasets.items(): for field_name, new_field_name in zip(field_names, new_field_names): - dataset.apply_field(lambda words:self.tokenizer(words), field_name=field_name, + dataset.apply_field(lambda words: self.tokenizer(words), field_name=field_name, new_field_name=new_field_name) return data_bundle @@ -191,34 +185,37 @@ class MatchingPipe(Pipe): data_bundle = self._tokenize(data_bundle, [Const.RAW_WORDS(0), Const.RAW_WORDS(1)], [Const.INPUTS(0), Const.INPUTS(1)]) + for dataset in data_bundle.datasets.values(): + if dataset.has_field(Const.TARGET): + dataset.drop(lambda x: x[Const.TARGET] == '-') + if self.lower: for name, dataset in data_bundle.datasets.items(): dataset[Const.INPUTS(0)].lower() dataset[Const.INPUTS(1)].lower() word_vocab = Vocabulary() - word_vocab.from_dataset(data_bundle.datasets['train'], field_name=[Const.INPUTS(0), Const.INPUTS(1)], + word_vocab.from_dataset(*[dataset for name, dataset in data_bundle.datasets.items() if 'train' in name], + field_name=[Const.INPUTS(0), Const.INPUTS(1)], no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if - name != 'train']) + 'train' not in name]) word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=[Const.INPUTS(0), Const.INPUTS(1)]) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) - has_target_datasets = [] - for name, dataset in data_bundle.datasets.items(): - if dataset.has_field(Const.TARGET): - has_target_datasets.append(dataset) + has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if + dataset.has_field(Const.TARGET)] target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) data_bundle.set_vocab(word_vocab, Const.INPUTS(0)) data_bundle.set_vocab(target_vocab, Const.TARGET) - input_fields = [Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LEN(0), Const.INPUT_LEN(1)] + input_fields = [Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LENS(0), Const.INPUT_LENS(1), Const.TARGET] target_fields = [Const.TARGET] for name, dataset in data_bundle.datasets.items(): - dataset.add_seq_len(Const.INPUTS(0), Const.INPUT_LEN(0)) - dataset.add_seq_len(Const.INPUTS(1), Const.INPUT_LEN(1)) + dataset.add_seq_len(Const.INPUTS(0), Const.INPUT_LENS(0)) + dataset.add_seq_len(Const.INPUTS(1), Const.INPUT_LENS(1)) dataset.set_input(*input_fields, flag=True) dataset.set_target(*target_fields, flag=True) From 392badabf93501c94e1df3ce61defaea2fc31c2a Mon Sep 17 00:00:00 2001 From: ChenXin Date: Thu, 15 Aug 2019 14:00:42 +0800 Subject: [PATCH 037/153] tidy up the BERT download list --- fastNLP/io/file_utils.py | 117 +++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 61 deletions(-) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 43fe2ab1..43f8be62 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -1,4 +1,3 @@ - import os from pathlib import Path from urllib.parse import urlparse @@ -9,35 +8,29 @@ from tqdm import tqdm import shutil from requests import HTTPError - PRETRAINED_BERT_MODEL_DIR = { 'en': 'bert-large-cased-wwm.zip', - 'en-base-uncased': 'bert-base-uncased-3413b23c.zip', - 'en-base-cased': 'bert-base-cased-f89bfe08.zip', - 'en-large-uncased': 'bert-large-uncased-20939f45.zip', - 'en-large-cased': 'bert-large-cased-e0cf90fc.zip', - - 'en-large-cased-wwm': 'bert-large-cased-wwm-a457f118.zip', - 'en-large-uncased-wwm': 'bert-large-uncased-wwm-92a50aeb.zip', - 'en-base-cased-mrpc': 'bert-base-cased-finetuned-mrpc-c7099855.zip', - - 'cn': 'bert-base-chinese-29d0a84a.zip', - 'cn-base': 'bert-base-chinese-29d0a84a.zip', - 'bert-base-chinese': 'bert-base-chinese.zip', - 'bert-base-cased': 'bert-base-cased.zip', - 'bert-base-cased-finetuned-mrpc': 'bert-base-cased-finetuned-mrpc.zip', - 'bert-large-cased-wwm': 'bert-large-cased-wwm.zip', - 'bert-large-uncased': 'bert-large-uncased.zip', - 'bert-large-cased': 'bert-large-cased.zip', - 'bert-base-uncased': 'bert-base-uncased.zip', - 'bert-large-uncased-wwm': 'bert-large-uncased-wwm.zip', - 'bert-chinese-wwm': 'bert-chinese-wwm.zip', - 'bert-base-multilingual-cased': 'bert-base-multilingual-cased.zip', - 'bert-base-multilingual-uncased': 'bert-base-multilingual-uncased.zip', + 'en-large-cased-wwm': 'bert-large-cased-wwm.zip', + 'en-large-uncased-wwm': 'bert-large-uncased-wwm.zip', + + 'en-large-uncased': 'bert-large-uncased.zip', + 'en-large-cased': 'bert-large-cased.zip', + + 'en-base-uncased': 'bert-base-uncased.zip', + 'en-base-cased': 'bert-base-cased.zip', + + 'en-base-cased-mrpc': 'bert-base-cased-finetuned-mrpc.zip', + + 'en-base-multi-cased': 'bert-base-multilingual-cased.zip', + 'en-base-multi-uncased': 'bert-base-multilingual-uncased.zip', + + 'cn': 'bert-chinese-wwm.zip', + 'cn-base': 'bert-base-chinese.zip', + 'cn-wwm': 'bert-chinese-wwm.zip', } PRETRAINED_ELMO_MODEL_DIR = { - 'en': 'elmo_en-d39843fe.tar.gz', + 'en': 'elmo_en_Medium.tar.gz', 'en-small': "elmo_en_Small.zip", 'en-original-5.5b': 'elmo_en_Original_5.5B.zip', 'en-original': 'elmo_en_Original.zip', @@ -45,30 +38,33 @@ PRETRAINED_ELMO_MODEL_DIR = { } PRETRAIN_STATIC_FILES = { - 'en': 'glove.840B.300d-cc1ad5e1.tar.gz', - 'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz', - 'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz", - 'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz", + 'en': 'glove.840B.300d.tar.gz', + + 'en-glove-6b-50d': 'glove.6B.50d.zip', + 'en-glove-6b-100d': 'glove.6B.100d.zip', + 'en-glove-6b-200d': 'glove.6B.200d.zip', + 'en-glove-6b-300d': 'glove.6B.300d.zip', + 'en-glove-42b-300d': 'glove.42B.300d.zip', + 'en-glove-840b-300d': 'glove.840B.300d.zip', + 'en-glove-twitter-27b-25d': 'glove.twitter.27B.25d.zip', + 'en-glove-twitter-27b-50d': 'glove.twitter.27B.50d.zip', + 'en-glove-twitter-27b-100d': 'glove.twitter.27B.100d.zip', + 'en-glove-twitter-27b-200d': 'glove.twitter.27B.200d.zip', + + 'en-word2vec-300': "GoogleNews-vectors-negative300.zip", + 'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", - 'cn': "tencent_cn-dab24577.tar.gz", - 'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz", - 'sgns-literature-word':'sgns.literature.word.txt.zip', - 'glove-42b-300d': 'glove.42B.300d.zip', - 'glove-6b-50d': 'glove.6B.50d.zip', - 'glove-6b-100d': 'glove.6B.100d.zip', - 'glove-6b-200d': 'glove.6B.200d.zip', - 'glove-6b-300d': 'glove.6B.300d.zip', - 'glove-840b-300d': 'glove.840B.300d.zip', - 'glove-twitter-27b-25d': 'glove.twitter.27B.25d.zip', - 'glove-twitter-27b-50d': 'glove.twitter.27B.50d.zip', - 'glove-twitter-27b-100d': 'glove.twitter.27B.100d.zip', - 'glove-twitter-27b-200d': 'glove.twitter.27B.200d.zip' -} + 'en-fasttext-crawl': "crawl-300d-2M.vec.zip", + 'cn': "tencent_cn.txt.zip", + 'cn-tencent': "tencent_cn.txt.zip", + 'cn-fasttext': "cc.zh.300.vec.gz", + 'cn-sgns-literature-word': 'sgns.literature.word.txt.zip', +} DATASET_DIR = { 'aclImdb': "imdb.zip", - "yelp-review-full":"yelp_review_full.tar.gz", + "yelp-review-full": "yelp_review_full.tar.gz", "yelp-review-polarity": "yelp_review_polarity.tar.gz", "mnli": "MNLI.zip", "snli": "SNLI.zip", @@ -79,7 +75,7 @@ DATASET_DIR = { } -def cached_path(url_or_filename:str, cache_dir:str=None, name=None) -> Path: +def cached_path(url_or_filename: str, cache_dir: str = None, name=None) -> Path: """ 给定一个url,尝试通过url中的解析出来的文件名字filename到{cache_dir}/{name}/{filename}下寻找这个文件, (1)如果cache_dir=None, 则cache_dir=~/.fastNLP/; 否则cache_dir=cache_dir @@ -136,7 +132,7 @@ def get_filepath(filepath): """ if os.path.isdir(filepath): files = os.listdir(filepath) - if len(files)==1: + if len(files) == 1: return os.path.join(filepath, files[0]) else: return filepath @@ -180,9 +176,9 @@ def _get_base_url(name): return url + '/' else: URLS = { - 'embedding': "http://dbcloud.irocn.cn:8989/api/public/dl/", - "dataset": "http://dbcloud.irocn.cn:8989/api/public/dl/dataset/" - } + 'embedding': "http://dbcloud.irocn.cn:8989/api/public/dl/", + "dataset": "http://dbcloud.irocn.cn:8989/api/public/dl/dataset/" + } if name.lower() not in URLS: raise KeyError(f"{name} is not recognized.") return URLS[name.lower()] @@ -198,7 +194,7 @@ def _get_embedding_url(type, name): """ PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, "bert": PRETRAINED_BERT_MODEL_DIR, - "static":PRETRAIN_STATIC_FILES} + "static": PRETRAIN_STATIC_FILES} map = PRETRAIN_MAP.get(type, None) if map: filename = map.get(name, None) @@ -273,16 +269,16 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. fd, temp_filename = tempfile.mkstemp() - print("%s not found in cache, downloading to %s"%(url, temp_filename)) + print("%s not found in cache, downloading to %s" % (url, temp_filename)) # GET file object req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) - if req.status_code==200: + if req.status_code == 200: content_length = req.headers.get("Content-Length") total = int(content_length) if content_length is not None else None progress = tqdm(unit="B", total=total, unit_scale=1) with open(temp_filename, "wb") as temp_file: - for chunk in req.iter_content(chunk_size=1024*16): + for chunk in req.iter_content(chunk_size=1024 * 16): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) @@ -300,7 +296,7 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: else: untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) filenames = os.listdir(uncompress_temp_dir) - if len(filenames)==1: + if len(filenames) == 1: if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) @@ -316,9 +312,9 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: if os.path.isdir(uncompress_temp_dir): for filename in os.listdir(uncompress_temp_dir): if os.path.isdir(os.path.join(uncompress_temp_dir, filename)): - shutil.copytree(os.path.join(uncompress_temp_dir, filename), cache_path/filename) + shutil.copytree(os.path.join(uncompress_temp_dir, filename), cache_path / filename) else: - shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path/filename) + shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path / filename) else: shutil.copyfile(uncompress_temp_dir, cache_path) success = True @@ -350,7 +346,7 @@ def unzip_file(file: Path, to: Path): zipObj.extractall(to) -def untar_gz_file(file:Path, to:Path): +def untar_gz_file(file: Path, to: Path): import tarfile with tarfile.open(file, 'r:gz') as tar: @@ -369,12 +365,11 @@ def match_file(dir_name: str, cache_dir: Path) -> str: files = os.listdir(cache_dir) matched_filenames = [] for file_name in files: - if re.match(dir_name+'$', file_name) or re.match(dir_name+'\\..*', file_name): + if re.match(dir_name + '$', file_name) or re.match(dir_name + '\\..*', file_name): matched_filenames.append(file_name) - if len(matched_filenames)==0: + if len(matched_filenames) == 0: return '' - elif len(matched_filenames)==1: + elif len(matched_filenames) == 1: return matched_filenames[-1] else: raise RuntimeError(f"Duplicate matched files:{matched_filenames}, this should be caused by a bug.") - From aaabcd6bab034a7f1da70f694eb9fc21795f29ed Mon Sep 17 00:00:00 2001 From: xuyige Date: Thu, 15 Aug 2019 15:59:10 +0800 Subject: [PATCH 038/153] update io/file_utils.py --- fastNLP/io/file_utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 43f8be62..14766fba 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -21,8 +21,8 @@ PRETRAINED_BERT_MODEL_DIR = { 'en-base-cased-mrpc': 'bert-base-cased-finetuned-mrpc.zip', - 'en-base-multi-cased': 'bert-base-multilingual-cased.zip', - 'en-base-multi-uncased': 'bert-base-multilingual-uncased.zip', + 'multi-base-cased': 'bert-base-multilingual-cased.zip', + 'multi-base-uncased': 'bert-base-multilingual-uncased.zip', 'cn': 'bert-chinese-wwm.zip', 'cn-base': 'bert-base-chinese.zip', @@ -38,7 +38,7 @@ PRETRAINED_ELMO_MODEL_DIR = { } PRETRAIN_STATIC_FILES = { - 'en': 'glove.840B.300d.tar.gz', + 'en': 'glove.840B.300d.zip', 'en-glove-6b-50d': 'glove.6B.50d.zip', 'en-glove-6b-100d': 'glove.6B.100d.zip', @@ -184,26 +184,26 @@ def _get_base_url(name): return URLS[name.lower()] -def _get_embedding_url(type, name): +def _get_embedding_url(embed_type, name): """ 给定embedding类似和名称,返回下载url - :param str type: 支持static, bert, elmo。即embedding的类型 + :param str embed_type: 支持static, bert, elmo。即embedding的类型 :param str name: embedding的名称, 例如en, cn, based等 :return: str, 下载的url地址 """ PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, "bert": PRETRAINED_BERT_MODEL_DIR, "static": PRETRAIN_STATIC_FILES} - map = PRETRAIN_MAP.get(type, None) - if map: - filename = map.get(name, None) + embed_map = PRETRAIN_MAP.get(embed_type, None) + if embed_map: + filename = embed_map.get(name, None) if filename: url = _get_base_url('embedding') + filename return url - raise KeyError("There is no {}. Only supports {}.".format(name, list(map.keys()))) + raise KeyError("There is no {}. Only supports {}.".format(name, list(embed_map.keys()))) else: - raise KeyError(f"There is no {type}. Only supports bert, elmo, static") + raise KeyError(f"There is no {embed_type}. Only supports bert, elmo, static") def _get_dataset_url(name): From a88fe24c34d06e313c04455aaa44b4f933dadb66 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Thu, 15 Aug 2019 16:46:43 +0800 Subject: [PATCH 039/153] update the word2vec download link --- fastNLP/io/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 14766fba..fdec5b1f 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -51,7 +51,7 @@ PRETRAIN_STATIC_FILES = { 'en-glove-twitter-27b-100d': 'glove.twitter.27B.100d.zip', 'en-glove-twitter-27b-200d': 'glove.twitter.27B.200d.zip', - 'en-word2vec-300': "GoogleNews-vectors-negative300.zip", + 'en-word2vec-300': "GoogleNews-vectors-negative300.txt.gz", 'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", 'en-fasttext-crawl': "crawl-300d-2M.vec.zip", From fb436e8239c0be281b833c775df43f47409a69c8 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Thu, 15 Aug 2019 17:06:38 +0800 Subject: [PATCH 040/153] update some docs of io modules --- fastNLP/io/data_loader/__init__.py | 4 +++ fastNLP/io/dataset_loader.py | 5 +++ fastNLP/io/loader/__init__.py | 50 ++++++++++++++++++------------ fastNLP/io/pipe/__init__.py | 7 +++-- 4 files changed, 43 insertions(+), 23 deletions(-) diff --git a/fastNLP/io/data_loader/__init__.py b/fastNLP/io/data_loader/__init__.py index 5d6b08b0..b3ca9021 100644 --- a/fastNLP/io/data_loader/__init__.py +++ b/fastNLP/io/data_loader/__init__.py @@ -1,4 +1,8 @@ """ +.. warning:: + + 本模块在 `0.5.0版本` 中被废弃,由 :mod:`~fastNLP.io.loader` 和 :mod:`~fastNLP.io.pipe` 模块替代。 + 用于读数据集的模块, 可以读取文本分类、序列标注、Matching任务的数据集 这些模块的具体介绍如下,您可以通过阅读 :doc:`教程` 来进行了解。 diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 3e3ac575..e1e06ec9 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -1,4 +1,8 @@ """ +.. warning:: + + 本模块将在 `0.5.0版本` 中被废弃,由 :mod:`~fastNLP.io.loader` 和 :mod:`~fastNLP.io.pipe` 模块替代。 + dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的数据, 并返回 `DataSet` , 得到的 :class:`~fastNLP.DataSet` 对象可以直接传入 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester`, 用于模型的训练和测试。 以SNLI数据集为例:: @@ -11,6 +15,7 @@ dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的 # ... do stuff 为 fastNLP 提供 DataSetLoader 的开发者请参考 :class:`~fastNLP.io.DataSetLoader` 的介绍。 + """ __all__ = [ 'CSVLoader', diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 8c0d391c..5abef0eb 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -1,25 +1,35 @@ """ Loader用于读取数据,并将内容读取到 :class:`~fastNLP.DataSet` 或者 :class:`~fastNLP.io.DataBundle` 中。所有的Loader都支持以下的 -三个方法: __init__(),_load(), loads(). 其中__init__()用于申明读取参数,以及说明该Loader支持的数据格式,读取后Dataset中field -; _load(path)方法传入一个文件路径读取单个文件,并返回DataSet; load(paths)用于读取文件夹下的文件,并返回DataBundle, load()方法 -支持以下三种类型的参数:: - - (0) 如果传入None,将尝试自动下载数据集并缓存。但不是所有的数据都可以直接下载。 - (1) 如果传入的是一个文件path,则返回的DataBundle包含一个名为train的DataSet可以通过data_bundle.datasets['train']获取 - (2) 传入的是一个文件夹目录,将读取的是这个文件夹下文件名中包含'train', 'test', 'dev'的文件,其它文件会被忽略。 - 假设某个目录下的文件为 - -train.txt - -dev.txt - -test.txt - -other.txt - Loader().load('/path/to/dir')读取,返回的data_bundle中可以用data_bundle.datasets['train'], data_bundle.datasets['dev'], - data_bundle.datasets['test']获取对应的DataSet,其中other.txt的内容会被忽略。 - 假设某个目录下的文件为 - -train.txt - -dev.txt - Loader().load('/path/to/dir')读取,返回的data_bundle中可以用data_bundle.datasets['train'], data_bundle.datasets['dev']获取 - 对应的DataSet。 - (3) 传入一个dict,key为dataset的名称,value是该dataset的文件路径。 +三个方法: ``__init__`` , ``_load`` , ``loads`` . 其中 ``__init__(...)`` 用于申明读取参数,以及说明该Loader支持的数据格式, +读取后 :class:`~fastNLP.Dataset` 中的 `field` ; ``_load(path)`` 方法传入文件路径读取单个文件,并返回 :class:`~fastNLP.Dataset` ; +``load(paths)`` 用于读取文件夹下的文件,并返回 :class:`~fastNLP.io.DataBundle` 类型的对象 , load()方法支持以下几种类型的参数: + +0.传入None + 将尝试自动下载数据集并缓存。但不是所有的数据都可以直接下载。 + +1.传入一个文件path + 返回的 data_bundle 包含一个名为 `train` 的 dataset ,可以通过 data_bundle.datasets['train']获取 + +2.传入一个文件夹目录 + 将读取的是这个文件夹下文件名中包含'train', 'test', 'dev'的文件,其它文件会被忽略。假设某个目录下的文件为:: + + -train.txt + -dev.txt + -test.txt + -other.txt + + Loader().load('/path/to/dir')读取,返回的 data_bundle 中可以用 data_bundle.datasets['train'], data_bundle.datasets['dev'], + data_bundle.datasets['test'] 获取对应的DataSet,其中other.txt的内容会被忽略。假设某个目录下的文件为:: + + -train.txt + -dev.txt + + Loader().load('/path/to/dir')读取,返回的 data_bundle 中可以用 data_bundle.datasets['train'], + data_bundle.datasets['dev'] 获取对应的DataSet。 + +3.传入一个dict + key为 dataset 的名称,value 是该 dataset 的文件路径:: + paths = {'train':'/path/to/train', 'dev': '/path/to/dev', 'test':'/path/to/test'} Loader().load(paths) # 返回的data_bundle可以通过以下的方式获取相应的DataSet, data_bundle.datasets['train'], data_bundle.datasets['dev'], data_bundle.datasets['test'] diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 4cec3ad5..6a5e6948 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -1,7 +1,8 @@ """ -Pipe用于处理数据,所有的Pipe都包含一个process(DataBundle)方法,传入一个DataBundle对象, 在传入DataBundle上进行原位修改,并将其返回; -process_from_file(paths)传入的文件路径,返回一个DataBundle。process(DataBundle)或者process_from_file(paths)的返回DataBundle -中的DataSet一般都包含原文与转换为index的输入,以及转换为index的target;除了DataSet之外,还会包含将field转为index时所建立的词表。 +Pipe用于处理数据,所有的Pipe都包含一个 process(data_bundle) 方法,传入一个 :class:`~fastNLP.io.DataBundle` 类型的对象, +在传入 data_bundle 上进行原位修改,并将其返回; process_from_file(paths) 传入的文件路径,返回一个 :class:`~fastNLP.io.DataBundle` 。 +process(data_bundle) 或者 process_from_file(paths)的返回 :class:`~fastNLP.io.DataBundle` 中的 :class:`~fastNLP.DataSet` + 一般都包含原文与转换为index的输入以及转换为index的target;除了 :class:`~fastNLP.DataSet` 之外,还会包含将field转为index时所建立的词表。 """ __all__ = [ From 015376d235074dfda67b38723454072bcb4f7102 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 15 Aug 2019 18:43:24 +0800 Subject: [PATCH 041/153] =?UTF-8?q?1.git=20add=20fastNLP/io/loader/loader.?= =?UTF-8?q?pygit=20add=20fastNLP/io/loader/loader.py=E9=87=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/__init__.py | 2 +- fastNLP/core/vocabulary.py | 2 +- fastNLP/embeddings/bert_embedding.py | 8 ++++ fastNLP/embeddings/static_embedding.py | 32 +++++++------- fastNLP/io/base_loader.py | 4 +- fastNLP/io/file_utils.py | 58 ++++++++++++++++++++++---- fastNLP/io/loader/loader.py | 4 +- fastNLP/io/pipe/conll.py | 2 +- fastNLP/io/pipe/matching.py | 2 +- fastNLP/io/pipe/utils.py | 2 +- 10 files changed, 83 insertions(+), 33 deletions(-) diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index b246c6a0..d92e8f62 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -24,5 +24,5 @@ from .optimizer import Optimizer, SGD, Adam from .sampler import SequentialSampler, BucketSampler, RandomSampler, Sampler from .tester import Tester from .trainer import Trainer -from .utils import cache_results, seq_len_to_mask +from .utils import cache_results, seq_len_to_mask, get_seq_len from .vocabulary import Vocabulary diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index a51c3f92..330d73dd 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -376,7 +376,7 @@ class Vocabulary(object): :return: bool """ return word in self._no_create_word - + def to_index(self, w): """ 将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出``ValueError``:: diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index db50f9f4..fa56419b 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -68,6 +68,10 @@ class BertEmbedding(ContextualEmbedding): else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") + self._word_sep_index = None + if '[SEP]' in vocab: + self._word_sep_index = vocab['[SEP]'] + self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, pooled_cls=pooled_cls, auto_truncate=auto_truncate) @@ -86,7 +90,11 @@ class BertEmbedding(ContextualEmbedding): :param torch.LongTensor words: [batch_size, max_len] :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) """ + if self._word_sep_index: # 不能drop sep + sep_mask = words.eq(self._word_sep_index) words = self.drop_word(words) + if self._word_sep_index: + words.masked_fill_(sep_mask, self._word_sep_index) outputs = self._get_sent_reprs(words) if outputs is not None: return self.dropout(words) diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index d44d7087..78f615f6 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -74,14 +74,10 @@ class StaticEmbedding(TokenEmbedding): if lower: lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown) for word, index in vocab: - if not vocab._is_word_no_create_entry(word): + if vocab._is_word_no_create_entry(word): + lowered_vocab.add_word(word.lower(), no_create_entry=True) + else: lowered_vocab.add_word(word.lower()) # 先加入需要创建entry的 - for word in vocab._no_create_word.keys(): # 不需要创建entry的 - if word in vocab: - lowered_word = word.lower() - if lowered_word not in lowered_vocab.word_count: - lowered_vocab.add_word(lowered_word) - lowered_vocab._no_create_word[lowered_word] += 1 print(f"All word in the vocab have been lowered before finding pretrained vectors. There are {len(vocab)} " f"words, {len(lowered_vocab)} unique lowered words.") if model_path: @@ -90,7 +86,7 @@ class StaticEmbedding(TokenEmbedding): embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) # 需要适配一下 if not hasattr(self, 'words_to_words'): - self.words_to_words = torch.arange(len(lowered_vocab, )).long() + self.words_to_words = torch.arange(len(lowered_vocab)).long() if lowered_vocab.unknown: unknown_idx = lowered_vocab.unknown_idx else: @@ -100,10 +96,11 @@ class StaticEmbedding(TokenEmbedding): for word, index in vocab: if word not in lowered_vocab: word = word.lower() - if lowered_vocab._is_word_no_create_entry(word): # 如果不需要创建entry,已经默认unknown了 - continue + if word not in lowered_vocab and lowered_vocab._is_word_no_create_entry(word): + continue # 如果不需要创建entry,已经默认unknown了 words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)] self.words_to_words = words_to_words + self._word_unk_index = lowered_vocab.unknown_idx # 替换一下unknown的index else: if model_path: embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method) @@ -211,12 +208,14 @@ class StaticEmbedding(TokenEmbedding): print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) for word, index in vocab: if index not in matrix and not vocab._is_word_no_create_entry(word): - if vocab.unknown_idx in matrix: # 如果有unkonwn,用unknown初始化 + if vocab.padding_idx == index: + matrix[index] = torch.zeros(dim) + elif vocab.unknown_idx in matrix: # 如果有unkonwn,用unknown初始化 matrix[index] = matrix[vocab.unknown_idx] else: matrix[index] = None - vectors = self._randomly_init_embed(len(matrix), dim, init_method) + vectors = self._randomly_init_embed(len(vocab), dim, init_method) if vocab._no_create_word_length>0: if vocab.unknown is None: # 创建一个专门的unknown @@ -226,10 +225,13 @@ class StaticEmbedding(TokenEmbedding): unknown_idx = vocab.unknown_idx words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), requires_grad=False) - for order, (index, vec) in enumerate(matrix.items()): + for word, index in vocab: + vec = matrix.get(index, None) if vec is not None: - vectors[order] = vec - words_to_words[index] = order + vectors[index] = vec + words_to_words[index] = index + else: + vectors[index] = vectors[unknown_idx] self.words_to_words = words_to_words else: for index, vec in matrix.items(): diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py index 429a8406..5cbd5bb1 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/base_loader.py @@ -144,7 +144,7 @@ class DataBundle: """ self.datasets[name] = dataset - def get_dataset(self, name:str): + def get_dataset(self, name:str)->DataSet: """ 获取名为name的dataset @@ -153,7 +153,7 @@ class DataBundle: """ return self.datasets[name] - def get_vocab(self, field_name:str): + def get_vocab(self, field_name:str)->Vocabulary: """ 获取field名为field_name对应的vocab diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 43fe2ab1..eb6dea1d 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -78,6 +78,17 @@ DATASET_DIR = { "rte": "RTE.zip" } +PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, + "bert": PRETRAINED_BERT_MODEL_DIR, + "static": PRETRAIN_STATIC_FILES} + +# 用于扩展fastNLP的下载 +FASTNLP_EXTEND_DATASET_URL = 'fastnlp_dataset_url.txt' +FASTNLP_EXTEND_EMBEDDING_URL = {'elmo': 'fastnlp_elmo_url.txt', + 'bert':'fastnlp_bert_url.txt', + 'static': 'fastnlp_static_url.txt' +} + def cached_path(url_or_filename:str, cache_dir:str=None, name=None) -> Path: """ @@ -97,7 +108,7 @@ def cached_path(url_or_filename:str, cache_dir:str=None, name=None) -> Path: :return: """ if cache_dir is None: - data_cache = Path(get_default_cache_path()) + data_cache = Path(get_cache_path()) else: data_cache = cache_dir @@ -146,7 +157,7 @@ def get_filepath(filepath): raise FileNotFoundError(f"{filepath} is not a valid file or directory.") -def get_default_cache_path(): +def get_cache_path(): """ 获取默认的fastNLP存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中,将使用环境变量的值,使得不用每个用户都去下载。 @@ -188,27 +199,51 @@ def _get_base_url(name): return URLS[name.lower()] -def _get_embedding_url(type, name): +def _get_embedding_url(embed_type, name): """ 给定embedding类似和名称,返回下载url - :param str type: 支持static, bert, elmo。即embedding的类型 + :param str embed_type: 支持static, bert, elmo。即embedding的类型 :param str name: embedding的名称, 例如en, cn, based等 :return: str, 下载的url地址 """ - PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, - "bert": PRETRAINED_BERT_MODEL_DIR, - "static":PRETRAIN_STATIC_FILES} - map = PRETRAIN_MAP.get(type, None) + # 从扩展中寻找下载的url + _filename = FASTNLP_EXTEND_EMBEDDING_URL.get(embed_type, None) + if _filename: + url = _read_extend_url_file(_filename, name) + if url: + return url + map = PRETRAIN_MAP.get(embed_type, None) if map: + filename = map.get(name, None) if filename: url = _get_base_url('embedding') + filename return url raise KeyError("There is no {}. Only supports {}.".format(name, list(map.keys()))) else: - raise KeyError(f"There is no {type}. Only supports bert, elmo, static") + raise KeyError(f"There is no {embed_type}. Only supports bert, elmo, static") +def _read_extend_url_file(filename, name)->str: + """ + filename中的内容使用制表符隔开,第一列是名称,第二列是下载的url地址 + + :param str filename: 在默认的路径下寻找file这个文件 + :param str name: 需要寻找的资源的名称 + :return: str or None + """ + cache_dir = get_cache_path() + filepath = os.path.join(cache_dir, filename) + if os.path.exists(filepath): + with open(filepath, 'r', encoding='utf-8') as f: + for line in f: + line = line.strip() + if line: + parts = line.split('\t') + if len(parts) == 2: + if name == parts[0]: + return parts[1] + return None def _get_dataset_url(name): """ @@ -217,6 +252,11 @@ def _get_dataset_url(name): :param str name: 给定dataset的名称,比如imdb, sst-2等 :return: str """ + # 从扩展中寻找下载的url + url = _read_extend_url_file(FASTNLP_EXTEND_DATASET_URL, name) + if url: + return url + filename = DATASET_DIR.get(name, None) if filename: url = _get_base_url('dataset') + filename diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py index c59de29f..607d6920 100644 --- a/fastNLP/io/loader/loader.py +++ b/fastNLP/io/loader/loader.py @@ -3,7 +3,7 @@ from .. import DataBundle from ..utils import check_loader_paths from typing import Union, Dict import os -from ..file_utils import _get_dataset_url, get_default_cache_path, cached_path +from ..file_utils import _get_dataset_url, get_cache_path, cached_path class Loader: def __init__(self): @@ -66,7 +66,7 @@ class Loader: :return: str, 数据集的目录地址。直接到该目录下读取相应的数据即可。 """ - default_cache_path = get_default_cache_path() + default_cache_path = get_cache_path() url = _get_dataset_url(dataset_name) output_dir = cached_path(url_or_filename=url, cache_dir=default_cache_path, name='dataset') diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index b9007344..a49e68b1 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -24,7 +24,7 @@ class _NERPipe(Pipe): if encoding_type == 'bio': self.convert_tag = iob2 else: - self.convert_tag = iob2bioes + self.convert_tag = lambda words: iob2bioes(iob2(words)) self.lower = lower self.target_pad_val = int(target_pad_val) diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 93e854b1..76116345 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -57,7 +57,7 @@ class MatchingBertPipe(Pipe): dataset[Const.INPUTS(0)].lower() dataset[Const.INPUTS(1)].lower() - data_bundle = self._tokenize(data_bundle, [Const.INPUTS(0), Const.INPUT(1)], + data_bundle = self._tokenize(data_bundle, [Const.INPUTS(0), Const.INPUTS(1)], [Const.INPUTS(0), Const.INPUTS(1)]) # concat两个words diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py index 5e9ff8dc..48454b67 100644 --- a/fastNLP/io/pipe/utils.py +++ b/fastNLP/io/pipe/utils.py @@ -61,7 +61,7 @@ def get_tokenizer(tokenizer:str, lang='en'): if tokenizer == 'spacy': import spacy spacy.prefer_gpu() - if lang!='en': + if lang != 'en': raise RuntimeError("Spacy only supports en right right.") en = spacy.load(lang) tokenizer = lambda x: [w.text for w in en.tokenizer(x)] From c9fba2ae96c370fff4f7a6633f66173bc71bd2e9 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 15 Aug 2019 18:46:42 +0800 Subject: [PATCH 042/153] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9static=5Femb?= =?UTF-8?q?ed=E7=9A=84=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/embeddings/test_static_embedding.py | 83 +++++++++++++++++++++++- 1 file changed, 80 insertions(+), 3 deletions(-) diff --git a/test/embeddings/test_static_embedding.py b/test/embeddings/test_static_embedding.py index 0c8fc739..6fd33072 100644 --- a/test/embeddings/test_static_embedding.py +++ b/test/embeddings/test_static_embedding.py @@ -3,13 +3,90 @@ import unittest from fastNLP.embeddings import StaticEmbedding from fastNLP import Vocabulary import torch +import os class TestRandomSameEntry(unittest.TestCase): def test_same_vector(self): - vocab = Vocabulary().add_word_lst(["The", "the", "THE"]) + vocab = Vocabulary().add_word_lst(["The", "the", "THE", 'a', "A"]) embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True) - words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE"]]]) + words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'a', 'A']]]) words = embed(words) embed_0 = words[0, 0] - for i in range(1, words.size(1)): + for i in range(1, 3): assert torch.sum(embed_0==words[0, i]).eq(len(embed_0)) + embed_0 = words[0, 3] + for i in range(3, 5): + assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0)) + + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_same_vector2(self): + vocab = Vocabulary().add_word_lst(["The", 'a', 'b', "the", "THE", "B", 'a', "A"]) + embed = StaticEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.6B.100d.txt', + lower=True) + words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'b', "B", 'a', 'A']]]) + words = embed(words) + embed_0 = words[0, 0] + for i in range(1, 3): + assert torch.sum(embed_0==words[0, i]).eq(len(embed_0)) + embed_0 = words[0, 3] + for i in range(3, 5): + assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0)) + + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_same_vector3(self): + word_lst = ["The", "the"] + no_create_word_lst = ['of', 'Of', 'With', 'with'] + vocab = Vocabulary().add_word_lst(word_lst) + vocab.add_word_lst(no_create_word_lst, no_create_entry=True) + embed = StaticEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + lower=True) + words = torch.LongTensor([[vocab.to_index(word) for word in word_lst+no_create_word_lst]]) + words = embed(words) + + lowered_word_lst = [word.lower() for word in word_lst] + lowered_no_create_word_lst = [word.lower() for word in no_create_word_lst] + lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) + lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) + lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + lower=False) + lowered_words = torch.LongTensor([[lowered_vocab.to_index(word) for word in lowered_word_lst+lowered_no_create_word_lst]]) + lowered_words = lowered_embed(lowered_words) + + all_words = word_lst + no_create_word_lst + + for idx, (word_i, word_j) in enumerate(zip(words[0], lowered_words[0])): + with self.subTest(idx=idx, word=all_words[idx]): + assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size) + + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_same_vector4(self): + # words = [] + # create_word_lst = [] # 需要创建 + # no_create_word_lst = [] + # ignore_word_lst = [] + # with open('/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', 'r', encoding='utf-8') as f: + # for line in f: + # words + word_lst = ["The", "the", "the", "The", "a", "A"] + no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with'] + all_words = word_lst[:-2] + no_create_word_lst[:-2] + vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) + vocab.add_word_lst(no_create_word_lst, no_create_entry=True) + embed = StaticEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + lower=True) + words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) + words = embed(words) + + lowered_word_lst = [word.lower() for word in word_lst] + lowered_no_create_word_lst = [word.lower() for word in no_create_word_lst] + lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) + lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) + lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + lower=False) + lowered_words = torch.LongTensor([[lowered_vocab.to_index(word.lower()) for word in all_words]]) + lowered_words = lowered_embed(lowered_words) + + for idx in range(len(all_words)): + word_i, word_j = words[0, idx], lowered_words[0, idx] + with self.subTest(idx=idx, word=all_words[idx]): + assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size) \ No newline at end of file From f8441787a581652b15b996c6d4b8d456a24f03eb Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 16 Aug 2019 00:58:56 +0800 Subject: [PATCH 043/153] =?UTF-8?q?travis.yml=E6=A0=BC=E5=BC=8F=E9=94=99?= =?UTF-8?q?=E8=AF=AF=EF=BC=8C=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 856ec9c8..0d63417a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,7 @@ language: python python: - "3.6" -env +env: - TRAVIS=1 # command to install dependencies install: From fd37ed60a715e1154f8a642f701f9da042cc90f3 Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 16 Aug 2019 02:19:23 +0800 Subject: [PATCH 044/153] =?UTF-8?q?1.=20Trainer=E5=A2=9E=E5=8A=A0=E4=B8=80?= =?UTF-8?q?=E4=B8=AAdev=5Fbatch=5Fsize=E5=8F=82=E6=95=B0;2.StaticEmbedding?= =?UTF-8?q?=E4=B8=AD=E5=A2=9E=E5=8A=A0min=5Ffreq;?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 6 +- fastNLP/embeddings/static_embedding.py | 82 +++++++++++++++--------- fastNLP/io/pipe/conll.py | 3 +- test/embeddings/test_static_embedding.py | 36 ++++++++--- 4 files changed, 85 insertions(+), 42 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 6d18fd48..a6f4f823 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -422,7 +422,7 @@ class Trainer(object): num_workers=0, n_epochs=10, print_every=5, dev_data=None, metrics=None, metric_key=None, validate_every=-1, save_path=None, use_tqdm=True, device=None, prefetch=False, - callbacks=None, check_code_level=0): + callbacks=None, check_code_level=0, **kwargs): if prefetch and num_workers==0: num_workers = 1 if prefetch: @@ -550,12 +550,12 @@ class Trainer(object): self.use_tqdm = use_tqdm self.pbar = None self.print_every = abs(self.print_every) - + self.kwargs = kwargs if self.dev_data is not None: self.tester = Tester(model=self.model, data=self.dev_data, metrics=self.metrics, - batch_size=self.batch_size, + batch_size=kwargs.get("dev_batch_size", self.batch_size), device=None, # 由上面的部分处理device verbose=0, use_tqdm=self.use_tqdm) diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 78f615f6..12011128 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -10,6 +10,8 @@ from ..core.vocabulary import Vocabulary from ..io.file_utils import PRETRAIN_STATIC_FILES, _get_embedding_url, cached_path from .embedding import TokenEmbedding from ..modules.utils import _get_file_name_base_on_postfix +from copy import deepcopy +from collections import defaultdict class StaticEmbedding(TokenEmbedding): """ @@ -46,12 +48,13 @@ class StaticEmbedding(TokenEmbedding): :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对 :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 为大写的词语开辟一个vector表示,则将lower设置为False。 - :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 + :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 """ def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=100, requires_grad: bool=True, - init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False): + init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1): super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) # 得到cache_path @@ -70,6 +73,28 @@ class StaticEmbedding(TokenEmbedding): else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") + # 缩小vocab + truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq=min_freq and word_count0: - if vocab.unknown is None: # 创建一个专门的unknown - unknown_idx = len(matrix) - vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() - else: - unknown_idx = vocab.unknown_idx - words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), - requires_grad=False) - for word, index in vocab: - vec = matrix.get(index, None) - if vec is not None: - vectors[index] = vec - words_to_words[index] = index - else: - vectors[index] = vectors[unknown_idx] - self.words_to_words = words_to_words + if vocab.unknown is None: # 创建一个专门的unknown + unknown_idx = len(matrix) + vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() else: - for index, vec in matrix.items(): - if vec is not None: - vectors[index] = vec + unknown_idx = vocab.unknown_idx + self.words_to_words = nn.Parameter(torch.full((len(vocab), ), fill_value=unknown_idx).long(), + requires_grad=False) + + for index, (index_in_vocab, vec) in enumerate(matrix.items()): + if vec is not None: + vectors[index] = vec + self.words_to_words[index_in_vocab] = index return vectors diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index a49e68b1..0379a45b 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -138,9 +138,8 @@ class OntoNotesNERPipe(_NERPipe): "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 6 "[...]", "[...]", "[...]", . - + :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 - :param bool delete_unused_fields: 是否删除NER任务中用不到的field。 :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 """ diff --git a/test/embeddings/test_static_embedding.py b/test/embeddings/test_static_embedding.py index 6fd33072..ca97dd75 100644 --- a/test/embeddings/test_static_embedding.py +++ b/test/embeddings/test_static_embedding.py @@ -34,6 +34,7 @@ class TestRandomSameEntry(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_same_vector3(self): + # 验证lower word_lst = ["The", "the"] no_create_word_lst = ['of', 'Of', 'With', 'with'] vocab = Vocabulary().add_word_lst(word_lst) @@ -60,13 +61,7 @@ class TestRandomSameEntry(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_same_vector4(self): - # words = [] - # create_word_lst = [] # 需要创建 - # no_create_word_lst = [] - # ignore_word_lst = [] - # with open('/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', 'r', encoding='utf-8') as f: - # for line in f: - # words + # 验证在有min_freq下的lower word_lst = ["The", "the", "the", "The", "a", "A"] no_create_word_lst = ['of', 'Of', "Of", "of", 'With', 'with'] all_words = word_lst[:-2] + no_create_word_lst[:-2] @@ -89,4 +84,29 @@ class TestRandomSameEntry(unittest.TestCase): for idx in range(len(all_words)): word_i, word_j = words[0, idx], lowered_words[0, idx] with self.subTest(idx=idx, word=all_words[idx]): - assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size) \ No newline at end of file + assert torch.sum(word_i == word_j).eq(lowered_embed.embed_size) + + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_same_vector5(self): + # 检查通过使用min_freq后的word是否内容一致 + word_lst = ["they", "the", "they", "the", 'he', 'he', "a", "A"] + no_create_word_lst = ['of', "of", "she", "she", 'With', 'with'] + all_words = word_lst[:-2] + no_create_word_lst[:-2] + vocab = Vocabulary().add_word_lst(word_lst) + vocab.add_word_lst(no_create_word_lst, no_create_entry=True) + embed = StaticEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + lower=False, min_freq=2) + words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) + words = embed(words) + + min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) + min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True) + min_freq_embed = StaticEmbedding(min_freq_vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + lower=False) + min_freq_words = torch.LongTensor([[min_freq_vocab.to_index(word.lower()) for word in all_words]]) + min_freq_words = min_freq_embed(min_freq_words) + + for idx in range(len(all_words)): + word_i, word_j = words[0, idx], min_freq_words[0, idx] + with self.subTest(idx=idx, word=all_words[idx]): + assert torch.sum(word_i == word_j).eq(min_freq_embed.embed_size) \ No newline at end of file From e0493053a5118c7f90f1ed381ccf877c046b633e Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 16 Aug 2019 10:26:00 +0800 Subject: [PATCH 045/153] update docs of io --- fastNLP/io/__init__.py | 4 ++-- fastNLP/io/loader/__init__.py | 43 ++++++++++++++++++++--------------- fastNLP/io/pipe/__init__.py | 10 ++++---- 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index bf5c2c36..5234b209 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -3,9 +3,9 @@ 1. 用于读入 embedding 的 :doc:`EmbedLoader ` 类, -2. 用于读入不同格式数据的 :doc:`DataSetLoader ` 类 +2. 用于读入不同格式数据的 :doc:`Loader ` 类 -3. 用于读入不同数据集并进行预处理的 :doc:`DataLoader ` 类 +3. 用于处理读入数据的 :doc:`Pipe ` 类 4. 用于保存和载入模型的类, 参考 :doc:`model_io文档` diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 5abef0eb..a4e6a6f5 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -1,38 +1,45 @@ """ Loader用于读取数据,并将内容读取到 :class:`~fastNLP.DataSet` 或者 :class:`~fastNLP.io.DataBundle` 中。所有的Loader都支持以下的 三个方法: ``__init__`` , ``_load`` , ``loads`` . 其中 ``__init__(...)`` 用于申明读取参数,以及说明该Loader支持的数据格式, -读取后 :class:`~fastNLP.Dataset` 中的 `field` ; ``_load(path)`` 方法传入文件路径读取单个文件,并返回 :class:`~fastNLP.Dataset` ; +读取后 :class:`~fastNLP.DataSet` 中的 `field` ; ``_load(path)`` 方法传入文件路径读取单个文件,并返回 :class:`~fastNLP.DataSet` ; ``load(paths)`` 用于读取文件夹下的文件,并返回 :class:`~fastNLP.io.DataBundle` 类型的对象 , load()方法支持以下几种类型的参数: 0.传入None 将尝试自动下载数据集并缓存。但不是所有的数据都可以直接下载。 -1.传入一个文件path - 返回的 data_bundle 包含一个名为 `train` 的 dataset ,可以通过 data_bundle.datasets['train']获取 +1.传入一个文件的 path + 返回的 `data_bundle` 包含一个名为 `train` 的 dataset ,可以通过 ``data_bundle.datasets['train']`` 获取 2.传入一个文件夹目录 - 将读取的是这个文件夹下文件名中包含'train', 'test', 'dev'的文件,其它文件会被忽略。假设某个目录下的文件为:: + 将读取的是这个文件夹下文件名中包含 `train` , `test` , `dev` 的文件,其它文件会被忽略。假设某个目录下的文件为:: - -train.txt - -dev.txt - -test.txt - -other.txt + | + +-train.txt + +-dev.txt + +-test.txt + +-other.txt - Loader().load('/path/to/dir')读取,返回的 data_bundle 中可以用 data_bundle.datasets['train'], data_bundle.datasets['dev'], - data_bundle.datasets['test'] 获取对应的DataSet,其中other.txt的内容会被忽略。假设某个目录下的文件为:: + 在 Loader().load('/path/to/dir') 返回的 `data_bundle` 中可以用 ``data_bundle.datasets['train']`` , ``data_bundle.datasets['dev']`` , + ``data_bundle.datasets['test']`` 获取对应的 `dataset` ,其中 `other.txt` 的内容会被忽略。假设某个目录下的文件为:: - -train.txt - -dev.txt + | + +-train.txt + +-dev.txt - Loader().load('/path/to/dir')读取,返回的 data_bundle 中可以用 data_bundle.datasets['train'], - data_bundle.datasets['dev'] 获取对应的DataSet。 + 在 Loader().load('/path/to/dir') 返回的 `data_bundle` 中可以用 ``data_bundle.datasets['train']`` , + ``data_bundle.datasets['dev']`` 获取对应的 dataset。 -3.传入一个dict - key为 dataset 的名称,value 是该 dataset 的文件路径:: +3.传入一个字典 + 字典的的 key 为 `dataset` 的名称,value 是该 `dataset` 的文件路径:: paths = {'train':'/path/to/train', 'dev': '/path/to/dev', 'test':'/path/to/test'} - Loader().load(paths) # 返回的data_bundle可以通过以下的方式获取相应的DataSet, data_bundle.datasets['train'], data_bundle.datasets['dev'], - data_bundle.datasets['test'] + + 在 Loader().load(paths) 返回的 `data_bundle` 中可以用 ``data_bundle.datasets['train']`` , ``data_bundle.datasets['dev']`` , + ``data_bundle.datasets['test']`` 来获取对应的 `dataset` + +fastNLP 目前提供了如下的 Loader + + """ diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 6a5e6948..ad68f486 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -1,8 +1,10 @@ """ -Pipe用于处理数据,所有的Pipe都包含一个 process(data_bundle) 方法,传入一个 :class:`~fastNLP.io.DataBundle` 类型的对象, -在传入 data_bundle 上进行原位修改,并将其返回; process_from_file(paths) 传入的文件路径,返回一个 :class:`~fastNLP.io.DataBundle` 。 -process(data_bundle) 或者 process_from_file(paths)的返回 :class:`~fastNLP.io.DataBundle` 中的 :class:`~fastNLP.DataSet` - 一般都包含原文与转换为index的输入以及转换为index的target;除了 :class:`~fastNLP.DataSet` 之外,还会包含将field转为index时所建立的词表。 +Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``process`` 和 ``process_from_file`` 两种方法。 +``process(data_bundle)`` 传入一个 :class:`~fastNLP.io.DataBundle` 类型的对象, 在传入的 `data_bundle` 上进行原位修改,并将其返回; +``process_from_file(paths)`` 传入的文件路径,返回一个 :class:`~fastNLP.io.DataBundle` 类型的对象。 +``process(data_bundle)`` 或者 ``process_from_file(paths)`` 的返回 `data_bundle` 中的 :class:`~fastNLP.DataSet` +一般都包含原文与转换为index的输入以及转换为index的target;除了 :class:`~fastNLP.DataSet` 之外, +`data_bundle` 还会包含将field转为index时所建立的词表。 """ __all__ = [ From 620ad161e0b4dc61fa239b9d053808885409a109 Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Fri, 16 Aug 2019 13:16:45 +0800 Subject: [PATCH 046/153] Update tutorial_4_loss_optimizer.rst --- docs/source/tutorials/tutorial_4_loss_optimizer.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/tutorials/tutorial_4_loss_optimizer.rst b/docs/source/tutorials/tutorial_4_loss_optimizer.rst index a6e1730a..f863a7a8 100644 --- a/docs/source/tutorials/tutorial_4_loss_optimizer.rst +++ b/docs/source/tutorials/tutorial_4_loss_optimizer.rst @@ -158,6 +158,7 @@ Vocabulary 的使用 损失函数 训练模型需要提供一个损失函数 ,fastNLP中提供了直接可以导入使用的四种loss,分别为: + * :class:`~fastNLP.CrossEntropyLoss`:包装了torch.nn.functional.cross_entropy()函数,返回交叉熵损失(可以运用于多分类场景) * :class:`~fastNLP.BCELoss`:包装了torch.nn.functional.binary_cross_entropy()函数,返回二分类的交叉熵 * :class:`~fastNLP.L1Loss`:包装了torch.nn.functional.l1_loss()函数,返回L1 损失 @@ -209,7 +210,7 @@ Vocabulary 的使用 #使用CNNText的时候第一个参数输入一个tuple,作为模型定义embedding的参数 #还可以传入 kernel_nums, kernel_sizes, padding, dropout的自定义值 - model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=3, padding=2, dropout=0.1) + model_cnn = CNNText((len(vocab),EMBED_DIM), num_classes=3, dropout=0.1) #如果在定义trainer的时候没有传入optimizer参数,模型默认的优化器为torch.optim.Adam且learning rate为lr=4e-3 #这里只使用了optimizer_1作为优化器输入,感兴趣可以尝试optimizer_2或者其他优化器作为输入 From cd395a7cdf9461d6e0b5866f6c29e6d6b598c8f8 Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Fri, 16 Aug 2019 13:18:35 +0800 Subject: [PATCH 047/153] Update tutorial_5_datasetiter.rst --- docs/source/tutorials/tutorial_5_datasetiter.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tutorials/tutorial_5_datasetiter.rst b/docs/source/tutorials/tutorial_5_datasetiter.rst index 23d26deb..e81b18dd 100644 --- a/docs/source/tutorials/tutorial_5_datasetiter.rst +++ b/docs/source/tutorials/tutorial_5_datasetiter.rst @@ -192,7 +192,7 @@ sampler import time embed_dim = 100 - model = CNNText((len(vocab),embed_dim), num_classes=3, padding=2, dropout=0.1) + model = CNNText((len(vocab),embed_dim), num_classes=3, dropout=0.1) def train(epoch, data, devdata): optimizer = torch.optim.Adam(model.parameters(), lr=0.001) From 31f35ad61736432923706c13ecfc123eab03e130 Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Fri, 16 Aug 2019 13:57:24 +0800 Subject: [PATCH 048/153] Update bert_embedding.py --- fastNLP/embeddings/bert_embedding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index aa72898a..5d46d98c 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -27,6 +27,7 @@ class BertEmbedding(ContextualEmbedding): >>> import torch >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import BertEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) >>> embed = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1') >>> words = torch.LongTensor([[vocab.to_index(word) for word in "The whether is good .".split()]]) From d631d136dc69cb0c23fc999a175c0296a505d7af Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Fri, 16 Aug 2019 14:00:38 +0800 Subject: [PATCH 049/153] Update char_embedding.py --- fastNLP/embeddings/char_embedding.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index b9e6659e..b0bd6796 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -24,6 +24,9 @@ class CNNCharEmbedding(TokenEmbedding): Example:: + >>> import torch + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import CNNCharEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) >>> embed = CNNCharEmbedding(vocab, embed_size=50) >>> words = torch.LongTensor([[vocab.to_index(word) for word in "The whether is good .".split()]]) @@ -167,6 +170,9 @@ class LSTMCharEmbedding(TokenEmbedding): Example:: + >>> import torch + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import LSTMCharEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) >>> embed = LSTMCharEmbedding(vocab, embed_size=50) >>> words = torch.LongTensor([[vocab.to_index(word) for word in "The whether is good .".split()]]) From 7fe4223d10934ab4f15ffeec1b9399a1415731ea Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Fri, 16 Aug 2019 14:14:30 +0800 Subject: [PATCH 050/153] Update embedding.py --- fastNLP/embeddings/embedding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index 111bacd0..a02e7a20 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -20,6 +20,7 @@ class Embedding(nn.Module): Example:: >>> import numpy as np + >>> from fastNLP.embeddings import Embedding >>> init_embed = (2000, 100) >>> embed = Embedding(init_embed) # 随机初始化一个具有2000个词,每个词表示为100维的词向量 >>> init_embed = np.zeros((2000, 100)) From 764123031778d881e9a15dae78ccbceb0f393a07 Mon Sep 17 00:00:00 2001 From: yhcc Date: Fri, 16 Aug 2019 14:46:44 +0800 Subject: [PATCH 051/153] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E8=87=AA=E5=8A=A8?= =?UTF-8?q?=E4=B8=8B=E8=BD=BD=E6=96=87=E4=BB=B6=E5=90=8E=E7=BC=80=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/io/file_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 8d04c8be..a4724818 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -9,7 +9,7 @@ import shutil from requests import HTTPError PRETRAINED_BERT_MODEL_DIR = { - 'en': 'bert-large-cased-wwm.zip', + 'en': 'bert-base-cased.zip', 'en-large-cased-wwm': 'bert-large-cased-wwm.zip', 'en-large-uncased-wwm': 'bert-large-uncased-wwm.zip', @@ -30,7 +30,7 @@ PRETRAINED_BERT_MODEL_DIR = { } PRETRAINED_ELMO_MODEL_DIR = { - 'en': 'elmo_en_Medium.tar.gz', + 'en': 'elmo_en_Medium.zip', 'en-small': "elmo_en_Small.zip", 'en-original-5.5b': 'elmo_en_Original_5.5B.zip', 'en-original': 'elmo_en_Original.zip', From 58d7742b6626f4c6a5850f4b8277c1c5cdb1c150 Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 16 Aug 2019 16:12:00 +0800 Subject: [PATCH 052/153] =?UTF-8?q?1.=E5=A2=9E=E5=8A=A0EvaluateCallback?= =?UTF-8?q?=E5=AE=9E=E7=8E=B0=E5=9C=A8=E9=99=A4dev=E4=BB=A5=E5=A4=96?= =?UTF-8?q?=E7=9A=84=E6=95=B0=E6=8D=AE=E9=9B=86=E9=AA=8C=E8=AF=81=E7=9A=84?= =?UTF-8?q?=E9=9C=80=E6=B1=82;=202.StaticEmbedding=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=B8=80=E4=B8=AAonly=5Ftrian=5Fmin=5Ffreq=E9=80=89=E9=A1=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/__init__.py | 1 + fastNLP/core/callback.py | 95 ++++++++++++++++++++++---- fastNLP/core/trainer.py | 2 +- fastNLP/embeddings/bert_embedding.py | 12 ++-- fastNLP/embeddings/static_embedding.py | 11 ++- fastNLP/io/file_utils.py | 8 +-- 6 files changed, 103 insertions(+), 26 deletions(-) diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index d92e8f62..eeabda35 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -14,6 +14,7 @@ core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fa """ from .batch import DataSetIter, BatchIter, TorchLoaderIter from .callback import Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC +from .callback import EvaluateCallback, FitlogCallback, SaveModelCallback from .const import Const from .dataset import DataSet from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 1cc5d53b..633c6f45 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -57,6 +57,7 @@ __all__ = [ "FitlogCallback", "LRScheduler", "ControlC", + "EvaluateCallback", "CallbackException", "EarlyStopError" @@ -504,10 +505,9 @@ class FitlogCallback(Callback): 并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则 fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。 - :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 - DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。若tester不为None时,data需要通过 - dict的方式传入。如果仅传入DataSet, 则被命名为test - :param ~fastNLP.Tester tester: Tester对象,将在on_valid_end时调用。tester中的DataSet会被称为为`test` + :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要 + 传入多个DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。data的结果的名称以'data'开头。 + :param ~fastNLP.Tester,Dict[~fastNLP.Tester] tester: Tester对象,将在on_valid_end时调用。tester的结果的名称以'tester'开头 :param int log_loss_every: 多少个step记录一次loss(记录的是这几个batch的loss平均值),如果数据集较大建议将该值设置得 大一些,不然会导致log文件巨大。默认为0, 即不要记录loss。 :param int verbose: 是否在终端打印evaluation的结果,0不打印。 @@ -521,20 +521,23 @@ class FitlogCallback(Callback): self._log_exception = log_exception assert isinstance(log_loss_every, int) and log_loss_every>=0 if tester is not None: - assert isinstance(tester, Tester), "Only fastNLP.Tester allowed." - assert isinstance(data, dict) or data is None, "If tester is not None, only dict[DataSet] allowed for data." - if data is not None: - assert 'test' not in data, "Cannot use `test` as DataSet key, when tester is passed." - setattr(tester, 'verbose', 0) - self.testers['test'] = tester - + if isinstance(tester, dict): + for name, test in tester.items(): + if not isinstance(test, Tester): + raise TypeError(f"{name} in tester is not a valid fastNLP.Tester.") + self.testers['tester-' + name] = test + if isinstance(tester, Tester): + self.testers['tester-test'] = tester + for tester in self.testers.values(): + setattr(tester, 'verbose', 0) + if isinstance(data, dict): for key, value in data.items(): assert isinstance(value, DataSet), f"Only DataSet object is allowed, not {type(value)}." for key, value in data.items(): - self.datasets[key] = value + self.datasets['data-' + key] = value elif isinstance(data, DataSet): - self.datasets['test'] = data + self.datasets['data-test'] = data elif data is not None: raise TypeError("data receives dict[DataSet] or DataSet object.") @@ -548,8 +551,11 @@ class FitlogCallback(Callback): if len(self.datasets) > 0: for key, data in self.datasets.items(): - tester = Tester(data=data, model=self.model, batch_size=self.batch_size, metrics=self.trainer.metrics, - verbose=0) + tester = Tester(data=data, model=self.model, + batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), + metrics=self.trainer.metrics, + verbose=0, + use_tqdm=self.trainer.use_tqdm) self.testers[key] = tester fitlog.add_progress(total_steps=self.n_steps) @@ -589,6 +595,65 @@ class FitlogCallback(Callback): fitlog.add_other(repr(exception), name='except_info') +class EvaluateCallback(Callback): + """ + 别名: :class:`fastNLP.EvaluateCallback` :class:`fastNLP.core.callback.EvaluateCallback` + + 该callback用于扩展Trainer训练过程中只能对dev数据进行验证的问题。 + + :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 + DataSet请通过dict的方式传入。 + :param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象,将在on_valid_end时调用。 + """ + + def __init__(self, data=None, tester=None): + super().__init__() + self.datasets = {} + self.testers = {} + if tester is not None: + if isinstance(tester, dict): + for name, test in tester.items(): + if not isinstance(test, Tester): + raise TypeError(f"{name} in tester is not a valid fastNLP.Tester.") + self.testers['tester-' + name] = test + if isinstance(tester, Tester): + self.testers['tester-test'] = tester + for tester in self.testers.values(): + setattr(tester, 'verbose', 0) + + if isinstance(data, dict): + for key, value in data.items(): + assert isinstance(value, DataSet), f"Only DataSet object is allowed, not {type(value)}." + for key, value in data.items(): + self.datasets['data-' + key] = value + elif isinstance(data, DataSet): + self.datasets['data-test'] = data + elif data is not None: + raise TypeError("data receives dict[DataSet] or DataSet object.") + + def on_train_begin(self): + if len(self.datasets) > 0and self.trainer.dev_data is None: + raise RuntimeError("Trainer has no dev data, you cannot pass extra DataSet to do evaluation.") + + if len(self.datasets) > 0: + for key, data in self.datasets.items(): + tester = Tester(data=data, model=self.model, + batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), + metrics=self.trainer.metrics, verbose=0, + use_tqdm=self.trainer.use_tqdm) + self.testers[key] = tester + + def on_valid_end(self, eval_result, metric_key, optimizer, better_result): + if len(self.testers) > 0: + for key, tester in self.testers.items(): + try: + eval_result = tester.test() + self.pbar.write("Evaluation on {}:".format(key)) + self.pbar.write(tester._format_eval_results(eval_result)) + except Exception: + self.pbar.write("Exception happens when evaluate on DataSet named `{}`.".format(key)) + + class LRScheduler(Callback): """ 别名::class:`fastNLP.LRScheduler` :class:`fastNLP.core.callback.LRScheduler` diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index a6f4f823..0d239048 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -690,7 +690,7 @@ class Trainer(object): (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ and self.dev_data is not None: eval_res = self._do_validation(epoch=epoch, step=self.step) - eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, + eval_str = "Evaluation on dev at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, self.n_steps) + \ self.tester._format_eval_results(eval_res) pbar.write(eval_str + '\n') diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index fa56419b..8ec5fd50 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -74,7 +74,7 @@ class BertEmbedding(ContextualEmbedding): self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, - pooled_cls=pooled_cls, auto_truncate=auto_truncate) + pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) self.requires_grad = requires_grad self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size @@ -209,7 +209,7 @@ class BertWordPieceEncoder(nn.Module): class _WordBertModel(nn.Module): def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', - include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False): + include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False, min_freq=2): super().__init__() self.tokenzier = BertTokenizer.from_pretrained(model_dir) @@ -238,9 +238,12 @@ class _WordBertModel(nn.Module): word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的 found_count = 0 self._has_sep_in_vocab = '[SEP]' in vocab # 用来判断传入的数据是否需要生成token_ids + if '[sep]' in vocab: + warnings.warn("Lower cased [sep] detected, it cannot be correctly recognized as [SEP] by BertEmbedding.") if "[CLS]" in vocab: warnings.warn("[CLS] detected in your vocabulary. BertEmbedding will add [CSL] and [SEP] to the begin " - "and end of the sentence automatically.") + "and end of the input automatically, make sure you don't add [CLS] and [SEP] at the begin" + " and end.") for word, index in vocab: if index == vocab.padding_idx: # pad是个特殊的符号 word = '[PAD]' @@ -250,7 +253,8 @@ class _WordBertModel(nn.Module): if len(word_pieces)==1: if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到 if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面 - word_piece_dict[word] = 1 # 新增一个值 + if vocab.word_count[word]>=min_freq and not vocab._is_word_no_create_entry(word): #出现次数大于这个次数才新增 + word_piece_dict[word] = 1 # 新增一个值 continue for word_piece in word_pieces: word_piece_dict[word_piece] = 1 diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 12011128..4b25ea8d 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -54,7 +54,7 @@ class StaticEmbedding(TokenEmbedding): :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 """ def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=100, requires_grad: bool=True, - init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1): + init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) # 得到cache_path @@ -73,7 +73,7 @@ class StaticEmbedding(TokenEmbedding): else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - # 缩小vocab + # 根据min_freq缩小vocab truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq=min_freq and word_count Path: if not cache_path.exists(): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. - fd, temp_filename = tempfile.mkstemp() - print("%s not found in cache, downloading to %s" % (url, temp_filename)) - # GET file object req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) if req.status_code == 200: content_length = req.headers.get("Content-Length") total = int(content_length) if content_length is not None else None progress = tqdm(unit="B", total=total, unit_scale=1) + fd, temp_filename = tempfile.mkstemp() + print("%s not found in cache, downloading to %s" % (url, temp_filename)) + with open(temp_filename, "wb") as temp_file: for chunk in req.iter_content(chunk_size=1024 * 16): if chunk: # filter out keep-alive new chunks @@ -373,7 +373,7 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: os.remove(temp_filename) return get_filepath(cache_path) else: - raise HTTPError(f"Fail to download from {url}.") + raise HTTPError(f"Status code:{req.status_code}. Fail to download from {url}.") def unzip_file(file: Path, to: Path): From 3eb986f86fa806a7577779e1e9849baffcb701a1 Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Fri, 16 Aug 2019 16:16:55 +0800 Subject: [PATCH 053/153] Update elmo_embedding.py --- fastNLP/embeddings/elmo_embedding.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index af94e8ec..73def086 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -21,6 +21,9 @@ class ElmoEmbedding(ContextualEmbedding): Example:: + >>> import torch + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import ElmoEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) >>> # 使用不同层的concat的结果 >>> embed = ElmoEmbedding(vocab, model_dir_or_name='en', layers='1,2', requires_grad=False) From 5fac9867ae8290cb337584ca04d7bd22d96ded9e Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Fri, 16 Aug 2019 16:45:21 +0800 Subject: [PATCH 054/153] Update stack_embedding.py --- fastNLP/embeddings/stack_embedding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastNLP/embeddings/stack_embedding.py b/fastNLP/embeddings/stack_embedding.py index 8091d598..d3ce462b 100644 --- a/fastNLP/embeddings/stack_embedding.py +++ b/fastNLP/embeddings/stack_embedding.py @@ -17,7 +17,7 @@ class StackEmbedding(TokenEmbedding): >>> from fastNLP import Vocabulary >>> from fastNLP.embeddings import StaticEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) - >>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) + >>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d', requires_grad=True) >>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True) :param embeds: 一个由若干个TokenEmbedding组成的list,要求每一个TokenEmbedding的词表都保持一致 @@ -91,4 +91,4 @@ class StackEmbedding(TokenEmbedding): for embed in self.embeds: outputs.append(embed(words)) outputs = self.dropout(torch.cat(outputs, dim=-1)) - return outputs \ No newline at end of file + return outputs From e22a94f9f08346dca9132768b7dba455af246b3e Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Fri, 16 Aug 2019 16:46:53 +0800 Subject: [PATCH 055/153] Update static_embedding.py --- fastNLP/embeddings/static_embedding.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 94f7adb5..c2aa1c49 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -20,12 +20,14 @@ class StaticEmbedding(TokenEmbedding): 当前支持自动下载的预训练vector有以下的几种(待补充); Example:: - + + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import StaticEmbedding >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) - >>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-50') + >>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-50d') >>> vocab = Vocabulary().add_word_lst(["The", 'the', "THE"]) - >>> embed = StaticEmbedding(vocab, model_dir_or_name="en-glove-50", lower=True) + >>> embed = StaticEmbedding(vocab, model_dir_or_name="en-glove-50d", lower=True) >>> # "the", "The", "THE"它们共用一个vector,且将使用"the"在预训练词表中寻找它们的初始化表示。 >>> vocab = Vocabulary().add_word_lst(["The", "the", "THE"]) From e92408c543425cbdc14a137c596fb777589501be Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 16 Aug 2019 16:53:02 +0800 Subject: [PATCH 056/153] update docs of io.file_utils --- fastNLP/io/file_utils.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 8b2d1c79..9febfe4a 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -89,14 +89,16 @@ FASTNLP_EXTEND_EMBEDDING_URL = {'elmo': 'fastnlp_elmo_url.txt', def cached_path(url_or_filename: str, cache_dir: str = None, name=None) -> Path: """ 给定一个url,尝试通过url中的解析出来的文件名字filename到{cache_dir}/{name}/{filename}下寻找这个文件, - (1)如果cache_dir=None, 则cache_dir=~/.fastNLP/; 否则cache_dir=cache_dir - (2)如果name=None, 则没有中间的{name}这一层结构;否者中间结构就为{name} + + 1. 如果cache_dir=None, 则cache_dir=~/.fastNLP/; 否则cache_dir=cache_dir + 2. 如果name=None, 则没有中间的{name}这一层结构;否者中间结构就为{name} 如果有该文件,就直接返回路径 + 如果没有该文件,则尝试用传入的url下载 或者文件名(可以是具体的文件名,也可以是文件夹),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 - 将文件放入到cache_dir中. + 将文件放入到cache_dir中. :param str url_or_filename: 文件的下载url或者文件名称。 :param str cache_dir: 文件的缓存文件夹。如果为None,将使用"~/.fastNLP"这个默认路径 @@ -132,10 +134,13 @@ def cached_path(url_or_filename: str, cache_dir: str = None, name=None) -> Path: def get_filepath(filepath): """ 如果filepath为文件夹, + 如果内含多个文件, 返回filepath + 如果只有一个文件, 返回filepath + filename 如果filepath为文件 + 返回filepath :param str filepath: 路径 @@ -155,9 +160,9 @@ def get_filepath(filepath): def get_cache_path(): """ - 获取默认的fastNLP存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中,将使用环境变量的值,使得不用每个用户都去下载。 + 获取fastNLP默认cache的存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中,将使用环境变量的值,使得不用每个用户都去下载。 - :return: str + :return str: 存放路径 """ if 'FASTNLP_CACHE_DIR' in os.environ: fastnlp_cache_dir = os.environ.get('FASTNLP_CACHE_DIR') @@ -262,8 +267,9 @@ def _get_dataset_url(name): def split_filename_suffix(filepath): """ - 给定filepath返回对应的name和suffix. 如果后缀是多个点,仅支持.tar.gz类型 - :param filepath: + 给定filepath 返回对应的name和suffix. 如果后缀是多个点,仅支持.tar.gz类型 + + :param filepath: 文件路径 :return: filename, suffix """ filename = os.path.basename(filepath) @@ -278,6 +284,10 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: 文件解压,将解压后的文件全部放在cache_dir文件夹中。 如果从url中下载的资源解压后有多个文件,则返回目录的路径; 如果只有一个资源文件,则返回具体的路径。 + + :param url: 资源的 url + :param cache_dir: cache 目录 + :return: 路径 """ cache_dir.mkdir(parents=True, exist_ok=True) @@ -394,12 +404,12 @@ def untar_gz_file(file: Path, to: Path): def match_file(dir_name: str, cache_dir: Path) -> str: """ - 匹配的原则是,在cache_dir下的文件: (1) 与dir_name完全一致; (2) 除了后缀以外和dir_name完全一致。 + 匹配的原则是: 在cache_dir下的文件与dir_name完全一致, 或除了后缀以外和dir_name完全一致。 如果找到了两个匹配的结果将报错. 如果找到了则返回匹配的文件的名称; 没有找到返回空字符串 :param dir_name: 需要匹配的名称 :param cache_dir: 在该目录下找匹配dir_name是否存在 - :return: str + :return str: 做为匹配结果的字符串 """ files = os.listdir(cache_dir) matched_filenames = [] From 0032f7788af60177609e47174c1d3e9244168dc5 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 16 Aug 2019 17:40:16 +0800 Subject: [PATCH 057/153] update docs-tools --- docs/Makefile | 2 +- docs/format.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index b9f1cf95..b41beb44 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -20,7 +20,7 @@ server: cd build/html && python -m http.server dev: - rm -rf build/html && make html && make server + rm -rf build && make html && make server .PHONY: help Makefile diff --git a/docs/format.py b/docs/format.py index 7cc341c2..67671ae7 100644 --- a/docs/format.py +++ b/docs/format.py @@ -59,7 +59,10 @@ def clear(path='./source/'): else: shorten(path + file, to_delete) for file in to_delete: - os.remove(path + file + ".rst") + try: + os.remove(path + file + ".rst") + except: + pass clear() From de17c9a7d346129d6a0c5fee97210fe4f19bb593 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 16 Aug 2019 17:40:43 +0800 Subject: [PATCH 058/153] rename base_loader file as data_bundle --- docs/source/fastNLP.io.base_loader.rst | 7 - docs/source/fastNLP.io.data_bundle.rst | 7 + docs/source/fastNLP.io.rst | 2 +- fastNLP/io/__init__.py | 7 +- fastNLP/io/config_io.py | 313 --------------- fastNLP/io/{base_loader.py => data_bundle.py} | 2 - fastNLP/io/data_loader/conll.py | 4 +- fastNLP/io/data_loader/imdb.py | 2 +- fastNLP/io/data_loader/matching.py | 2 +- fastNLP/io/data_loader/mtl.py | 2 +- fastNLP/io/data_loader/people_daily.py | 2 +- fastNLP/io/data_loader/sst.py | 2 +- fastNLP/io/data_loader/yelp.py | 2 +- fastNLP/io/dataset_loader.py | 2 +- fastNLP/io/embed_loader.py | 2 +- fastNLP/io/loader/__init__.py | 3 +- fastNLP/io/loader/classification.py | 60 +-- fastNLP/io/loader/loader.py | 38 +- fastNLP/io/loader/matching.py | 3 +- fastNLP/io/model_io.py | 2 +- fastNLP/io/pipe/classification.py | 2 +- .../Summarization/Baseline/data/dataloader.py | 376 +++++++++--------- .../Summarization/BertSum/dataloader.py | 2 +- .../data_load/cr_loader.py | 2 +- .../joint_cws_parse/data/data_loader.py | 2 +- .../matching/data/MatchingDataLoader.py | 2 +- .../chinese_ner/data/ChineseNER.py | 2 +- .../cws/data/CWSDataLoader.py | 2 +- .../ner/data/Conll2003Loader.py | 2 +- .../ner/data/OntoNoteLoader.py | 2 +- .../text_classification/data/IMDBLoader.py | 2 +- .../text_classification/data/MTL16Loader.py | 2 +- .../text_classification/data/sstloader.py | 2 +- .../text_classification/data/yelpLoader.py | 2 +- 34 files changed, 275 insertions(+), 591 deletions(-) delete mode 100644 docs/source/fastNLP.io.base_loader.rst create mode 100644 docs/source/fastNLP.io.data_bundle.rst delete mode 100644 fastNLP/io/config_io.py rename fastNLP/io/{base_loader.py => data_bundle.py} (99%) diff --git a/docs/source/fastNLP.io.base_loader.rst b/docs/source/fastNLP.io.base_loader.rst deleted file mode 100644 index 057867f4..00000000 --- a/docs/source/fastNLP.io.base_loader.rst +++ /dev/null @@ -1,7 +0,0 @@ -fastNLP.io.base\_loader -======================= - -.. automodule:: fastNLP.io.base_loader - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/source/fastNLP.io.data_bundle.rst b/docs/source/fastNLP.io.data_bundle.rst new file mode 100644 index 00000000..a6273956 --- /dev/null +++ b/docs/source/fastNLP.io.data_bundle.rst @@ -0,0 +1,7 @@ +fastNLP.io.data\_bundle +======================= + +.. automodule:: fastNLP.io.data_bundle + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst index 0a006709..0cd5d3f2 100644 --- a/docs/source/fastNLP.io.rst +++ b/docs/source/fastNLP.io.rst @@ -20,7 +20,7 @@ Submodules .. toctree:: - fastNLP.io.base_loader + fastNLP.io.data_bundle fastNLP.io.dataset_loader fastNLP.io.embed_loader fastNLP.io.file_utils diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 5234b209..90d4d12c 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -12,10 +12,9 @@ 这些类的使用方法如下: """ __all__ = [ - 'EmbedLoader', - 'DataBundle', - 'DataSetLoader', + + 'EmbedLoader', 'YelpLoader', 'YelpFullLoader', @@ -69,7 +68,7 @@ __all__ = [ ] from .embed_loader import EmbedLoader -from .base_loader import DataBundle, DataSetLoader +from .data_bundle import DataBundle from .dataset_loader import CSVLoader, JsonLoader from .model_io import ModelLoader, ModelSaver diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py deleted file mode 100644 index ac349080..00000000 --- a/fastNLP/io/config_io.py +++ /dev/null @@ -1,313 +0,0 @@ -""" -用于读入和处理和保存 config 文件 - -.. todo:: - 这个模块中的类可能被抛弃? - -""" -__all__ = [ - "ConfigLoader", - "ConfigSection", - "ConfigSaver" -] - -import configparser -import json -import os - -from .base_loader import BaseLoader - - -class ConfigLoader(BaseLoader): - """ - 别名::class:`fastNLP.io.ConfigLoader` :class:`fastNLP.io.config_io.ConfigLoader` - - 读取配置文件的Loader - - :param str data_path: 配置文件的路径 - - """ - - def __init__(self, data_path=None): - super(ConfigLoader, self).__init__() - if data_path is not None: - self.config = self.parse(super(ConfigLoader, self).load(data_path)) - - @staticmethod - def parse(string): - raise NotImplementedError - - @staticmethod - def load_config(file_path, sections): - """ - 把配置文件的section 存入提供的 ``sections`` 中 - - :param str file_path: 配置文件的路径 - :param dict sections: 符合如下键值对组成的字典 `section_name(string)` : :class:`~fastNLP.io.ConfigSection` - - Example:: - - test_args = ConfigSection() - ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args}) - - """ - assert isinstance(sections, dict) - cfg = configparser.ConfigParser() - if not os.path.exists(file_path): - raise FileNotFoundError("config file {} not found. ".format(file_path)) - cfg.read(file_path) - for s in sections: - attr_list = [i for i in sections[s].__dict__.keys() if - not callable(getattr(sections[s], i)) and not i.startswith("__")] - if s not in cfg: - print('section %s not found in config file' % (s)) - continue - gen_sec = cfg[s] - for attr in gen_sec.keys(): - try: - val = json.loads(gen_sec[attr]) - # print(s, attr, val, type(val)) - if attr in attr_list: - assert type(val) == type(getattr(sections[s], attr)), \ - 'type not match, except %s but got %s' % \ - (type(getattr(sections[s], attr)), type(val)) - """ - if attr in attr_list then check its type and - update its value. - else add a new attr in sections[s] - """ - setattr(sections[s], attr, val) - except Exception as e: - print("cannot load attribute %s in section %s" - % (attr, s)) - pass - - -class ConfigSection(object): - """ - 别名::class:`fastNLP.io.ConfigSection` :class:`fastNLP.io.config_io.ConfigSection` - - ConfigSection是一个存储了一个section中所有键值对的数据结构,推荐使用此类的实例来配合 :meth:`ConfigLoader.load_config` 使用 - - """ - - def __init__(self): - super(ConfigSection, self).__init__() - - def __getitem__(self, key): - """ - :param key: str, the name of the attribute - :return attr: the value of this attribute - if key not in self.__dict__.keys(): - return self[key] - else: - raise AttributeError - """ - if key in self.__dict__.keys(): - return getattr(self, key) - raise AttributeError("do NOT have attribute %s" % key) - - def __setitem__(self, key, value): - """ - :param key: str, the name of the attribute - :param value: the value of this attribute - if key not in self.__dict__.keys(): - self[key] will be added - else: - self[key] will be updated - """ - if key in self.__dict__.keys(): - if not isinstance(value, type(getattr(self, key))): - raise AttributeError("attr %s except %s but got %s" % - (key, str(type(getattr(self, key))), str(type(value)))) - setattr(self, key, value) - - def __contains__(self, item): - """ - :param item: The key of item. - :return: True if the key in self.__dict__.keys() else False. - """ - return item in self.__dict__.keys() - - def __eq__(self, other): - """Overwrite the == operator - - :param other: Another ConfigSection() object which to be compared. - :return: True if value of each key in each ConfigSection() object are equal to the other, else False. - """ - for k in self.__dict__.keys(): - if k not in other.__dict__.keys(): - return False - if getattr(self, k) != getattr(self, k): - return False - - for k in other.__dict__.keys(): - if k not in self.__dict__.keys(): - return False - if getattr(self, k) != getattr(self, k): - return False - - return True - - def __ne__(self, other): - """Overwrite the != operator - - :param other: - :return: - """ - return not self.__eq__(other) - - @property - def data(self): - return self.__dict__ - - -class ConfigSaver(object): - """ - 别名::class:`fastNLP.io.ConfigSaver` :class:`fastNLP.io.config_io.ConfigSaver` - - ConfigSaver 是用来存储配置文件并解决相关冲突的类 - - :param str file_path: 配置文件的路径 - - """ - - def __init__(self, file_path): - self.file_path = file_path - if not os.path.exists(self.file_path): - raise FileNotFoundError("file {} NOT found!".__format__(self.file_path)) - - def _get_section(self, sect_name): - """ - This is the function to get the section with the section name. - - :param sect_name: The name of section what wants to load. - :return: The section. - """ - sect = ConfigSection() - ConfigLoader().load_config(self.file_path, {sect_name: sect}) - return sect - - def _read_section(self): - """ - This is the function to read sections from the config file. - - :return: sect_list, sect_key_list - sect_list: A list of ConfigSection(). - sect_key_list: A list of names in sect_list. - """ - sect_name = None - - sect_list = {} - sect_key_list = [] - - single_section = {} - single_section_key = [] - - with open(self.file_path, 'r') as f: - lines = f.readlines() - - for line in lines: - if line.startswith('[') and line.endswith(']\n'): - if sect_name is None: - pass - else: - sect_list[sect_name] = single_section, single_section_key - single_section = {} - single_section_key = [] - sect_key_list.append(sect_name) - sect_name = line[1: -2] - continue - - if line.startswith('#'): - single_section[line] = '#' - single_section_key.append(line) - continue - - if line.startswith('\n'): - single_section_key.append('\n') - continue - - if '=' not in line: - raise RuntimeError("can NOT load config file {}".__format__(self.file_path)) - - key = line.split('=', maxsplit=1)[0].strip() - value = line.split('=', maxsplit=1)[1].strip() + '\n' - single_section[key] = value - single_section_key.append(key) - - if sect_name is not None: - sect_list[sect_name] = single_section, single_section_key - sect_key_list.append(sect_name) - return sect_list, sect_key_list - - def _write_section(self, sect_list, sect_key_list): - """ - This is the function to write config file with section list and name list. - - :param sect_list: A list of ConfigSection() need to be writen into file. - :param sect_key_list: A list of name of sect_list. - :return: - """ - with open(self.file_path, 'w') as f: - for sect_key in sect_key_list: - single_section, single_section_key = sect_list[sect_key] - f.write('[' + sect_key + ']\n') - for key in single_section_key: - if key == '\n': - f.write('\n') - continue - if single_section[key] == '#': - f.write(key) - continue - f.write(key + ' = ' + single_section[key]) - f.write('\n') - - def save_config_file(self, section_name, section): - """ - 这个方法可以用来修改并保存配置文件中单独的一个 section - - :param str section_name: 需要保存的 section 的名字. - :param section: 你需要修改并保存的 section, :class:`~fastNLP.io.ConfigSaver` 类型 - """ - section_file = self._get_section(section_name) - if len(section_file.__dict__.keys()) == 0: # the section not in the file before - # append this section to config file - with open(self.file_path, 'a') as f: - f.write('[' + section_name + ']\n') - for k in section.__dict__.keys(): - f.write(k + ' = ') - if isinstance(section[k], str): - f.write('\"' + str(section[k]) + '\"\n\n') - else: - f.write(str(section[k]) + '\n\n') - else: - # the section exists - change_file = False - for k in section.__dict__.keys(): - if k not in section_file: - # find a new key in this section - change_file = True - break - if section_file[k] != section[k]: - change_file = True - break - if not change_file: - return - - sect_list, sect_key_list = self._read_section() - if section_name not in sect_key_list: - raise AttributeError() - - sect, sect_key = sect_list[section_name] - for k in section.__dict__.keys(): - if k not in sect_key: - if sect_key[-1] != '\n': - sect_key.append('\n') - sect_key.append(k) - sect[k] = str(section[k]) - if isinstance(section[k], str): - sect[k] = "\"" + sect[k] + "\"" - sect[k] = sect[k] + "\n" - sect_list[section_name] = sect, sect_key - self._write_section(sect_list, sect_key_list) diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/data_bundle.py similarity index 99% rename from fastNLP/io/base_loader.py rename to fastNLP/io/data_bundle.py index 5cbd5bb1..4203294b 100644 --- a/fastNLP/io/base_loader.py +++ b/fastNLP/io/data_bundle.py @@ -1,7 +1,5 @@ __all__ = [ - "BaseLoader", 'DataBundle', - 'DataSetLoader', ] import _pickle as pickle diff --git a/fastNLP/io/data_loader/conll.py b/fastNLP/io/data_loader/conll.py index 0285173c..7083b98d 100644 --- a/fastNLP/io/data_loader/conll.py +++ b/fastNLP/io/data_loader/conll.py @@ -1,11 +1,11 @@ from ...core.dataset import DataSet from ...core.instance import Instance -from ..base_loader import DataSetLoader +from ..data_bundle import DataSetLoader from ..file_reader import _read_conll from typing import Union, Dict from ..utils import check_loader_paths -from ..base_loader import DataBundle +from ..data_bundle import DataBundle class ConllLoader(DataSetLoader): """ diff --git a/fastNLP/io/data_loader/imdb.py b/fastNLP/io/data_loader/imdb.py index d3636cde..c9dda76e 100644 --- a/fastNLP/io/data_loader/imdb.py +++ b/fastNLP/io/data_loader/imdb.py @@ -2,7 +2,7 @@ from typing import Union, Dict from ..embed_loader import EmbeddingOption, EmbedLoader -from ..base_loader import DataSetLoader, DataBundle +from ..data_bundle import DataSetLoader, DataBundle from ...core.vocabulary import VocabularyOption, Vocabulary from ...core.dataset import DataSet from ...core.instance import Instance diff --git a/fastNLP/io/data_loader/matching.py b/fastNLP/io/data_loader/matching.py index 1242b432..41c9a98d 100644 --- a/fastNLP/io/data_loader/matching.py +++ b/fastNLP/io/data_loader/matching.py @@ -4,7 +4,7 @@ from typing import Union, Dict, List from ...core.const import Const from ...core.vocabulary import Vocabulary -from ..base_loader import DataBundle, DataSetLoader +from ..data_bundle import DataBundle, DataSetLoader from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR from ...modules.encoder.bert import BertTokenizer diff --git a/fastNLP/io/data_loader/mtl.py b/fastNLP/io/data_loader/mtl.py index 20824958..923aadfb 100644 --- a/fastNLP/io/data_loader/mtl.py +++ b/fastNLP/io/data_loader/mtl.py @@ -1,7 +1,7 @@ from typing import Union, Dict -from ..base_loader import DataBundle +from ..data_bundle import DataBundle from ..dataset_loader import CSVLoader from ...core.vocabulary import Vocabulary, VocabularyOption from ...core.const import Const diff --git a/fastNLP/io/data_loader/people_daily.py b/fastNLP/io/data_loader/people_daily.py index 5efadb7d..afd66744 100644 --- a/fastNLP/io/data_loader/people_daily.py +++ b/fastNLP/io/data_loader/people_daily.py @@ -1,5 +1,5 @@ -from ..base_loader import DataSetLoader +from ..data_bundle import DataSetLoader from ...core.dataset import DataSet from ...core.instance import Instance from ...core.const import Const diff --git a/fastNLP/io/data_loader/sst.py b/fastNLP/io/data_loader/sst.py index c2e0eca1..2034fc2b 100644 --- a/fastNLP/io/data_loader/sst.py +++ b/fastNLP/io/data_loader/sst.py @@ -2,7 +2,7 @@ from typing import Union, Dict from nltk import Tree -from ..base_loader import DataBundle, DataSetLoader +from ..data_bundle import DataBundle, DataSetLoader from ..dataset_loader import CSVLoader from ...core.vocabulary import VocabularyOption, Vocabulary from ...core.dataset import DataSet diff --git a/fastNLP/io/data_loader/yelp.py b/fastNLP/io/data_loader/yelp.py index 15533b04..f2bc60c8 100644 --- a/fastNLP/io/data_loader/yelp.py +++ b/fastNLP/io/data_loader/yelp.py @@ -6,7 +6,7 @@ from ...core.const import Const from ...core.dataset import DataSet from ...core.instance import Instance from ...core.vocabulary import VocabularyOption, Vocabulary -from ..base_loader import DataBundle, DataSetLoader +from ..data_bundle import DataBundle, DataSetLoader from typing import Union, Dict from ..utils import check_loader_paths, get_tokenizer diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index e1e06ec9..82e96597 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -26,7 +26,7 @@ __all__ = [ from ..core.dataset import DataSet from ..core.instance import Instance from .file_reader import _read_csv, _read_json -from .base_loader import DataSetLoader +from .data_bundle import DataSetLoader class JsonLoader(DataSetLoader): diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 91a0919c..48048983 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -9,7 +9,7 @@ import warnings import numpy as np from ..core.vocabulary import Vocabulary -from .base_loader import BaseLoader +from .data_bundle import BaseLoader from ..core.utils import Option diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index a4e6a6f5..bcb3b730 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -44,6 +44,8 @@ fastNLP 目前提供了如下的 Loader """ __all__ = [ + 'Loader', + 'YelpLoader', 'YelpFullLoader', 'YelpPolarityLoader', @@ -57,7 +59,6 @@ __all__ = [ 'OntoNotesNERLoader', 'CTBLoader', - 'Loader', 'CSVLoader', 'JsonLoader', diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index dd85b4fe..ad56101d 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -7,6 +7,7 @@ import random import shutil import numpy as np + class YelpLoader(Loader): """ 别名::class:`fastNLP.io.YelpLoader` :class:`fastNLP.io.loader.YelpLoader` @@ -14,6 +15,7 @@ class YelpLoader(Loader): 原始数据中内容应该为, 每一行为一个sample,第一个逗号之前为target,第一个逗号之后为文本内容。 Example:: + "1","I got 'new' tires from the..." "1","Don't waste your time..." @@ -28,11 +30,11 @@ class YelpLoader(Loader): "...", "..." """ - + def __init__(self): super(YelpLoader, self).__init__() - - def _load(self, path: str=None): + + def _load(self, path: str = None): ds = DataSet() with open(path, 'r', encoding='utf-8') as f: for line in f: @@ -69,12 +71,12 @@ class YelpFullLoader(YelpLoader): :param int seed: 划分dev时的随机数种子 :return: str, 数据集的目录地址 """ - + dataset_name = 'yelp-review-full' data_dir = self._get_dataset_path(dataset_name=dataset_name) if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否需要重新下载 re_download = True - if dev_ratio>0: + if dev_ratio > 0: dev_line_count = 0 tr_line_count = 0 with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ @@ -83,14 +85,14 @@ class YelpFullLoader(YelpLoader): tr_line_count += 1 for line in f2: dev_line_count += 1 - if not np.isclose(dev_line_count, dev_ratio*(tr_line_count + dev_line_count), rtol=0.005): + if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): re_download = True else: re_download = False if re_download: shutil.rmtree(data_dir) data_dir = self._get_dataset_path(dataset_name=dataset_name) - + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." @@ -109,7 +111,7 @@ class YelpFullLoader(YelpLoader): finally: if os.path.exists(os.path.join(data_dir, 'middle_file.csv')): os.remove(os.path.join(data_dir, 'middle_file.csv')) - + return data_dir @@ -131,7 +133,7 @@ class YelpPolarityLoader(YelpLoader): data_dir = self._get_dataset_path(dataset_name=dataset_name) if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否符合比例要求 re_download = True - if dev_ratio>0: + if dev_ratio > 0: dev_line_count = 0 tr_line_count = 0 with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ @@ -140,14 +142,14 @@ class YelpPolarityLoader(YelpLoader): tr_line_count += 1 for line in f2: dev_line_count += 1 - if not np.isclose(dev_line_count, dev_ratio*(tr_line_count + dev_line_count), rtol=0.005): + if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): re_download = True else: re_download = False if re_download: shutil.rmtree(data_dir) data_dir = self._get_dataset_path(dataset_name=dataset_name) - + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." @@ -166,7 +168,7 @@ class YelpPolarityLoader(YelpLoader): finally: if os.path.exists(os.path.join(data_dir, 'middle_file.csv')): os.remove(os.path.join(data_dir, 'middle_file.csv')) - + return data_dir @@ -185,10 +187,10 @@ class IMDBLoader(Loader): "...", "..." """ - + def __init__(self): super(IMDBLoader, self).__init__() - + def _load(self, path: str): dataset = DataSet() with open(path, 'r', encoding="utf-8") as f: @@ -201,12 +203,12 @@ class IMDBLoader(Loader): words = parts[1] if words: dataset.append(Instance(raw_words=words, target=target)) - + if len(dataset) == 0: raise RuntimeError(f"{path} has no valid data.") - + return dataset - + def download(self, dev_ratio: float = 0.1, seed: int = 0): """ 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 @@ -221,9 +223,9 @@ class IMDBLoader(Loader): """ dataset_name = 'aclImdb' data_dir = self._get_dataset_path(dataset_name=dataset_name) - if os.path.exists(os.path.join(data_dir, 'dev.txt')): # 存在dev的话,check是否符合比例要求 + if os.path.exists(os.path.join(data_dir, 'dev.txt')): # 存在dev的话,check是否符合比例要求 re_download = True - if dev_ratio>0: + if dev_ratio > 0: dev_line_count = 0 tr_line_count = 0 with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f1, \ @@ -232,14 +234,14 @@ class IMDBLoader(Loader): tr_line_count += 1 for line in f2: dev_line_count += 1 - if not np.isclose(dev_line_count, dev_ratio*(tr_line_count + dev_line_count), rtol=0.005): + if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): re_download = True else: re_download = False if re_download: shutil.rmtree(data_dir) data_dir = self._get_dataset_path(dataset_name=dataset_name) - + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." @@ -258,7 +260,7 @@ class IMDBLoader(Loader): finally: if os.path.exists(os.path.join(data_dir, 'middle_file.txt')): os.remove(os.path.join(data_dir, 'middle_file.txt')) - + return data_dir @@ -278,10 +280,10 @@ class SSTLoader(Loader): raw_words列是str。 """ - + def __init__(self): super().__init__() - + def _load(self, path: str): """ 从path读取SST文件 @@ -296,7 +298,7 @@ class SSTLoader(Loader): if line: ds.append(Instance(raw_words=line)) return ds - + def download(self): """ 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 @@ -323,10 +325,10 @@ class SST2Loader(Loader): test的DataSet没有target列。 """ - + def __init__(self): super().__init__() - + def _load(self, path: str): """ 从path读取SST2文件 @@ -335,7 +337,7 @@ class SST2Loader(Loader): :return: DataSet """ ds = DataSet() - + with open(path, 'r', encoding='utf-8') as f: f.readline() # 跳过header if 'test' in os.path.split(path)[1]: @@ -356,7 +358,7 @@ class SST2Loader(Loader): if raw_words: ds.append(Instance(raw_words=raw_words, target=target)) return ds - + def download(self): """ 自动下载数据集,如果你使用了该数据集,请引用以下的文章 diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py index 607d6920..296714bf 100644 --- a/fastNLP/io/loader/loader.py +++ b/fastNLP/io/loader/loader.py @@ -2,17 +2,21 @@ from ...core.dataset import DataSet from .. import DataBundle from ..utils import check_loader_paths from typing import Union, Dict -import os from ..file_utils import _get_dataset_url, get_cache_path, cached_path + class Loader: + """ + 各种数据 Loader 的基类,提供了 API 的参考. + + """ def __init__(self): pass - - def _load(self, path:str) -> DataSet: + + def _load(self, path: str) -> DataSet: raise NotImplementedError - - def load(self, paths: Union[str, Dict[str, str]]=None) -> DataBundle: + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: """ 从指定一个或多个路径中的文件中读取数据,返回:class:`~fastNLP.io.DataBundle` 。 @@ -22,31 +26,25 @@ class Loader: (0) 如果为None,则先查看本地是否有缓存,如果没有则自动下载并缓存。 (1) 传入一个目录, 该目录下名称包含train的被认为是train,包含test的被认为是test,包含dev的被认为是dev,如果检测到多个文件 - 名包含'train'、 'dev'、 'test'则会报错 - - Example:: + 名包含'train'、 'dev'、 'test'则会报错:: data_bundle = ConllLoader().load('/path/to/dir') # 返回的DataBundle中datasets根据目录下是否检测到train、 # dev、 test等有所变化,可以通过以下的方式取出DataSet tr_data = data_bundle.datasets['train'] te_data = data_bundle.datasets['test'] # 如果目录下有文件包含test这个字段 - (2) 传入文件路径 - - Example:: + (2) 传入文件路径:: data_bundle = ConllLoader().load("/path/to/a/train.conll") # 返回DataBundle对象, datasets中仅包含'train' tr_data = data_bundle.datasets['train'] # 可以通过以下的方式取出DataSet - (3) 传入一个dict,比如train,dev,test不在同一个目录下,或者名称中不包含train, dev, test - - Example:: + (3) 传入一个dict,比如train,dev,test不在同一个目录下,或者名称中不包含train, dev, test:: paths = {'train':"/path/to/tr.conll", 'dev':"/to/validate.conll", "test":"/to/te.conll"} data_bundle = ConllLoader().load(paths) # 返回的DataBundle中的dataset中包含"train", "dev", "test" dev_data = data_bundle.datasets['dev'] - :return: 返回的:class:`~fastNLP.io.DataBundle` + :return: 返回的 :class:`~fastNLP.io.DataBundle` """ if paths is None: paths = self.download() @@ -54,10 +52,10 @@ class Loader: datasets = {name: self._load(path) for name, path in paths.items()} data_bundle = DataBundle(datasets=datasets) return data_bundle - + def download(self): raise NotImplementedError(f"{self.__class__} cannot download data automatically.") - + def _get_dataset_path(self, dataset_name): """ 传入dataset的名称,获取读取数据的目录。如果数据不存在,会尝试自动下载并缓存 @@ -65,11 +63,9 @@ class Loader: :param str dataset_name: 数据集的名称 :return: str, 数据集的目录地址。直接到该目录下读取相应的数据即可。 """ - + default_cache_path = get_cache_path() url = _get_dataset_url(dataset_name) output_dir = cached_path(url_or_filename=url, cache_dir=default_cache_path, name='dataset') - + return output_dir - - diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py index 58fa0d6f..26455914 100644 --- a/fastNLP/io/loader/matching.py +++ b/fastNLP/io/loader/matching.py @@ -203,7 +203,8 @@ class QNLILoader(JsonLoader): """ 如果您的实验使用到了该数据,请引用 - TODO 补充 + .. todo:: + 补充 :return: """ diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py index ffaa4ef5..22ced1ce 100644 --- a/fastNLP/io/model_io.py +++ b/fastNLP/io/model_io.py @@ -8,7 +8,7 @@ __all__ = [ import torch -from .base_loader import BaseLoader +from .data_bundle import BaseLoader class ModelLoader(BaseLoader): diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index 429b6552..daa17da9 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -1,6 +1,6 @@ from nltk import Tree -from ..base_loader import DataBundle +from ..data_bundle import DataBundle from ...core.vocabulary import Vocabulary from ...core.const import Const from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader diff --git a/reproduction/Summarization/Baseline/data/dataloader.py b/reproduction/Summarization/Baseline/data/dataloader.py index 47cd0856..dcb294b0 100644 --- a/reproduction/Summarization/Baseline/data/dataloader.py +++ b/reproduction/Summarization/Baseline/data/dataloader.py @@ -1,188 +1,188 @@ -import pickle -import numpy as np - -from fastNLP.core.vocabulary import Vocabulary -from fastNLP.io.base_loader import DataBundle -from fastNLP.io.dataset_loader import JsonLoader -from fastNLP.core.const import Const - -from tools.logger import * - -WORD_PAD = "[PAD]" -WORD_UNK = "[UNK]" -DOMAIN_UNK = "X" -TAG_UNK = "X" - - -class SummarizationLoader(JsonLoader): - """ - 读取summarization数据集,读取的DataSet包含fields:: - - text: list(str),document - summary: list(str), summary - text_wd: list(list(str)),tokenized document - summary_wd: list(list(str)), tokenized summary - labels: list(int), - flatten_label: list(int), 0 or 1, flatten labels - domain: str, optional - tag: list(str), optional - - 数据来源: CNN_DailyMail Newsroom DUC - """ - - def __init__(self): - super(SummarizationLoader, self).__init__() - - def _load(self, path): - ds = super(SummarizationLoader, self)._load(path) - - def _lower_text(text_list): - return [text.lower() for text in text_list] - - def _split_list(text_list): - return [text.split() for text in text_list] - - def _convert_label(label, sent_len): - np_label = np.zeros(sent_len, dtype=int) - if label != []: - np_label[np.array(label)] = 1 - return np_label.tolist() - - ds.apply(lambda x: _lower_text(x['text']), new_field_name='text') - ds.apply(lambda x: _lower_text(x['summary']), new_field_name='summary') - ds.apply(lambda x:_split_list(x['text']), new_field_name='text_wd') - ds.apply(lambda x:_split_list(x['summary']), new_field_name='summary_wd') - ds.apply(lambda x:_convert_label(x["label"], len(x["text"])), new_field_name="flatten_label") - - return ds - - def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab_file=True): - """ - :param paths: dict path for each dataset - :param vocab_size: int max_size for vocab - :param vocab_path: str vocab path - :param sent_max_len: int max token number of the sentence - :param doc_max_timesteps: int max sentence number of the document - :param domain: bool build vocab for publication, use 'X' for unknown - :param tag: bool build vocab for tag, use 'X' for unknown - :param load_vocab_file: bool build vocab (False) or load vocab (True) - :return: DataBundle - datasets: dict keys correspond to the paths dict - vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) - embeddings: optional - """ - - def _pad_sent(text_wd): - pad_text_wd = [] - for sent_wd in text_wd: - if len(sent_wd) < sent_max_len: - pad_num = sent_max_len - len(sent_wd) - sent_wd.extend([WORD_PAD] * pad_num) - else: - sent_wd = sent_wd[:sent_max_len] - pad_text_wd.append(sent_wd) - return pad_text_wd - - def _token_mask(text_wd): - token_mask_list = [] - for sent_wd in text_wd: - token_num = len(sent_wd) - if token_num < sent_max_len: - mask = [1] * token_num + [0] * (sent_max_len - token_num) - else: - mask = [1] * sent_max_len - token_mask_list.append(mask) - return token_mask_list - - def _pad_label(label): - text_len = len(label) - if text_len < doc_max_timesteps: - pad_label = label + [0] * (doc_max_timesteps - text_len) - else: - pad_label = label[:doc_max_timesteps] - return pad_label - - def _pad_doc(text_wd): - text_len = len(text_wd) - if text_len < doc_max_timesteps: - padding = [WORD_PAD] * sent_max_len - pad_text = text_wd + [padding] * (doc_max_timesteps - text_len) - else: - pad_text = text_wd[:doc_max_timesteps] - return pad_text - - def _sent_mask(text_wd): - text_len = len(text_wd) - if text_len < doc_max_timesteps: - sent_mask = [1] * text_len + [0] * (doc_max_timesteps - text_len) - else: - sent_mask = [1] * doc_max_timesteps - return sent_mask - - - datasets = {} - train_ds = None - for key, value in paths.items(): - ds = self.load(value) - # pad sent - ds.apply(lambda x:_pad_sent(x["text_wd"]), new_field_name="pad_text_wd") - ds.apply(lambda x:_token_mask(x["text_wd"]), new_field_name="pad_token_mask") - # pad document - ds.apply(lambda x:_pad_doc(x["pad_text_wd"]), new_field_name="pad_text") - ds.apply(lambda x:_sent_mask(x["pad_text_wd"]), new_field_name="seq_len") - ds.apply(lambda x:_pad_label(x["flatten_label"]), new_field_name="pad_label") - - # rename field - ds.rename_field("pad_text", Const.INPUT) - ds.rename_field("seq_len", Const.INPUT_LEN) - ds.rename_field("pad_label", Const.TARGET) - - # set input and target - ds.set_input(Const.INPUT, Const.INPUT_LEN) - ds.set_target(Const.TARGET, Const.INPUT_LEN) - - datasets[key] = ds - if "train" in key: - train_ds = datasets[key] - - vocab_dict = {} - if load_vocab_file == False: - logger.info("[INFO] Build new vocab from training dataset!") - if train_ds == None: - raise ValueError("Lack train file to build vocabulary!") - - vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) - vocabs.from_dataset(train_ds, field_name=["text_wd","summary_wd"]) - vocab_dict["vocab"] = vocabs - else: - logger.info("[INFO] Load existing vocab from %s!" % vocab_path) - word_list = [] - with open(vocab_path, 'r', encoding='utf8') as vocab_f: - cnt = 2 # pad and unk - for line in vocab_f: - pieces = line.split("\t") - word_list.append(pieces[0]) - cnt += 1 - if cnt > vocab_size: - break - vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) - vocabs.add_word_lst(word_list) - vocabs.build_vocab() - vocab_dict["vocab"] = vocabs - - if domain == True: - domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK) - domaindict.from_dataset(train_ds, field_name="publication") - vocab_dict["domain"] = domaindict - if tag == True: - tagdict = Vocabulary(padding=None, unknown=TAG_UNK) - tagdict.from_dataset(train_ds, field_name="tag") - vocab_dict["tag"] = tagdict - - for ds in datasets.values(): - vocab_dict["vocab"].index_dataset(ds, field_name=Const.INPUT, new_field_name=Const.INPUT) - - return DataBundle(vocabs=vocab_dict, datasets=datasets) - - - +import pickle +import numpy as np + +from fastNLP.core.vocabulary import Vocabulary +from fastNLP.io.data_bundle import DataBundle +from fastNLP.io.dataset_loader import JsonLoader +from fastNLP.core.const import Const + +from tools.logger import * + +WORD_PAD = "[PAD]" +WORD_UNK = "[UNK]" +DOMAIN_UNK = "X" +TAG_UNK = "X" + + +class SummarizationLoader(JsonLoader): + """ + 读取summarization数据集,读取的DataSet包含fields:: + + text: list(str),document + summary: list(str), summary + text_wd: list(list(str)),tokenized document + summary_wd: list(list(str)), tokenized summary + labels: list(int), + flatten_label: list(int), 0 or 1, flatten labels + domain: str, optional + tag: list(str), optional + + 数据来源: CNN_DailyMail Newsroom DUC + """ + + def __init__(self): + super(SummarizationLoader, self).__init__() + + def _load(self, path): + ds = super(SummarizationLoader, self)._load(path) + + def _lower_text(text_list): + return [text.lower() for text in text_list] + + def _split_list(text_list): + return [text.split() for text in text_list] + + def _convert_label(label, sent_len): + np_label = np.zeros(sent_len, dtype=int) + if label != []: + np_label[np.array(label)] = 1 + return np_label.tolist() + + ds.apply(lambda x: _lower_text(x['text']), new_field_name='text') + ds.apply(lambda x: _lower_text(x['summary']), new_field_name='summary') + ds.apply(lambda x:_split_list(x['text']), new_field_name='text_wd') + ds.apply(lambda x:_split_list(x['summary']), new_field_name='summary_wd') + ds.apply(lambda x:_convert_label(x["label"], len(x["text"])), new_field_name="flatten_label") + + return ds + + def process(self, paths, vocab_size, vocab_path, sent_max_len, doc_max_timesteps, domain=False, tag=False, load_vocab_file=True): + """ + :param paths: dict path for each dataset + :param vocab_size: int max_size for vocab + :param vocab_path: str vocab path + :param sent_max_len: int max token number of the sentence + :param doc_max_timesteps: int max sentence number of the document + :param domain: bool build vocab for publication, use 'X' for unknown + :param tag: bool build vocab for tag, use 'X' for unknown + :param load_vocab_file: bool build vocab (False) or load vocab (True) + :return: DataBundle + datasets: dict keys correspond to the paths dict + vocabs: dict key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True) + embeddings: optional + """ + + def _pad_sent(text_wd): + pad_text_wd = [] + for sent_wd in text_wd: + if len(sent_wd) < sent_max_len: + pad_num = sent_max_len - len(sent_wd) + sent_wd.extend([WORD_PAD] * pad_num) + else: + sent_wd = sent_wd[:sent_max_len] + pad_text_wd.append(sent_wd) + return pad_text_wd + + def _token_mask(text_wd): + token_mask_list = [] + for sent_wd in text_wd: + token_num = len(sent_wd) + if token_num < sent_max_len: + mask = [1] * token_num + [0] * (sent_max_len - token_num) + else: + mask = [1] * sent_max_len + token_mask_list.append(mask) + return token_mask_list + + def _pad_label(label): + text_len = len(label) + if text_len < doc_max_timesteps: + pad_label = label + [0] * (doc_max_timesteps - text_len) + else: + pad_label = label[:doc_max_timesteps] + return pad_label + + def _pad_doc(text_wd): + text_len = len(text_wd) + if text_len < doc_max_timesteps: + padding = [WORD_PAD] * sent_max_len + pad_text = text_wd + [padding] * (doc_max_timesteps - text_len) + else: + pad_text = text_wd[:doc_max_timesteps] + return pad_text + + def _sent_mask(text_wd): + text_len = len(text_wd) + if text_len < doc_max_timesteps: + sent_mask = [1] * text_len + [0] * (doc_max_timesteps - text_len) + else: + sent_mask = [1] * doc_max_timesteps + return sent_mask + + + datasets = {} + train_ds = None + for key, value in paths.items(): + ds = self.load(value) + # pad sent + ds.apply(lambda x:_pad_sent(x["text_wd"]), new_field_name="pad_text_wd") + ds.apply(lambda x:_token_mask(x["text_wd"]), new_field_name="pad_token_mask") + # pad document + ds.apply(lambda x:_pad_doc(x["pad_text_wd"]), new_field_name="pad_text") + ds.apply(lambda x:_sent_mask(x["pad_text_wd"]), new_field_name="seq_len") + ds.apply(lambda x:_pad_label(x["flatten_label"]), new_field_name="pad_label") + + # rename field + ds.rename_field("pad_text", Const.INPUT) + ds.rename_field("seq_len", Const.INPUT_LEN) + ds.rename_field("pad_label", Const.TARGET) + + # set input and target + ds.set_input(Const.INPUT, Const.INPUT_LEN) + ds.set_target(Const.TARGET, Const.INPUT_LEN) + + datasets[key] = ds + if "train" in key: + train_ds = datasets[key] + + vocab_dict = {} + if load_vocab_file == False: + logger.info("[INFO] Build new vocab from training dataset!") + if train_ds == None: + raise ValueError("Lack train file to build vocabulary!") + + vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) + vocabs.from_dataset(train_ds, field_name=["text_wd","summary_wd"]) + vocab_dict["vocab"] = vocabs + else: + logger.info("[INFO] Load existing vocab from %s!" % vocab_path) + word_list = [] + with open(vocab_path, 'r', encoding='utf8') as vocab_f: + cnt = 2 # pad and unk + for line in vocab_f: + pieces = line.split("\t") + word_list.append(pieces[0]) + cnt += 1 + if cnt > vocab_size: + break + vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) + vocabs.add_word_lst(word_list) + vocabs.build_vocab() + vocab_dict["vocab"] = vocabs + + if domain == True: + domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK) + domaindict.from_dataset(train_ds, field_name="publication") + vocab_dict["domain"] = domaindict + if tag == True: + tagdict = Vocabulary(padding=None, unknown=TAG_UNK) + tagdict.from_dataset(train_ds, field_name="tag") + vocab_dict["tag"] = tagdict + + for ds in datasets.values(): + vocab_dict["vocab"].index_dataset(ds, field_name=Const.INPUT, new_field_name=Const.INPUT) + + return DataBundle(vocabs=vocab_dict, datasets=datasets) + + + diff --git a/reproduction/Summarization/BertSum/dataloader.py b/reproduction/Summarization/BertSum/dataloader.py index c5201261..6af797e4 100644 --- a/reproduction/Summarization/BertSum/dataloader.py +++ b/reproduction/Summarization/BertSum/dataloader.py @@ -3,7 +3,7 @@ from datetime import timedelta from fastNLP.io.dataset_loader import JsonLoader from fastNLP.modules.encoder._bert import BertTokenizer -from fastNLP.io.base_loader import DataBundle +from fastNLP.io.data_bundle import DataBundle from fastNLP.core.const import Const class BertData(JsonLoader): diff --git a/reproduction/coreference_resolution/data_load/cr_loader.py b/reproduction/coreference_resolution/data_load/cr_loader.py index a424b0d1..5ed73473 100644 --- a/reproduction/coreference_resolution/data_load/cr_loader.py +++ b/reproduction/coreference_resolution/data_load/cr_loader.py @@ -1,7 +1,7 @@ from fastNLP.io.dataset_loader import JsonLoader,DataSet,Instance from fastNLP.io.file_reader import _read_json from fastNLP.core.vocabulary import Vocabulary -from fastNLP.io.base_loader import DataBundle +from fastNLP.io.data_bundle import DataBundle from reproduction.coreference_resolution.model.config import Config import reproduction.coreference_resolution.model.preprocess as preprocess diff --git a/reproduction/joint_cws_parse/data/data_loader.py b/reproduction/joint_cws_parse/data/data_loader.py index 3e6fec4b..4df46b04 100644 --- a/reproduction/joint_cws_parse/data/data_loader.py +++ b/reproduction/joint_cws_parse/data/data_loader.py @@ -1,6 +1,6 @@ -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from fastNLP.io.data_loader import ConllLoader import numpy as np diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py index bba26a8a..f13618aa 100644 --- a/reproduction/matching/data/MatchingDataLoader.py +++ b/reproduction/matching/data/MatchingDataLoader.py @@ -9,7 +9,7 @@ from typing import Union, Dict from fastNLP.core.const import Const from fastNLP.core.vocabulary import Vocabulary -from fastNLP.io.base_loader import DataBundle, DataSetLoader +from fastNLP.io.data_bundle import DataBundle, DataSetLoader from fastNLP.io.dataset_loader import JsonLoader, CSVLoader from fastNLP.io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR from fastNLP.modules.encoder._bert import BertTokenizer diff --git a/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py b/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py index 0d292bdc..a2ee4663 100644 --- a/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py +++ b/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py @@ -1,6 +1,6 @@ -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from fastNLP.io import ConllLoader from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 from fastNLP import Const diff --git a/reproduction/seqence_labelling/cws/data/CWSDataLoader.py b/reproduction/seqence_labelling/cws/data/CWSDataLoader.py index 3c82d814..5f69c0ad 100644 --- a/reproduction/seqence_labelling/cws/data/CWSDataLoader.py +++ b/reproduction/seqence_labelling/cws/data/CWSDataLoader.py @@ -1,7 +1,7 @@ from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from typing import Union, Dict, List, Iterator from fastNLP import DataSet from fastNLP import Instance diff --git a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py index 1aeddcf8..0af4681e 100644 --- a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py +++ b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py @@ -1,6 +1,6 @@ from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from typing import Union, Dict from fastNLP import Vocabulary from fastNLP import Const diff --git a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py index a6070f39..25c6f29b 100644 --- a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py +++ b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py @@ -1,5 +1,5 @@ from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from typing import Union, Dict from fastNLP import DataSet from fastNLP import Vocabulary diff --git a/reproduction/text_classification/data/IMDBLoader.py b/reproduction/text_classification/data/IMDBLoader.py index 94244431..1585fe44 100644 --- a/reproduction/text_classification/data/IMDBLoader.py +++ b/reproduction/text_classification/data/IMDBLoader.py @@ -1,6 +1,6 @@ from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from typing import Union, Dict, List, Iterator from fastNLP import DataSet from fastNLP import Instance diff --git a/reproduction/text_classification/data/MTL16Loader.py b/reproduction/text_classification/data/MTL16Loader.py index 68969069..225fffe6 100644 --- a/reproduction/text_classification/data/MTL16Loader.py +++ b/reproduction/text_classification/data/MTL16Loader.py @@ -1,6 +1,6 @@ from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.base_loader import DataSetLoader, DataBundle +from fastNLP.io.data_bundle import DataSetLoader, DataBundle from typing import Union, Dict, List, Iterator from fastNLP import DataSet from fastNLP import Instance diff --git a/reproduction/text_classification/data/sstloader.py b/reproduction/text_classification/data/sstloader.py index fa4d1837..b635a14a 100644 --- a/reproduction/text_classification/data/sstloader.py +++ b/reproduction/text_classification/data/sstloader.py @@ -1,6 +1,6 @@ from typing import Iterable from nltk import Tree -from fastNLP.io.base_loader import DataBundle, DataSetLoader +from fastNLP.io.data_bundle import DataBundle, DataSetLoader from fastNLP.core.vocabulary import VocabularyOption, Vocabulary from fastNLP import DataSet from fastNLP import Instance diff --git a/reproduction/text_classification/data/yelpLoader.py b/reproduction/text_classification/data/yelpLoader.py index d2272a88..1f7634fc 100644 --- a/reproduction/text_classification/data/yelpLoader.py +++ b/reproduction/text_classification/data/yelpLoader.py @@ -4,7 +4,7 @@ from typing import Iterable from fastNLP import DataSet, Instance, Vocabulary from fastNLP.core.vocabulary import VocabularyOption from fastNLP.io import JsonLoader -from fastNLP.io.base_loader import DataBundle,DataSetLoader +from fastNLP.io.data_bundle import DataBundle,DataSetLoader from fastNLP.io.embed_loader import EmbeddingOption from fastNLP.io.file_reader import _read_json from typing import Union, Dict From fb82c66b4c8d2521816b7648d9e93eeef31a82fa Mon Sep 17 00:00:00 2001 From: YanqunJiang Date: Fri, 16 Aug 2019 17:51:07 +0800 Subject: [PATCH 059/153] =?UTF-8?q?=E5=A2=9E=E5=8A=A0char=5Fembedding?= =?UTF-8?q?=E5=8F=AF=E4=BD=BF=E7=94=A8=E9=A2=84=E8=AE=AD=E7=BB=83=E7=9A=84?= =?UTF-8?q?character=20embedding=E7=9A=84=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/char_embedding.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index b9e6659e..8243e148 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -9,6 +9,7 @@ import torch.nn as nn import torch.nn.functional as F from typing import List +from .static_embedding import StaticEmbedding from ..modules.encoder.lstm import LSTM from ..core.vocabulary import Vocabulary from .embedding import TokenEmbedding @@ -41,10 +42,13 @@ class CNNCharEmbedding(TokenEmbedding): :param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. :param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. :param min_char_freq: character的最少出现次数。默认值为2. + :param pre_train_char_embed:可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 + 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 + 如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), - pool_method: str='max', activation='relu', min_char_freq: int=2): + pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=''): super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) for kernel in kernel_sizes: @@ -85,7 +89,11 @@ class CNNCharEmbedding(TokenEmbedding): self.words_to_chars_embedding[index, :len(word)] = \ torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) - self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) + # self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) + if len(pre_train_char_embed): + self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) + else: + self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) self.convs = nn.ModuleList([nn.Conv1d( char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) @@ -184,10 +192,13 @@ class LSTMCharEmbedding(TokenEmbedding): :param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数. :param min_char_freq: character的最小出现次数。默认值为2. :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。 + :param pre_train_char_embed:可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 + 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 + 如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, - bidirectional=True): + bidirectional=True, pre_train_char_embed: str=''): super(LSTMCharEmbedding, self).__init__(vocab) assert hidden_size % 2 == 0, "Only even kernel is allowed." @@ -227,7 +238,11 @@ class LSTMCharEmbedding(TokenEmbedding): self.words_to_chars_embedding[index, :len(word)] = \ torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) - self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) + # self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) + if len(pre_train_char_embed): + self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) + else: + self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) self.fc = nn.Linear(hidden_size, embed_size) hidden_size = hidden_size // 2 if bidirectional else hidden_size From 4da6239ace6cd1ad022789b3e8d8af40e4d7ab1c Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 16 Aug 2019 18:38:13 +0800 Subject: [PATCH 060/153] Still some bugs on CSVLoader and JsonLoader. These should be solved more clear --- fastNLP/io/loader/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index bcb3b730..1da3e125 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -59,8 +59,8 @@ __all__ = [ 'OntoNotesNERLoader', 'CTBLoader', - 'CSVLoader', - 'JsonLoader', + # 'CSVLoader', + # 'JsonLoader', 'CWSLoader', From 1faf4ba2fad759f9d0652853f0bf31a447563c16 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 16 Aug 2019 18:48:38 +0800 Subject: [PATCH 061/153] delete a out-date test case --- test/io/test_dataset_loader.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py index 492545f6..6fb8e4f7 100644 --- a/test/io/test_dataset_loader.py +++ b/test/io/test_dataset_loader.py @@ -61,17 +61,17 @@ class TestDatasetLoader(unittest.TestCase): print(info.datasets) os.remove(train), os.remove(test) - def test_import(self): - import fastNLP - from fastNLP.io import SNLILoader - ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True, - get_index=True, seq_len_type='seq_len', extra_split=['-']) - assert 'train' in ds.datasets - assert len(ds.datasets) == 1 - assert len(ds.datasets['train']) == 3 - - ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True, - get_index=True, seq_len_type='seq_len') - assert 'train' in ds.datasets - assert len(ds.datasets) == 1 - assert len(ds.datasets['train']) == 3 + # def test_import(self): + # import fastNLP + # from fastNLP.io import SNLILoader + # ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True, + # get_index=True, seq_len_type='seq_len', extra_split=['-']) + # assert 'train' in ds.datasets + # assert len(ds.datasets) == 1 + # assert len(ds.datasets['train']) == 3 + # + # ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True, + # get_index=True, seq_len_type='seq_len') + # assert 'train' in ds.datasets + # assert len(ds.datasets) == 1 + # assert len(ds.datasets['train']) == 3 From 4bee5a78f4fe0c7a761a67ef0d92e5294994a6d6 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 16 Aug 2019 18:58:38 +0800 Subject: [PATCH 062/153] fix static_embedding --- fastNLP/embeddings/static_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 050a7fe1..c3fe7966 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -129,7 +129,7 @@ class StaticEmbedding(TokenEmbedding): word = word.lower() if word not in lowered_vocab and lowered_vocab._is_word_no_create_entry(word): continue # 如果不需要创建entry,已经默认unknown了 - words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)] + words_to_words[index] = words_to_words[lowered_vocab.to_index(word)] self.words_to_words = words_to_words self._word_unk_index = lowered_vocab.unknown_idx # 替换一下unknown的index else: From 23e283c45950d2c19eff14af956c28dffb9b7094 Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 16 Aug 2019 22:04:38 +0800 Subject: [PATCH 063/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DStaticEmbedding?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/static_embedding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index c3fe7966..15cb05f6 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -122,6 +122,7 @@ class StaticEmbedding(TokenEmbedding): unknown_idx = lowered_vocab.unknown_idx else: unknown_idx = embedding.size(0) - 1 # 否则是最后一个为unknow + self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), requires_grad=False) for word, index in vocab: @@ -129,7 +130,7 @@ class StaticEmbedding(TokenEmbedding): word = word.lower() if word not in lowered_vocab and lowered_vocab._is_word_no_create_entry(word): continue # 如果不需要创建entry,已经默认unknown了 - words_to_words[index] = words_to_words[lowered_vocab.to_index(word)] + words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)] self.words_to_words = words_to_words self._word_unk_index = lowered_vocab.unknown_idx # 替换一下unknown的index else: @@ -137,6 +138,7 @@ class StaticEmbedding(TokenEmbedding): embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) + self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) if normalize: embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) From f5571f17698013299a189e71e05ccfb0c413a6b2 Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 16 Aug 2019 22:21:19 +0800 Subject: [PATCH 064/153] =?UTF-8?q?1.=E6=9B=B4=E6=96=B0=E4=BA=86loader?= =?UTF-8?q?=E5=92=8Cpipe=E7=9A=84=E6=96=87=E4=BB=B6=E8=AF=B4=E6=98=8E;=202?= =?UTF-8?q?.=E4=BF=AE=E6=AD=A3conll.py=E4=B8=AD=E7=9A=84typo;=203.?= =?UTF-8?q?=E4=BF=AE=E6=94=B9char=5Fembedding=E7=9A=84pretrain=5Fchar=5Fpa?= =?UTF-8?q?th=E7=9A=84=E5=88=9D=E5=A7=8B=E5=8C=96=E8=B7=AF=E5=BE=84?= =?UTF-8?q?=E4=B8=BANone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/char_embedding.py | 22 +++++++++++----------- fastNLP/io/loader/loader.py | 13 ++++++++++++- fastNLP/io/pipe/conll.py | 4 ++-- fastNLP/io/pipe/pipe.py | 12 ++++++++++++ 4 files changed, 37 insertions(+), 14 deletions(-) diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index 955e08c9..1f3a9234 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -36,8 +36,8 @@ class CNNCharEmbedding(TokenEmbedding): >>> # torch.Size([1, 5,50]) :param vocab: 词表 - :param embed_size: 该word embedding的大小,默认值为50. - :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50. + :param embed_size: 该CNNCharEmbedding的输出维度大小,默认值为50. + :param char_emb_size: character的embed的维度。character是从vocab中生成的。默认值为50. :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 :param float dropout: 以多大的概率drop分布式表示与char embedding的输出。 :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20]. @@ -45,13 +45,13 @@ class CNNCharEmbedding(TokenEmbedding): :param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. :param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. :param min_char_freq: character的最少出现次数。默认值为2. - :param pre_train_char_embed:可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 + :param pre_train_char_embed:可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), - pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=''): + pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None): super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) for kernel in kernel_sizes: @@ -93,8 +93,8 @@ class CNNCharEmbedding(TokenEmbedding): torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) # self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) - if len(pre_train_char_embed): - self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) + if pre_train_char_embed: + self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) else: self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) @@ -189,8 +189,8 @@ class LSTMCharEmbedding(TokenEmbedding): >>> # torch.Size([1, 5,50]) :param vocab: 词表 - :param embed_size: embedding的大小。默认值为50. - :param char_emb_size: character的embedding的大小。默认值为50. + :param embed_size: LSTMCharEmbedding的输出维度。默认值为50. + :param char_emb_size: character的embedding的维度。默认值为50. :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 :param dropout: 以多大概率drop character embedding的输出以及最终的word的输出。 :param hidden_size: LSTM的中间hidden的大小,如果为bidirectional的,hidden会除二,默认为50. @@ -198,13 +198,13 @@ class LSTMCharEmbedding(TokenEmbedding): :param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数. :param min_char_freq: character的最小出现次数。默认值为2. :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。 - :param pre_train_char_embed:可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 + :param pre_train_char_embed:可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, - bidirectional=True, pre_train_char_embed: str=''): + bidirectional=True, pre_train_char_embed: str=None): super(LSTMCharEmbedding, self).__init__(vocab) assert hidden_size % 2 == 0, "Only even kernel is allowed." @@ -245,7 +245,7 @@ class LSTMCharEmbedding(TokenEmbedding): torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) # self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) - if len(pre_train_char_embed): + if pre_train_char_embed: self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) else: self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py index 296714bf..89628196 100644 --- a/fastNLP/io/loader/loader.py +++ b/fastNLP/io/loader/loader.py @@ -14,6 +14,12 @@ class Loader: pass def _load(self, path: str) -> DataSet: + """ + 给定一个路径,返回读取的DataSet。 + + :param str path: 路径 + :return: DataSet + """ raise NotImplementedError def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: @@ -53,7 +59,12 @@ class Loader: data_bundle = DataBundle(datasets=datasets) return data_bundle - def download(self): + def download(self)->str: + """ + 自动下载该数据集 + + :return: 下载后解压目录 + """ raise NotImplementedError(f"{self.__class__} cannot download data automatically.") def _get_dataset_path(self, dataset_name): diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index 0379a45b..7d55dd29 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -111,7 +111,7 @@ class Conll2003NERPipe(_NERPipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 - :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 + :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 """ def process_from_file(self, paths) -> DataBundle: @@ -140,7 +140,7 @@ class OntoNotesNERPipe(_NERPipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 - :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 + :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 """ def process_from_file(self, paths): diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py index 76cc00ec..a2b74301 100644 --- a/fastNLP/io/pipe/pipe.py +++ b/fastNLP/io/pipe/pipe.py @@ -3,7 +3,19 @@ from .. import DataBundle class Pipe: def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 对输入的DataBundle进行处理,然后返回该DataBundle。 + + :param data_bundle: 需要处理的DataBundle对象 + :return: + """ raise NotImplementedError def process_from_file(self, paths) -> DataBundle: + """ + 传入文件路径,生成处理好的DataBundle对象。paths支持的路径形式可以参考 `fastNLP.io.loader.Loader.load()` + + :param paths: + :return: DataBundle + """ raise NotImplementedError From 6aac447e5b830c3856ec7e6dc2db4f9ff273da7c Mon Sep 17 00:00:00 2001 From: ChenXin Date: Fri, 16 Aug 2019 23:38:16 +0800 Subject: [PATCH 065/153] fix some bugs on docs' format --- fastNLP/embeddings/char_embedding.py | 12 ++++++------ fastNLP/embeddings/utils.py | 2 +- fastNLP/io/__init__.py | 5 +++-- fastNLP/io/data_bundle.py | 7 ++++++- fastNLP/io/data_loader/conll.py | 24 +----------------------- fastNLP/io/loader/loader.py | 5 +++-- 6 files changed, 20 insertions(+), 35 deletions(-) diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index 1f3a9234..e772703a 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -45,9 +45,9 @@ class CNNCharEmbedding(TokenEmbedding): :param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. :param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. :param min_char_freq: character的最少出现次数。默认值为2. - :param pre_train_char_embed:可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 - 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 - 如果输入为None则使用embedding_dim的维度随机初始化一个embedding. + :param pre_train_char_embed: 可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹 + (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, + 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), @@ -198,9 +198,9 @@ class LSTMCharEmbedding(TokenEmbedding): :param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数. :param min_char_freq: character的最小出现次数。默认值为2. :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。 - :param pre_train_char_embed:可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 - 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 - 如果输入为None则使用embedding_dim的维度随机初始化一个embedding. + :param pre_train_char_embed: 可以有两种方式调用预训练好的character embedding:第一种是传入embedding文件夹 + (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, + 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, diff --git a/fastNLP/embeddings/utils.py b/fastNLP/embeddings/utils.py index b79f563c..1e83219a 100644 --- a/fastNLP/embeddings/utils.py +++ b/fastNLP/embeddings/utils.py @@ -31,7 +31,7 @@ def get_embeddings(init_embed): :param init_embed: 可以是 tuple:(num_embedings, embedding_dim), 即embedding的大小和每个词的维度;也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding; 传入np.ndarray也行,将使用传入的ndarray作为作为Embedding初始化; 传入torch.Tensor, 将使用传入的值作为Embedding初始化。 - :return nn.Embedding embeddings: + :return nn.Embedding: embeddings """ if isinstance(init_embed, tuple): res = nn.Embedding( diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 90d4d12c..f4b9c0cb 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -15,7 +15,9 @@ __all__ = [ 'DataBundle', 'EmbedLoader', - + + 'Loader', + 'YelpLoader', 'YelpFullLoader', 'YelpPolarityLoader', @@ -29,7 +31,6 @@ __all__ = [ 'OntoNotesNERLoader', 'CTBLoader', - 'Loader', 'CSVLoader', 'JsonLoader', diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index 4203294b..6f845511 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -110,9 +110,14 @@ def _uncompress(src, dst): class DataBundle: """ 经过处理的数据信息,包括一系列数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。该对象一般由fastNLP中各种 - DataSetLoader的load函数生成,可以通过以下的方法获取里面的内容 + Loader的load函数生成,可以通过以下的方法获取里面的内容 Example:: + + data_bundle = YelpLoader().load({'train':'/path/to/train', 'dev': '/path/to/dev'}) + train_vocabs = data_bundle.vocabs['train'] + train_data = data_bundle.datasets['train'] + dev_data = data_bundle.datasets['train'] :param vocabs: 从名称(字符串)到 :class:`~fastNLP.Vocabulary` 类型的dict :param datasets: 从名称(字符串)到 :class:`~fastNLP.DataSet` 类型的dict diff --git a/fastNLP/io/data_loader/conll.py b/fastNLP/io/data_loader/conll.py index 7083b98d..31a90881 100644 --- a/fastNLP/io/data_loader/conll.py +++ b/fastNLP/io/data_loader/conll.py @@ -76,29 +76,7 @@ class ConllLoader(DataSetLoader): 读取的field根据ConllLoader初始化时传入的headers决定。 - :param Union[str, Dict[str, str]] paths: 支持以下的几种输入方式 - (1) 传入一个目录, 该目录下名称包含train的被认为是train,包含test的被认为是test,包含dev的被认为是dev,如果检测到多个文件 - 名包含'train'、 'dev'、 'test'则会报错 - - Example:: - data_bundle = ConllLoader().load('/path/to/dir') # 返回的DataBundle中datasets根据目录下是否检测到train, dev, test等有所变化 - # 可以通过以下的方式取出DataSet - tr_data = data_bundle.datasets['train'] - te_data = data_bundle.datasets['test'] # 如果目录下有文件包含test这个字段 - - (2) 传入文件path - - Example:: - data_bundle = ConllLoader().load("/path/to/a/train.conll") # 返回DataBundle对象, datasets中仅包含'train' - tr_data = data_bundle.datasets['train'] # 可以通过以下的方式取出DataSet - - (3) 传入一个dict,比如train,dev,test不在同一个目录下,或者名称中不包含train, dev, test - - Example:: - paths = {'train':"/path/to/tr.conll", 'dev':"/to/validate.conll", "test":"/to/te.conll"} - data_bundle = ConllLoader().load(paths) # 返回的DataBundle中的dataset中包含"train", "dev", "test" - dev_data = data_bundle.datasets['dev'] - + :param Union[str, Dict[str, str]] paths: :return: :class:`~fastNLP.DataSet` 类的对象或 :class:`~fastNLP.io.DataBundle` 的字典 """ paths = check_loader_paths(paths) diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py index 89628196..02f24097 100644 --- a/fastNLP/io/loader/loader.py +++ b/fastNLP/io/loader/loader.py @@ -10,6 +10,7 @@ class Loader: 各种数据 Loader 的基类,提供了 API 的参考. """ + def __init__(self): pass @@ -24,7 +25,7 @@ class Loader: def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: """ - 从指定一个或多个路径中的文件中读取数据,返回:class:`~fastNLP.io.DataBundle` 。 + 从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 读取的field根据ConllLoader初始化时传入的headers决定。 @@ -59,7 +60,7 @@ class Loader: data_bundle = DataBundle(datasets=datasets) return data_bundle - def download(self)->str: + def download(self) -> str: """ 自动下载该数据集 From d576d3999fb15507bb157bcc865d83dbc7465698 Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 17 Aug 2019 00:02:19 +0800 Subject: [PATCH 066/153] =?UTF-8?q?=E6=9B=B4=E6=96=B0StaticEmbedding?= =?UTF-8?q?=E4=B8=AD=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/static_embedding.py | 1 + 1 file changed, 1 insertion(+) diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 15cb05f6..c3d4ede6 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -118,6 +118,7 @@ class StaticEmbedding(TokenEmbedding): embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) + self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) if lowered_vocab.unknown: unknown_idx = lowered_vocab.unknown_idx else: From 9560a4d367439c9df4f574869e6310c34a108204 Mon Sep 17 00:00:00 2001 From: xuyige Date: Sat, 17 Aug 2019 00:10:40 +0800 Subject: [PATCH 067/153] update test codes in models/bert.py --- fastNLP/models/bert.py | 5 +++++ test/models/test_bert.py | 12 +++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py index ad7750ec..3afccc14 100644 --- a/fastNLP/models/bert.py +++ b/fastNLP/models/bert.py @@ -10,6 +10,7 @@ from .base_model import BaseModel from ..core.const import Const from ..modules.encoder import BertModel from ..modules.encoder.bert import BertConfig, CONFIG_FILE +from ..core.utils import seq_len_to_mask class BertForSequenceClassification(BaseModel): @@ -70,6 +71,10 @@ class BertForSequenceClassification(BaseModel): return model def forward(self, words, seq_len=None, target=None): + if seq_len is None: + seq_len = torch.ones_like(words, dtype=words.dtype, device=words.device) + if len(seq_len.size()) + 1 == len(words.size()): + seq_len = seq_len_to_mask(seq_len, max_len=words.size(-1)) _, pooled_output = self.bert(words, attention_mask=seq_len, output_all_encoded_layers=False) pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) diff --git a/test/models/test_bert.py b/test/models/test_bert.py index 05ee6d5a..40b98c81 100644 --- a/test/models/test_bert.py +++ b/test/models/test_bert.py @@ -2,7 +2,8 @@ import unittest import torch -from fastNLP.models.bert import * +from fastNLP.models.bert import BertForSequenceClassification, BertForQuestionAnswering, \ + BertForTokenClassification, BertForMultipleChoice class TestBert(unittest.TestCase): @@ -14,9 +15,14 @@ class TestBert(unittest.TestCase): input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - pred = model(input_ids, token_type_ids, input_mask) + pred = model(input_ids, input_mask) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2)) + + input_mask = torch.LongTensor([3, 2]) + pred = model(input_ids, input_mask) self.assertTrue(isinstance(pred, dict)) self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2)) From 89142d9dc5ad34b98a1d8d0db47bed4bab562fd9 Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 17 Aug 2019 11:36:39 +0800 Subject: [PATCH 068/153] =?UTF-8?q?CrossEntropyLoss=E5=A2=9E=E5=8A=A0class?= =?UTF-8?q?=5Fin=5Fdim=E9=80=89=E9=A1=B9=E6=8E=A7=E5=88=B6target=E7=9A=84?= =?UTF-8?q?=E7=BB=B4=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 05e5b440..d5549cec 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -206,7 +206,11 @@ class CrossEntropyLoss(LossBase): :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` - :param seq_len: 句子的长度, 长度之外的token不会计算loss。。 + :param seq_len: 句子的长度, 长度之外的token不会计算loss。 + :param int class_in_dim: 在序列标注的场景中,pred可能的shape为(batch_size, max_len, num_classes) + 或(batch_size, num_classes, max_len), CrossEntropyLoss需要知道哪一维是class的维度以计算loss。如果为-1,就根据pred的第 + 二维是否等于target的第二维来判断是否需要交换pred的第二维和第三维,因为target的第二维是length的维度,如果这一维度上和pred相等, + 那么pred可能第二维也是长度维(存在误判的可能,如果有误判的情况,请显示设置该值)。其它大于0的值则认为该维度是class的维度。 :param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替 传入seq_len. :param str reduction: 支持 `mean` ,`sum` 和 `none` . @@ -217,18 +221,21 @@ class CrossEntropyLoss(LossBase): """ - def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100, reduction='mean'): + def __init__(self, pred=None, target=None, seq_len=None, class_in_dim=-1, padding_idx=-100, reduction='mean'): super(CrossEntropyLoss, self).__init__() self._init_param_map(pred=pred, target=target, seq_len=seq_len) self.padding_idx = padding_idx assert reduction in ('mean', 'sum', 'none') self.reduction = reduction + self.class_in_dim = class_in_dim def get_loss(self, pred, target, seq_len=None): if pred.dim() > 2: - if pred.size(1) != target.size(1): # 有可能顺序替换了 - raise RuntimeError("It seems like that your prediction's shape is (batch_size, num_labels, max_len)." - " It should be (batch_size, max_len, num_labels).") + if self.class_in_dim == -1: + if pred.size(1) != target.size(1): # 有可能顺序替换了 + pred = pred.transpose(1, 2) + else: + pred = pred.tranpose(-1, pred) pred = pred.reshape(-1, pred.size(-1)) target = target.reshape(-1) if seq_len is not None: From 287019450e4883ba8030f661cf3c82d1960fb4ac Mon Sep 17 00:00:00 2001 From: yunfan Date: Sat, 17 Aug 2019 14:38:12 +0800 Subject: [PATCH 069/153] [add] logger in trainer --- fastNLP/core/callback.py | 39 ++++++++++++---- fastNLP/core/dist_trainer.py | 46 ++++++++++--------- fastNLP/core/trainer.py | 22 ++++++--- fastNLP/core/utils.py | 8 ++-- .../text_classification/model/dpcnn.py | 4 +- .../text_classification/train_dpcnn.py | 42 +++++++++-------- 6 files changed, 98 insertions(+), 63 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 633c6f45..1a20f861 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -164,7 +164,7 @@ class Callback(object): @property def is_master(self): - return self._trainer.is_master() + return self._trainer.is_master @property def disabled(self): @@ -172,7 +172,7 @@ class Callback(object): @property def logger(self): - return getattr(self._trainer, 'logger', logging) + return getattr(self._trainer, 'logger', logging.getLogger(__name__)) def on_train_begin(self): """ @@ -405,11 +405,11 @@ class DistCallbackManager(CallbackManager): def __init__(self, env, callbacks_all=None, callbacks_master=None): super(DistCallbackManager, self).__init__(env) assert 'trainer' in env - is_master = env['trainer'].is_master - self.patch_callback(callbacks_master, disabled=not is_master) - self.callbacks_all = self.prepare_callbacks(callbacks_all) - self.callbacks_master = self.prepare_callbacks(callbacks_master) - self.callbacks = self.callbacks_all + self.callbacks_master + self._trainer = env['trainer'] + self.callbacks_master = [] + self.callbacks_all = [] + self.add_callback(callbacks_all, master=False) + self.add_callback(callbacks_master, master=True) def patch_callback(self, callbacks, disabled): if not callbacks: @@ -419,6 +419,14 @@ class DistCallbackManager(CallbackManager): for cb in callbacks: cb._disabled = disabled + def add_callback(self, cb, master=False): + if master: + self.patch_callback(cb, not self.is_master) + self.callbacks_master += self.prepare_callbacks(cb) + else: + self.callbacks_all += self.prepare_callbacks(cb) + self.callbacks = self.callbacks_all + self.callbacks_master + class GradientClipCallback(Callback): """ @@ -1048,15 +1056,26 @@ class TesterCallback(Callback): self.score = cur_score return cur_score, is_better + def _get_score(self, metric_dict, key): + for metric in metric_dict.items(): + if key in metric: + return metric[key] + return None + def compare_better(self, a): if self.score is None: return True + if self.metric_key is None: + self.metric_key = list(list(self.score.values())[0].keys())[0] k = self.metric_key - is_increase = self.score[k] <= a[k] # if equal, prefer more recent results + score = self._get_score(self.score, k) + new_score = self._get_score(a, k) + if score is None or new_score is None: + return False if self.increase_better: - return is_increase + return score <= new_score else: - return not is_increase + return score >= new_score def on_train_end(self): self.logger.info('Evaluate on training ends.') diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 00db6361..bfd0e70b 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -22,6 +22,7 @@ from .optimizer import Optimizer from .utils import _build_args from .utils import _move_dict_value_to_device from .utils import _get_func_signature +from ..io.logger import initLogger from pkg_resources import parse_version __all__ = [ @@ -40,7 +41,7 @@ def get_local_rank(): if 'local_rank' in args and args.local_rank: os.environ['LOCAL_RANK'] = str(args.local_rank) # for multiple calls for this function return args.local_rank - raise RuntimeError('Please use "python -m torch.distributed.launch train_script.py') + raise RuntimeError('Please use "python -m torch.distributed.launch --nproc_per_node=N train_script.py') class DistTrainer(): @@ -50,7 +51,7 @@ class DistTrainer(): def __init__(self, train_data, model, optimizer=None, loss=None, callbacks_all=None, callbacks_master=None, batch_size_per_gpu=8, n_epochs=1, - num_data_workers=1, drop_last=False, + num_workers=1, drop_last=False, dev_data=None, metrics=None, metric_key=None, update_every=1, print_every=10, validate_every=-1, log_path=None, @@ -78,7 +79,7 @@ class DistTrainer(): self.train_data = train_data self.batch_size_per_gpu = int(batch_size_per_gpu) self.n_epochs = int(n_epochs) - self.num_data_workers = int(num_data_workers) + self.num_data_workers = int(num_workers) self.drop_last = drop_last self.update_every = int(update_every) self.print_every = int(print_every) @@ -127,9 +128,8 @@ class DistTrainer(): if dev_data and metrics: cb = TesterCallback( dev_data, model, metrics, - batch_size=batch_size_per_gpu, num_workers=num_data_workers) - self.callback_manager.callbacks_master += \ - self.callback_manager.prepare_callbacks([cb]) + batch_size=batch_size_per_gpu, num_workers=num_workers) + self.callback_manager.add_callback([cb], master=True) # Setup logging dist.barrier() @@ -140,10 +140,7 @@ class DistTrainer(): self.cp_save_path = None # use INFO in the master, WARN for others - logging.basicConfig(filename=log_path, - format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.INFO if self.is_master else logging.WARN) + initLogger(log_path, level=logging.INFO if self.is_master else logging.WARNING) self.logger = logging.getLogger(__name__) self.logger.info("Setup Distributed Trainer") self.logger.warning("Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".format( @@ -284,18 +281,8 @@ class DistTrainer(): self.callback_manager.on_batch_end() - if ((self.validate_every > 0 and self.step % self.validate_every == 0) or - (self.validate_every < 0 and self.step % len(data_iterator) == 0)): - self.callback_manager.on_valid_begin() - eval_res = self.callback_manager.on_validation() - eval_res = list(filter(lambda x: x is not None, eval_res)) - if len(eval_res): - eval_res, is_better = list(zip(*eval_res)) - else: - eval_res, is_better = None, None - self.callback_manager.on_valid_end( - eval_res, self.metric_key, self.optimizer, is_better) - dist.barrier() + if (self.validate_every > 0 and self.step % self.validate_every == 0): + self._do_validation() if self.cp_save_path and \ self.save_every > 0 and \ @@ -303,6 +290,9 @@ class DistTrainer(): self.save_check_point() # ================= mini-batch end ==================== # + if self.validate_every < 0: + self._do_validation() + if self.save_every < 0 and self.cp_save_path: self.save_check_point() # lr decay; early stopping @@ -351,5 +341,17 @@ class DistTrainer(): model_to_save = model_to_save.state_dict() torch.save(model_to_save, path) + def _do_validation(self): + self.callback_manager.on_valid_begin() + eval_res = self.callback_manager.on_validation() + eval_res = list(filter(lambda x: x is not None, eval_res)) + if len(eval_res): + eval_res, is_better = list(zip(*eval_res)) + else: + eval_res, is_better = None, None + self.callback_manager.on_valid_end( + eval_res, self.metric_key, self.optimizer, is_better) + dist.barrier() + def close(self): dist.destroy_process_group() diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 0d239048..83882df0 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -353,6 +353,8 @@ from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device from ._parallel_utils import _model_contains_inner_module +from ..io.logger import initLogger +import logging class Trainer(object): @@ -547,6 +549,12 @@ class Trainer(object): else: raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer))) + log_path = None + if save_path is not None: + log_path = os.path.join(os.path.dirname(save_path), 'log') + initLogger(log_path) + self.logger = logging.getLogger(__name__) + self.use_tqdm = use_tqdm self.pbar = None self.print_every = abs(self.print_every) @@ -588,7 +596,7 @@ class Trainer(object): """ results = {} if self.n_epochs <= 0: - print(f"training epoch is {self.n_epochs}, nothing was done.") + self.logger.info(f"training epoch is {self.n_epochs}, nothing was done.") results['seconds'] = 0. return results try: @@ -597,7 +605,7 @@ class Trainer(object): self._load_best_model = load_best_model self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) start_time = time.time() - print("training epochs started " + self.start_time, flush=True) + self.logger.info("training epochs started " + self.start_time) try: self.callback_manager.on_train_begin() @@ -613,7 +621,7 @@ class Trainer(object): raise e if self.dev_data is not None and self.best_dev_perf is not None: - print( + self.logger.info( "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + self.tester._format_eval_results(self.best_dev_perf), ) results['best_eval'] = self.best_dev_perf @@ -623,9 +631,9 @@ class Trainer(object): model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) load_succeed = self._load_model(self.model, model_name) if load_succeed: - print("Reloaded the best model.") + self.logger.info("Reloaded the best model.") else: - print("Fail to reload best model.") + self.logger.info("Fail to reload best model.") finally: pass results['seconds'] = round(time.time() - start_time, 2) @@ -825,12 +833,12 @@ class Trainer(object): self.best_metric_indicator = indicator_val else: if self.increase_better is True: - if indicator_val > self.best_metric_indicator: + if indicator_val >= self.best_metric_indicator: self.best_metric_indicator = indicator_val else: is_better = False else: - if indicator_val < self.best_metric_indicator: + if indicator_val <= self.best_metric_indicator: self.best_metric_indicator = indicator_val else: is_better = False diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 4ce382f3..f2826421 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -17,6 +17,7 @@ import numpy as np import torch import torch.nn as nn from typing import List +import logging _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', 'varargs']) @@ -659,15 +660,14 @@ class _pseudo_tqdm: """ 当无法引入tqdm,或者Trainer中设置use_tqdm为false的时候,用该方法打印数据 """ - def __init__(self, **kwargs): - pass + self.logger = logging.getLogger() def write(self, info): - print(info) + self.logger.info(info) def set_postfix_str(self, info): - print(info) + self.logger.info(info) def __getattr__(self, item): def pass_func(*args, **kwargs): diff --git a/reproduction/text_classification/model/dpcnn.py b/reproduction/text_classification/model/dpcnn.py index ae2d46bd..b63c6d38 100644 --- a/reproduction/text_classification/model/dpcnn.py +++ b/reproduction/text_classification/model/dpcnn.py @@ -1,6 +1,5 @@ import torch import torch.nn as nn -from fastNLP.embeddings.utils import get_embeddings from fastNLP.core import Const as C @@ -64,7 +63,8 @@ class RegionEmbedding(nn.Module): kernel_sizes = [5, 9] assert isinstance( kernel_sizes, list), 'kernel_sizes should be List(int)' - self.embed = get_embeddings(init_embed) + # self.embed = nn.Embedding.from_pretrained(torch.tensor(init_embed).float(), freeze=False) + self.embed = init_embed try: embed_dim = self.embed.embedding_dim except Exception: diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py index 6cce453b..e4df00bf 100644 --- a/reproduction/text_classification/train_dpcnn.py +++ b/reproduction/text_classification/train_dpcnn.py @@ -13,10 +13,11 @@ from fastNLP.core.sampler import BucketSampler from fastNLP.core import LRScheduler from fastNLP.core.const import Const as C from fastNLP.core.vocabulary import VocabularyOption +from fastNLP.core.dist_trainer import DistTrainer from utils.util_init import set_rng_seeds import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' +# os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' +# os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" @@ -64,27 +65,28 @@ def load_data(): ds.apply_field(len, C.INPUT, C.INPUT_LEN) ds.set_input(C.INPUT, C.INPUT_LEN) ds.set_target(C.TARGET) - embedding = StaticEmbedding( - datainfo.vocabs['words'], model_dir_or_name='en-glove-840b-300', requires_grad=ops.embedding_grad, - normalize=False - ) - return datainfo, embedding + return datainfo -datainfo, embedding = load_data() + +datainfo = load_data() +embedding = StaticEmbedding( + datainfo.vocabs['words'], model_dir_or_name='en-glove-6b-100d', requires_grad=ops.embedding_grad, + normalize=False) embedding.embedding.weight.data /= embedding.embedding.weight.data.std() -print(embedding.embedding.weight.mean(), embedding.embedding.weight.std()) +print(embedding.embedding.weight.data.mean(), embedding.embedding.weight.data.std()) # 2.或直接复用fastNLP的模型 # embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)]) - +datainfo.datasets['train'] = datainfo.datasets['train'][:1000] +datainfo.datasets['test'] = datainfo.datasets['test'][:1000] print(datainfo) print(datainfo.datasets['train'][0]) model = DPCNN(init_embed=embedding, num_cls=len(datainfo.vocabs[C.TARGET]), embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout) -print(model) +# print(model) # 3. 声明loss,metric,optimizer loss = CrossEntropyLoss(pred=C.OUTPUT, target=C.TARGET) @@ -109,13 +111,17 @@ device = 'cuda:0' if torch.cuda.is_available() else 'cpu' print(device) # 4.定义train方法 -trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), - metrics=[metric], - dev_data=datainfo.datasets['test'], device=device, - check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, - n_epochs=ops.train_epoch, num_workers=4) - +# trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, +# sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), +# metrics=[metric], +# dev_data=datainfo.datasets['test'], device=device, +# check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, +# n_epochs=ops.train_epoch, num_workers=4) +trainer = DistTrainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, + metrics=[metric], + dev_data=datainfo.datasets['test'], device='cuda', + batch_size_per_gpu=ops.batch_size, callbacks_all=callbacks, + n_epochs=ops.train_epoch, num_workers=4) if __name__ == "__main__": From 007c047ae7cb0cdc80857ce9ebded3143af231a1 Mon Sep 17 00:00:00 2001 From: yunfan Date: Sun, 18 Aug 2019 13:39:56 +0800 Subject: [PATCH 070/153] [update] logger in trainer & tester --- fastNLP/core/callback.py | 9 +- fastNLP/core/dist_trainer.py | 4 +- fastNLP/core/tester.py | 6 +- fastNLP/core/trainer.py | 11 ++- fastNLP/core/utils.py | 2 +- fastNLP/io/logger.py | 88 +++++++++++++++++++ .../text_classification/train_dpcnn.py | 22 ++--- 7 files changed, 118 insertions(+), 24 deletions(-) create mode 100644 fastNLP/io/logger.py diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 1a20f861..447186ca 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -656,10 +656,13 @@ class EvaluateCallback(Callback): for key, tester in self.testers.items(): try: eval_result = tester.test() - self.pbar.write("Evaluation on {}:".format(key)) - self.pbar.write(tester._format_eval_results(eval_result)) + # self.pbar.write("Evaluation on {}:".format(key)) + self.logger.info("Evaluation on {}:".format(key)) + # self.pbar.write(tester._format_eval_results(eval_result)) + self.logger.info(tester._format_eval_results(eval_result)) except Exception: - self.pbar.write("Exception happens when evaluate on DataSet named `{}`.".format(key)) + # self.pbar.write("Exception happens when evaluate on DataSet named `{}`.".format(key)) + self.logger.info("Exception happens when evaluate on DataSet named `{}`.".format(key)) class LRScheduler(Callback): diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index bfd0e70b..e14e17c8 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -22,7 +22,7 @@ from .optimizer import Optimizer from .utils import _build_args from .utils import _move_dict_value_to_device from .utils import _get_func_signature -from ..io.logger import initLogger +from ..io.logger import init_logger from pkg_resources import parse_version __all__ = [ @@ -140,7 +140,7 @@ class DistTrainer(): self.cp_save_path = None # use INFO in the master, WARN for others - initLogger(log_path, level=logging.INFO if self.is_master else logging.WARNING) + init_logger(log_path, level=logging.INFO if self.is_master else logging.WARNING) self.logger = logging.getLogger(__name__) self.logger.info("Setup Distributed Trainer") self.logger.warning("Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".format( diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 691bf2ae..10696240 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -56,6 +56,7 @@ from .utils import _move_model_to_device from ._parallel_utils import _data_parallel_wrapper from ._parallel_utils import _model_contains_inner_module from functools import partial +from ..io.logger import init_logger, get_logger __all__ = [ "Tester" @@ -103,6 +104,8 @@ class Tester(object): self.batch_size = batch_size self.verbose = verbose self.use_tqdm = use_tqdm + init_logger(stdout='tqdm' if use_tqdm else 'plain') + self.logger = get_logger(__name__) if isinstance(data, DataSet): self.data_iterator = DataSetIter( @@ -181,7 +184,8 @@ class Tester(object): end_time = time.time() test_str = f'Evaluate data in {round(end_time - start_time, 2)} seconds!' - pbar.write(test_str) + # pbar.write(test_str) + self.logger.info(test_str) pbar.close() except _CheckError as e: prev_func_signature = _get_func_signature(self._predict_func) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 83882df0..d71e23f5 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -353,8 +353,7 @@ from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device from ._parallel_utils import _model_contains_inner_module -from ..io.logger import initLogger -import logging +from ..io.logger import init_logger, get_logger class Trainer(object): @@ -552,8 +551,8 @@ class Trainer(object): log_path = None if save_path is not None: log_path = os.path.join(os.path.dirname(save_path), 'log') - initLogger(log_path) - self.logger = logging.getLogger(__name__) + init_logger(path=log_path, stdout='tqdm' if use_tqdm else 'plain') + self.logger = get_logger(__name__) self.use_tqdm = use_tqdm self.pbar = None @@ -701,8 +700,8 @@ class Trainer(object): eval_str = "Evaluation on dev at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, self.n_steps) + \ self.tester._format_eval_results(eval_res) - pbar.write(eval_str + '\n') - + # pbar.write(eval_str + '\n') + self.logger.info(eval_str) # ================= mini-batch end ==================== # # lr decay; early stopping diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index f2826421..a49d203d 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -661,7 +661,7 @@ class _pseudo_tqdm: 当无法引入tqdm,或者Trainer中设置use_tqdm为false的时候,用该方法打印数据 """ def __init__(self, **kwargs): - self.logger = logging.getLogger() + self.logger = logging.getLogger(__name__) def write(self, info): self.logger.info(info) diff --git a/fastNLP/io/logger.py b/fastNLP/io/logger.py new file mode 100644 index 00000000..287bdbc9 --- /dev/null +++ b/fastNLP/io/logger.py @@ -0,0 +1,88 @@ +import logging +import logging.config +import torch +import _pickle as pickle +import os +import sys +import warnings + +try: + import fitlog +except ImportError: + fitlog = None +try: + from tqdm.auto import tqdm +except ImportError: + tqdm = None + +if tqdm is not None: + class TqdmLoggingHandler(logging.Handler): + def __init__(self, level=logging.INFO): + super().__init__(level) + + def emit(self, record): + try: + msg = self.format(record) + tqdm.write(msg) + self.flush() + except (KeyboardInterrupt, SystemExit): + raise + except: + self.handleError(record) +else: + class TqdmLoggingHandler(logging.StreamHandler): + def __init__(self, level=logging.INFO): + super().__init__(sys.stdout) + self.setLevel(level) + + +def init_logger(path=None, stdout='tqdm', level='INFO'): + """initialize logger""" + if stdout not in ['none', 'plain', 'tqdm']: + raise ValueError('stdout must in one of {}'.format(['none', 'plain', 'tqdm'])) + + if isinstance(level, int): + pass + else: + level = level.lower() + level = {'info': logging.INFO, 'debug': logging.DEBUG, + 'warn': logging.WARN, 'warning': logging.WARN, + 'error': logging.ERROR}[level] + + logger = logging.getLogger('fastNLP') + logger.setLevel(level) + handlers_type = set([type(h) for h in logger.handlers]) + + # make sure to initialize logger only once + # Stream Handler + if stdout == 'plain' and (logging.StreamHandler not in handlers_type): + stream_handler = logging.StreamHandler(sys.stdout) + elif stdout == 'tqdm' and (TqdmLoggingHandler not in handlers_type): + stream_handler = TqdmLoggingHandler(level) + else: + stream_handler = None + + if stream_handler is not None: + stream_formatter = logging.Formatter('[%(levelname)s] %(message)s') + stream_handler.setLevel(level) + stream_handler.setFormatter(stream_formatter) + logger.addHandler(stream_handler) + + # File Handler + if path is not None and (logging.FileHandler not in handlers_type): + if os.path.exists(path): + assert os.path.isfile(path) + warnings.warn('log already exists in {}'.format(path)) + dirname = os.path.abspath(os.path.dirname(path)) + os.makedirs(dirname, exist_ok=True) + + file_handler = logging.FileHandler(path, mode='a') + file_handler.setLevel(level) + file_formatter = logging.Formatter(fmt='%(asctime)s - [%(levelname)s] - %(name)s - %(message)s', + datefmt='%Y/%m/%d %H:%M:%S') + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + + return logger + +get_logger = logging.getLogger diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py index e4df00bf..99e27640 100644 --- a/reproduction/text_classification/train_dpcnn.py +++ b/reproduction/text_classification/train_dpcnn.py @@ -111,17 +111,17 @@ device = 'cuda:0' if torch.cuda.is_available() else 'cpu' print(device) # 4.定义train方法 -# trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, -# sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), -# metrics=[metric], -# dev_data=datainfo.datasets['test'], device=device, -# check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, -# n_epochs=ops.train_epoch, num_workers=4) -trainer = DistTrainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - metrics=[metric], - dev_data=datainfo.datasets['test'], device='cuda', - batch_size_per_gpu=ops.batch_size, callbacks_all=callbacks, - n_epochs=ops.train_epoch, num_workers=4) +trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, + sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), + metrics=[metric], use_tqdm=False, + dev_data=datainfo.datasets['test'], device=device, + check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, + n_epochs=ops.train_epoch, num_workers=4) +# trainer = DistTrainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, +# metrics=[metric], +# dev_data=datainfo.datasets['test'], device='cuda', +# batch_size_per_gpu=ops.batch_size, callbacks_all=callbacks, +# n_epochs=ops.train_epoch, num_workers=4) if __name__ == "__main__": From 9971861d86058098eb4834fade67de04a48bc2e5 Mon Sep 17 00:00:00 2001 From: xuyige Date: Mon, 19 Aug 2019 02:57:12 +0800 Subject: [PATCH 071/153] 1. update import statements in callback.py; 2. fix some code style --- fastNLP/core/__init__.py | 5 +++-- fastNLP/core/callback.py | 12 +++++++++--- fastNLP/io/loader/loader.py | 3 ++- fastNLP/io/pipe/matching.py | 1 + 4 files changed, 15 insertions(+), 6 deletions(-) diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index eeabda35..acf0efc4 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -13,8 +13,9 @@ core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fa """ from .batch import DataSetIter, BatchIter, TorchLoaderIter -from .callback import Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC -from .callback import EvaluateCallback, FitlogCallback, SaveModelCallback +from .callback import Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, \ + LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, \ + TesterCallback, CallbackException, EarlyStopError from .const import Const from .dataset import DataSet from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 447186ca..17ded171 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -51,13 +51,19 @@ callback模块实现了 fastNLP 中的许多 callback 类,用于增强 :class: """ __all__ = [ "Callback", + "GradientClipCallback", "EarlyStopCallback", - "TensorboardCallback", "FitlogCallback", + "EvaluateCallback", "LRScheduler", "ControlC", - "EvaluateCallback", + "LRFinder", + "TensorboardCallback", + "WarmupCallback", + "SaveModelCallback", + "EchoCallback", + "TesterCallback", "CallbackException", "EarlyStopError" @@ -718,7 +724,7 @@ class SmoothValue(object): self.smooth = None def add_value(self, val: float) -> None: - "Add `val` to calculate updated smoothed value." + """Add `val` to calculate updated smoothed value.""" self.n += 1 self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val self.smooth = self.mov_avg / (1 - self.beta ** self.n) diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py index 02f24097..e7b419ac 100644 --- a/fastNLP/io/loader/loader.py +++ b/fastNLP/io/loader/loader.py @@ -68,7 +68,8 @@ class Loader: """ raise NotImplementedError(f"{self.__class__} cannot download data automatically.") - def _get_dataset_path(self, dataset_name): + @staticmethod + def _get_dataset_path(dataset_name): """ 传入dataset的名称,获取读取数据的目录。如果数据不存在,会尝试自动下载并缓存 diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 9f7c7d68..b0209b72 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -239,6 +239,7 @@ class QuoraPipe(MatchingPipe): data_bundle = QuoraLoader().load(paths) return self.process(data_bundle) + class QNLIPipe(MatchingPipe): def process_from_file(self, paths=None): data_bundle = QNLILoader().load(paths) From 511f41dda1ee67954f0133e2d70f1378e0d4d728 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 19 Aug 2019 10:52:12 +0800 Subject: [PATCH 072/153] =?UTF-8?q?1.=20=E5=A2=9E=E5=8A=A0=E4=B8=AD?= =?UTF-8?q?=E6=96=87NER=E7=9B=B8=E5=85=B3=E7=9A=84loader=E5=92=8Cpipe;=202?= =?UTF-8?q?.=20=E5=AF=B9=E5=BA=94=E4=BF=AE=E6=94=B9sequence=5Flabeling?= =?UTF-8?q?=E7=9A=84=E4=BB=A3=E7=A0=81;=203.=E5=A2=9E=E5=8A=A0=E9=83=A8?= =?UTF-8?q?=E5=88=86=E6=B5=8B=E8=AF=95=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataset.py | 47 +---- fastNLP/embeddings/bert_embedding.py | 73 +++++-- fastNLP/embeddings/static_embedding.py | 6 +- fastNLP/io/__init__.py | 6 + fastNLP/io/data_bundle.py | 39 +++- fastNLP/io/file_utils.py | 93 +++++---- fastNLP/io/loader/__init__.py | 4 + fastNLP/io/loader/classification.py | 98 ++++------ fastNLP/io/loader/conll.py | 178 +++++++++++++++++- fastNLP/io/pipe/__init__.py | 8 +- fastNLP/io/pipe/conll.py | 165 ++++++++++++---- fastNLP/io/pipe/matching.py | 4 +- fastNLP/io/pipe/utils.py | 38 +++- fastNLP/modules/encoder/bert.py | 3 +- .../chinese_ner/data/ChineseNER.py | 115 ----------- .../chinese_ner/data/__init__.py | 0 .../chinese_ner/train_bert.py | 33 ++-- .../chinese_ner/train_cn_ner.py | 70 +++++-- .../ner/model/lstm_cnn_crf.py | 1 - .../ner/train_cnn_lstm_crf_conll2003.py | 65 ++----- .../seqence_labelling/ner/train_ontonote.py | 51 ++--- test/embeddings/test_bert_embedding.py | 14 ++ test/io/loader/test_conll_loader.py | 21 +++ test/io/pipe/test_conll.py | 12 ++ 24 files changed, 704 insertions(+), 440 deletions(-) delete mode 100644 reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py delete mode 100644 reproduction/seqence_labelling/chinese_ner/data/__init__.py create mode 100644 test/embeddings/test_bert_embedding.py create mode 100644 test/io/loader/test_conll_loader.py create mode 100644 test/io/pipe/test_conll.py diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 0f98ed1f..4c689842 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -613,6 +613,7 @@ class DataSet(object): raise e else: raise KeyError("{} is not a valid field name.".format(name)) + return self def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): """ @@ -636,6 +637,7 @@ class DataSet(object): raise e else: raise KeyError("{} is not a valid field name.".format(name)) + return self def set_ignore_type(self, *field_names, flag=True): """ @@ -652,6 +654,7 @@ class DataSet(object): self.field_arrays[name].ignore_type = flag else: raise KeyError("{} is not a valid field name.".format(name)) + return self def set_padder(self, field_name, padder): """ @@ -667,6 +670,7 @@ class DataSet(object): if field_name not in self.field_arrays: raise KeyError("There is no field named {}.".format(field_name)) self.field_arrays[field_name].set_padder(padder) + return self def set_pad_val(self, field_name, pad_val): """ @@ -678,6 +682,7 @@ class DataSet(object): if field_name not in self.field_arrays: raise KeyError("There is no field named {}.".format(field_name)) self.field_arrays[field_name].set_pad_val(pad_val) + return self def get_input_name(self): """ @@ -868,48 +873,6 @@ class DataSet(object): return train_set, dev_set - @classmethod - def read_csv(cls, csv_path, headers=None, sep=",", dropna=True): - r""" - .. warning:: - 此方法会在下个版本移除,请使用 :class:`fastNLP.io.CSVLoader` - - 从csv_path路径下以csv的格式读取数据。 - - :param str csv_path: 从哪里读取csv文件 - :param list[str] headers: 如果为None,则使用csv文件的第一行作为header; 如果传入list(str), 则元素的个数必须 - 与csv文件中每行的元素个数相同。 - :param str sep: 分割符 - :param bool dropna: 是否忽略与header数量不一致行。 - :return: 读取后的 :class:`~fastNLP.读取后的DataSet`。 - """ - warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead', - category=DeprecationWarning) - with open(csv_path, "r", encoding='utf-8') as f: - start_idx = 0 - if headers is None: - headers = f.readline().rstrip('\r\n') - headers = headers.split(sep) - start_idx += 1 - else: - assert isinstance(headers, (list, tuple)), "headers should be list or tuple, not {}.".format( - type(headers)) - _dict = {} - for col in headers: - _dict[col] = [] - for line_idx, line in enumerate(f, start_idx): - contents = line.rstrip('\r\n').split(sep) - if len(contents) != len(headers): - if dropna: - continue - else: - # TODO change error type - raise ValueError("Line {} has {} parts, while header has {} parts." \ - .format(line_idx, len(contents), len(headers))) - for header, content in zip(headers, contents): - _dict[header].append(content) - return cls(_dict) - def save(self, path): """ 保存DataSet. diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 7a9738fe..cf0b57b0 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -61,6 +61,9 @@ class BertEmbedding(ContextualEmbedding): # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: + if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): + warnings.warn("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" + " faster speed.") model_url = _get_embedding_url('bert', model_dir_or_name.lower()) model_dir = cached_path(model_url, name='embedding') # 检查是否存在 @@ -91,19 +94,33 @@ class BertEmbedding(ContextualEmbedding): :param torch.LongTensor words: [batch_size, max_len] :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) """ - if self._word_sep_index: # 不能drop sep - sep_mask = words.eq(self._word_sep_index) words = self.drop_word(words) - if self._word_sep_index: - words.masked_fill_(sep_mask, self._word_sep_index) outputs = self._get_sent_reprs(words) if outputs is not None: - return self.dropout(words) + return self.dropout(outputs) outputs = self.model(words) outputs = torch.cat([*outputs], dim=-1) return self.dropout(outputs) + def drop_word(self, words): + """ + 按照设定随机将words设置为unknown_index。 + + :param torch.LongTensor words: batch_size x max_len + :return: + """ + if self.word_dropout > 0 and self.training: + with torch.no_grad(): + if self._word_sep_index: # 不能drop sep + sep_mask = words.eq(self._word_sep_index) + mask = torch.ones_like(words).float() * self.word_dropout + mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + words = words.masked_fill(mask, self._word_unk_index) + if self._word_sep_index: + words.masked_fill_(sep_mask, self._word_sep_index) + return words + @property def requires_grad(self): """ @@ -134,10 +151,12 @@ class BertWordPieceEncoder(nn.Module): :param str layers: 最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 :param bool pooled_cls: 返回的句子开头的[CLS]是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取 [CLS]做预测,一般该值为True。 + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 :param bool requires_grad: 是否需要gradient。 """ - def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', - pooled_cls: bool = False, requires_grad: bool=False): + def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False, + word_dropout=0, dropout=0, requires_grad: bool=False): super().__init__() if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: @@ -150,8 +169,12 @@ class BertWordPieceEncoder(nn.Module): raise ValueError(f"Cannot recognize {model_dir_or_name}.") self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) + self._sep_index = self.model._sep_index + self._wordpiece_unk_index = self.model._wordpiece_unknown_index self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size self.requires_grad = requires_grad + self.word_dropout = word_dropout + self.dropout_layer = nn.Dropout(dropout) @property def requires_grad(self): @@ -199,13 +222,41 @@ class BertWordPieceEncoder(nn.Module): 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 :param words: batch_size x max_len - :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话 + :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入), + 第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。 :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) """ + with torch.no_grad(): + sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len + if token_type_ids is None: + sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) + token_type_ids = sep_mask_cumsum.fmod(2) + if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 + token_type_ids = token_type_ids.eq(0).long() + + word_pieces = self.drop_word(word_pieces) outputs = self.model(word_pieces, token_type_ids) outputs = torch.cat([*outputs], dim=-1) - return outputs + return self.dropout_layer(outputs) + + def drop_word(self, words): + """ + 按照设定随机将words设置为unknown_index。 + + :param torch.LongTensor words: batch_size x max_len + :return: + """ + if self.word_dropout > 0 and self.training: + with torch.no_grad(): + if self._word_sep_index: # 不能drop sep + sep_mask = words.eq(self._wordpiece_unk_index) + mask = torch.ones_like(words).float() * self.word_dropout + mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + words = words.masked_fill(mask, self._word_unk_index) + if self._word_sep_index: + words.masked_fill_(sep_mask, self._wordpiece_unk_index) + return words class _WordBertModel(nn.Module): @@ -288,11 +339,11 @@ class _WordBertModel(nn.Module): word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces) word_to_wordpieces.append(word_pieces) word_pieces_lengths.append(len(word_pieces)) - print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab))) self._cls_index = self.tokenzier.vocab['[CLS]'] self._sep_index = self.tokenzier.vocab['[SEP]'] self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece + print("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab))) self.word_to_wordpieces = np.array(word_to_wordpieces) self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) print("Successfully generate word pieces.") @@ -339,7 +390,7 @@ class _WordBertModel(nn.Module): sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) token_type_ids = sep_mask_cumsum.fmod(2) if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 - token_type_ids = token_type_ids.eq(0).float() + token_type_ids = token_type_ids.eq(0).long() else: token_type_ids = torch.zeros_like(word_pieces) # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index c3d4ede6..ac9611fe 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -45,7 +45,7 @@ class StaticEmbedding(TokenEmbedding): :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding文件夹(文件夹下应该只有一个 以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 如果输入为None则使用embedding_dim的维度随机初始化一个embedding。 - :param int embedding_dim: 随机初始化的embedding的维度,仅在model_dir_or_name为None时有效。 + :param int embedding_dim: 随机初始化的embedding的维度,当该值为大于0的值时,将忽略model_dir_or_name。 :param bool requires_grad: 是否需要gradient. 默认为True :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对 :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独 @@ -55,9 +55,11 @@ class StaticEmbedding(TokenEmbedding): :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 """ - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=100, requires_grad: bool=True, + def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True, init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) + if embedding_dim>0: + model_dir_or_name = None # 得到cache_path if model_dir_or_name is None: diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index f4b9c0cb..f8c55bf5 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -30,6 +30,9 @@ __all__ = [ 'Conll2003NERLoader', 'OntoNotesNERLoader', 'CTBLoader', + "MsraNERLoader", + "WeiboNERLoader", + "PeopleDailyNERLoader", 'CSVLoader', 'JsonLoader', @@ -50,6 +53,9 @@ __all__ = [ "Conll2003NERPipe", "OntoNotesNERPipe", + "MsraNERPipe", + "PeopleDailyPipe", + "WeiboNERPipe", "MatchingBertPipe", "RTEBertPipe", diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index 6f845511..6bb53914 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -133,19 +133,21 @@ class DataBundle: :param ~fastNLP.Vocabulary vocab: 词表 :param str field_name: 这个vocab对应的field名称 - :return: + :return: self """ assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary supports." self.vocabs[field_name] = vocab + return self def set_dataset(self, dataset, name): """ :param ~fastNLP.DataSet dataset: 传递给DataBundle的DataSet :param str name: dataset的名称 - :return: + :return: self """ self.datasets[name] = dataset + return self def get_dataset(self, name:str)->DataSet: """ @@ -165,7 +167,7 @@ class DataBundle: """ return self.vocabs[field_name] - def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_field=True): + def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): """ 将field_names中的field设置为input, 对data_bundle中所有的dataset执行该操作:: @@ -176,18 +178,21 @@ class DataBundle: :param bool flag: 将field_name的input状态设置为flag :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 行的数据进行类型和维度推断本列的数据的类型和维度。 - :param bool ignore_miss_field: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return self """ for field_name in field_names: for name, dataset in self.datasets.items(): - if not ignore_miss_field and not dataset.has_field(field_name): + if not ignore_miss_dataset and not dataset.has_field(field_name): raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") if not dataset.has_field(field_name): continue else: dataset.set_input(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) + return self - def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_field=True): + def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): """ 将field_names中的field设置为target, 对data_bundle中所有的dataset执行该操作:: @@ -198,16 +203,34 @@ class DataBundle: :param bool flag: 将field_name的target状态设置为flag :param bool use_1st_ins_infer_dim_type: 如果为True,将不会check该列是否所有数据都是同样的维度,同样的类型。将直接使用第一 行的数据进行类型和维度推断本列的数据的类型和维度。 - :param bool ignore_miss_field: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略; 如果为False,则报错 + :return self """ for field_name in field_names: for name, dataset in self.datasets.items(): - if not ignore_miss_field and not dataset.has_field(field_name): + if not ignore_miss_dataset and not dataset.has_field(field_name): raise KeyError(f"Field:{field_name} was not found in DataSet:{name}") if not dataset.has_field(field_name): continue else: dataset.set_target(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) + return self + + def copy_field(self, field_name, new_field_name, ignore_miss_dataset=True): + """ + 将DataBundle中所有的field_name复制一份叫new_field_name. + + :param str field_name: + :param str new_field_name: + :param bool ignore_miss_dataset: 若DataBundle中的DataSet的 + :return: self + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.copy_field(field_name=field_name, new_field_name=new_field_name) + elif ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self def __repr__(self): _str = 'In total {} datasets:\n'.format(len(self.datasets)) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 9febfe4a..dbe94633 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -27,6 +27,7 @@ PRETRAINED_BERT_MODEL_DIR = { 'cn': 'bert-chinese-wwm.zip', 'cn-base': 'bert-base-chinese.zip', 'cn-wwm': 'bert-chinese-wwm.zip', + 'cn-wwm-ext': "bert-chinese-wwm-ext.zip" } PRETRAINED_ELMO_MODEL_DIR = { @@ -56,7 +57,7 @@ PRETRAIN_STATIC_FILES = { 'en-fasttext-wiki': "wiki-news-300d-1M.vec.zip", 'en-fasttext-crawl': "crawl-300d-2M.vec.zip", - 'cn': "tencent_cn.txt.zip", + 'cn': "tencent_cn.zip", 'cn-tencent': "tencent_cn.txt.zip", 'cn-fasttext': "cc.zh.300.vec.gz", 'cn-sgns-literature-word': 'sgns.literature.word.txt.zip', @@ -71,7 +72,10 @@ DATASET_DIR = { "qnli": "QNLI.zip", "sst-2": "SST-2.zip", "sst": "SST.zip", - "rte": "RTE.zip" + "rte": "RTE.zip", + "msra-ner": "MSRA_NER.zip", + "peopledaily": "peopledaily.zip", + "weibo-ner": "weibo_NER.zip" } PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, @@ -320,42 +324,44 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: # GET file object req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"}) if req.status_code == 200: - content_length = req.headers.get("Content-Length") - total = int(content_length) if content_length is not None else None - progress = tqdm(unit="B", total=total, unit_scale=1) - fd, temp_filename = tempfile.mkstemp() - print("%s not found in cache, downloading to %s" % (url, temp_filename)) - - with open(temp_filename, "wb") as temp_file: - for chunk in req.iter_content(chunk_size=1024 * 16): - if chunk: # filter out keep-alive new chunks - progress.update(len(chunk)) - temp_file.write(chunk) - progress.close() - print(f"Finish download from {url}.") - - # 开始解压 - delete_temp_dir = None - if suffix in ('.zip', '.tar.gz'): - uncompress_temp_dir = tempfile.mkdtemp() - delete_temp_dir = uncompress_temp_dir - print(f"Start to uncompress file to {uncompress_temp_dir}") - if suffix == '.zip': - unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) - else: - untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) - filenames = os.listdir(uncompress_temp_dir) - if len(filenames) == 1: - if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): - uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) - - cache_path.mkdir(parents=True, exist_ok=True) - print("Finish un-compressing file.") - else: - uncompress_temp_dir = temp_filename - cache_path = str(cache_path) + suffix success = False + fd, temp_filename = tempfile.mkstemp() + uncompress_temp_dir = None try: + content_length = req.headers.get("Content-Length") + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total, unit_scale=1) + print("%s not found in cache, downloading to %s" % (url, temp_filename)) + + with open(temp_filename, "wb") as temp_file: + for chunk in req.iter_content(chunk_size=1024 * 16): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + print(f"Finish download from {url}") + + # 开始解压 + if suffix in ('.zip', '.tar.gz', '.gz'): + uncompress_temp_dir = tempfile.mkdtemp() + print(f"Start to uncompress file to {uncompress_temp_dir}") + if suffix == '.zip': + unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) + elif suffix == '.gz': + ungzip_file(temp_filename, uncompress_temp_dir, dir_name) + else: + untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir)) + filenames = os.listdir(uncompress_temp_dir) + if len(filenames) == 1: + if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])): + uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) + + cache_path.mkdir(parents=True, exist_ok=True) + print("Finish un-compressing file.") + else: + uncompress_temp_dir = temp_filename + cache_path = str(cache_path) + suffix + # 复制到指定的位置 print(f"Copy file to {cache_path}") if os.path.isdir(uncompress_temp_dir): @@ -377,10 +383,12 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: os.remove(cache_path) else: shutil.rmtree(cache_path) - if delete_temp_dir: - shutil.rmtree(delete_temp_dir) os.close(fd) os.remove(temp_filename) + if os.path.isdir(uncompress_temp_dir): + shutil.rmtree(uncompress_temp_dir) + elif os.path.isfile(uncompress_temp_dir): + os.remove(uncompress_temp_dir) return get_filepath(cache_path) else: raise HTTPError(f"Status code:{req.status_code}. Fail to download from {url}.") @@ -402,6 +410,15 @@ def untar_gz_file(file: Path, to: Path): tar.extractall(to) +def ungzip_file(file: str, to: str, filename:str): + import gzip + + g_file = gzip.GzipFile(file) + with open(os.path.join(to, filename), 'wb+') as f: + f.write(g_file.read()) + g_file.close() + + def match_file(dir_name: str, cache_dir: Path) -> str: """ 匹配的原则是: 在cache_dir下的文件与dir_name完全一致, 或除了后缀以外和dir_name完全一致。 diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 1da3e125..820c33be 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -58,6 +58,9 @@ __all__ = [ 'Conll2003NERLoader', 'OntoNotesNERLoader', 'CTBLoader', + "MsraNERLoader", + "PeopleDailyNERLoader", + "WeiboNERLoader", # 'CSVLoader', # 'JsonLoader', @@ -77,3 +80,4 @@ from .cws import CWSLoader from .json import JsonLoader from .loader import Loader from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader +from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index ad56101d..67e19773 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -6,6 +6,8 @@ import os import random import shutil import numpy as np +import glob +import time class YelpLoader(Loader): @@ -57,7 +59,7 @@ class YelpLoader(Loader): class YelpFullLoader(YelpLoader): - def download(self, dev_ratio: float = 0.1, seed: int = 0): + def download(self, dev_ratio: float = 0.1, re_download:bool=False): """ 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 @@ -68,35 +70,23 @@ class YelpFullLoader(YelpLoader): dev.csv三个文件。 :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 - :param int seed: 划分dev时的随机数种子 + :param bool re_download: 是否重新下载数据,以重新切分数据。 :return: str, 数据集的目录地址 """ dataset_name = 'yelp-review-full' data_dir = self._get_dataset_path(dataset_name=dataset_name) - if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否需要重新下载 - re_download = True - if dev_ratio > 0: - dev_line_count = 0 - tr_line_count = 0 - with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ - open(os.path.join(data_dir, 'dev.csv'), 'r', encoding='utf-8') as f2: - for line in f1: - tr_line_count += 1 - for line in f2: - dev_line_count += 1 - if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): - re_download = True - else: - re_download = False - if re_download: - shutil.rmtree(data_dir) - data_dir = self._get_dataset_path(dataset_name=dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) if not os.path.exists(os.path.join(data_dir, 'dev.csv')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." - random.seed(int(seed)) try: with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ @@ -116,44 +106,32 @@ class YelpFullLoader(YelpLoader): class YelpPolarityLoader(YelpLoader): - def download(self, dev_ratio: float = 0.1, seed: int = 0): + def download(self, dev_ratio: float = 0.1, re_download=False): """ 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015) - 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分dev_ratio这么多作为dev - :param float dev_ratio: 如果路径中不存在dev.csv, 从train划分多少作为dev的数据. 如果为0,则不划分dev - :param int seed: 划分dev时的随机数种子 + :param float dev_ratio: 如果路径中不存在dev.csv, 从train划分多少作为dev的数据。 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 :return: str, 数据集的目录地址 """ dataset_name = 'yelp-review-polarity' data_dir = self._get_dataset_path(dataset_name=dataset_name) - if os.path.exists(os.path.join(data_dir, 'dev.csv')): # 存在dev的话,check是否符合比例要求 - re_download = True - if dev_ratio > 0: - dev_line_count = 0 - tr_line_count = 0 - with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f1, \ - open(os.path.join(data_dir, 'dev.csv'), 'r', encoding='utf-8') as f2: - for line in f1: - tr_line_count += 1 - for line in f2: - dev_line_count += 1 - if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): - re_download = True - else: - re_download = False - if re_download: - shutil.rmtree(data_dir) - data_dir = self._get_dataset_path(dataset_name=dataset_name) - + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." - random.seed(int(seed)) try: with open(os.path.join(data_dir, 'train.csv'), 'r', encoding='utf-8') as f, \ open(os.path.join(data_dir, 'middle_file.csv'), 'w', encoding='utf-8') as f1, \ @@ -209,7 +187,7 @@ class IMDBLoader(Loader): return dataset - def download(self, dev_ratio: float = 0.1, seed: int = 0): + def download(self, dev_ratio: float = 0.1, re_download=False): """ 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 @@ -218,34 +196,22 @@ class IMDBLoader(Loader): 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后从train中切分0.1作为dev :param float dev_ratio: 如果路径中没有dev.txt。从train划分多少作为dev的数据. 如果为0,则不划分dev - :param int seed: 划分dev时的随机数种子 + :param bool re_download: 是否重新下载数据,以重新切分数据。 :return: str, 数据集的目录地址 """ dataset_name = 'aclImdb' data_dir = self._get_dataset_path(dataset_name=dataset_name) - if os.path.exists(os.path.join(data_dir, 'dev.txt')): # 存在dev的话,check是否符合比例要求 - re_download = True - if dev_ratio > 0: - dev_line_count = 0 - tr_line_count = 0 - with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f1, \ - open(os.path.join(data_dir, 'dev.txt'), 'r', encoding='utf-8') as f2: - for line in f1: - tr_line_count += 1 - for line in f2: - dev_line_count += 1 - if not np.isclose(dev_line_count, dev_ratio * (tr_line_count + dev_line_count), rtol=0.005): - re_download = True - else: - re_download = False - if re_download: - shutil.rmtree(data_dir) - data_dir = self._get_dataset_path(dataset_name=dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) if not os.path.exists(os.path.join(data_dir, 'dev.csv')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." - random.seed(int(seed)) try: with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f, \ open(os.path.join(data_dir, 'middle_file.txt'), 'w', encoding='utf-8') as f1, \ diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py index b2c89ecc..5dc4c6d7 100644 --- a/fastNLP/io/loader/conll.py +++ b/fastNLP/io/loader/conll.py @@ -4,10 +4,12 @@ from .loader import Loader from ...core.dataset import DataSet from ..file_reader import _read_conll from ...core.instance import Instance -from .. import DataBundle -from ..utils import check_loader_paths from ...core.const import Const - +import glob +import os +import shutil +import time +import random class ConllLoader(Loader): """ @@ -262,3 +264,173 @@ class CTBLoader(Loader): def _load(self, path:str): pass + + +class CNNERLoader(Loader): + def _load(self, path:str): + """ + 支持加载形如以下格式的内容,一行两列,以空格隔开两个sample + + Example:: + + 我 O + 们 O + 变 O + 而 O + 以 O + 书 O + 会 O + ... + + :param str path: 文件路径 + :return: DataSet,包含raw_words列和target列 + """ + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + raw_chars = [] + target = [] + for line in f: + line = line.strip() + if line: + parts = line.split() + if len(parts) == 1: # 网上下载的数据有一些列少tag,默认补充O + parts.append('O') + raw_chars.append(parts[0]) + target.append(parts[1]) + else: + if raw_chars: + ds.append(Instance(raw_chars=raw_chars, target=target)) + raw_chars = [] + target = [] + return ds + + +class MsraNERLoader(CNNERLoader): + """ + 读取MSRA-NER数据,数据中的格式应该类似与下列的内容 + + Example:: + + 我 O + 们 O + 变 O + 而 O + 以 O + 书 O + 会 O + ... + + 读取后的DataSet包含以下的field + + .. csv-table:: target列是基于BIO的编码方式 + :header: "raw_chars", "target" + + "[我, 们, 变...]", "[O, O, ...]" + "[中, 共, 中, ...]", "[B-ORG, I-ORG, I-ORG, ...]" + "[...]", "[...]" + + """ + def __init__(self): + super().__init__() + + def download(self, dev_ratio:float=0.1, re_download:bool=False)->str: + """ + 自动下载MSAR-NER的数据,如果你使用该数据,请引用 Gina-Anne Levow, 2006, The Third International Chinese Language + Processing Bakeoff: Word Segmentation and Named Entity Recognition. + + 根据dev_ratio的值随机将train中的数据取出一部分作为dev数据。下载完成后在output_dir中有train.conll, test.conll, + dev.conll三个文件。 + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str, 数据集的目录地址 + :return: + """ + dataset_name = 'msra-ner' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.conll')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + try: + with open(os.path.join(data_dir, 'train.conll'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.conll'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.conll'), 'w', encoding='utf-8') as f2: + lines = [] # 一个sample包含很多行 + for line in f: + line = line.strip() + if line: + lines.append(line) + else: + if random.random() < dev_ratio: + f2.write('\n'.join(lines) + '\n\n') + else: + f1.write('\n'.join(lines) + '\n\n') + lines.clear() + os.remove(os.path.join(data_dir, 'train.conll')) + os.renames(os.path.join(data_dir, 'middle_file.conll'), os.path.join(data_dir, 'train.conll')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.conll')): + os.remove(os.path.join(data_dir, 'middle_file.conll')) + + return data_dir + + +class WeiboNERLoader(CNNERLoader): + def __init__(self): + super().__init__() + + def download(self)->str: + """ + 自动下载Weibo-NER的数据,如果你使用了该数据,请引用 Nanyun Peng and Mark Dredze, 2015, Named Entity Recognition for + Chinese Social Media with Jointly Trained Embeddings. + + :return: str + """ + dataset_name = 'weibo-ner' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + return data_dir + + +class PeopleDailyNERLoader(CNNERLoader): + """ + 支持加载的数据格式如下 + + Example:: + + 当 O + 希 O + 望 O + 工 O + 程 O + 救 O + 助 O + 的 O + 百 O + + 读取后的DataSet包含以下的field + + .. csv-table:: target列是基于BIO的编码方式 + :header: "raw_chars", "target" + + "[我, 们, 变...]", "[O, O, ...]" + "[中, 共, 中, ...]", "[B-ORG, I-ORG, I-ORG, ...]" + "[...]", "[...]" + + """ + def __init__(self): + super().__init__() + + def download(self) -> str: + dataset_name = 'peopledaily' + data_dir = self._get_dataset_path(dataset_name=dataset_name) + + return data_dir diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index ad68f486..9ffb9ed6 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -8,6 +8,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce """ __all__ = [ + "Pipe", + "YelpFullPipe", "YelpPolarityPipe", "SSTPipe", @@ -16,6 +18,9 @@ __all__ = [ "Conll2003NERPipe", "OntoNotesNERPipe", + "MsraNERPipe", + "WeiboNERPipe", + "PeopleDailyPipe", "MatchingBertPipe", "RTEBertPipe", @@ -32,6 +37,7 @@ __all__ = [ ] from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe -from .conll import Conll2003NERPipe, OntoNotesNERPipe +from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe +from .pipe import Pipe diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index 7d55dd29..fb599340 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -4,6 +4,8 @@ from .utils import iob2, iob2bioes from ...core.const import Const from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader from .utils import _indexize, _add_words_field +from .utils import _add_chars_field +from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader class _NERPipe(Pipe): @@ -17,7 +19,7 @@ class _NERPipe(Pipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 - :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为-100。 + :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 """ def __init__(self, encoding_type: str = 'bio', lower: bool = False, target_pad_val=0): @@ -32,31 +34,16 @@ class _NERPipe(Pipe): """ 支持的DataSet的field为 - .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader + .. csv-table:: :header: "raw_words", "target" "[Nadim, Ladki]", "[B-PER, I-PER]" "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" "[...]", "[...]" - :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 在传入DataBundle基础上原位修改。 :return: DataBundle - - Example:: - - data_bundle = Conll2003Loader().load('/path/to/conll2003/') - data_bundle = Conll2003NERPipe().process(data_bundle) - - # 获取train - tr_data = data_bundle.get_dataset('train') - - # 获取target这个field的词表 - target_vocab = data_bundle.get_vocab('target') - # 获取words这个field的词表 - word_vocab = data_bundle.get_vocab('words') - """ # 转换tag for name, dataset in data_bundle.datasets.items(): @@ -79,18 +66,6 @@ class _NERPipe(Pipe): return data_bundle - def process_from_file(self, paths) -> DataBundle: - """ - - :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.ConllLoader` 的load函数。 - :return: DataBundle - """ - # 读取数据 - data_bundle = Conll2003NERLoader().load(paths) - data_bundle = self.process(data_bundle) - - return data_bundle - class Conll2003NERPipe(_NERPipe): """ @@ -102,8 +77,8 @@ class Conll2003NERPipe(_NERPipe): .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader :header: "raw_words", "words", "target", "seq_len" - "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 - "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 10 + "[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 + "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 "[...]", "[...]", "[...]", . raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 @@ -134,10 +109,13 @@ class OntoNotesNERPipe(_NERPipe): .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader :header: "raw_words", "words", "target", "seq_len" - "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 - "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 6 + "[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 + "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 "[...]", "[...]", "[...]", . + raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 + :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 @@ -146,3 +124,124 @@ class OntoNotesNERPipe(_NERPipe): def process_from_file(self, paths): data_bundle = OntoNotesNERLoader().load(paths) return self.process(data_bundle) + + +class _CNNERPipe(Pipe): + """ + 中文NER任务的处理Pipe, 该Pipe会(1)复制raw_chars列,并命名为chars; (2)在chars, target列建立词表 + (创建 :class:`fastNLP.Vocabulary` 对象,所以在返回的DataBundle中将有两个Vocabulary); (3)将chars,target列根据相应的 + Vocabulary转换为index。 + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target, seq_len。 + + :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 + :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 + """ + + def __init__(self, encoding_type: str = 'bio', target_pad_val=0): + if encoding_type == 'bio': + self.convert_tag = iob2 + else: + self.convert_tag = lambda words: iob2bioes(iob2(words)) + self.target_pad_val = int(target_pad_val) + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 支持的DataSet的field为 + + .. csv-table:: + :header: "raw_chars", "target" + + "[相, 比, 之, 下,...]", "[O, O, O, O, ...]" + "[青, 岛, 海, 牛, 队, 和, ...]", "[B-ORG, I-ORG, I-ORG, ...]" + "[...]", "[...]" + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + + :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 + 在传入DataBundle基础上原位修改。 + :return: DataBundle + """ + # 转换tag + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) + + _add_chars_field(data_bundle, lower=False) + + # index + _indexize(data_bundle, input_field_name=Const.CHAR_INPUT, target_field_name=Const.TARGET) + + input_fields = [Const.TARGET, Const.CHAR_INPUT, Const.INPUT_LEN] + target_fields = [Const.TARGET, Const.INPUT_LEN] + + for name, dataset in data_bundle.datasets.items(): + dataset.set_pad_val(Const.TARGET, self.target_pad_val) + dataset.add_seq_len(Const.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + +class MsraNERPipe(_CNNERPipe): + """ + 处理MSRA-NER的数据,处理之后的DataSet的field情况为 + + .. csv-table:: + :header: "raw_chars", "chars", "target", "seq_len" + + "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 + "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 + "[...]", "[...]", "[...]", . + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + + """ + def process_from_file(self, paths=None) -> DataBundle: + data_bundle = MsraNERLoader().load(paths) + return self.process(data_bundle) + + +class PeopleDailyPipe(_CNNERPipe): + """ + 处理people daily的ner的数据,处理之后的DataSet的field情况为 + + .. csv-table:: + :header: "raw_chars", "chars", "target", "seq_len" + + "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 + "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 + "[...]", "[...]", "[...]", . + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + """ + def process_from_file(self, paths=None) -> DataBundle: + data_bundle = PeopleDailyNERLoader().load(paths) + return self.process(data_bundle) + + +class WeiboNERPipe(_CNNERPipe): + """ + 处理weibo的ner的数据,处理之后的DataSet的field情况为 + + .. csv-table:: + :header: "raw_chars", "chars", "target", "seq_len" + + "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 + "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 + "[...]", "[...]", "[...]", . + + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 + target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + + :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 + :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 + """ + def process_from_file(self, paths=None) -> DataBundle: + data_bundle = WeiboNERLoader().load(paths) + return self.process(data_bundle) diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 9f7c7d68..474865c6 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -50,8 +50,8 @@ class MatchingBertPipe(Pipe): dataset.drop(lambda x: x[Const.TARGET] == '-') for name, dataset in data_bundle.datasets.items(): - dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0)) - dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1)) + dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0), ) + dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1), ) if self.lower: for name, dataset in data_bundle.datasets.items(): diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py index 48454b67..7d011446 100644 --- a/fastNLP/io/pipe/utils.py +++ b/fastNLP/io/pipe/utils.py @@ -76,25 +76,27 @@ def _raw_split(sent): return sent.split() -def _indexize(data_bundle): +def _indexize(data_bundle, input_field_name=Const.INPUT, target_field_name=Const.TARGET): """ - 在dataset中的"words"列建立词表,"target"列建立词表,并把词表加入到data_bundle中。 + 在dataset中的field_name列建立词表,Const.TARGET列建立词表,并把词表加入到data_bundle中。 :param data_bundle: + :param: str input_field_name: + :param: str target_field_name: 这一列的vocabulary没有unknown和padding :return: """ src_vocab = Vocabulary() - src_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT, + src_vocab.from_dataset(data_bundle.datasets['train'], field_name=input_field_name, no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if name != 'train']) - src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) + src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) tgt_vocab = Vocabulary(unknown=None, padding=None) - tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) - tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.TARGET) + tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=target_field_name) + tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=target_field_name) - data_bundle.set_vocab(src_vocab, Const.INPUT) - data_bundle.set_vocab(tgt_vocab, Const.TARGET) + data_bundle.set_vocab(src_vocab, input_field_name) + data_bundle.set_vocab(tgt_vocab, target_field_name) return data_bundle @@ -107,14 +109,30 @@ def _add_words_field(data_bundle, lower=False): :param bool lower:是否要小写化 :return: 传入的DataBundle """ - for name, dataset in data_bundle.datasets.items(): - dataset.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT) + data_bundle.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT, ignore_miss_dataset=True) if lower: for name, dataset in data_bundle.datasets.items(): dataset[Const.INPUT].lower() return data_bundle + +def _add_chars_field(data_bundle, lower=False): + """ + 给data_bundle中的dataset中复制一列chars. 并根据lower参数判断是否需要小写化 + + :param data_bundle: + :param bool lower:是否要小写化 + :return: 传入的DataBundle + """ + data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True) + + if lower: + for name, dataset in data_bundle.datasets.items(): + dataset[Const.CHAR_INPUT].lower() + return data_bundle + + def _drop_empty_instance(data_bundle, field_name): """ 删除data_bundle的DataSet中存在的某个field为空的情况 diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index e73b2c40..ffc43863 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -868,6 +868,7 @@ class _WordPieceBertModel(nn.Module): self._cls_index = self.tokenzier.vocab['[CLS]'] self._sep_index = self.tokenzier.vocab['[SEP]'] + self._wordpiece_unknown_index = self.tokenzier.vocab['[UNK]'] self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece self.pooled_cls = pooled_cls @@ -919,7 +920,7 @@ class _WordPieceBertModel(nn.Module): outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1))) for l_index, l in enumerate(self.layers): bert_output = bert_outputs[l] - if l==len(bert_outputs) and self.pooled_cls: + if l in (len(bert_outputs)-1, -1) and self.pooled_cls: bert_output[:, 0] = pooled_cls outputs[l_index] = bert_output return outputs diff --git a/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py b/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py deleted file mode 100644 index a2ee4663..00000000 --- a/reproduction/seqence_labelling/chinese_ner/data/ChineseNER.py +++ /dev/null @@ -1,115 +0,0 @@ - - -from fastNLP.io.data_bundle import DataSetLoader, DataBundle -from fastNLP.io import ConllLoader -from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 -from fastNLP import Const -from reproduction.utils import check_dataloader_paths -from fastNLP import Vocabulary - -class ChineseNERLoader(DataSetLoader): - """ - 读取中文命名实体数据集,包括PeopleDaily, MSRA-NER, Weibo。数据在这里可以找到https://github.com/OYE93/Chinese-NLP-Corpus/tree/master/NER - 请确保输入数据的格式如下, 共两列,第一列为字,第二列为标签,不同句子以空行隔开 - 我 O - 们 O - 变 O - 而 O - 以 O - 书 O - 会 O - ... - - """ - def __init__(self, encoding_type:str='bioes'): - """ - - :param str encoding_type: 支持bio和bioes格式 - """ - super().__init__() - self._loader = ConllLoader(headers=['raw_chars', 'target'], indexes=[0, 1]) - - assert encoding_type in ('bio', 'bioes') - - self._tag_converters = [iob2] - if encoding_type == 'bioes': - self._tag_converters.append(iob2bioes) - - def load(self, path:str): - dataset = self._loader.load(path) - def convert_tag_schema(tags): - for converter in self._tag_converters: - tags = converter(tags) - return tags - if self._tag_converters: - dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) - return dataset - - def process(self, paths, bigrams=False, trigrams=False): - """ - - :param paths: - :param bool, bigrams: 是否包含生成bigram feature, [a, b, c, d] -> [ab, bc, cd, d] - :param bool, trigrams: 是否包含trigram feature,[a, b, c, d] -> [abc, bcd, cd, d] - :return: ~fastNLP.io.DataBundle - 包含以下的fields - raw_chars: List[str] - chars: List[int] - seq_len: int, 字的长度 - bigrams: List[int], optional - trigrams: List[int], optional - target: List[int] - """ - paths = check_dataloader_paths(paths) - data = DataBundle() - input_fields = [Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET] - target_fields = [Const.TARGET, Const.INPUT_LEN] - - for name, path in paths.items(): - dataset = self.load(path) - if bigrams: - dataset.apply_field(lambda raw_chars: [c1+c2 for c1, c2 in zip(raw_chars, raw_chars[1:]+[''])], - field_name='raw_chars', new_field_name='bigrams') - - if trigrams: - dataset.apply_field(lambda raw_chars: [c1+c2+c3 for c1, c2, c3 in zip(raw_chars, - raw_chars[1:]+[''], - raw_chars[2:]+['']*2)], - field_name='raw_chars', new_field_name='trigrams') - data.datasets[name] = dataset - - char_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='raw_chars', - no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) - char_vocab.index_dataset(*data.datasets.values(), field_name='raw_chars', new_field_name=Const.CHAR_INPUT) - data.vocabs[Const.CHAR_INPUT] = char_vocab - - target_vocab = Vocabulary(unknown=None, padding=None).from_dataset(data.datasets['train'], field_name=Const.TARGET) - target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) - data.vocabs[Const.TARGET] = target_vocab - - if bigrams: - bigram_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='bigrams', - no_create_entry_dataset=[dataset for name, dataset in - data.datasets.items() if name != 'train']) - bigram_vocab.index_dataset(*data.datasets.values(), field_name='bigrams', new_field_name='bigrams') - data.vocabs['bigrams'] = bigram_vocab - input_fields.append('bigrams') - - if trigrams: - trigram_vocab = Vocabulary().from_dataset(data.datasets['train'], field_name='trigrams', - no_create_entry_dataset=[dataset for name, dataset in - data.datasets.items() if name != 'train']) - trigram_vocab.index_dataset(*data.datasets.values(), field_name='trigrams', new_field_name='trigrams') - data.vocabs['trigrams'] = trigram_vocab - input_fields.append('trigrams') - - for name, dataset in data.datasets.items(): - dataset.add_seq_len(Const.CHAR_INPUT) - dataset.set_input(*input_fields) - dataset.set_target(*target_fields) - - return data - - - - diff --git a/reproduction/seqence_labelling/chinese_ner/data/__init__.py b/reproduction/seqence_labelling/chinese_ner/data/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/reproduction/seqence_labelling/chinese_ner/train_bert.py b/reproduction/seqence_labelling/chinese_ner/train_bert.py index a34b7d01..b12c8f75 100644 --- a/reproduction/seqence_labelling/chinese_ner/train_bert.py +++ b/reproduction/seqence_labelling/chinese_ner/train_bert.py @@ -12,22 +12,23 @@ sys.path.append('../../../') from torch import nn from fastNLP.embeddings import BertEmbedding, Embedding -from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader from fastNLP import Trainer, Const from fastNLP import BucketSampler, SpanFPreRecMetric, GradientClipCallback from fastNLP.modules import MLP from fastNLP.core.callback import WarmupCallback from fastNLP import CrossEntropyLoss from fastNLP.core.optimizer import AdamW -import os +from fastNLP.io import MsraNERPipe, MsraNERLoader, WeiboNERPipe from fastNLP import cache_results encoding_type = 'bio' -@cache_results('caches/msra.pkl') +@cache_results('caches/weibo.pkl', _refresh=False) def get_data(): - data = ChineseNERLoader(encoding_type=encoding_type).process("MSRA/") + # data_dir = MsraNERLoader().download(dev_ratio=0) + # data = MsraNERPipe(encoding_type=encoding_type, target_pad_val=-100).process_from_file(data_dir) + data = WeiboNERPipe(encoding_type=encoding_type).process_from_file() return data data = get_data() print(data) @@ -35,10 +36,10 @@ print(data) class BertCNNER(nn.Module): def __init__(self, embed, tag_size): super().__init__() - - self.embedding = Embedding(embed, dropout=0.1) + self.embedding = embed self.tag_size = tag_size self.mlp = MLP(size_layer=[self.embedding.embedding_dim, tag_size]) + def forward(self, chars): # batch_size, max_len = words.size() chars = self.embedding(chars) @@ -46,11 +47,15 @@ class BertCNNER(nn.Module): return {Const.OUTPUT: outputs} -embed = BertEmbedding(data.vocabs[Const.CHAR_INPUT], model_dir_or_name='en-base', - pool_method='max', requires_grad=True, layers='11') + def predict(self, chars): + # batch_size, max_len = words.size() + chars = self.embedding(chars) + outputs = self.mlp(chars) -for name, dataset in data.datasets.items(): - dataset.set_pad_val(Const.TARGET, -100) + return {Const.OUTPUT: outputs} + +embed = BertEmbedding(data.get_vocab(Const.CHAR_INPUT), model_dir_or_name='cn-wwm-ext', + pool_method='first', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5) callbacks = [ GradientClipCallback(clip_type='norm', clip_value=1), @@ -58,7 +63,7 @@ callbacks = [ ] model = BertCNNER(embed, len(data.vocabs[Const.TARGET])) -optimizer = AdamW(model.parameters(), lr=1e-4) +optimizer = AdamW(model.parameters(), lr=3e-5) for name, dataset in data.datasets.items(): original_len = len(dataset) @@ -66,13 +71,11 @@ for name, dataset in data.datasets.items(): clipped_len = len(dataset) print("Delete {} instances in {}.".format(original_len-clipped_len, name)) -os.environ['CUDA_VISIBLE_DEVICES'] = '0,1' - trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), - device=[0, 1], dev_data=data.datasets['test'], batch_size=20, + device=0, dev_data=data.datasets['test'], batch_size=6, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), loss=CrossEntropyLoss(reduction='sum'), callbacks=callbacks, num_workers=2, n_epochs=5, - check_code_level=-1, update_every=3) + check_code_level=0, update_every=3) trainer.train() diff --git a/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py b/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py index 53a85186..1005ea23 100644 --- a/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py +++ b/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py @@ -1,7 +1,6 @@ +import sys +sys.path.append('../../..') - - -from reproduction.seqence_labelling.chinese_ner.data.ChineseNER import ChineseNERLoader from fastNLP.embeddings import StaticEmbedding from torch import nn @@ -14,7 +13,51 @@ import torch.nn.functional as F from fastNLP import seq_len_to_mask from fastNLP.core.const import Const as C from fastNLP import SpanFPreRecMetric, Trainer -from fastNLP import cache_results +from fastNLP import cache_results, Vocabulary +from fastNLP.io.pipe.utils import _add_chars_field, _indexize + +from fastNLP.io.pipe import Pipe +from fastNLP.core.utils import iob2bioes, iob2 +from fastNLP.io import MsraNERLoader, WeiboNERLoader + +class ChineseNERPipe(Pipe): + def __init__(self, encoding_type: str = 'bio', target_pad_val=0, bigram=False): + if encoding_type == 'bio': + self.convert_tag = iob2 + else: + self.convert_tag = lambda words: iob2bioes(iob2(words)) + self.target_pad_val = int(target_pad_val) + self.bigram = bigram + + def process(self, data_bundle): + data_bundle.copy_field(C.RAW_CHAR, C.CHAR_INPUT) + input_fields = [C.TARGET, C.CHAR_INPUT, C.INPUT_LEN] + target_fields = [C.TARGET, C.INPUT_LEN] + if self.bigram: + for dataset in data_bundle.datasets.values(): + dataset.apply_field(lambda chars:[c1+c2 for c1, c2 in zip(chars, chars[1:]+[''])], + field_name=C.CHAR_INPUT, new_field_name='bigrams') + bigram_vocab = Vocabulary() + bigram_vocab.from_dataset(data_bundle.get_dataset('train'),field_name='bigrams', + no_create_entry_dataset=[ds for name, ds in data_bundle.datasets.items() if name!='train']) + bigram_vocab.index_dataset(*data_bundle.datasets.values(), field_name='bigrams') + data_bundle.set_vocab(bigram_vocab, field_name='bigrams') + input_fields.append('bigrams') + + _add_chars_field(data_bundle, lower=False) + + # index + _indexize(data_bundle, input_field_name=C.CHAR_INPUT, target_field_name=C.TARGET) + + for name, dataset in data_bundle.datasets.items(): + dataset.set_pad_val(C.TARGET, self.target_pad_val) + dataset.add_seq_len(C.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + class CNBiLSTMCRFNER(nn.Module): def __init__(self, char_embed, num_classes, bigram_embed=None, trigram_embed=None, num_layers=1, hidden_size=100, @@ -73,22 +116,21 @@ class CNBiLSTMCRFNER(nn.Module): return self._forward(chars, bigrams, trigrams, seq_len) # data_bundle = pickle.load(open('caches/msra.pkl', 'rb')) -@cache_results('caches/msra.pkl', _refresh=True) +@cache_results('caches/weibo-lstm.pkl', _refresh=False) def get_data(): - data_bundle = ChineseNERLoader().process('MSRA-NER/', bigrams=True) - char_embed = StaticEmbedding(data_bundle.vocabs['chars'], - model_dir_or_name='cn-char') - bigram_embed = StaticEmbedding(data_bundle.vocabs['bigrams'], - model_dir_or_name='cn-bigram') + data_bundle = WeiboNERLoader().load() + data_bundle = ChineseNERPipe(encoding_type='bioes', bigram=True).process(data_bundle) + char_embed = StaticEmbedding(data_bundle.get_vocab(C.CHAR_INPUT), model_dir_or_name='cn-fasttext') + bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), embedding_dim=100, min_freq=3) return data_bundle, char_embed, bigram_embed data_bundle, char_embed, bigram_embed = get_data() +# data_bundle = get_data() print(data_bundle) + # exit(0) -data_bundle.datasets['train'].set_input('target') -data_bundle.datasets['dev'].set_input('target') model = CNBiLSTMCRFNER(char_embed, num_classes=len(data_bundle.vocabs['target']), bigram_embed=bigram_embed) -Trainer(data_bundle.datasets['train'], model, batch_size=640, +Trainer(data_bundle.datasets['train'], model, batch_size=20, metrics=SpanFPreRecMetric(data_bundle.vocabs['target'], encoding_type='bioes'), - num_workers=2, dev_data=data_bundle. datasets['dev'], device=3).train() + num_workers=2, dev_data=data_bundle. datasets['dev'], device=0).train() diff --git a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py index 79d704ba..249e2851 100644 --- a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py +++ b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py @@ -2,7 +2,6 @@ import torch from torch import nn from fastNLP import seq_len_to_mask -from fastNLP.modules import Embedding from fastNLP.modules import LSTM from fastNLP.modules import ConditionalRandomField, allowed_transitions import torch.nn.functional as F diff --git a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py index caa0247a..10c5bdea 100644 --- a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py +++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py @@ -1,8 +1,7 @@ import sys sys.path.append('../../..') -from fastNLP.embeddings.embedding import CNNCharEmbedding, StaticEmbedding -from fastNLP.core.vocabulary import VocabularyOption +from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF from fastNLP import Trainer @@ -11,68 +10,44 @@ from fastNLP import BucketSampler from fastNLP import Const from torch.optim import SGD from fastNLP import GradientClipCallback -from fastNLP.core.callback import FitlogCallback, LRScheduler +from fastNLP.core.callback import EvaluateCallback, LRScheduler from torch.optim.lr_scheduler import LambdaLR -# from reproduction.seqence_labelling.ner.model.swats import SWATS from fastNLP import cache_results -import fitlog -fitlog.debug() - -from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader - +from fastNLP.io.pipe.conll import Conll2003NERPipe encoding_type = 'bioes' -@cache_results('caches/upper_conll2003.pkl') +@cache_results('caches/conll2003_new.pkl', _refresh=True) def load_data(): - data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003', - word_vocab_opt=VocabularyOption(min_freq=1), - lower=False) + # 替换路径 + paths = {'test':"NER/corpus/CoNLL-2003/eng.testb", + 'train':"NER/corpus/CoNLL-2003/eng.train", + 'dev':"NER/corpus/CoNLL-2003/eng.testa"} + data = Conll2003NERPipe(encoding_type=encoding_type, target_pad_val=0).process_from_file(paths) return data data = load_data() print(data) -char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], - kernel_sizes=[3], word_dropout=0.01, dropout=0.5) -# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30) -word_embed = StaticEmbedding(vocab=data.vocabs['words'], - model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', + +char_embed = CNNCharEmbedding(vocab=data.get_vocab('words'), embed_size=30, char_emb_size=30, filter_nums=[30], + kernel_sizes=[3], word_dropout=0, dropout=0.5) +word_embed = StaticEmbedding(vocab=data.get_vocab('words'), + model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0.01, dropout=0.5) word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() -# import joblib -# raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib') -# def convert_to_ids(raw_words): -# ids = [] -# for word in raw_words: -# id = raw_data['word_to_id'][word] -# id = raw_data['id_to_emb_map'][id] -# ids.append(id) -# return ids -# word_embed = raw_data['emb_matrix'] -# for name, dataset in data.datasets.items(): -# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) - -# elmo_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'], -# model_dir_or_name='.', -# requires_grad=True, layers='mix') -# char_embed = StackEmbedding([elmo_embed, char_embed]) - model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) callbacks = [ GradientClipCallback(clip_type='value', clip_value=5), - FitlogCallback({'test':data.datasets['test']}, verbose=1), - # SaveModelCallback('save_models/', top=3, only_param=False, save_on_exception=True) + EvaluateCallback(data=data.get_dataset('test')) # 额外对test上的数据进行性能评测 ] -# optimizer = Adam(model.parameters(), lr=0.001) -# optimizer = SWATS(model.parameters(), verbose=True) -optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) + +optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9) scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) callbacks.append(scheduler) - -trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(batch_size=20), - device=1, dev_data=data.datasets['dev'], batch_size=20, +trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(), + device=0, dev_data=data.get_dataset('dev'), batch_size=20, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), - callbacks=callbacks, num_workers=2, n_epochs=100) + callbacks=callbacks, num_workers=2, n_epochs=100, dev_batch_size=512) trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/train_ontonote.py b/reproduction/seqence_labelling/ner/train_ontonote.py index 894d42ce..7b465d77 100644 --- a/reproduction/seqence_labelling/ner/train_ontonote.py +++ b/reproduction/seqence_labelling/ner/train_ontonote.py @@ -11,52 +11,37 @@ from fastNLP import Const from torch.optim import SGD from torch.optim.lr_scheduler import LambdaLR from fastNLP import GradientClipCallback -from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.core.callback import FitlogCallback, LRScheduler -from functools import partial -from torch import nn +from fastNLP import BucketSampler +from fastNLP.core.callback import EvaluateCallback, LRScheduler from fastNLP import cache_results +from fastNLP.io.pipe.conll import OntoNotesNERPipe -import fitlog -fitlog.debug() -fitlog.set_log_dir('logs/') - -fitlog.add_hyper_in_file(__file__) #######hyper normalize = False -divide_std = True lower = False -lr = 0.015 +lr = 0.01 dropout = 0.5 -batch_size = 20 -init_method = 'default' +batch_size = 32 job_embed = False data_name = 'ontonote' #######hyper -init_method = {'default': None, - 'xavier': partial(nn.init.xavier_normal_, gain=0.02), - 'normal': partial(nn.init.normal_, std=0.02) - }[init_method] - - -from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader - encoding_type = 'bioes' -@cache_results('caches/ontonotes.pkl') +@cache_results('caches/ontonotes.pkl', _refresh=True) def cache(): - data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('../../../../others/data/v4/english', - lower=lower, - word_vocab_opt=VocabularyOption(min_freq=1)) - char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], - kernel_sizes=[3]) + data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file('../../../../others/data/v4/english') + char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], + kernel_sizes=[3], dropout=dropout) word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], - model_dir_or_name='/remote-home/hyan01/fastnlp_caches/glove.6B.100d/glove.6B.100d.txt', + model_dir_or_name='en-glove-100d', requires_grad=True, normalize=normalize, - init_method=init_method) + word_dropout=0.01, + dropout=dropout, + lower=True, + min_freq=2) return data, char_embed, word_embed data, char_embed, word_embed = cache() @@ -67,7 +52,7 @@ model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag callbacks = [ GradientClipCallback(clip_value=5, clip_type='value'), - FitlogCallback(data.datasets['test'], verbose=1) + EvaluateCallback(data.datasets['test']) ] optimizer = SGD(model.parameters(), lr=lr, momentum=0.9) @@ -75,8 +60,8 @@ scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.0 callbacks.append(scheduler) -trainer = Trainer(train_data=data.datasets['dev'][:100], model=model, optimizer=optimizer, sampler=None, - device=0, dev_data=data.datasets['dev'][:100], batch_size=batch_size, +trainer = Trainer(train_data=data.get_dataset('train'), model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100), + device=0, dev_data=data.get_dataset('dev'), batch_size=batch_size, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), - callbacks=callbacks, num_workers=1, n_epochs=100) + callbacks=callbacks, num_workers=1, n_epochs=100, dev_batch_size=256) trainer.train() \ No newline at end of file diff --git a/test/embeddings/test_bert_embedding.py b/test/embeddings/test_bert_embedding.py new file mode 100644 index 00000000..c27ebd40 --- /dev/null +++ b/test/embeddings/test_bert_embedding.py @@ -0,0 +1,14 @@ +import unittest +from fastNLP import Vocabulary +from fastNLP.embeddings import BertEmbedding +import torch +import os + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestDownload(unittest.TestCase): + def test_download(self): + # import os + vocab = Vocabulary().add_word_lst("This is a test .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/embedding/bert-base-cased') + words = torch.LongTensor([[0, 1, 2]]) + print(embed(words).size()) diff --git a/test/io/loader/test_conll_loader.py b/test/io/loader/test_conll_loader.py new file mode 100644 index 00000000..e44b8a2a --- /dev/null +++ b/test/io/loader/test_conll_loader.py @@ -0,0 +1,21 @@ + +import unittest +import os +from fastNLP.io.loader.conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader + +class MSRANERTest(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_download(self): + MsraNERLoader().download(re_download=False) + data_bundle = MsraNERLoader().load() + print(data_bundle) + +class PeopleDailyTest(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_download(self): + PeopleDailyNERLoader().download() + +class WeiboNERTest(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_download(self): + WeiboNERLoader().download() \ No newline at end of file diff --git a/test/io/pipe/test_conll.py b/test/io/pipe/test_conll.py new file mode 100644 index 00000000..e8879d71 --- /dev/null +++ b/test/io/pipe/test_conll.py @@ -0,0 +1,12 @@ +import unittest +import os +from fastNLP.io import MsraNERPipe, PeopleDailyPipe, WeiboNERPipe + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [MsraNERPipe, PeopleDailyPipe, WeiboNERPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe().process_from_file() + print(data_bundle) \ No newline at end of file From 1168b9dc243619232963eef11a16068099e9c0e4 Mon Sep 17 00:00:00 2001 From: yunfan Date: Sun, 18 Aug 2019 17:55:28 +0800 Subject: [PATCH 073/153] [update] logger in trainer & tester --- fastNLP/io/logger.py | 51 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) diff --git a/fastNLP/io/logger.py b/fastNLP/io/logger.py index 287bdbc9..6bdf693d 100644 --- a/fastNLP/io/logger.py +++ b/fastNLP/io/logger.py @@ -6,6 +6,9 @@ import os import sys import warnings + +__all__ = ['logger'] + try: import fitlog except ImportError: @@ -36,11 +39,7 @@ else: self.setLevel(level) -def init_logger(path=None, stdout='tqdm', level='INFO'): - """initialize logger""" - if stdout not in ['none', 'plain', 'tqdm']: - raise ValueError('stdout must in one of {}'.format(['none', 'plain', 'tqdm'])) - +def get_level(level): if isinstance(level, int): pass else: @@ -48,6 +47,15 @@ def init_logger(path=None, stdout='tqdm', level='INFO'): level = {'info': logging.INFO, 'debug': logging.DEBUG, 'warn': logging.WARN, 'warning': logging.WARN, 'error': logging.ERROR}[level] + return level + + +def init_logger(path=None, stdout='tqdm', level='INFO'): + """initialize logger""" + if stdout not in ['none', 'plain', 'tqdm']: + raise ValueError('stdout must in one of {}'.format(['none', 'plain', 'tqdm'])) + + level = get_level(level) logger = logging.getLogger('fastNLP') logger.setLevel(level) @@ -85,4 +93,35 @@ def init_logger(path=None, stdout='tqdm', level='INFO'): return logger -get_logger = logging.getLogger + +# init logger when import +logger = init_logger() + + +def get_logger(name=None): + if name is None: + return logging.getLogger('fastNLP') + return logging.getLogger(name) + + +def set_file(path, level='INFO'): + for h in logger.handlers: + if isinstance(h, logging.FileHandler): + if os.path.abspath(path) == h.baseFilename: + # file path already added + return + + # File Handler + if os.path.exists(path): + assert os.path.isfile(path) + warnings.warn('log already exists in {}'.format(path)) + dirname = os.path.abspath(os.path.dirname(path)) + os.makedirs(dirname, exist_ok=True) + + file_handler = logging.FileHandler(path, mode='a') + file_handler.setLevel(get_level(level)) + file_formatter = logging.Formatter(fmt='%(asctime)s - [%(levelname)s] - %(name)s - %(message)s', + datefmt='%Y/%m/%d %H:%M:%S') + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + From 3b8bc469ba752873333c9fe15bc6b144efe3251d Mon Sep 17 00:00:00 2001 From: yunfan Date: Mon, 19 Aug 2019 14:22:58 +0800 Subject: [PATCH 074/153] [update] logger, support straightly import logger to use --- fastNLP/core/callback.py | 4 +- fastNLP/core/dist_trainer.py | 8 +- fastNLP/core/tester.py | 5 +- fastNLP/core/trainer.py | 10 +- fastNLP/io/__init__.py | 3 + fastNLP/io/{logger.py => _logger.py} | 120 ++++++++++-------- .../text_classification/train_dpcnn.py | 17 ++- 7 files changed, 93 insertions(+), 74 deletions(-) rename fastNLP/io/{logger.py => _logger.py} (62%) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 17ded171..53767011 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -86,7 +86,7 @@ except: from ..io.model_io import ModelSaver, ModelLoader from .dataset import DataSet from .tester import Tester -import logging +from ..io import logger try: import fitlog @@ -178,7 +178,7 @@ class Callback(object): @property def logger(self): - return getattr(self._trainer, 'logger', logging.getLogger(__name__)) + return getattr(self._trainer, 'logger', logger) def on_train_begin(self): """ diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index e14e17c8..8ad282c9 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -9,7 +9,6 @@ from torch.utils.data.distributed import DistributedSampler from torch.nn.parallel import DistributedDataParallel as DDP import os from tqdm import tqdm -import logging import time from datetime import datetime, timedelta from functools import partial @@ -22,7 +21,8 @@ from .optimizer import Optimizer from .utils import _build_args from .utils import _move_dict_value_to_device from .utils import _get_func_signature -from ..io.logger import init_logger +from ..io import logger +import logging from pkg_resources import parse_version __all__ = [ @@ -140,8 +140,8 @@ class DistTrainer(): self.cp_save_path = None # use INFO in the master, WARN for others - init_logger(log_path, level=logging.INFO if self.is_master else logging.WARNING) - self.logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO if self.is_master else logging.WARNING) + self.logger = logger self.logger.info("Setup Distributed Trainer") self.logger.warning("Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".format( os.getpid(), self.rank, self.local_rank, self.device, self.fp16 if self.fp16 else False)) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 10696240..ab86fb62 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -56,7 +56,7 @@ from .utils import _move_model_to_device from ._parallel_utils import _data_parallel_wrapper from ._parallel_utils import _model_contains_inner_module from functools import partial -from ..io.logger import init_logger, get_logger +from ..io import logger __all__ = [ "Tester" @@ -104,8 +104,7 @@ class Tester(object): self.batch_size = batch_size self.verbose = verbose self.use_tqdm = use_tqdm - init_logger(stdout='tqdm' if use_tqdm else 'plain') - self.logger = get_logger(__name__) + self.logger = logger if isinstance(data, DataSet): self.data_iterator = DataSetIter( diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index d71e23f5..783997a7 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -353,7 +353,7 @@ from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device from ._parallel_utils import _model_contains_inner_module -from ..io.logger import init_logger, get_logger +from ..io import logger class Trainer(object): @@ -548,11 +548,7 @@ class Trainer(object): else: raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer))) - log_path = None - if save_path is not None: - log_path = os.path.join(os.path.dirname(save_path), 'log') - init_logger(path=log_path, stdout='tqdm' if use_tqdm else 'plain') - self.logger = get_logger(__name__) + self.logger = logger self.use_tqdm = use_tqdm self.pbar = None @@ -701,7 +697,7 @@ class Trainer(object): self.n_steps) + \ self.tester._format_eval_results(eval_res) # pbar.write(eval_str + '\n') - self.logger.info(eval_str) + self.logger.info(eval_str + '\n') # ================= mini-batch end ==================== # # lr decay; early stopping diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index f8c55bf5..a19428d3 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -72,6 +72,8 @@ __all__ = [ 'ModelLoader', 'ModelSaver', + + 'logger', ] from .embed_loader import EmbedLoader @@ -81,3 +83,4 @@ from .model_io import ModelLoader, ModelSaver from .loader import * from .pipe import * +from ._logger import * diff --git a/fastNLP/io/logger.py b/fastNLP/io/_logger.py similarity index 62% rename from fastNLP/io/logger.py rename to fastNLP/io/_logger.py index 6bdf693d..73c47d42 100644 --- a/fastNLP/io/logger.py +++ b/fastNLP/io/_logger.py @@ -6,8 +6,11 @@ import os import sys import warnings +__all__ = [ + 'logger', +] -__all__ = ['logger'] +ROOT_NAME = 'fastNLP' try: import fitlog @@ -39,7 +42,7 @@ else: self.setLevel(level) -def get_level(level): +def _get_level(level): if isinstance(level, int): pass else: @@ -50,22 +53,45 @@ def get_level(level): return level -def init_logger(path=None, stdout='tqdm', level='INFO'): - """initialize logger""" - if stdout not in ['none', 'plain', 'tqdm']: - raise ValueError('stdout must in one of {}'.format(['none', 'plain', 'tqdm'])) +def _add_file_handler(logger, path, level='INFO'): + for h in logger.handlers: + if isinstance(h, logging.FileHandler): + if os.path.abspath(path) == h.baseFilename: + # file path already added + return - level = get_level(level) + # File Handler + if os.path.exists(path): + assert os.path.isfile(path) + warnings.warn('log already exists in {}'.format(path)) + dirname = os.path.abspath(os.path.dirname(path)) + os.makedirs(dirname, exist_ok=True) - logger = logging.getLogger('fastNLP') - logger.setLevel(level) - handlers_type = set([type(h) for h in logger.handlers]) + file_handler = logging.FileHandler(path, mode='a') + file_handler.setLevel(_get_level(level)) + file_formatter = logging.Formatter(fmt='%(asctime)s - [%(levelname)s] - %(message)s', + datefmt='%Y/%m/%d %H:%M:%S') + file_handler.setFormatter(file_formatter) + logger.addHandler(file_handler) + +def _set_stdout_handler(logger, stdout='tqdm', level='INFO'): + level = _get_level(level) + if stdout not in ['none', 'plain', 'tqdm']: + raise ValueError('stdout must in one of {}'.format(['none', 'plain', 'tqdm'])) # make sure to initialize logger only once + stream_handler = None + for i, h in enumerate(logger.handlers): + if isinstance(h, (logging.StreamHandler, TqdmLoggingHandler)): + stream_handler = h + break + if stream_handler is not None: + logger.removeHandler(stream_handler) + # Stream Handler - if stdout == 'plain' and (logging.StreamHandler not in handlers_type): + if stdout == 'plain': stream_handler = logging.StreamHandler(sys.stdout) - elif stdout == 'tqdm' and (TqdmLoggingHandler not in handlers_type): + elif stdout == 'tqdm': stream_handler = TqdmLoggingHandler(level) else: stream_handler = None @@ -76,52 +102,44 @@ def init_logger(path=None, stdout='tqdm', level='INFO'): stream_handler.setFormatter(stream_formatter) logger.addHandler(stream_handler) - # File Handler - if path is not None and (logging.FileHandler not in handlers_type): - if os.path.exists(path): - assert os.path.isfile(path) - warnings.warn('log already exists in {}'.format(path)) - dirname = os.path.abspath(os.path.dirname(path)) - os.makedirs(dirname, exist_ok=True) - - file_handler = logging.FileHandler(path, mode='a') - file_handler.setLevel(level) - file_formatter = logging.Formatter(fmt='%(asctime)s - [%(levelname)s] - %(name)s - %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') - file_handler.setFormatter(file_formatter) - logger.addHandler(file_handler) - return logger +def _init_logger(path=None, stdout='tqdm', level='INFO'): + """initialize logger""" + level = _get_level(level) + # logger = logging.getLogger(ROOT_NAME) + logger = logging.getLogger() + logger.setLevel(level) -# init logger when import -logger = init_logger() + _set_stdout_handler(logger, stdout, level) + # File Handler + if path is not None: + _add_file_handler(logger, path, level) -def get_logger(name=None): - if name is None: - return logging.getLogger('fastNLP') - return logging.getLogger(name) + return logger -def set_file(path, level='INFO'): - for h in logger.handlers: - if isinstance(h, logging.FileHandler): - if os.path.abspath(path) == h.baseFilename: - # file path already added - return +def _get_logger(name=None, level='INFO'): + level = _get_level(level) + if name is None: + name = ROOT_NAME + assert isinstance(name, str) + if not name.startswith(ROOT_NAME): + name = '{}.{}'.format(ROOT_NAME, name) + logger = logging.getLogger(name) + logger.setLevel(level) + return logger - # File Handler - if os.path.exists(path): - assert os.path.isfile(path) - warnings.warn('log already exists in {}'.format(path)) - dirname = os.path.abspath(os.path.dirname(path)) - os.makedirs(dirname, exist_ok=True) - file_handler = logging.FileHandler(path, mode='a') - file_handler.setLevel(get_level(level)) - file_formatter = logging.Formatter(fmt='%(asctime)s - [%(levelname)s] - %(name)s - %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') - file_handler.setFormatter(file_formatter) - logger.addHandler(file_handler) +class FastNLPLogger(logging.Logger): + def add_file(self, path, level): + _add_file_handler(self, path, level) + + def set_stdout(self, stdout, level): + _set_stdout_handler(self, stdout, level) +_logger = _init_logger(path=None) +logger = FastNLPLogger(ROOT_NAME) +logger.__dict__.update(_logger.__dict__) +del _logger diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py index 99e27640..704b9f43 100644 --- a/reproduction/text_classification/train_dpcnn.py +++ b/reproduction/text_classification/train_dpcnn.py @@ -15,13 +15,14 @@ from fastNLP.core.const import Const as C from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.dist_trainer import DistTrainer from utils.util_init import set_rng_seeds +from fastNLP.io import logger import os # os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' # os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - # hyper +logger.add_file('log', 'INFO') class Config(): seed = 12345 @@ -46,11 +47,11 @@ class Config(): self.datapath = {k: os.path.join(self.datadir, v) for k, v in self.datafile.items()} - ops = Config() set_rng_seeds(ops.seed) -print('RNG SEED: {}'.format(ops.seed)) +# print('RNG SEED: {}'.format(ops.seed)) +logger.info('RNG SEED %d'%ops.seed) # 1.task相关信息:利用dataloader载入dataInfo @@ -81,8 +82,9 @@ print(embedding.embedding.weight.data.mean(), embedding.embedding.weight.data.st # embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)]) datainfo.datasets['train'] = datainfo.datasets['train'][:1000] datainfo.datasets['test'] = datainfo.datasets['test'][:1000] -print(datainfo) -print(datainfo.datasets['train'][0]) +# print(datainfo) +# print(datainfo.datasets['train'][0]) +logger.info(datainfo) model = DPCNN(init_embed=embedding, num_cls=len(datainfo.vocabs[C.TARGET]), embed_dropout=ops.embed_dropout, cls_dropout=ops.cls_dropout) @@ -108,12 +110,13 @@ callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' -print(device) +# print(device) +logger.info(device) # 4.定义train方法 trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), - metrics=[metric], use_tqdm=False, + metrics=[metric], use_tqdm=False, save_path='save', dev_data=datainfo.datasets['test'], device=device, check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, n_epochs=ops.train_epoch, num_workers=4) From ea0f2f7e00188ab44bad21d8a6e53aa55601a3b6 Mon Sep 17 00:00:00 2001 From: xuyige Date: Mon, 19 Aug 2019 20:48:08 +0800 Subject: [PATCH 075/153] update reproduction/matching to adapt version 0.5.0: 1) move loader codes from DataLoader to PiPe; 2) fix some bugs in matching pipe; 3) delete some expire codes. --- fastNLP/io/pipe/matching.py | 12 +- .../matching/data/MatchingDataLoader.py | 435 ------------------ reproduction/matching/matching_bert.py | 76 +-- reproduction/matching/matching_cntn.py | 42 +- reproduction/matching/matching_esim.py | 69 ++- reproduction/matching/matching_mwan.py | 60 +-- reproduction/matching/model/bert.py | 35 +- reproduction/matching/model/cntn.py | 20 +- reproduction/matching/model/esim.py | 21 +- .../matching/test/test_snlidataloader.py | 10 - 10 files changed, 142 insertions(+), 638 deletions(-) delete mode 100644 reproduction/matching/data/MatchingDataLoader.py delete mode 100644 reproduction/matching/test/test_snlidataloader.py diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 2eaeef58..0d1b4e82 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -89,13 +89,15 @@ class MatchingBertPipe(Pipe): data_bundle.set_vocab(word_vocab, Const.INPUT) data_bundle.set_vocab(target_vocab, Const.TARGET) - input_fields = [Const.INPUT, Const.INPUT_LEN, Const.TARGET] + input_fields = [Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) dataset.set_input(*input_fields, flag=True) - dataset.set_target(*target_fields, flag=True) + for fields in target_fields: + if dataset.has_field(fields): + dataset.set_target(fields, flag=True) return data_bundle @@ -210,14 +212,16 @@ class MatchingPipe(Pipe): data_bundle.set_vocab(word_vocab, Const.INPUTS(0)) data_bundle.set_vocab(target_vocab, Const.TARGET) - input_fields = [Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LENS(0), Const.INPUT_LENS(1), Const.TARGET] + input_fields = [Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LENS(0), Const.INPUT_LENS(1)] target_fields = [Const.TARGET] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUTS(0), Const.INPUT_LENS(0)) dataset.add_seq_len(Const.INPUTS(1), Const.INPUT_LENS(1)) dataset.set_input(*input_fields, flag=True) - dataset.set_target(*target_fields, flag=True) + for fields in target_fields: + if dataset.has_field(fields): + dataset.set_target(fields, flag=True) return data_bundle diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py deleted file mode 100644 index f13618aa..00000000 --- a/reproduction/matching/data/MatchingDataLoader.py +++ /dev/null @@ -1,435 +0,0 @@ -""" -这个文件的内容已合并到fastNLP.io.data_loader里,这个文件的内容不再更新 -""" - - -import os - -from typing import Union, Dict - -from fastNLP.core.const import Const -from fastNLP.core.vocabulary import Vocabulary -from fastNLP.io.data_bundle import DataBundle, DataSetLoader -from fastNLP.io.dataset_loader import JsonLoader, CSVLoader -from fastNLP.io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR -from fastNLP.modules.encoder._bert import BertTokenizer - - -class MatchingLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader` - - 读取Matching任务的数据集 - - :param dict paths: key是数据集名称(如train、dev、test),value是对应的文件名 - """ - - def __init__(self, paths: dict=None): - self.paths = paths - - def _load(self, path): - """ - :param str path: 待读取数据集的路径名 - :return: fastNLP.DataSet ds: 返回一个DataSet对象,里面必须包含3个field:其中两个分别为两个句子 - 的原始字符串文本,第三个为标签 - """ - raise NotImplementedError - - def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None, - to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None, - cut_text: int = None, get_index=True, auto_pad_length: int=None, - auto_pad_token: str='', set_input: Union[list, str, bool]=True, - set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataBundle: - """ - :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, - 则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和 - 对应的全路径文件名。 - :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义 - 这个数据集的名字,如果不定义则默认为train。 - :param bool to_lower: 是否将文本自动转为小写。默认值为False。 - :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` : - 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 - attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len - :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 - :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。 - :param bool get_index: 是否需要根据词表将文本转为index - :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad - :param str auto_pad_token: 自动pad的内容 - :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False - 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, - 于此同时其他field不会被设置为input。默认值为True。 - :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。 - :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个。 - 如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果 - 传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]']. - :return: - """ - if isinstance(set_input, str): - set_input = [set_input] - if isinstance(set_target, str): - set_target = [set_target] - if isinstance(set_input, bool): - auto_set_input = set_input - else: - auto_set_input = False - if isinstance(set_target, bool): - auto_set_target = set_target - else: - auto_set_target = False - if isinstance(paths, str): - if os.path.isdir(paths): - path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()} - else: - path = {dataset_name if dataset_name is not None else 'train': paths} - else: - path = paths - - data_info = DataBundle() - for data_name in path.keys(): - data_info.datasets[data_name] = self._load(path[data_name]) - - for data_name, data_set in data_info.datasets.items(): - if auto_set_input: - data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) - if auto_set_target: - if Const.TARGET in data_set.get_field_names(): - data_set.set_target(Const.TARGET) - - if to_lower: - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0), - is_input=auto_set_input) - data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1), - is_input=auto_set_input) - - if bert_tokenizer is not None: - if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: - PRETRAIN_URL = _get_base_url('bert') - model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url) - # 检查是否存在 - elif os.path.isdir(bert_tokenizer): - model_dir = bert_tokenizer - else: - raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") - - words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') - with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f: - lines = f.readlines() - lines = [line.strip() for line in lines] - words_vocab.add_word_lst(lines) - words_vocab.build_vocab() - - tokenizer = BertTokenizer.from_pretrained(model_dir) - - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields, - is_input=auto_set_input) - - if isinstance(concat, bool): - concat = 'default' if concat else None - if concat is not None: - if isinstance(concat, str): - CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'], - 'default': ['', '', '', '']} - if concat.lower() in CONCAT_MAP: - concat = CONCAT_MAP[concat] - else: - concat = 4 * [concat] - assert len(concat) == 4, \ - f'Please choose a list with 4 symbols which at the beginning of first sentence ' \ - f'the end of first sentence, the begin of second sentence, and the end of second' \ - f'sentence. Your input is {concat}' - - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] + - x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT) - data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT, - is_input=auto_set_input) - - if seq_len_type is not None: - if seq_len_type == 'seq_len': # - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), - is_input=auto_set_input) - elif seq_len_type == 'mask': - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: [1] * len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), - is_input=auto_set_input) - elif seq_len_type == 'bert': - for data_name, data_set in data_info.datasets.items(): - if Const.INPUT not in data_set.get_field_names(): - raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: ' - f'got {data_set.get_field_names()}') - data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), - new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input) - data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), - new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) - - if auto_pad_length is not None: - cut_text = min(auto_pad_length, cut_text if cut_text is not None else auto_pad_length) - - if cut_text is not None: - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')): - data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields, - is_input=auto_set_input) - - data_set_list = [d for n, d in data_info.datasets.items()] - assert len(data_set_list) > 0, f'There are NO data sets in data info!' - - if bert_tokenizer is None: - words_vocab = Vocabulary(padding=auto_pad_token) - words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], - field_name=[n for n in data_set_list[0].get_field_names() - if (Const.INPUT in n)], - no_create_entry_dataset=[d for n, d in data_info.datasets.items() - if 'train' not in n]) - target_vocab = Vocabulary(padding=None, unknown=None) - target_vocab = target_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], - field_name=Const.TARGET) - data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab} - - if get_index: - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, - is_input=auto_set_input) - - if Const.TARGET in data_set.get_field_names(): - data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET, - is_input=auto_set_input, is_target=auto_set_target) - - if auto_pad_length is not None: - if seq_len_type == 'seq_len': - raise RuntimeError(f'the sequence will be padded with the length {auto_pad_length}, ' - f'so the seq_len_type cannot be `{seq_len_type}`!') - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] * - (auto_pad_length - len(x[fields])), new_field_name=fields, - is_input=auto_set_input) - elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'): - data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])), - new_field_name=fields, is_input=auto_set_input) - - for data_name, data_set in data_info.datasets.items(): - if isinstance(set_input, list): - data_set.set_input(*[inputs for inputs in set_input if inputs in data_set.get_field_names()]) - if isinstance(set_target, list): - data_set.set_target(*[target for target in set_target if target in data_set.get_field_names()]) - - return data_info - - -class SNLILoader(MatchingLoader, JsonLoader): - """ - 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader` - - 读取SNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip - """ - - def __init__(self, paths: dict=None): - fields = { - 'sentence1_binary_parse': Const.INPUTS(0), - 'sentence2_binary_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - paths = paths if paths is not None else { - 'train': 'snli_1.0_train.jsonl', - 'dev': 'snli_1.0_dev.jsonl', - 'test': 'snli_1.0_test.jsonl'} - MatchingLoader.__init__(self, paths=paths) - JsonLoader.__init__(self, fields=fields) - - def _load(self, path): - ds = JsonLoader._load(self, path) - - parentheses_table = str.maketrans({'(': None, ')': None}) - - ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(1)) - ds.drop(lambda x: x[Const.TARGET] == '-') - return ds - - -class RTELoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.RTELoader` :class:`fastNLP.io.dataset_loader.RTELoader` - - 读取RTE数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv' # test set has not label - } - MatchingLoader.__init__(self, paths=paths) - self.fields = { - 'sentence1': Const.INPUTS(0), - 'sentence2': Const.INPUTS(1), - 'label': Const.TARGET, - } - CSVLoader.__init__(self, sep='\t') - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if v in ds.get_field_names(): - ds.rename_field(k, v) - for fields in ds.get_all_fields(): - if Const.INPUT in fields: - ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) - - return ds - - -class QNLILoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.dataset_loader.QNLILoader` - - 读取QNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv' # test set has not label - } - MatchingLoader.__init__(self, paths=paths) - self.fields = { - 'question': Const.INPUTS(0), - 'sentence': Const.INPUTS(1), - 'label': Const.TARGET, - } - CSVLoader.__init__(self, sep='\t') - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if v in ds.get_field_names(): - ds.rename_field(k, v) - for fields in ds.get_all_fields(): - if Const.INPUT in fields: - ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) - - return ds - - -class MNLILoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.MNLILoader` :class:`fastNLP.io.dataset_loader.MNLILoader` - - 读取MNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev_matched': 'dev_matched.tsv', - 'dev_mismatched': 'dev_mismatched.tsv', - 'test_matched': 'test_matched.tsv', - 'test_mismatched': 'test_mismatched.tsv', - # 'test_0.9_matched': 'multinli_0.9_test_matched_unlabeled.txt', - # 'test_0.9_mismatched': 'multinli_0.9_test_mismatched_unlabeled.txt', - - # test_0.9_mathed与mismatched是MNLI0.9版本的(数据来源:kaggle) - } - MatchingLoader.__init__(self, paths=paths) - CSVLoader.__init__(self, sep='\t') - self.fields = { - 'sentence1_binary_parse': Const.INPUTS(0), - 'sentence2_binary_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - - if Const.TARGET in ds.get_field_names(): - if ds[0][Const.TARGET] == 'hidden': - ds.delete_field(Const.TARGET) - - parentheses_table = str.maketrans({'(': None, ')': None}) - - ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(1)) - if Const.TARGET in ds.get_field_names(): - ds.drop(lambda x: x[Const.TARGET] == '-') - return ds - - -class QuoraLoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.QuoraLoader` :class:`fastNLP.io.dataset_loader.QuoraLoader` - - 读取MNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - words2: list(str), 第二句文本, hypothesis - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv', - } - MatchingLoader.__init__(self, paths=paths) - CSVLoader.__init__(self, sep='\t', headers=(Const.TARGET, Const.INPUTS(0), Const.INPUTS(1), 'pairID')) - - def _load(self, path): - ds = CSVLoader._load(self, path) - return ds diff --git a/reproduction/matching/matching_bert.py b/reproduction/matching/matching_bert.py index 3ed75fd1..323d81a3 100644 --- a/reproduction/matching/matching_bert.py +++ b/reproduction/matching/matching_bert.py @@ -2,8 +2,12 @@ import random import numpy as np import torch -from fastNLP.core import Trainer, Tester, AccuracyMetric, Const, Adam -from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader +from fastNLP.core import Trainer, Tester, AccuracyMetric, Const +from fastNLP.core.callback import WarmupCallback, EvaluateCallback +from fastNLP.core.optimizer import AdamW +from fastNLP.embeddings import BertEmbedding +from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, MNLIBertPipe,\ + QNLIBertPipe, QuoraBertPipe from reproduction.matching.model.bert import BertForNLI @@ -12,16 +16,22 @@ from reproduction.matching.model.bert import BertForNLI class BERTConfig: task = 'snli' + batch_size_per_gpu = 6 n_epochs = 6 lr = 2e-5 - seq_len_type = 'bert' + warm_up_rate = 0.1 seed = 42 + save_path = None # 模型存储的位置,None表示不存储模型。 + train_dataset_name = 'train' dev_dataset_name = 'dev' test_dataset_name = 'test' - save_path = None # 模型存储的位置,None表示不存储模型。 - bert_dir = 'path/to/bert/dir' # 预训练BERT参数文件的文件夹 + + to_lower = True # 忽略大小写 + tokenizer = 'spacy' # 使用spacy进行分词 + + bert_model_dir_or_name = 'bert-base-uncased' arg = BERTConfig() @@ -37,58 +47,52 @@ if n_gpu > 0: # load data set if arg.task == 'snli': - data_info = SNLILoader().process( - paths='path/to/snli/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = SNLIBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'rte': - data_info = RTELoader().process( - paths='path/to/rte/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = RTEBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'qnli': - data_info = QNLILoader().process( - paths='path/to/qnli/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = QNLIBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'mnli': - data_info = MNLILoader().process( - paths='path/to/mnli/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = MNLIBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'quora': - data_info = QuoraLoader().process( - paths='path/to/quora/data', to_lower=True, seq_len_type=arg.seq_len_type, - bert_tokenizer=arg.bert_dir, cut_text=512, - get_index=True, concat='bert', - ) + data_bundle = QuoraBertPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() else: raise RuntimeError(f'NOT support {arg.task} task yet!') +print(data_bundle) # print details in data_bundle + +# load embedding +embed = BertEmbedding(data_bundle.vocabs[Const.INPUT], model_dir_or_name=arg.bert_model_dir_or_name) + # define model -model = BertForNLI(class_num=len(data_info.vocabs[Const.TARGET]), bert_dir=arg.bert_dir) +model = BertForNLI(embed, class_num=len(data_bundle.vocabs[Const.TARGET])) + +# define optimizer and callback +optimizer = AdamW(lr=arg.lr, params=model.parameters()) +callbacks = [WarmupCallback(warmup=arg.warm_up_rate, schedule='linear'), ] + +if arg.task in ['snli']: + callbacks.append(EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name])) + # evaluate test set in every epoch if task is snli. # define trainer -trainer = Trainer(train_data=data_info.datasets[arg.train_dataset_name], model=model, - optimizer=Adam(lr=arg.lr, model_params=model.parameters()), +trainer = Trainer(train_data=data_bundle.datasets[arg.train_dataset_name], model=model, + optimizer=optimizer, batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, n_epochs=arg.n_epochs, print_every=-1, - dev_data=data_info.datasets[arg.dev_dataset_name], + dev_data=data_bundle.datasets[arg.dev_dataset_name], metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1, - save_path=arg.save_path) + save_path=arg.save_path, + callbacks=callbacks) # train model trainer.train(load_best_model=True) # define tester tester = Tester( - data=data_info.datasets[arg.test_dataset_name], + data=data_bundle.datasets[arg.test_dataset_name], model=model, metrics=AccuracyMetric(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, diff --git a/reproduction/matching/matching_cntn.py b/reproduction/matching/matching_cntn.py index 098f3bc4..9be716ba 100644 --- a/reproduction/matching/matching_cntn.py +++ b/reproduction/matching/matching_cntn.py @@ -1,9 +1,9 @@ import argparse import torch -from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const +from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const, CrossEntropyLoss from fastNLP.embeddings import StaticEmbedding -from fastNLP.io.data_loader import QNLILoader, RTELoader, SNLILoader, MNLILoader +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, MNLIPipe, QNLIPipe from reproduction.matching.model.cntn import CNTNModel @@ -13,14 +13,12 @@ argument.add_argument('--embedding', choices=['glove', 'word2vec'], default='glo argument.add_argument('--batch-size-per-gpu', type=int, default=256) argument.add_argument('--n-epochs', type=int, default=200) argument.add_argument('--lr', type=float, default=1e-5) -argument.add_argument('--seq-len-type', choices=['mask', 'seq_len'], default='mask') argument.add_argument('--save-dir', type=str, default=None) argument.add_argument('--cntn-depth', type=int, default=1) argument.add_argument('--cntn-ns', type=int, default=200) argument.add_argument('--cntn-k-top', type=int, default=10) argument.add_argument('--cntn-r', type=int, default=5) argument.add_argument('--dataset', choices=['qnli', 'rte', 'snli', 'mnli'], default='qnli') -argument.add_argument('--max-len', type=int, default=50) arg = argument.parse_args() # dataset dict @@ -45,30 +43,25 @@ else: num_labels = 3 # load data set -if arg.dataset == 'qnli': - data_info = QNLILoader().process( - paths='path/to/qnli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, - get_index=True, concat=False, auto_pad_length=arg.max_len) +if arg.dataset == 'snli': + data_bundle = SNLIPipe(lower=True, tokenizer='raw').process_from_file() elif arg.dataset == 'rte': - data_info = RTELoader().process( - paths='path/to/rte/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, - get_index=True, concat=False, auto_pad_length=arg.max_len) -elif arg.dataset == 'snli': - data_info = SNLILoader().process( - paths='path/to/snli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, - get_index=True, concat=False, auto_pad_length=arg.max_len) + data_bundle = RTEPipe(lower=True, tokenizer='raw').process_from_file() +elif arg.dataset == 'qnli': + data_bundle = QNLIPipe(lower=True, tokenizer='raw').process_from_file() elif arg.dataset == 'mnli': - data_info = MNLILoader().process( - paths='path/to/mnli/data', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, - get_index=True, concat=False, auto_pad_length=arg.max_len) + data_bundle = MNLIPipe(lower=True, tokenizer='raw').process_from_file() else: - raise ValueError(f'now we only support [qnli,rte,snli,mnli] dataset for cntn model!') + raise RuntimeError(f'NOT support {arg.task} task yet!') + +print(data_bundle) # print details in data_bundle # load embedding if arg.embedding == 'word2vec': - embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], model_dir_or_name='en-word2vec-300', requires_grad=True) + embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-word2vec-300', + requires_grad=True) elif arg.embedding == 'glove': - embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], model_dir_or_name='en-glove-840b-300', + embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-glove-840b-300d', requires_grad=True) else: raise ValueError(f'now we only support word2vec or glove embedding for cntn model!') @@ -79,11 +72,12 @@ model = CNTNModel(embedding, ns=arg.cntn_ns, k_top=arg.cntn_k_top, num_labels=nu print(model) # define trainer -trainer = Trainer(train_data=data_info.datasets['train'], model=model, +trainer = Trainer(train_data=data_bundle.datasets['train'], model=model, optimizer=Adam(lr=arg.lr, model_params=model.parameters()), + loss=CrossEntropyLoss(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, n_epochs=arg.n_epochs, print_every=-1, - dev_data=data_info.datasets[dev_dict[arg.dataset]], + dev_data=data_bundle.datasets[dev_dict[arg.dataset]], metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1) @@ -93,7 +87,7 @@ trainer.train(load_best_model=True) # define tester tester = Tester( - data=data_info.datasets[test_dict[arg.dataset]], + data=data_bundle.datasets[test_dict[arg.dataset]], model=model, metrics=AccuracyMetric(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, diff --git a/reproduction/matching/matching_esim.py b/reproduction/matching/matching_esim.py index 2ff6916a..9d50c0fb 100644 --- a/reproduction/matching/matching_esim.py +++ b/reproduction/matching/matching_esim.py @@ -6,10 +6,11 @@ from torch.optim import Adamax from torch.optim.lr_scheduler import StepLR from fastNLP.core import Trainer, Tester, AccuracyMetric, Const -from fastNLP.core.callback import GradientClipCallback, LRScheduler -from fastNLP.embeddings.static_embedding import StaticEmbedding -from fastNLP.embeddings.elmo_embedding import ElmoEmbedding -from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader +from fastNLP.core.callback import GradientClipCallback, LRScheduler, EvaluateCallback +from fastNLP.core.losses import CrossEntropyLoss +from fastNLP.embeddings import StaticEmbedding +from fastNLP.embeddings import ElmoEmbedding +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, MNLIPipe, QNLIPipe, QuoraPipe from fastNLP.models.snli import ESIM @@ -17,18 +18,21 @@ from fastNLP.models.snli import ESIM class ESIMConfig: task = 'snli' + embedding = 'glove' + batch_size_per_gpu = 196 n_epochs = 30 lr = 2e-3 - seq_len_type = 'seq_len' - # seq_len表示在process的时候用len(words)来表示长度信息; - # mask表示用0/1掩码矩阵来表示长度信息; seed = 42 + save_path = None # 模型存储的位置,None表示不存储模型。 + train_dataset_name = 'train' dev_dataset_name = 'dev' test_dataset_name = 'test' - save_path = None # 模型存储的位置,None表示不存储模型。 + + to_lower = True # 忽略大小写 + tokenizer = 'spacy' # 使用spacy进行分词 arg = ESIMConfig() @@ -44,43 +48,32 @@ if n_gpu > 0: # load data set if arg.task == 'snli': - data_info = SNLILoader().process( - paths='path/to/snli/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = SNLIPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'rte': - data_info = RTELoader().process( - paths='path/to/rte/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = RTEPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'qnli': - data_info = QNLILoader().process( - paths='path/to/qnli/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = QNLIPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'mnli': - data_info = MNLILoader().process( - paths='path/to/mnli/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = MNLIPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() elif arg.task == 'quora': - data_info = QuoraLoader().process( - paths='path/to/quora/data', to_lower=False, seq_len_type=arg.seq_len_type, - get_index=True, concat=False, - ) + data_bundle = QuoraPipe(lower=arg.to_lower, tokenizer=arg.tokenizer).process_from_file() else: raise RuntimeError(f'NOT support {arg.task} task yet!') +print(data_bundle) # print details in data_bundle + # load embedding if arg.embedding == 'elmo': - embedding = ElmoEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True) + embedding = ElmoEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-medium', + requires_grad=True) elif arg.embedding == 'glove': - embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True, normalize=False) + embedding = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], model_dir_or_name='en-glove-840b-300d', + requires_grad=True, normalize=False) else: raise RuntimeError(f'NOT support {arg.embedding} embedding yet!') # define model -model = ESIM(embedding, num_labels=len(data_info.vocabs[Const.TARGET])) +model = ESIM(embedding, num_labels=len(data_bundle.vocabs[Const.TARGET])) # define optimizer and callback optimizer = Adamax(lr=arg.lr, params=model.parameters()) @@ -91,23 +84,29 @@ callbacks = [ LRScheduler(scheduler), ] +if arg.task in ['snli']: + callbacks.append(EvaluateCallback(data=data_bundle.datasets[arg.test_dataset_name])) + # evaluate test set in every epoch if task is snli. + # define trainer -trainer = Trainer(train_data=data_info.datasets[arg.train_dataset_name], model=model, +trainer = Trainer(train_data=data_bundle.datasets[arg.train_dataset_name], model=model, optimizer=optimizer, + loss=CrossEntropyLoss(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, n_epochs=arg.n_epochs, print_every=-1, - dev_data=data_info.datasets[arg.dev_dataset_name], + dev_data=data_bundle.datasets[arg.dev_dataset_name], metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1, - save_path=arg.save_path) + save_path=arg.save_path, + callbacks=callbacks) # train model trainer.train(load_best_model=True) # define tester tester = Tester( - data=data_info.datasets[arg.test_dataset_name], + data=data_bundle.datasets[arg.test_dataset_name], model=model, metrics=AccuracyMetric(), batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, diff --git a/reproduction/matching/matching_mwan.py b/reproduction/matching/matching_mwan.py index 31af54c5..026ea7b4 100644 --- a/reproduction/matching/matching_mwan.py +++ b/reproduction/matching/matching_mwan.py @@ -6,12 +6,11 @@ from torch.optim import Adadelta from torch.optim.lr_scheduler import StepLR from fastNLP import CrossEntropyLoss -from fastNLP import cache_results from fastNLP.core import Trainer, Tester, AccuracyMetric, Const -from fastNLP.core.callback import LRScheduler, FitlogCallback +from fastNLP.core.callback import LRScheduler, EvaluateCallback from fastNLP.embeddings import StaticEmbedding -from fastNLP.io.data_loader import MNLILoader, QNLILoader, SNLILoader, RTELoader +from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, MNLIPipe, QNLIPipe, QuoraPipe from reproduction.matching.model.mwan import MwanModel import fitlog @@ -46,47 +45,25 @@ for k in arg.__dict__: # load data set if arg.task == 'snli': - @cache_results(f'snli_mwan.pkl') - def read_snli(): - data_info = SNLILoader().process( - paths='path/to/snli/data', to_lower=True, seq_len_type=None, bert_tokenizer=None, - get_index=True, concat=False, extra_split=['/','%','-'], - ) - return data_info - data_info = read_snli() + data_bundle = SNLIPipe(lower=True, tokenizer='spacy').process_from_file() elif arg.task == 'rte': - @cache_results(f'rte_mwan.pkl') - def read_rte(): - data_info = RTELoader().process( - paths='path/to/rte/data', to_lower=True, seq_len_type=None, bert_tokenizer=None, - get_index=True, concat=False, extra_split=['/','%','-'], - ) - return data_info - data_info = read_rte() + data_bundle = RTEPipe(lower=True, tokenizer='spacy').process_from_file() elif arg.task == 'qnli': - data_info = QNLILoader().process( - paths='path/to/qnli/data', to_lower=True, seq_len_type=None, bert_tokenizer=None, - get_index=True, concat=False , cut_text=512, extra_split=['/','%','-'], - ) + data_bundle = QNLIPipe(lower=True, tokenizer='spacy').process_from_file() elif arg.task == 'mnli': - @cache_results(f'mnli_v0.9_mwan.pkl') - def read_mnli(): - data_info = MNLILoader().process( - paths='path/to/mnli/data', to_lower=True, seq_len_type=None, bert_tokenizer=None, - get_index=True, concat=False, extra_split=['/','%','-'], - ) - return data_info - data_info = read_mnli() + data_bundle = MNLIPipe(lower=True, tokenizer='spacy').process_from_file() +elif arg.task == 'quora': + data_bundle = QuoraPipe(lower=True, tokenizer='spacy').process_from_file() else: raise RuntimeError(f'NOT support {arg.task} task yet!') -print(data_info) -print(len(data_info.vocabs['words'])) +print(data_bundle) +print(len(data_bundle.vocabs[Const.INPUTS(0)])) model = MwanModel( - num_class = len(data_info.vocabs[Const.TARGET]), - EmbLayer = StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=False, normalize=False), + num_class = len(data_bundle.vocabs[Const.TARGET]), + EmbLayer = StaticEmbedding(data_bundle.vocabs[Const.INPUTS(0)], requires_grad=False, normalize=False), ElmoLayer = None, args_of_imm = { "input_size" : 300 , @@ -105,21 +82,20 @@ callbacks = [ ] if arg.task in ['snli']: - callbacks.append(FitlogCallback(data_info.datasets[arg.testset_name], verbose=1)) + callbacks.append(EvaluateCallback(data=data_bundle.datasets[arg.testset_name])) elif arg.task == 'mnli': - callbacks.append(FitlogCallback({'dev_matched': data_info.datasets['dev_matched'], - 'dev_mismatched': data_info.datasets['dev_mismatched']}, - verbose=1)) + callbacks.append(EvaluateCallback(data={'dev_matched': data_bundle.datasets['dev_matched'], + 'dev_mismatched': data_bundle.datasets['dev_mismatched']},)) trainer = Trainer( - train_data = data_info.datasets['train'], + train_data = data_bundle.datasets['train'], model = model, optimizer = optimizer, num_workers = 0, batch_size = arg.batch_size, n_epochs = arg.n_epochs, print_every = -1, - dev_data = data_info.datasets[arg.devset_name], + dev_data = data_bundle.datasets[arg.devset_name], metrics = AccuracyMetric(pred = "pred" , target = "target"), metric_key = 'acc', device = [i for i in range(torch.cuda.device_count())], @@ -130,7 +106,7 @@ trainer = Trainer( trainer.train(load_best_model=True) tester = Tester( - data=data_info.datasets[arg.testset_name], + data=data_bundle.datasets[arg.testset_name], model=model, metrics=AccuracyMetric(), batch_size=arg.batch_size, diff --git a/reproduction/matching/model/bert.py b/reproduction/matching/model/bert.py index a21f8c36..73a0c533 100644 --- a/reproduction/matching/model/bert.py +++ b/reproduction/matching/model/bert.py @@ -3,39 +3,28 @@ import torch import torch.nn as nn from fastNLP.core.const import Const -from fastNLP.models import BaseModel -from fastNLP.embeddings.bert import BertModel +from fastNLP.models.base_model import BaseModel +from fastNLP.embeddings import BertEmbedding class BertForNLI(BaseModel): - # TODO: still in progress - def __init__(self, class_num=3, bert_dir=None): + def __init__(self, bert_embed: BertEmbedding, class_num=3): super(BertForNLI, self).__init__() - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - self.bert = BertModel() - hidden_size = self.bert.pooler.dense._parameters['bias'].size(-1) - self.classifier = nn.Linear(hidden_size, class_num) - - def forward(self, words, seq_len1, seq_len2, target=None): + self.embed = bert_embed + self.classifier = nn.Linear(self.embed.embedding_dim, class_num) + + def forward(self, words): """ :param torch.Tensor words: [batch_size, seq_len] input_ids - :param torch.Tensor seq_len1: [batch_size, seq_len] token_type_ids - :param torch.Tensor seq_len2: [batch_size, seq_len] attention_mask - :param torch.Tensor target: [batch] :return: """ - _, pooled_output = self.bert(words, seq_len1, seq_len2) - logits = self.classifier(pooled_output) + hidden = self.embed(words) + logits = self.classifier(hidden) - if target is not None: - loss_func = torch.nn.CrossEntropyLoss() - loss = loss_func(logits, target) - return {Const.OUTPUT: logits, Const.LOSS: loss} return {Const.OUTPUT: logits} - def predict(self, words, seq_len1, seq_len2, target=None): - return self.forward(words, seq_len1, seq_len2) + def predict(self, words): + logits = self.forward(words)[Const.OUTPUT] + return {Const.OUTPUT: logits.argmax(dim=-1)} diff --git a/reproduction/matching/model/cntn.py b/reproduction/matching/model/cntn.py index a0a104a3..cfa5e5a8 100644 --- a/reproduction/matching/model/cntn.py +++ b/reproduction/matching/model/cntn.py @@ -3,10 +3,8 @@ import torch.nn as nn import torch.nn.functional as F import numpy as np -from torch.nn import CrossEntropyLoss - -from fastNLP.models import BaseModel -from fastNLP.embeddings.embedding import TokenEmbedding +from fastNLP.models.base_model import BaseModel +from fastNLP.embeddings import TokenEmbedding from fastNLP.core.const import Const @@ -83,13 +81,12 @@ class CNTNModel(BaseModel): self.weight_V = nn.Linear(2 * ns, r) self.weight_u = nn.Sequential(nn.Dropout(p=dropout_rate), nn.Linear(r, num_labels)) - def forward(self, words1, words2, seq_len1, seq_len2, target=None): + def forward(self, words1, words2, seq_len1, seq_len2): """ :param words1: [batch, seq_len, emb_size] Question. :param words2: [batch, seq_len, emb_size] Answer. :param seq_len1: [batch] :param seq_len2: [batch] - :param target: [batch] Glod labels. :return: """ in_q = self.embedding(words1) @@ -109,12 +106,7 @@ class CNTNModel(BaseModel): in_a = self.fc_q(in_a.view(in_a.size(0), -1)) score = torch.tanh(self.weight_u(self.weight_M(in_q, in_a) + self.weight_V(torch.cat((in_q, in_a), -1)))) - if target is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(score, target) - return {Const.LOSS: loss, Const.OUTPUT: score} - else: - return {Const.OUTPUT: score} + return {Const.OUTPUT: score} - def predict(self, **kwargs): - return self.forward(**kwargs) + def predict(self, words1, words2, seq_len1, seq_len2): + return self.forward(words1, words2, seq_len1, seq_len2) diff --git a/reproduction/matching/model/esim.py b/reproduction/matching/model/esim.py index 87e5ba65..d704e2f8 100644 --- a/reproduction/matching/model/esim.py +++ b/reproduction/matching/model/esim.py @@ -2,10 +2,8 @@ import torch import torch.nn as nn import torch.nn.functional as F -from torch.nn import CrossEntropyLoss - -from fastNLP.models import BaseModel -from fastNLP.embeddings.embedding import TokenEmbedding +from fastNLP.models.base_model import BaseModel +from fastNLP.embeddings import TokenEmbedding from fastNLP.core.const import Const from fastNLP.core.utils import seq_len_to_mask @@ -42,13 +40,12 @@ class ESIMModel(BaseModel): nn.init.xavier_uniform_(self.classifier[1].weight.data) nn.init.xavier_uniform_(self.classifier[4].weight.data) - def forward(self, words1, words2, seq_len1, seq_len2, target=None): + def forward(self, words1, words2, seq_len1, seq_len2): """ :param words1: [batch, seq_len] :param words2: [batch, seq_len] :param seq_len1: [batch] :param seq_len2: [batch] - :param target: :return: """ mask1 = seq_len_to_mask(seq_len1, words1.size(1)) @@ -82,16 +79,10 @@ class ESIMModel(BaseModel): logits = torch.tanh(self.classifier(out)) # logits = self.classifier(out) - if target is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits, target) - - return {Const.LOSS: loss, Const.OUTPUT: logits} - else: - return {Const.OUTPUT: logits} + return {Const.OUTPUT: logits} - def predict(self, **kwargs): - pred = self.forward(**kwargs)[Const.OUTPUT].argmax(-1) + def predict(self, words1, words2, seq_len1, seq_len2): + pred = self.forward(words1, words2, seq_len1, seq_len2)[Const.OUTPUT].argmax(-1) return {Const.OUTPUT: pred} # input [batch_size, len , hidden] diff --git a/reproduction/matching/test/test_snlidataloader.py b/reproduction/matching/test/test_snlidataloader.py deleted file mode 100644 index 60b3ad59..00000000 --- a/reproduction/matching/test/test_snlidataloader.py +++ /dev/null @@ -1,10 +0,0 @@ -import unittest -from ..data import MatchingDataLoader -from fastNLP.core.vocabulary import Vocabulary - - -class TestCWSDataLoader(unittest.TestCase): - def test_case1(self): - snli_loader = MatchingDataLoader() - # TODO: still in progress - From f381703e80efabbcd1c43fd915ee7a2d003935f3 Mon Sep 17 00:00:00 2001 From: xuyige Date: Mon, 19 Aug 2019 20:48:30 +0800 Subject: [PATCH 076/153] export TokenEmbedding. --- fastNLP/embeddings/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py index 4f90ac63..37881f17 100644 --- a/fastNLP/embeddings/__init__.py +++ b/fastNLP/embeddings/__init__.py @@ -7,6 +7,7 @@ torch.FloatTensor。所有的embedding都可以使用 `self.num_embedding` 获 __all__ = [ "Embedding", + "TokenEmbedding", "StaticEmbedding", "ElmoEmbedding", "BertEmbedding", @@ -14,14 +15,14 @@ __all__ = [ "StackEmbedding", "LSTMCharEmbedding", "CNNCharEmbedding", - "get_embeddings" + "get_embeddings", ] -from .embedding import Embedding +from .embedding import Embedding, TokenEmbedding from .static_embedding import StaticEmbedding from .elmo_embedding import ElmoEmbedding from .bert_embedding import BertEmbedding, BertWordPieceEncoder from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding from .stack_embedding import StackEmbedding -from .utils import get_embeddings \ No newline at end of file +from .utils import get_embeddings From 7fb7c1b5b41d29c15f8322e12385ee9057e540e5 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 19 Aug 2019 21:48:41 +0800 Subject: [PATCH 077/153] make a tool to check the alias name and __all__ part --- docs/count.py | 98 +++++++++++++++++++++++++++++++++++++++++++++ fastNLP/__init__.py | 7 ++-- 2 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 docs/count.py diff --git a/docs/count.py b/docs/count.py new file mode 100644 index 00000000..d906f4c0 --- /dev/null +++ b/docs/count.py @@ -0,0 +1,98 @@ +import os + + +def find_all(path='../fastNLP'): + head_list = [] + alias_list = [] + for path, dirs, files in os.walk(path): + for file in files: + if file.endswith('.py'): + name = ".".join(path.split('/')[1:]) + if file.split('.')[0] != "__init__": + name = name + '.' + file.split('.')[0] + if len(name.split('.')) < 3 or name.startswith('fastNLP.core'): + heads, alias = find_one(path + '/' + file) + for h in heads: + head_list.append(name + "." + h) + for a in alias: + alias_list.append(a) + heads = {} + for h in head_list: + end = h.split('.')[-1] + file = h[:-len(end) - 1] + if end not in heads: + heads[end] = set() + heads[end].add(file) + alias = {} + for a in alias_list: + for each in a: + end = each.split('.')[-1] + file = each[:-len(end) - 1] + if end not in alias: + alias[end] = set() + alias[end].add(file) + print("IN alias NOT IN heads") + for item in alias: + if item not in heads: + print(item, alias[item]) + elif len(heads[item]) != 2: + print(item, alias[item], heads[item]) + + print("\n\nIN heads NOT IN alias") + for item in heads: + if item not in alias: + print(item, heads[item]) + + +def find_class(path): + with open(path, 'r') as fin: + lines = fin.readlines() + pars = {} + for i, line in enumerate(lines): + if line.strip().startswith('class'): + line = line.strip()[len('class'):-1].strip() + if line[-1] == ')': + line = line[:-1].split('(') + name = line[0].strip() + parents = line[1].split(',') + for i in range(len(parents)): + parents[i] = parents[i].strip() + if len(parents) == 1: + pars[name] = parents[0] + else: + pars[name] = tuple(parents) + return pars + + +def find_one(path): + head_list = [] + alias = [] + with open(path, 'r') as fin: + lines = fin.readlines() + flag = False + for i, line in enumerate(lines): + if line.strip().startswith('__all__'): + line = line.strip()[len('__all__'):].strip() + if line[-1] == ']': + line = line[1:-1].strip()[1:].strip() + head_list.append(line.strip("\"").strip("\'").strip()) + else: + flag = True + elif line.strip() == ']': + flag = False + elif flag: + line = line.strip()[:-1].strip("\"").strip("\'").strip() + if len(line) == 0 or line[0] == '#': + continue + head_list.append(line) + if line.startswith('def') or line.startswith('class'): + if lines[i + 2].strip().startswith("别名:"): + names = lines[i + 2].strip()[len("别名:"):].split() + names[0] = names[0][len(":class:`"):-1] + names[1] = names[1][len(":class:`"):-1] + alias.append((names[0], names[1])) + return head_list, alias + + +if __name__ == "__main__": + find_all() # use to check __all__ diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py index ec192568..a6767088 100644 --- a/fastNLP/__init__.py +++ b/fastNLP/__init__.py @@ -13,11 +13,11 @@ fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ,他们的 __all__ = [ "Instance", "FieldArray", - + "DataSetIter", "BatchIter", "TorchLoaderIter", - + "Vocabulary", "DataSet", "Const", @@ -51,7 +51,8 @@ __all__ = [ "LossFunc", "CrossEntropyLoss", - "L1Loss", "BCELoss", + "L1Loss", + "BCELoss", "NLLLoss", "LossInForward", From 0de2ec88239c82736491aef284513e1dea2deddf Mon Sep 17 00:00:00 2001 From: Yunfan Shao Date: Mon, 19 Aug 2019 22:08:48 +0800 Subject: [PATCH 078/153] [update] add default args for logger method --- fastNLP/io/_logger.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fastNLP/io/_logger.py b/fastNLP/io/_logger.py index 73c47d42..a69e297e 100644 --- a/fastNLP/io/_logger.py +++ b/fastNLP/io/_logger.py @@ -133,10 +133,12 @@ def _get_logger(name=None, level='INFO'): class FastNLPLogger(logging.Logger): - def add_file(self, path, level): + def add_file(self, path='./log.txt', level='INFO'): + """add log output file and level""" _add_file_handler(self, path, level) - def set_stdout(self, stdout, level): + def set_stdout(self, stdout='tqdm', level='INFO'): + """set stdout format and level""" _set_stdout_handler(self, stdout, level) _logger = _init_logger(path=None) From 3624f7dafddc23bfd60faeaeb4e8eefe541fa2eb Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 19 Aug 2019 23:35:47 +0800 Subject: [PATCH 079/153] =?UTF-8?q?=E5=A2=9E=E5=8A=A0conll2003Pipe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/callback.py | 2 +- fastNLP/core/metrics.py | 1 + fastNLP/io/pipe/conll.py | 92 ++++++++++++++++++- fastNLP/io/pipe/utils.py | 38 ++++---- .../chinese_ner/train_cn_ner.py | 2 +- 5 files changed, 113 insertions(+), 22 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 53767011..47d4174b 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -646,7 +646,7 @@ class EvaluateCallback(Callback): raise TypeError("data receives dict[DataSet] or DataSet object.") def on_train_begin(self): - if len(self.datasets) > 0and self.trainer.dev_data is None: + if len(self.datasets) > 0 and self.trainer.dev_data is None: raise RuntimeError("Trainer has no dev data, you cannot pass extra DataSet to do evaluation.") if len(self.datasets) > 0: diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 8dd51eb6..ef6f8b69 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -358,6 +358,7 @@ def _bmes_tag_to_spans(tags, ignore_labels=None): """ 给定一个tags的lis,比如['S-song', 'B-singer', 'M-singer', 'E-singer', 'S-moive', 'S-actor']。 返回[('song', (0, 1)), ('singer', (1, 4)), ('moive', (4, 5)), ('actor', (5, 6))] (左闭右开区间) + 也可以是单纯的['S', 'B', 'M', 'E', 'B', 'M', 'M',...]序列 :param tags: List[str], :param ignore_labels: List[str], 在该list中的label将被忽略 diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index fb599340..58fab281 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -5,8 +5,8 @@ from ...core.const import Const from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader from .utils import _indexize, _add_words_field from .utils import _add_chars_field -from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader - +from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader, ConllLoader +from ...core.vocabulary import Vocabulary class _NERPipe(Pipe): """ @@ -78,7 +78,7 @@ class Conll2003NERPipe(_NERPipe): :header: "raw_words", "words", "target", "seq_len" "[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 - "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 + "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4,...]", 6 "[...]", "[...]", "[...]", . raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 @@ -102,6 +102,90 @@ class Conll2003NERPipe(_NERPipe): return data_bundle +class Conll2003Pipe(Pipe): + def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False, target_pad_val=0): + """ + 经过该Pipe后,DataSet中的内容如下 + + .. csv-table:: + :header: "raw_words", "words", "pos", "chunk", "ner", "seq_len" + + "[Nadim, Ladki]", "[2, 3]", "[0, 0]", "[1, 2]", "[1, 2]", 2 + "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", 6 + "[...]", "[...]", "[...]", "[...]", "[...]". + + 其中words, seq_len是input; pos, chunk, ner, seq_len是target + + :param str chunk_encoding_type: 支持bioes, bio。 + :param str ner_encoding_type: 支持bioes, bio。 + :param bool lower: 是否将words列小写化后再建立词表 + :param int target_pad_val: pos, ner, chunk列的padding值 + """ + if chunk_encoding_type == 'bio': + self.chunk_convert_tag = iob2 + else: + self.chunk_convert_tag = lambda tags: iob2bioes(iob2(tags)) + if ner_encoding_type == 'bio': + self.ner_convert_tag = iob2 + else: + self.ner_convert_tag = lambda tags: iob2bioes(iob2(tags)) + self.lower = lower + self.target_pad_val = int(target_pad_val) + + def process(self, data_bundle)->DataBundle: + """ + 输入的DataSet应该类似于如下的形式 + + .. csv-table:: + :header: "raw_words", "pos", "chunk", "ner" + + "[Nadim, Ladki]", "[NNP, NNP]", "[B-NP, I-NP]", "[B-PER, I-PER]" + "[AL-AIN, United, Arab, ...]", "[NNP, NNP...]", "[B-NP, B-NP, ...]", "[B-LOC, B-LOC,...]" + "[...]", "[...]", "[...]", "[...]". + + :param data_bundle: + :return: 传入的DataBundle + """ + # 转换tag + for name, dataset in data_bundle.datasets.items(): + dataset.drop(lambda x: "-DOCSTART-" in x[Const.RAW_WORD]) + dataset.apply_field(self.chunk_convert_tag, field_name='chunk', new_field_name='chunk') + dataset.apply_field(self.ner_convert_tag, field_name='ner', new_field_name='ner') + + _add_words_field(data_bundle, lower=self.lower) + + # index + _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=['pos', 'ner']) + # chunk中存在一些tag只在dev中出现,没在train中 + tgt_vocab = Vocabulary(unknown=None, padding=None) + tgt_vocab.from_dataset(*data_bundle.datasets.values(), field_name='ner') + tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name='ner') + data_bundle.set_vocab(tgt_vocab, 'ner') + + input_fields = [Const.INPUT, Const.INPUT_LEN] + target_fields = ['pos', 'ner', 'chunk', Const.INPUT_LEN] + + for name, dataset in data_bundle.datasets.items(): + dataset.set_pad_val('pos', self.target_pad_val) + dataset.set_pad_val('ner', self.target_pad_val) + dataset.set_pad_val('chunk', self.target_pad_val) + dataset.add_seq_len(Const.INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths): + """ + + :param paths: + :return: + """ + data_bundle = ConllLoader(headers=['raw_words', 'pos', 'chunk', 'ner']).load(paths) + return self.process(data_bundle) + + class OntoNotesNERPipe(_NERPipe): """ 处理OntoNotes的NER数据,处理之后DataSet中的field情况为 @@ -171,7 +255,7 @@ class _CNNERPipe(Pipe): _add_chars_field(data_bundle, lower=False) # index - _indexize(data_bundle, input_field_name=Const.CHAR_INPUT, target_field_name=Const.TARGET) + _indexize(data_bundle, input_field_names=Const.CHAR_INPUT, target_field_names=Const.TARGET) input_fields = [Const.TARGET, Const.CHAR_INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py index 7d011446..8facd8d9 100644 --- a/fastNLP/io/pipe/utils.py +++ b/fastNLP/io/pipe/utils.py @@ -4,7 +4,8 @@ from ...core.const import Const def iob2(tags:List[str])->List[str]: """ - 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。两种格式的区别见https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format + 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。两种格式的区别见 + https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format :param tags: 需要转换的tags """ @@ -76,27 +77,32 @@ def _raw_split(sent): return sent.split() -def _indexize(data_bundle, input_field_name=Const.INPUT, target_field_name=Const.TARGET): +def _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=Const.TARGET): """ 在dataset中的field_name列建立词表,Const.TARGET列建立词表,并把词表加入到data_bundle中。 :param data_bundle: - :param: str input_field_name: - :param: str target_field_name: 这一列的vocabulary没有unknown和padding + :param: str,list input_field_names: + :param: str,list target_field_names: 这一列的vocabulary没有unknown和padding :return: """ - src_vocab = Vocabulary() - src_vocab.from_dataset(data_bundle.datasets['train'], field_name=input_field_name, - no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if - name != 'train']) - src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) - - tgt_vocab = Vocabulary(unknown=None, padding=None) - tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=target_field_name) - tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=target_field_name) - - data_bundle.set_vocab(src_vocab, input_field_name) - data_bundle.set_vocab(tgt_vocab, target_field_name) + if isinstance(input_field_names, str): + input_field_names = [input_field_names] + if isinstance(target_field_names, str): + target_field_names = [target_field_names] + for input_field_name in input_field_names: + src_vocab = Vocabulary() + src_vocab.from_dataset(data_bundle.datasets['train'], field_name=input_field_name, + no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if + name != 'train']) + src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) + data_bundle.set_vocab(src_vocab, input_field_name) + + for target_field_name in target_field_names: + tgt_vocab = Vocabulary(unknown=None, padding=None) + tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=target_field_name) + tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=target_field_name) + data_bundle.set_vocab(tgt_vocab, target_field_name) return data_bundle diff --git a/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py b/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py index 1005ea23..58b32265 100644 --- a/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py +++ b/reproduction/seqence_labelling/chinese_ner/train_cn_ner.py @@ -47,7 +47,7 @@ class ChineseNERPipe(Pipe): _add_chars_field(data_bundle, lower=False) # index - _indexize(data_bundle, input_field_name=C.CHAR_INPUT, target_field_name=C.TARGET) + _indexize(data_bundle, input_field_names=C.CHAR_INPUT, target_field_names=C.TARGET) for name, dataset in data_bundle.datasets.items(): dataset.set_pad_val(C.TARGET, self.target_pad_val) From fc8438587b7d3064e8e498aa046b1dccec276f70 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 19 Aug 2019 23:37:25 +0800 Subject: [PATCH 080/153] Conll2003Pipe --- fastNLP/io/pipe/conll.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index 58fab281..d253f3be 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -158,9 +158,9 @@ class Conll2003Pipe(Pipe): _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=['pos', 'ner']) # chunk中存在一些tag只在dev中出现,没在train中 tgt_vocab = Vocabulary(unknown=None, padding=None) - tgt_vocab.from_dataset(*data_bundle.datasets.values(), field_name='ner') - tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name='ner') - data_bundle.set_vocab(tgt_vocab, 'ner') + tgt_vocab.from_dataset(*data_bundle.datasets.values(), field_name='chunk') + tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name='chunk') + data_bundle.set_vocab(tgt_vocab, 'chunk') input_fields = [Const.INPUT, Const.INPUT_LEN] target_fields = ['pos', 'ner', 'chunk', Const.INPUT_LEN] From d0354d8e2883b4433378c4a6a247972de664a06d Mon Sep 17 00:00:00 2001 From: ChenXin Date: Tue, 20 Aug 2019 00:00:31 +0800 Subject: [PATCH 081/153] fix some importing bugs --- fastNLP/__init__.py | 5 +- fastNLP/core/__init__.py | 2 +- fastNLP/core/field.py | 191 +++++++++++++++++++++------------------ 3 files changed, 108 insertions(+), 90 deletions(-) diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py index a6767088..879fd644 100644 --- a/fastNLP/__init__.py +++ b/fastNLP/__init__.py @@ -14,6 +14,7 @@ __all__ = [ "Instance", "FieldArray", + "DataSetIter", "BatchIter", "TorchLoaderIter", @@ -31,6 +32,7 @@ __all__ = [ "TensorboardCallback", "LRScheduler", "ControlC", + "LRFinder", "Padder", "AutoPadder", @@ -43,7 +45,8 @@ __all__ = [ "Optimizer", "SGD", "Adam", - + "AdamW", + "Sampler", "SequentialSampler", "BucketSampler", diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index acf0efc4..4a43b73d 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -22,7 +22,7 @@ from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder from .instance import Instance from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward from .metrics import AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric -from .optimizer import Optimizer, SGD, Adam +from .optimizer import Optimizer, SGD, Adam, AdamW from .sampler import SequentialSampler, BucketSampler, RandomSampler, Sampler from .tester import Tester from .trainer import Trainer diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 65bd9be4..26d22ada 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -1,4 +1,8 @@ - +__all__ = [ + "Padder", + "AutoPadder", + "EngChar2DPadder", +] from numbers import Number import torch @@ -9,24 +13,27 @@ from copy import deepcopy from collections import Counter from .utils import _is_iterable + class SetInputOrTargetException(Exception): def __init__(self, msg, index=None, field_name=None): super().__init__(msg) self.msg = msg self.index = index # 标示在哪个数据遭遇到问题了 - self.field_name = field_name # 标示当前field的名称 + self.field_name = field_name # 标示当前field的名称 + class AppendToTargetOrInputException(Exception): def __init__(self, msg, index=None, field_name=None): super().__init__(msg) self.msg = msg self.index = index # 标示在哪个数据遭遇到问题了 - self.field_name = field_name # 标示当前field的名称 + self.field_name = field_name # 标示当前field的名称 + class FieldArray: def __init__(self, name, content, is_target=False, is_input=False, padder=None, ignore_type=False, use_1st_ins_infer_dim_type=True): - if len(content)==0: + if len(content) == 0: raise RuntimeError("Empty fieldarray is not allowed.") _content = content try: @@ -43,34 +50,34 @@ class FieldArray: self._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self._is_input = False self._is_target = False - + if is_input: self.is_input = is_input if is_target: self.is_target = is_target - + if padder is None: padder = AutoPadder(pad_val=0) else: assert isinstance(padder, Padder), "padder must be of type fastNLP.Padder." padder = deepcopy(padder) self.set_padder(padder) - + @property def ignore_type(self): return self._ignore_type - + @ignore_type.setter def ignore_type(self, value): if value: self._cell_ndim = None self.dtype = None self._ignore_type = value - + @property def is_input(self): return self._is_input - + @is_input.setter def is_input(self, value): """ @@ -85,11 +92,11 @@ class FieldArray: self.dtype = None self._cell_ndim = None self._is_input = value - + @property def is_target(self): return self._is_target - + @is_target.setter def is_target(self, value): """ @@ -103,7 +110,7 @@ class FieldArray: self.dtype = None self._cell_ndim = None self._is_target = value - + def _check_dtype_and_ndim(self, only_check_1st_ins_dim_type=True): """ 检查当前content所有的element是否是同一个类型,且是否每个元素具有相同的维度。通过的话,设置_cell_ndim与_ele_type属性;没有 @@ -120,35 +127,37 @@ class FieldArray: for cell in self.content[1:]: index += 1 type_i, dim_i = _get_ele_type_and_dim(cell) - if type_i!=type_0: - raise SetInputOrTargetException("Type:{} in index {} is different from the first element with type:{}." - ".".format(type_i, index, type_0)) - if dim_0!=dim_i: - raise SetInputOrTargetException("Dimension:{} in index {} is different from the first element with " - "dimension:{}.".format(dim_i, index, dim_0)) + if type_i != type_0: + raise SetInputOrTargetException( + "Type:{} in index {} is different from the first element with type:{}." + ".".format(type_i, index, type_0)) + if dim_0 != dim_i: + raise SetInputOrTargetException( + "Dimension:{} in index {} is different from the first element with " + "dimension:{}.".format(dim_i, index, dim_0)) self._cell_ndim = dim_0 self.dtype = type_0 except SetInputOrTargetException as e: e.index = index raise e - - def append(self, val:Any): + + def append(self, val: Any): """ :param val: 把该val append到fieldarray。 :return: """ if (self._is_target or self._is_input) and self._ignore_type is False and not self._use_1st_ins_infer_dim_type: type_, dim_ = _get_ele_type_and_dim(val) - if self.dtype!=type_: + if self.dtype != type_: raise AppendToTargetOrInputException(f"Value(type:{type_}) are of different types with " f"previous values(type:{self.dtype}).") - if self._cell_ndim!=dim_: + if self._cell_ndim != dim_: raise AppendToTargetOrInputException(f"Value(dim:{dim_}) are of different dimensions with " f"previous values(dim:{self._cell_ndim}).") self.content.append(val) else: self.content.append(val) - + def pop(self, index): """ 删除该field中index处的元素 @@ -156,22 +165,22 @@ class FieldArray: :return: """ self.content.pop(index) - + def __getitem__(self, indices): return self.get(indices, pad=False) - + def __setitem__(self, idx, val): assert isinstance(idx, int) if (self._is_target or self._is_input) and self.ignore_type is False: # 需要检测类型 type_, dim_ = _get_ele_type_and_dim(val) - if self.dtype!=type_: + if self.dtype != type_: raise RuntimeError(f"Value(type:{type_}) are of different types with " - f"other values(type:{self.dtype}).") - if self._cell_ndim!=dim_: + f"other values(type:{self.dtype}).") + if self._cell_ndim != dim_: raise RuntimeError(f"Value(dim:{dim_}) are of different dimensions with " - f"previous values(dim:{self._cell_ndim}).") + f"previous values(dim:{self._cell_ndim}).") self.content[idx] = val - + def get(self, indices, pad=True): """ 根据给定的indices返回内容 @@ -184,16 +193,16 @@ class FieldArray: return self.content[indices] if self.is_input is False and self.is_target is False: raise RuntimeError("Please specify either is_input or is_target to True for {}".format(self.name)) - + contents = [self.content[i] for i in indices] if self.padder is None or pad is False: return np.array(contents) else: return self.pad(contents) - + def pad(self, contents): return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim) - + def set_padder(self, padder): """ 设置padder,在这个field进行pad的时候用这个padder进行pad,如果为None则不进行pad。 @@ -205,7 +214,7 @@ class FieldArray: self.padder = deepcopy(padder) else: self.padder = None - + def set_pad_val(self, pad_val): """ 修改padder的pad_val. @@ -215,7 +224,7 @@ class FieldArray: if self.padder is not None: self.padder.set_pad_val(pad_val) return self - + def __len__(self): """ Returns the size of FieldArray. @@ -223,7 +232,7 @@ class FieldArray: :return int length: """ return len(self.content) - + def to(self, other): """ 将other的属性复制给本FieldArray(other必须为FieldArray类型). @@ -233,15 +242,15 @@ class FieldArray: :return: :class:`~fastNLP.FieldArray` """ assert isinstance(other, FieldArray), "Only supports fastNLP.FieldArray type, not {}.".format(type(other)) - + self.ignore_type = other.ignore_type self.is_input = other.is_input self.is_target = other.is_target self.padder = other.padder - + return self - - def split(self, sep:str=None, inplace:bool=True): + + def split(self, sep: str = None, inplace: bool = True): """ 依次对自身的元素使用.split()方法,应该只有当本field的元素为str时,该方法才有用。将返回值 @@ -257,8 +266,8 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - - def int(self, inplace:bool=True): + + def int(self, inplace: bool = True): """ 将本field中的值调用int(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list,list中的值会被依次转换。) @@ -277,7 +286,7 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") print(e) return self._after_process(new_contents, inplace=inplace) - + def float(self, inplace=True): """ 将本field中的值调用float(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -297,7 +306,7 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def bool(self, inplace=True): """ 将本field中的值调用bool(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -316,9 +325,9 @@ class FieldArray: except Exception as e: print(f"Exception happens when process value in index {index}.") raise e - + return self._after_process(new_contents, inplace=inplace) - + def lower(self, inplace=True): """ 将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -338,7 +347,7 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def upper(self, inplace=True): """ 将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的), @@ -358,7 +367,7 @@ class FieldArray: print(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) - + def value_count(self): """ 返回该field下不同value的数量。多用于统计label数量 @@ -366,17 +375,18 @@ class FieldArray: :return: Counter, key是label,value是出现次数 """ count = Counter() - + def cum(cell): if _is_iterable(cell) and not isinstance(cell, str): for cell_ in cell: cum(cell_) else: count[cell] += 1 + for cell in self.content: cum(cell) return count - + def _after_process(self, new_contents, inplace): """ 当调用处理函数之后,决定是否要替换field。 @@ -398,7 +408,7 @@ class FieldArray: return new_contents -def _get_ele_type_and_dim(cell:Any, dim=0): +def _get_ele_type_and_dim(cell: Any, dim=0): """ 识别cell的类别与dimension的数量 @@ -414,13 +424,13 @@ def _get_ele_type_and_dim(cell:Any, dim=0): elif isinstance(cell, list): dim += 1 res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] - types = set([i for i,j in res]) - dims = set([j for i,j in res]) - if len(types)>1: + types = set([i for i, j in res]) + dims = set([j for i, j in res]) + if len(types) > 1: raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) - elif len(types)==0: + elif len(types) == 0: raise SetInputOrTargetException("Empty value encountered.") - if len(dims)>1: + if len(dims) > 1: raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) return types.pop(), dims.pop() elif isinstance(cell, torch.Tensor): @@ -431,16 +441,16 @@ def _get_ele_type_and_dim(cell:Any, dim=0): # 否则需要继续往下iterate dim += 1 res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] - types = set([i for i,j in res]) - dims = set([j for i,j in res]) - if len(types)>1: + types = set([i for i, j in res]) + dims = set([j for i, j in res]) + if len(types) > 1: raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types))) - elif len(types)==0: + elif len(types) == 0: raise SetInputOrTargetException("Empty value encountered.") - if len(dims)>1: + if len(dims) > 1: raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims))) return types.pop(), dims.pop() - else: # 包含tuple, set, dict以及其它的类型 + else: # 包含tuple, set, dict以及其它的类型 raise SetInputOrTargetException(f"Cannot process type:{type(cell)}.") @@ -462,15 +472,15 @@ class Padder: :return: np.array([padded_element]) """ - + def __init__(self, pad_val=0, **kwargs): self.pad_val = pad_val - + def set_pad_val(self, pad_val): self.pad_val = pad_val - + @abstractmethod - def __call__(self, contents, field_name, field_ele_dtype, dim:int): + def __call__(self, contents, field_name, field_ele_dtype, dim: int): """ 传入的是List内容。假设有以下的DataSet。 @@ -537,23 +547,24 @@ class AutoPadder(Padder): 3 其它情况不进行处理,返回一个np.array类型。 """ + def __init__(self, pad_val=0): super().__init__(pad_val=pad_val) - + def __call__(self, contents, field_name, field_ele_dtype, dim): if field_ele_dtype: - if dim>3: + if dim > 3: return np.array(contents) if isinstance(field_ele_dtype, type) and \ (issubclass(field_ele_dtype, np.number) or issubclass(field_ele_dtype, Number)): - if dim==0: + if dim == 0: array = np.array(contents, dtype=field_ele_dtype) - elif dim==1: + elif dim == 1: max_len = max(map(len, contents)) array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype) for i, content_i in enumerate(contents): array[i, :len(content_i)] = content_i - elif dim==2: + elif dim == 2: max_len = max(map(len, contents)) max_word_len = max([max([len(content_ii) for content_ii in content_i]) for content_i in contents]) @@ -563,20 +574,21 @@ class AutoPadder(Padder): array[i, j, :len(content_ii)] = content_ii else: shape = np.shape(contents) - if len(shape)==4: # 说明各dimension是相同的大小 + if len(shape) == 4: # 说明各dimension是相同的大小 array = np.array(contents, dtype=field_ele_dtype) else: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") return array elif str(field_ele_dtype).startswith('torch'): - if dim==0: + if dim == 0: tensor = torch.tensor(contents).to(field_ele_dtype) - elif dim==1: + elif dim == 1: max_len = max(map(len, contents)) tensor = torch.full((len(contents), max_len), fill_value=self.pad_val, dtype=field_ele_dtype) for i, content_i in enumerate(contents): tensor[i, :len(content_i)] = torch.tensor(content_i) - elif dim==2: + elif dim == 2: max_len = max(map(len, contents)) max_word_len = max([max([len(content_ii) for content_ii in content_i]) for content_i in contents]) @@ -587,15 +599,18 @@ class AutoPadder(Padder): tensor[i, j, :len(content_ii)] = torch.tensor(content_ii) else: shapes = set([np.shape(content_i) for content_i in contents]) - if len(shapes)>1: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + if len(shapes) > 1: + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") shape = shapes.pop() - if len(shape)==3: - tensor = torch.full([len(contents)]+list(shape), fill_value=self.pad_val, dtype=field_ele_dtype) + if len(shape) == 3: + tensor = torch.full([len(contents)] + list(shape), fill_value=self.pad_val, + dtype=field_ele_dtype) for i, content_i in enumerate(contents): tensor[i] = torch.tensor(content_i, dtype=field_ele_dtype) else: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + raise RuntimeError( + f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") return tensor else: return np.array(contents) # 不进行任何操作 @@ -626,7 +641,7 @@ class EngChar2DPadder(Padder): dataset.set_padder('chars', padder) # chars这个field的设置为了EnChar2DPadder """ - + def __init__(self, pad_val=0, pad_length=0): """ :param pad_val: int, pad的位置使用该index @@ -634,9 +649,9 @@ class EngChar2DPadder(Padder): 都pad或截取到该长度. """ super().__init__(pad_val=pad_val) - + self.pad_length = pad_length - + def __call__(self, contents, field_name, field_ele_dtype, dim): """ 期望输入类似于 @@ -655,7 +670,7 @@ class EngChar2DPadder(Padder): raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format( field_name, field_ele_dtype )) - assert dim==2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions." + assert dim == 2, f"Field:{field_name} has {dim}, EngChar2DPadder only supports input with 2 dimensions." if self.pad_length < 1: max_char_length = max([max(len(char_lst) for char_lst in word_lst) for word_lst in contents]) else: @@ -663,12 +678,12 @@ class EngChar2DPadder(Padder): max_sent_length = max(len(word_lst) for word_lst in contents) batch_size = len(contents) dtype = type(contents[0][0][0]) - + padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val, dtype=dtype) for b_idx, word_lst in enumerate(contents): for c_idx, char_lst in enumerate(word_lst): chars = char_lst[:max_char_length] padded_array[b_idx, c_idx, :len(chars)] = chars - + return padded_array From 32a2f197e12bd67b7485ab77dd19e9b7eb55e552 Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 20 Aug 2019 15:06:01 +0800 Subject: [PATCH 082/153] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E5=AF=B9metric=20nam?= =?UTF-8?q?e=E8=AE=BE=E7=BD=AE=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 17 +++++++++++++++++ fastNLP/core/tester.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index ef6f8b69..007485b2 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -118,6 +118,7 @@ class MetricBase(object): def __init__(self): self._param_map = {} # key is param in function, value is input param. self._checked = False + self._metric_name = self.__class__.__name__ @property def param_map(self): @@ -135,6 +136,22 @@ class MetricBase(object): @abstractmethod def get_metric(self, reset=True): raise NotImplemented + + def set_metric_name(self, name:str): + """ + 设置metric的名称,默认是Metric的class name. + + :param str name: + :return: + """ + self._metric_name = name + + def get_metric_name(self): + """ + 返回metric的名称 + :return: + """ + return self._metric_name def _init_param_map(self, key_map=None, **kwargs): """检查key_map和其他参数map,并将这些映射关系添加到self._param_map diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index ab86fb62..e4d67261 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -178,7 +178,7 @@ class Tester(object): if not isinstance(eval_result, dict): raise TypeError(f"The return value of {_get_func_signature(metric.get_metric)} must be " f"`dict`, got {type(eval_result)}") - metric_name = metric.__class__.__name__ + metric_name = metric.get_metric_name() eval_results[metric_name] = eval_result end_time = time.time() From 4e59d887245ba0fe77a9c4ea8fb610667118246d Mon Sep 17 00:00:00 2001 From: yunfan Date: Tue, 20 Aug 2019 15:28:21 +0800 Subject: [PATCH 083/153] [update] move logger to fastNLP.core, update logging format --- fastNLP/__init__.py | 4 +- fastNLP/core/__init__.py | 1 + fastNLP/{io => core}/_logger.py | 40 +++++++++++-------- fastNLP/core/callback.py | 2 +- fastNLP/core/tester.py | 2 +- fastNLP/core/trainer.py | 3 +- fastNLP/core/utils.py | 4 +- fastNLP/io/__init__.py | 2 - .../text_classification/train_dpcnn.py | 4 +- 9 files changed, 34 insertions(+), 28 deletions(-) rename fastNLP/{io => core}/_logger.py (88%) diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py index 879fd644..2720f292 100644 --- a/fastNLP/__init__.py +++ b/fastNLP/__init__.py @@ -59,7 +59,9 @@ __all__ = [ "NLLLoss", "LossInForward", - "cache_results" + "cache_results", + + 'logger' ] __version__ = '0.4.5' diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index 4a43b73d..1feaf3fb 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -28,3 +28,4 @@ from .tester import Tester from .trainer import Trainer from .utils import cache_results, seq_len_to_mask, get_seq_len from .vocabulary import Vocabulary +from ._logger import logger diff --git a/fastNLP/io/_logger.py b/fastNLP/core/_logger.py similarity index 88% rename from fastNLP/io/_logger.py rename to fastNLP/core/_logger.py index a69e297e..50266d7a 100644 --- a/fastNLP/io/_logger.py +++ b/fastNLP/core/_logger.py @@ -69,7 +69,7 @@ def _add_file_handler(logger, path, level='INFO'): file_handler = logging.FileHandler(path, mode='a') file_handler.setLevel(_get_level(level)) - file_formatter = logging.Formatter(fmt='%(asctime)s - [%(levelname)s] - %(message)s', + file_formatter = logging.Formatter(fmt='%(asctime)s - %(module)s - [%(levelname)s] - %(message)s', datefmt='%Y/%m/%d %H:%M:%S') file_handler.setFormatter(file_formatter) logger.addHandler(file_handler) @@ -97,18 +97,36 @@ def _set_stdout_handler(logger, stdout='tqdm', level='INFO'): stream_handler = None if stream_handler is not None: - stream_formatter = logging.Formatter('[%(levelname)s] %(message)s') + stream_formatter = logging.Formatter('%(message)s') stream_handler.setLevel(level) stream_handler.setFormatter(stream_formatter) logger.addHandler(stream_handler) + +class FastNLPLogger(logging.getLoggerClass()): + def __init__(self, name): + super().__init__(name) + + def add_file(self, path='./log.txt', level='INFO'): + """add log output file and level""" + _add_file_handler(self, path, level) + + def set_stdout(self, stdout='tqdm', level='INFO'): + """set stdout format and level""" + _set_stdout_handler(self, stdout, level) + +logging.setLoggerClass(FastNLPLogger) +# print(logging.getLoggerClass()) +# print(logging.getLogger()) + def _init_logger(path=None, stdout='tqdm', level='INFO'): """initialize logger""" level = _get_level(level) - # logger = logging.getLogger(ROOT_NAME) - logger = logging.getLogger() + # logger = logging.getLogger() + logger = logging.getLogger(ROOT_NAME) + logger.propagate = False logger.setLevel(level) _set_stdout_handler(logger, stdout, level) @@ -132,16 +150,4 @@ def _get_logger(name=None, level='INFO'): return logger -class FastNLPLogger(logging.Logger): - def add_file(self, path='./log.txt', level='INFO'): - """add log output file and level""" - _add_file_handler(self, path, level) - - def set_stdout(self, stdout='tqdm', level='INFO'): - """set stdout format and level""" - _set_stdout_handler(self, stdout, level) - -_logger = _init_logger(path=None) -logger = FastNLPLogger(ROOT_NAME) -logger.__dict__.update(_logger.__dict__) -del _logger +logger = _init_logger(path=None) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 47d4174b..4ba4b945 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -86,7 +86,7 @@ except: from ..io.model_io import ModelSaver, ModelLoader from .dataset import DataSet from .tester import Tester -from ..io import logger +from ._logger import logger try: import fitlog diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index ab86fb62..89d2eb6a 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -56,7 +56,7 @@ from .utils import _move_model_to_device from ._parallel_utils import _data_parallel_wrapper from ._parallel_utils import _model_contains_inner_module from functools import partial -from ..io import logger +from ._logger import logger __all__ = [ "Tester" diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 783997a7..16aec472 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -353,8 +353,7 @@ from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device from ._parallel_utils import _model_contains_inner_module -from ..io import logger - +from ._logger import logger class Trainer(object): """ diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index a49d203d..a023c29e 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -17,7 +17,7 @@ import numpy as np import torch import torch.nn as nn from typing import List -import logging +from ._logger import logger _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', 'varargs']) @@ -661,7 +661,7 @@ class _pseudo_tqdm: 当无法引入tqdm,或者Trainer中设置use_tqdm为false的时候,用该方法打印数据 """ def __init__(self, **kwargs): - self.logger = logging.getLogger(__name__) + self.logger = logger def write(self, info): self.logger.info(info) diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index a19428d3..a8193c9b 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -73,7 +73,6 @@ __all__ = [ 'ModelLoader', 'ModelSaver', - 'logger', ] from .embed_loader import EmbedLoader @@ -83,4 +82,3 @@ from .model_io import ModelLoader, ModelSaver from .loader import * from .pipe import * -from ._logger import * diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py index 704b9f43..f3f4e231 100644 --- a/reproduction/text_classification/train_dpcnn.py +++ b/reproduction/text_classification/train_dpcnn.py @@ -15,14 +15,14 @@ from fastNLP.core.const import Const as C from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.dist_trainer import DistTrainer from utils.util_init import set_rng_seeds -from fastNLP.io import logger +from fastNLP import logger import os # os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' # os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - # hyper logger.add_file('log', 'INFO') +print(logger.handlers) class Config(): seed = 12345 From 7a0903d9ba03a0211defe4915db1846732983ff0 Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 20 Aug 2019 16:04:28 +0800 Subject: [PATCH 084/153] =?UTF-8?q?1.=E5=88=A0=E9=99=A4Trainer=E4=B8=AD?= =?UTF-8?q?=E7=9A=84prefetch=E5=8F=82=E6=95=B0;=202.=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E5=88=86=E8=AF=8D=E7=9A=84=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?;=203.=E5=A2=9E=E5=8A=A0DataBundle=E7=9A=84delete=5Fdataset,=20?= =?UTF-8?q?delete=5Fvocab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 3 ++- fastNLP/io/__init__.py | 1 + fastNLP/io/data_bundle.py | 19 +++++++++++++++++++ fastNLP/io/file_utils.py | 7 ++++++- fastNLP/modules/decoder/crf.py | 6 ++++-- 5 files changed, 32 insertions(+), 4 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 007485b2..1d1e3819 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -142,9 +142,10 @@ class MetricBase(object): 设置metric的名称,默认是Metric的class name. :param str name: - :return: + :return: self """ self._metric_name = name + return self def get_metric_name(self): """ diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index a19428d3..3888e255 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -50,6 +50,7 @@ __all__ = [ "SSTPipe", "SST2Pipe", "IMDBPipe", + "Conll2003Pipe", "Conll2003NERPipe", "OntoNotesNERPipe", diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index 6bb53914..8df73d06 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -158,6 +158,16 @@ class DataBundle: """ return self.datasets[name] + def delete_dataset(self, name:str): + """ + 删除名为name的DataSet + + :param str name: + :return: self + """ + self.datasets.pop(name, None) + return self + def get_vocab(self, field_name:str)->Vocabulary: """ 获取field名为field_name对应的vocab @@ -167,6 +177,15 @@ class DataBundle: """ return self.vocabs[field_name] + def delete_vocab(self, field_name:str): + """ + 删除vocab + :param str field_name: + :return: self + """ + self.vocabs.pop(field_name, None) + return self + def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True, ignore_miss_dataset=True): """ 将field_names中的field设置为input, 对data_bundle中所有的dataset执行该操作:: diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index dbe94633..5af3c4ff 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -75,7 +75,12 @@ DATASET_DIR = { "rte": "RTE.zip", "msra-ner": "MSRA_NER.zip", "peopledaily": "peopledaily.zip", - "weibo-ner": "weibo_NER.zip" + "weibo-ner": "weibo_NER.zip", + + "cws-pku": 'cws_pku.zip', + "cws-cityu": "cws_cityu.zip", + "cws-as": 'cws_as.zip', + "cws-msra": 'cws_msra.zip' } PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py index 7c496868..b7a7547f 100644 --- a/fastNLP/modules/decoder/crf.py +++ b/fastNLP/modules/decoder/crf.py @@ -7,7 +7,7 @@ import torch from torch import nn from ..utils import initial_parameter - +from ...core import Vocabulary def allowed_transitions(id2target, encoding_type='bio', include_start_end=False): """ @@ -15,7 +15,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=False) 给定一个id到label的映射表,返回所有可以跳转的(from_tag_id, to_tag_id)列表。 - :param dict id2target: key是label的indices,value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是 + :param dict,Vocabulary id2target: key是label的indices,value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是 "B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.idx2word得到id2label。 :param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"。 :param bool include_start_end: 是否包含开始与结尾的转换。比如在bio中,b/o可以在开头,但是i不能在开头; @@ -23,6 +23,8 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=False) start_idx=len(id2label), end_idx=len(id2label)+1。为False, 返回的结果中不含与开始结尾相关的内容 :return: List[Tuple(int, int)]], 内部的Tuple是可以进行跳转的(from_tag_id, to_tag_id)。 """ + if isinstance(id2target, Vocabulary): + id2target = id2target.idx2word num_tags = len(id2target) start_idx = num_tags end_idx = num_tags + 1 From ce083de26b6c28b28c507ea25ad499586dc032f8 Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 20 Aug 2019 16:04:51 +0800 Subject: [PATCH 085/153] =?UTF-8?q?1.=E5=88=A0=E9=99=A4Trainer=E4=B8=AD?= =?UTF-8?q?=E7=9A=84prefetch=E5=8F=82=E6=95=B0;=202.=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=B8=AD=E6=96=87=E5=88=86=E8=AF=8D=E7=9A=84=E4=B8=8B=E8=BD=BD?= =?UTF-8?q?;=203.=E5=A2=9E=E5=8A=A0DataBundle=E7=9A=84delete=5Fdataset,=20?= =?UTF-8?q?delete=5Fvocab?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/tester.py | 3 +-- fastNLP/core/trainer.py | 16 +++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index e4d67261..47959fd2 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -180,12 +180,11 @@ class Tester(object): f"`dict`, got {type(eval_result)}") metric_name = metric.get_metric_name() eval_results[metric_name] = eval_result - + pbar.close() end_time = time.time() test_str = f'Evaluate data in {round(end_time - start_time, 2)} seconds!' # pbar.write(test_str) self.logger.info(test_str) - pbar.close() except _CheckError as e: prev_func_signature = _get_func_signature(self._predict_func) _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 783997a7..787ea313 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -336,7 +336,7 @@ except: import warnings from .batch import DataSetIter, BatchIter -from .callback import CallbackManager, CallbackException +from .callback import CallbackManager, CallbackException, Callback from .dataset import DataSet from .losses import _prepare_losser from .metrics import _prepare_metrics @@ -422,13 +422,8 @@ class Trainer(object): batch_size=32, sampler=None, drop_last=False, update_every=1, num_workers=0, n_epochs=10, print_every=5, dev_data=None, metrics=None, metric_key=None, - validate_every=-1, save_path=None, use_tqdm=True, device=None, prefetch=False, + validate_every=-1, save_path=None, use_tqdm=True, device=None, callbacks=None, check_code_level=0, **kwargs): - if prefetch and num_workers==0: - num_workers = 1 - if prefetch: - warnings.warn("prefetch is deprecated, will be removed in version 0.5.0, please use num_workers instead.") - super(Trainer, self).__init__() if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") @@ -566,6 +561,9 @@ class Trainer(object): self.step = 0 self.start_time = None # start timestamp + if isinstance(callbacks, Callback): + callbacks = [callbacks] + self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) @@ -617,8 +615,8 @@ class Trainer(object): if self.dev_data is not None and self.best_dev_perf is not None: self.logger.info( - "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + - self.tester._format_eval_results(self.best_dev_perf), ) + "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step)) + self.logger.info(self.tester._format_eval_results(self.best_dev_perf)) results['best_eval'] = self.best_dev_perf results['best_epoch'] = self.best_dev_epoch results['best_step'] = self.best_dev_step From 44d569dadee621a2cd3d36225c7cb3516e32495b Mon Sep 17 00:00:00 2001 From: yunfan Date: Tue, 20 Aug 2019 16:28:12 +0800 Subject: [PATCH 086/153] [fix] logger in dist_trainer --- fastNLP/core/dist_trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 8ad282c9..346539cd 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -21,7 +21,7 @@ from .optimizer import Optimizer from .utils import _build_args from .utils import _move_dict_value_to_device from .utils import _get_func_signature -from ..io import logger +from ._logger import logger import logging from pkg_resources import parse_version From e2232ac39f78e0d796dac844994b4045a425318c Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 20 Aug 2019 21:35:12 +0800 Subject: [PATCH 087/153] =?UTF-8?q?1.=E4=BF=AE=E5=A4=8DEmbedding=E4=B8=AD?= =?UTF-8?q?=E6=BD=9C=E5=9C=A8=E7=9A=84=E5=AF=BB=E6=89=BE=E8=B7=AF=E5=BE=84?= =?UTF-8?q?=E5=A4=B1=E8=B4=A5;=202.reproduction=E4=B8=AD=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E5=88=86=E8=AF=8D=E6=A8=A1=E5=9E=8B;=203.=E4=BF=AE=E6=94=B9ner?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E4=B8=BA=E6=9C=80=E6=96=B0=E7=9A=84pipe?= =?UTF-8?q?=E7=89=88=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 10 +- fastNLP/embeddings/bert_embedding.py | 4 +- fastNLP/embeddings/elmo_embedding.py | 2 +- fastNLP/embeddings/static_embedding.py | 14 +- fastNLP/io/data_bundle.py | 52 +++- fastNLP/io/loader/cws.py | 58 +++- fastNLP/io/pipe/__init__.py | 2 + fastNLP/io/pipe/conll.py | 20 +- fastNLP/io/pipe/cws.py | 246 +++++++++++++++++ .../cws/data/CWSDataLoader.py | 249 ------------------ .../cws/data/cws_shift_pipe.py | 202 ++++++++++++++ .../cws/model/bilstm_crf_cws.py | 60 +++++ .../model/{model.py => bilstm_shift_relay.py} | 20 +- .../seqence_labelling/cws/train_bilstm_crf.py | 52 ++++ .../cws/train_shift_relay.py | 69 ++--- .../ner/model/lstm_cnn_crf.py | 9 +- .../ner/train_cnn_lstm_crf_conll2003.py | 7 +- .../seqence_labelling/ner/train_ontonote.py | 7 +- test/io/loader/test_cws_loader.py | 13 + test/io/pipe/test_cws.py | 13 + 20 files changed, 753 insertions(+), 356 deletions(-) create mode 100644 fastNLP/io/pipe/cws.py delete mode 100644 reproduction/seqence_labelling/cws/data/CWSDataLoader.py create mode 100644 reproduction/seqence_labelling/cws/data/cws_shift_pipe.py create mode 100644 reproduction/seqence_labelling/cws/model/bilstm_crf_cws.py rename reproduction/seqence_labelling/cws/model/{model.py => bilstm_shift_relay.py} (74%) create mode 100644 reproduction/seqence_labelling/cws/train_bilstm_crf.py create mode 100644 test/io/loader/test_cws_loader.py create mode 100644 test/io/pipe/test_cws.py diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 7ae34de9..2c52d104 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -690,11 +690,11 @@ class Trainer(object): (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ and self.dev_data is not None: eval_res = self._do_validation(epoch=epoch, step=self.step) - eval_str = "Evaluation on dev at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, - self.n_steps) + \ - self.tester._format_eval_results(eval_res) + eval_str = "Evaluation on dev at Epoch {}/{}. Step:{}/{}: ".format(epoch, self.n_epochs, self.step, + self.n_steps) # pbar.write(eval_str + '\n') - self.logger.info(eval_str + '\n') + self.logger.info(eval_str) + self.logger.info(self.tester._format_eval_results(eval_res)+'\n') # ================= mini-batch end ==================== # # lr decay; early stopping @@ -907,7 +907,7 @@ def _check_code(dataset, model, losser, metrics, forward_func, batch_size=DEFAUL info_str += '\n' else: info_str += 'There is no target field.' - print(info_str) + logger.info(info_str) _check_forward_error(forward_func=forward_func, dataset=dataset, batch_x=batch_x, check_level=check_level) refined_batch_x = _build_args(forward_func, **batch_x) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index cf0b57b0..bc0d46e2 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -67,8 +67,8 @@ class BertEmbedding(ContextualEmbedding): model_url = _get_embedding_url('bert', model_dir_or_name.lower()) model_dir = cached_path(model_url, name='embedding') # 检查是否存在 - elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): - model_dir = os.path.expanduser(os.path.abspath(model_dir_or_name)) + elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): + model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index 435e0b98..24cd052e 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -59,7 +59,7 @@ class ElmoEmbedding(ContextualEmbedding): model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) model_dir = cached_path(model_url, name='embedding') # 检查是否存在 - elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): + elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index ac9611fe..4079b2a2 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -70,10 +70,10 @@ class StaticEmbedding(TokenEmbedding): model_url = _get_embedding_url('static', model_dir_or_name.lower()) model_path = cached_path(model_url, name='embedding') # 检查是否存在 - elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))): - model_path = model_dir_or_name - elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): - model_path = _get_file_name_base_on_postfix(model_dir_or_name, '.txt') + elif os.path.isfile(os.path.abspath(os.path.expanduser(model_dir_or_name))): + model_path = os.path.abspath(os.path.expanduser(model_dir_or_name)) + elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): + model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") @@ -94,7 +94,7 @@ class StaticEmbedding(TokenEmbedding): no_create_entry=truncated_vocab._is_word_no_create_entry(word)) # 只限制在train里面的词语使用min_freq筛选 - if kwargs.get('only_train_min_freq', False): + if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: for word in truncated_vocab.word_count.keys(): if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word]str: + """ + 如果你使用了该数据集,请引用以下的文章:Thomas Emerson, The Second International Chinese Word Segmentation Bakeoff, + 2005. 更多信息可以在http://sighan.cs.uchicago.edu/bakeoff2005/查看 + + :param float dev_ratio: 如果路径中没有dev集,从train划分多少作为dev的数据. 如果为0,则不划分dev。 + :param bool re_download: 是否重新下载数据,以重新切分数据。 + :return: str + """ + if self.dataset_name is None: + return None + data_dir = self._get_dataset_path(dataset_name=self.dataset_name) + modify_time = 0 + for filepath in glob.glob(os.path.join(data_dir, '*')): + modify_time = os.stat(filepath).st_mtime + break + if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 + shutil.rmtree(data_dir) + data_dir = self._get_dataset_path(dataset_name=self.dataset_name) + + if not os.path.exists(os.path.join(data_dir, 'dev.txt')): + if dev_ratio > 0: + assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." + try: + with open(os.path.join(data_dir, 'train.txt'), 'r', encoding='utf-8') as f, \ + open(os.path.join(data_dir, 'middle_file.txt'), 'w', encoding='utf-8') as f1, \ + open(os.path.join(data_dir, 'dev.txt'), 'w', encoding='utf-8') as f2: + for line in f: + if random.random() < dev_ratio: + f2.write(line) + else: + f1.write(line) + os.remove(os.path.join(data_dir, 'train.txt')) + os.renames(os.path.join(data_dir, 'middle_file.txt'), os.path.join(data_dir, 'train.txt')) + finally: + if os.path.exists(os.path.join(data_dir, 'middle_file.txt')): + os.remove(os.path.join(data_dir, 'middle_file.txt')) + + return data_dir diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 9ffb9ed6..1907af4a 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -21,6 +21,7 @@ __all__ = [ "MsraNERPipe", "WeiboNERPipe", "PeopleDailyPipe", + "Conll2003Pipe", "MatchingBertPipe", "RTEBertPipe", @@ -41,3 +42,4 @@ from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe from .pipe import Pipe +from .conll import Conll2003Pipe diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index d253f3be..617d1236 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -19,16 +19,14 @@ class _NERPipe(Pipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 - :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 """ - def __init__(self, encoding_type: str = 'bio', lower: bool = False, target_pad_val=0): + def __init__(self, encoding_type: str = 'bio', lower: bool = False): if encoding_type == 'bio': self.convert_tag = iob2 else: self.convert_tag = lambda words: iob2bioes(iob2(words)) self.lower = lower - self.target_pad_val = int(target_pad_val) def process(self, data_bundle: DataBundle) -> DataBundle: """ @@ -58,7 +56,6 @@ class _NERPipe(Pipe): target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): - dataset.set_pad_val(Const.TARGET, self.target_pad_val) dataset.add_seq_len(Const.INPUT) data_bundle.set_input(*input_fields) @@ -86,7 +83,6 @@ class Conll2003NERPipe(_NERPipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 - :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 """ def process_from_file(self, paths) -> DataBundle: @@ -103,7 +99,7 @@ class Conll2003NERPipe(_NERPipe): class Conll2003Pipe(Pipe): - def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False, target_pad_val=0): + def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False): """ 经过该Pipe后,DataSet中的内容如下 @@ -119,7 +115,6 @@ class Conll2003Pipe(Pipe): :param str chunk_encoding_type: 支持bioes, bio。 :param str ner_encoding_type: 支持bioes, bio。 :param bool lower: 是否将words列小写化后再建立词表 - :param int target_pad_val: pos, ner, chunk列的padding值 """ if chunk_encoding_type == 'bio': self.chunk_convert_tag = iob2 @@ -130,7 +125,6 @@ class Conll2003Pipe(Pipe): else: self.ner_convert_tag = lambda tags: iob2bioes(iob2(tags)) self.lower = lower - self.target_pad_val = int(target_pad_val) def process(self, data_bundle)->DataBundle: """ @@ -166,9 +160,6 @@ class Conll2003Pipe(Pipe): target_fields = ['pos', 'ner', 'chunk', Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): - dataset.set_pad_val('pos', self.target_pad_val) - dataset.set_pad_val('ner', self.target_pad_val) - dataset.set_pad_val('chunk', self.target_pad_val) dataset.add_seq_len(Const.INPUT) data_bundle.set_input(*input_fields) @@ -202,7 +193,6 @@ class OntoNotesNERPipe(_NERPipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 - :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 """ def process_from_file(self, paths): @@ -220,15 +210,13 @@ class _CNNERPipe(Pipe): target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target, seq_len。 :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 - :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 """ - def __init__(self, encoding_type: str = 'bio', target_pad_val=0): + def __init__(self, encoding_type: str = 'bio'): if encoding_type == 'bio': self.convert_tag = iob2 else: self.convert_tag = lambda words: iob2bioes(iob2(words)) - self.target_pad_val = int(target_pad_val) def process(self, data_bundle: DataBundle) -> DataBundle: """ @@ -261,7 +249,6 @@ class _CNNERPipe(Pipe): target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): - dataset.set_pad_val(Const.TARGET, self.target_pad_val) dataset.add_seq_len(Const.CHAR_INPUT) data_bundle.set_input(*input_fields) @@ -324,7 +311,6 @@ class WeiboNERPipe(_CNNERPipe): target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 - :param int target_pad_val: target的padding值,target这一列pad的位置值为target_pad_val。默认为0。 """ def process_from_file(self, paths=None) -> DataBundle: data_bundle = WeiboNERLoader().load(paths) diff --git a/fastNLP/io/pipe/cws.py b/fastNLP/io/pipe/cws.py new file mode 100644 index 00000000..6ea1ae0c --- /dev/null +++ b/fastNLP/io/pipe/cws.py @@ -0,0 +1,246 @@ +from .pipe import Pipe +from .. import DataBundle +from ..loader import CWSLoader +from ... import Const +from itertools import chain +from .utils import _indexize +import re +def _word_lens_to_bmes(word_lens): + """ + + :param list word_lens: List[int], 每个词语的长度 + :return: List[str], BMES的序列 + """ + tags = [] + for word_len in word_lens: + if word_len==1: + tags.append('S') + else: + tags.append('B') + tags.extend(['M']*(word_len-2)) + tags.append('E') + return tags + + +def _word_lens_to_segapp(word_lens): + """ + + :param list word_lens: List[int], 每个词语的长度 + :return: List[str], BMES的序列 + """ + tags = [] + for word_len in word_lens: + if word_len==1: + tags.append('SEG') + else: + tags.extend(['APP']*(word_len-1)) + tags.append('SEG') + return tags + + +def _alpha_span_to_special_tag(span): + """ + 将span替换成特殊的字符 + + :param str span: + :return: + """ + if 'oo' == span.lower(): # speical case when represent 2OO8 + return span + if len(span) == 1: + return span + else: + return '' + + +def _find_and_replace_alpha_spans(line): + """ + 传入原始句子,替换其中的字母为特殊标记 + + :param str line:原始数据 + :return: str + """ + new_line = '' + pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%,.。!<-“])' + prev_end = 0 + for match in re.finditer(pattern, line): + start, end = match.span() + span = line[start:end] + new_line += line[prev_end:start] + _alpha_span_to_special_tag(span) + prev_end = end + new_line += line[prev_end:] + return new_line + + +def _digit_span_to_special_tag(span): + """ + + :param str span: 需要替换的str + :return: + """ + if span[0] == '0' and len(span) > 2: + return '' + decimal_point_count = 0 # one might have more than one decimal pointers + for idx, char in enumerate(span): + if char == '.' or char == '﹒' or char == '·': + decimal_point_count += 1 + if span[-1] == '.' or span[-1] == '﹒' or span[ + -1] == '·': # last digit being decimal point means this is not a number + if decimal_point_count == 1: + return span + else: + return '' + if decimal_point_count == 1: + return '' + elif decimal_point_count > 1: + return '' + else: + return '' + +def _find_and_replace_digit_spans(line): + # only consider words start with number, contains '.', characters. + # If ends with space, will be processed + # If ends with Chinese character, will be processed + # If ends with or contains english char, not handled. + # floats are replaced by + # otherwise unkdgt + new_line = '' + pattern = '\d[\d\\.﹒·]*(?=[\u4e00-\u9fff ,%,。!<-“])' + prev_end = 0 + for match in re.finditer(pattern, line): + start, end = match.span() + span = line[start:end] + new_line += line[prev_end:start] + _digit_span_to_special_tag(span) + prev_end = end + new_line += line[prev_end:] + return new_line + + +class CWSPipe(Pipe): + """ + 对CWS数据进行预处理, 处理之后的数据,具备以下的结构 + + .. csv-table:: + :header: "raw_words", "chars", "target", "bigrams", "trigrams", "seq_len" + + "共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", "[10, 4, 1,...]","[6, 4, 1,...]", 13 + "2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", "[11, 12, ...]","[3, 9, ...]", 20 + "...", "[...]","[...]", "[...]","[...]", . + + 其中bigrams仅当bigrams列为True的时候为真 + + :param str,None dataset_name: 支持'pku', 'msra', 'cityu', 'as', None + :param str encoding_type: 可以选择'bmes', 'segapp'两种。"我 来自 复旦大学...", bmes的tag为[S, B, E, B, M, M, E...]; segapp + 的tag为[seg, app, seg, app, app, app, seg, ...] + :param bool replace_num_alpha: 是否将数字和字母用特殊字符替换。 + :param bool bigrams: 是否增加一列bigram. bigram的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...] + :param bool trigrams: 是否增加一列trigram. trigram的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + """ + def __init__(self, dataset_name=None, encoding_type='bmes', replace_num_alpha=True, bigrams=False, trigrams=False): + if encoding_type=='bmes': + self.word_lens_to_tags = _word_lens_to_bmes + else: + self.word_lens_to_tags = _word_lens_to_segapp + + self.dataset_name = dataset_name + self.bigrams = bigrams + self.trigrams = trigrams + self.replace_num_alpha = replace_num_alpha + + def _tokenize(self, data_bundle): + """ + 将data_bundle中的'chars'列切分成一个一个的word. + 例如输入是"共同 创造 美好.."->[[共, 同], [创, 造], [...], ] + + :param data_bundle: + :return: + """ + def split_word_into_chars(raw_chars): + words = raw_chars.split() + chars = [] + for word in words: + char = [] + subchar = [] + for c in word: + if c=='<': + subchar.append(c) + continue + if c=='>' and subchar[0]=='<': + char.append(''.join(subchar)) + subchar = [] + if subchar: + subchar.append(c) + else: + char.append(c) + char.extend(subchar) + chars.append(char) + return chars + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(split_word_into_chars, field_name=Const.CHAR_INPUT, + new_field_name=Const.CHAR_INPUT) + return data_bundle + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 可以处理的DataSet需要包含raw_words列 + + .. csv-table:: + :header: "raw_words" + + "上海 浦东 开发 与 法制 建设 同步" + "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" + "..." + + :param data_bundle: + :return: + """ + data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT) + + if self.replace_num_alpha: + data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) + data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) + + self._tokenize(data_bundle) + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars:self.word_lens_to_tags(map(len, chars)), field_name=Const.CHAR_INPUT, + new_field_name=Const.TARGET) + dataset.apply_field(lambda chars:list(chain(*chars)), field_name=Const.CHAR_INPUT, + new_field_name=Const.CHAR_INPUT) + input_field_names = [Const.CHAR_INPUT] + if self.bigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1+c2 for c1, c2 in zip(chars, chars[1:]+[''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + if self.trigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1+c2+c3 for c1, c2, c3 in zip(chars, chars[1:]+[''], chars[2:]+['']*2)], + field_name=Const.CHAR_INPUT, new_field_name='trigrams') + input_field_names.append('trigrams') + + _indexize(data_bundle, input_field_names, Const.TARGET) + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET, Const.INPUT_LEN] + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths=None) -> DataBundle: + """ + + :param str paths: + :return: + """ + if self.dataset_name is None and paths is None: + raise RuntimeError("You have to set `paths` when calling process_from_file() or `dataset_name `when initialization.") + if self.dataset_name is not None and paths is not None: + raise RuntimeError("You cannot specify `paths` and `dataset_name` simultaneously") + data_bundle = CWSLoader(self.dataset_name).load(paths) + return self.process(data_bundle) \ No newline at end of file diff --git a/reproduction/seqence_labelling/cws/data/CWSDataLoader.py b/reproduction/seqence_labelling/cws/data/CWSDataLoader.py deleted file mode 100644 index 5f69c0ad..00000000 --- a/reproduction/seqence_labelling/cws/data/CWSDataLoader.py +++ /dev/null @@ -1,249 +0,0 @@ - -from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader -from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.data_bundle import DataSetLoader, DataBundle -from typing import Union, Dict, List, Iterator -from fastNLP import DataSet -from fastNLP import Instance -from fastNLP import Vocabulary -from fastNLP import Const -from reproduction.utils import check_dataloader_paths -from functools import partial - -class SigHanLoader(DataSetLoader): - """ - 任务相关的说明可以在这里找到http://sighan.cs.uchicago.edu/ - 支持的数据格式为,一行一句,不同的word用空格隔开。如下例 - - 共同 创造 美好 的 新 世纪 —— 二○○一年 新年 - 女士 们 , 先生 们 , 同志 们 , 朋友 们 : - - 读取sighan中的数据集,返回的DataSet将包含以下的内容fields: - raw_chars: list(str), 每个元素是一个汉字 - chars: list(str), 每个元素是一个index(汉字对应的index) - target: list(int), 根据不同的encoding_type会有不同的变化 - - :param target_type: target的类型,当前支持以下的两种: "bmes", "shift_relay" - """ - - def __init__(self, target_type:str): - super().__init__() - - if target_type.lower() not in ('bmes', 'shift_relay'): - raise ValueError("target_type only supports 'bmes', 'shift_relay'.") - - self.target_type = target_type - if target_type=='bmes': - self._word_len_to_target = self._word_len_to_bems - elif target_type=='shift_relay': - self._word_len_to_target = self._word_lens_to_relay - - @staticmethod - def _word_lens_to_relay(word_lens: Iterator[int]): - """ - [1, 2, 3, ..] 转换为[0, 1, 0, 2, 1, 0,](start指示seg有多长); - :param word_lens: - :return: {'target': , 'end_seg_mask':, 'start_seg_mask':} - """ - tags = [] - end_seg_mask = [] - start_seg_mask = [] - for word_len in word_lens: - tags.extend([idx for idx in range(word_len - 1, -1, -1)]) - end_seg_mask.extend([0] * (word_len - 1) + [1]) - start_seg_mask.extend([1] + [0] * (word_len - 1)) - return {'target': tags, 'end_seg_mask': end_seg_mask, 'start_seg_mask': start_seg_mask} - - @staticmethod - def _word_len_to_bems(word_lens:Iterator[int])->Dict[str, List[str]]: - """ - - :param word_lens: 每个word的长度 - :return: - """ - tags = [] - for word_len in word_lens: - if word_len==1: - tags.append('S') - else: - tags.append('B') - for _ in range(word_len-2): - tags.append('M') - tags.append('E') - return {'target':tags} - - @staticmethod - def _gen_bigram(chars:List[str])->List[str]: - """ - - :param chars: - :return: - """ - return [c1+c2 for c1, c2 in zip(chars, chars[1:]+[''])] - - def load(self, path:str, bigram:bool=False)->DataSet: - """ - :param path: str - :param bigram: 是否使用bigram feature - :return: - """ - dataset = DataSet() - with open(path, 'r', encoding='utf-8') as f: - for line in f: - line = line.strip() - if not line: # 去掉空行 - continue - parts = line.split() - word_lens = map(len, parts) - chars = list(''.join(parts)) - tags = self._word_len_to_target(word_lens) - assert len(chars)==len(tags['target']) - dataset.append(Instance(raw_chars=chars, **tags, seq_len=len(chars))) - if len(dataset)==0: - raise RuntimeError(f"{path} has no valid data.") - if bigram: - dataset.apply_field(self._gen_bigram, field_name='raw_chars', new_field_name='bigrams') - return dataset - - def process(self, paths: Union[str, Dict[str, str]], char_vocab_opt:VocabularyOption=None, - char_embed_opt:EmbeddingOption=None, bigram_vocab_opt:VocabularyOption=None, - bigram_embed_opt:EmbeddingOption=None, L:int=4): - """ - 支持的数据格式为一行一个sample,并且用空格隔开不同的词语。例如 - - Option:: - - 共同 创造 美好 的 新 世纪 —— 二○○一年 新年 贺词 - ( 二○○○年 十二月 三十一日 ) ( 附 图片 1 张 ) - 女士 们 , 先生 们 , 同志 们 , 朋友 们 : - - paths支持两种格式,第一种是str,第二种是Dict[str, str]. - - Option:: - - # 1. str类型 - # 1.1 传入具体的文件路径 - data = SigHanLoader('bmes').process('/path/to/cws/data.txt') # 将读取data.txt的内容 - # 包含以下的内容data.vocabs['chars']:Vocabulary对象, - # data.vocabs['target']: Vocabulary对象,根据encoding_type可能会没有该值 - # data.embeddings['chars']: Embedding对象. 只有提供了预训练的词向量的路径才有该项 - # data.datasets['train']: DataSet对象 - # 包含的field有: - # raw_chars: list[str], 每个元素是一个汉字 - # chars: list[int], 每个元素是汉字对应的index - # target: list[int], 根据encoding_type有对应的变化 - # 1.2 传入一个目录, 里面必须包含train.txt文件 - data = SigHanLoader('bmes').process('path/to/cws/') #将尝试在该目录下读取 train.txt, test.txt以及dev.txt - # 包含以下的内容data.vocabs['chars']: Vocabulary对象 - # data.vocabs['target']:Vocabulary对象 - # data.embeddings['chars']: 仅在提供了预训练embedding路径的情况下,为Embedding对象; - # data.datasets['train']: DataSet对象 - # 包含的field有: - # raw_chars: list[str], 每个元素是一个汉字 - # chars: list[int], 每个元素是汉字对应的index - # target: list[int], 根据encoding_type有对应的变化 - # data.datasets['dev']: DataSet对象,如果文件夹下包含了dev.txt;内容与data.datasets['train']一样 - - # 2. dict类型, key是文件的名称,value是对应的读取路径. 必须包含'train'这个key - paths = {'train': '/path/to/train/train.txt', 'test':'/path/to/test/test.txt', 'dev':'/path/to/dev/dev.txt'} - data = SigHanLoader(paths).process(paths) - # 结果与传入目录时是一致的,但是可以传入多个数据集。data.datasets中的key将与这里传入的一致 - - :param paths: 支持传入目录,文件路径,以及dict。 - :param char_vocab_opt: 用于构建chars的vocabulary参数,默认为min_freq=2 - :param char_embed_opt: 用于读取chars的Embedding的参数,默认不读取pretrained的embedding - :param bigram_vocab_opt: 用于构建bigram的vocabulary参数,默认不使用bigram, 仅在指定该参数的情况下会带有bigrams这个field。 - 为List[int], 每个instance长度与chars一样, abcde的bigram为ab bc cd de e - :param bigram_embed_opt: 用于读取预训练bigram的参数,仅在传入bigram_vocab_opt有效 - :param L: 当target_type为shift_relay时传入的segment长度 - :return: - """ - # 推荐大家使用这个check_data_loader_paths进行paths的验证 - paths = check_dataloader_paths(paths) - datasets = {} - data = DataBundle() - bigram = bigram_vocab_opt is not None - for name, path in paths.items(): - dataset = self.load(path, bigram=bigram) - datasets[name] = dataset - input_fields = [] - target_fields = [] - # 创建vocab - char_vocab = Vocabulary(min_freq=2) if char_vocab_opt is None else Vocabulary(**char_vocab_opt) - char_vocab.from_dataset(datasets['train'], field_name='raw_chars') - char_vocab.index_dataset(*datasets.values(), field_name='raw_chars', new_field_name='chars') - data.vocabs[Const.CHAR_INPUT] = char_vocab - input_fields.extend([Const.CHAR_INPUT, Const.INPUT_LEN, Const.TARGET]) - target_fields.append(Const.TARGET) - # 创建target - if self.target_type == 'bmes': - target_vocab = Vocabulary(unknown=None, padding=None) - target_vocab.add_word_lst(['B']*4+['M']*3+['E']*2+['S']) - target_vocab.index_dataset(*datasets.values(), field_name='target') - data.vocabs[Const.TARGET] = target_vocab - if char_embed_opt is not None: - char_embed = EmbedLoader.load_with_vocab(**char_embed_opt, vocab=char_vocab) - data.embeddings['chars'] = char_embed - if bigram: - bigram_vocab = Vocabulary(**bigram_vocab_opt) - bigram_vocab.from_dataset(datasets['train'], field_name='bigrams') - bigram_vocab.index_dataset(*datasets.values(), field_name='bigrams') - data.vocabs['bigrams'] = bigram_vocab - if bigram_embed_opt is not None: - bigram_embed = EmbedLoader.load_with_vocab(**bigram_embed_opt, vocab=bigram_vocab) - data.embeddings['bigrams'] = bigram_embed - input_fields.append('bigrams') - if self.target_type == 'shift_relay': - func = partial(self._clip_target, L=L) - for name, dataset in datasets.items(): - res = dataset.apply_field(func, field_name='target') - relay_target = [res_i[0] for res_i in res] - relay_mask = [res_i[1] for res_i in res] - dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) - dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) - if self.target_type == 'shift_relay': - input_fields.extend(['end_seg_mask']) - target_fields.append('start_seg_mask') - # 将dataset加入DataInfo - for name, dataset in datasets.items(): - dataset.set_input(*input_fields) - dataset.set_target(*target_fields) - data.datasets[name] = dataset - - return data - - @staticmethod - def _clip_target(target:List[int], L:int): - """ - - 只有在target_type为shift_relay的使用 - :param target: List[int] - :param L: - :return: - """ - relay_target_i = [] - tmp = [] - for j in range(len(target) - 1): - tmp.append(target[j]) - if target[j] > target[j + 1]: - pass - else: - relay_target_i.extend([L - 1 if t >= L else t for t in tmp[::-1]]) - tmp = [] - # 处理未结束的部分 - if len(tmp) == 0: - relay_target_i.append(0) - else: - tmp.append(target[-1]) - relay_target_i.extend([L - 1 if t >= L else t for t in tmp[::-1]]) - relay_mask_i = [] - j = 0 - while j < len(target): - seg_len = target[j] + 1 - if target[j] < L: - relay_mask_i.extend([0] * (seg_len)) - else: - relay_mask_i.extend([1] * (seg_len - L) + [0] * L) - j = seg_len + j - return relay_target_i, relay_mask_i - diff --git a/reproduction/seqence_labelling/cws/data/cws_shift_pipe.py b/reproduction/seqence_labelling/cws/data/cws_shift_pipe.py new file mode 100644 index 00000000..0ae4064d --- /dev/null +++ b/reproduction/seqence_labelling/cws/data/cws_shift_pipe.py @@ -0,0 +1,202 @@ +from fastNLP.io.pipe import Pipe +from fastNLP.io import DataBundle +from fastNLP.io.loader import CWSLoader +from fastNLP import Const +from itertools import chain +from fastNLP.io.pipe.utils import _indexize +from functools import partial +from fastNLP.io.pipe.cws import _find_and_replace_alpha_spans, _find_and_replace_digit_spans + + +def _word_lens_to_relay(word_lens): + """ + [1, 2, 3, ..] 转换为[0, 1, 0, 2, 1, 0,](start指示seg有多长); + :param word_lens: + :return: + """ + tags = [] + for word_len in word_lens: + tags.extend([idx for idx in range(word_len - 1, -1, -1)]) + return tags + +def _word_lens_to_end_seg_mask(word_lens): + """ + [1, 2, 3, ..] 转换为[0, 1, 0, 2, 1, 0,](start指示seg有多长); + :param word_lens: + :return: + """ + end_seg_mask = [] + for word_len in word_lens: + end_seg_mask.extend([0] * (word_len - 1) + [1]) + return end_seg_mask + +def _word_lens_to_start_seg_mask(word_lens): + """ + [1, 2, 3, ..] 转换为[0, 1, 0, 2, 1, 0,](start指示seg有多长); + :param word_lens: + :return: + """ + start_seg_mask = [] + for word_len in word_lens: + start_seg_mask.extend([1] + [0] * (word_len - 1)) + return start_seg_mask + + +class CWSShiftRelayPipe(Pipe): + """ + + :param str,None dataset_name: 支持'pku', 'msra', 'cityu', 'as', None + :param int L: ShiftRelay模型的超参数 + :param bool replace_num_alpha: 是否将数字和字母用特殊字符替换。 + :param bool bigrams: 是否增加一列bigram. bigram的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...] + :param bool trigrams: 是否增加一列trigram. trigram的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + """ + def __init__(self, dataset_name=None, L=5, replace_num_alpha=True, bigrams=True): + self.dataset_name = dataset_name + self.bigrams = bigrams + self.replace_num_alpha = replace_num_alpha + self.L = L + + def _tokenize(self, data_bundle): + """ + 将data_bundle中的'chars'列切分成一个一个的word. + 例如输入是"共同 创造 美好.."->[[共, 同], [创, 造], [...], ] + + :param data_bundle: + :return: + """ + def split_word_into_chars(raw_chars): + words = raw_chars.split() + chars = [] + for word in words: + char = [] + subchar = [] + for c in word: + if c=='<': + subchar.append(c) + continue + if c=='>' and subchar[0]=='<': + char.append(''.join(subchar)) + subchar = [] + if subchar: + subchar.append(c) + else: + char.append(c) + char.extend(subchar) + chars.append(char) + return chars + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(split_word_into_chars, field_name=Const.CHAR_INPUT, + new_field_name=Const.CHAR_INPUT) + return data_bundle + + def process(self, data_bundle: DataBundle) -> DataBundle: + """ + 可以处理的DataSet需要包含raw_words列 + + .. csv-table:: + :header: "raw_words" + + "上海 浦东 开发 与 法制 建设 同步" + "新华社 上海 二月 十日 电 ( 记者 谢金虎 、 张持坚 )" + "..." + + :param data_bundle: + :return: + """ + data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT) + + if self.replace_num_alpha: + data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) + data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) + + self._tokenize(data_bundle) + input_field_names = [Const.CHAR_INPUT] + target_field_names = [] + + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars:_word_lens_to_relay(map(len, chars)), field_name=Const.CHAR_INPUT, + new_field_name=Const.TARGET) + dataset.apply_field(lambda chars:_word_lens_to_start_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, + new_field_name='start_seg_mask') + dataset.apply_field(lambda chars:_word_lens_to_end_seg_mask(map(len, chars)), field_name=Const.CHAR_INPUT, + new_field_name='end_seg_mask') + dataset.apply_field(lambda chars:list(chain(*chars)), field_name=Const.CHAR_INPUT, + new_field_name=Const.CHAR_INPUT) + target_field_names.append('start_seg_mask') + input_field_names.append('end_seg_mask') + if self.bigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1+c2 for c1, c2 in zip(chars, chars[1:]+[''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + + _indexize(data_bundle, ['chars', 'bigrams'], []) + + func = partial(_clip_target, L=self.L) + for name, dataset in data_bundle.datasets.items(): + res = dataset.apply_field(func, field_name='target') + relay_target = [res_i[0] for res_i in res] + relay_mask = [res_i[1] for res_i in res] + dataset.add_field('relay_target', relay_target, is_input=True, is_target=False, ignore_type=False) + dataset.add_field('relay_mask', relay_mask, is_input=True, is_target=False, ignore_type=False) + input_field_names.append('relay_target') + input_field_names.append('relay_mask') + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET, Const.INPUT_LEN] + target_field_names + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths=None) -> DataBundle: + """ + + :param str paths: + :return: + """ + if self.dataset_name is None and paths is None: + raise RuntimeError("You have to set `paths` when calling process_from_file() or `dataset_name `when initialization.") + if self.dataset_name is not None and paths is not None: + raise RuntimeError("You cannot specify `paths` and `dataset_name` simultaneously") + data_bundle = CWSLoader(self.dataset_name).load(paths) + return self.process(data_bundle) + +def _clip_target(target, L:int): + """ + + 只有在target_type为shift_relay的使用 + :param target: List[int] + :param L: + :return: + """ + relay_target_i = [] + tmp = [] + for j in range(len(target) - 1): + tmp.append(target[j]) + if target[j] > target[j + 1]: + pass + else: + relay_target_i.extend([L - 1 if t >= L else t for t in tmp[::-1]]) + tmp = [] + # 处理未结束的部分 + if len(tmp) == 0: + relay_target_i.append(0) + else: + tmp.append(target[-1]) + relay_target_i.extend([L - 1 if t >= L else t for t in tmp[::-1]]) + relay_mask_i = [] + j = 0 + while j < len(target): + seg_len = target[j] + 1 + if target[j] < L: + relay_mask_i.extend([0] * (seg_len)) + else: + relay_mask_i.extend([1] * (seg_len - L) + [0] * L) + j = seg_len + j + return relay_target_i, relay_mask_i diff --git a/reproduction/seqence_labelling/cws/model/bilstm_crf_cws.py b/reproduction/seqence_labelling/cws/model/bilstm_crf_cws.py new file mode 100644 index 00000000..4f87a81c --- /dev/null +++ b/reproduction/seqence_labelling/cws/model/bilstm_crf_cws.py @@ -0,0 +1,60 @@ + +import torch +from fastNLP.modules import LSTM +from fastNLP.modules import allowed_transitions, ConditionalRandomField +from fastNLP import seq_len_to_mask +from torch import nn +from fastNLP import Const +import torch.nn.functional as F + +class BiLSTMCRF(nn.Module): + def __init__(self, char_embed, hidden_size, num_layers, target_vocab=None, bigram_embed=None, trigram_embed=None, + dropout=0.5): + super().__init__() + + embed_size = char_embed.embed_size + self.char_embed = char_embed + if bigram_embed: + embed_size += bigram_embed.embed_size + self.bigram_embed = bigram_embed + if trigram_embed: + embed_size += trigram_embed.embed_size + self.trigram_embed = trigram_embed + + self.lstm = LSTM(embed_size, hidden_size=hidden_size//2, bidirectional=True, batch_first=True, + num_layers=num_layers) + self.dropout = nn.Dropout(p=dropout) + self.fc = nn.Linear(hidden_size, len(target_vocab)) + + transitions = None + if target_vocab: + transitions = allowed_transitions(target_vocab, include_start_end=True, encoding_type='bmes') + + self.crf = ConditionalRandomField(num_tags=len(target_vocab), allowed_transitions=transitions) + + def _forward(self, chars, bigrams, trigrams, seq_len, target=None): + chars = self.char_embed(chars) + if bigrams is not None: + bigrams = self.bigram_embed(bigrams) + chars = torch.cat([chars, bigrams], dim=-1) + if trigrams is not None: + trigrams = self.trigram_embed(trigrams) + chars = torch.cat([chars, trigrams], dim=-1) + + output, _ = self.lstm(chars, seq_len) + output = self.dropout(output) + output = self.fc(output) + output = F.log_softmax(output, dim=-1) + mask = seq_len_to_mask(seq_len) + if target is None: + pred, _ = self.crf.viterbi_decode(output, mask) + return {Const.OUTPUT:pred} + else: + loss = self.crf.forward(output, tags=target, mask=mask) + return {Const.LOSS:loss} + + def forward(self, chars, seq_len, target, bigrams=None, trigrams=None): + return self._forward(chars, bigrams, trigrams, seq_len, target) + + def predict(self, chars, seq_len, bigrams=None, trigrams=None): + return self._forward(chars, bigrams, trigrams, seq_len) \ No newline at end of file diff --git a/reproduction/seqence_labelling/cws/model/model.py b/reproduction/seqence_labelling/cws/model/bilstm_shift_relay.py similarity index 74% rename from reproduction/seqence_labelling/cws/model/model.py rename to reproduction/seqence_labelling/cws/model/bilstm_shift_relay.py index de945ac3..4ce1cc51 100644 --- a/reproduction/seqence_labelling/cws/model/model.py +++ b/reproduction/seqence_labelling/cws/model/bilstm_shift_relay.py @@ -1,7 +1,5 @@ from torch import nn import torch -from fastNLP.embeddings import Embedding -import numpy as np from reproduction.seqence_labelling.cws.model.module import FeatureFunMax, SemiCRFShiftRelay from fastNLP.modules import LSTM @@ -21,25 +19,21 @@ class ShiftRelayCWSModel(nn.Module): :param num_bigram_per_char: 每个character对应的bigram的数量 :param drop_p: Dropout的大小 """ - def __init__(self, char_embed:Embedding, bigram_embed:Embedding, hidden_size:int=400, num_layers:int=1, - L:int=6, num_bigram_per_char:int=1, drop_p:float=0.2): + def __init__(self, char_embed, bigram_embed, hidden_size:int=400, num_layers:int=1, L:int=6, drop_p:float=0.2): super().__init__() - self.char_embedding = Embedding(char_embed, dropout=drop_p) - self._pretrained_embed = False - if isinstance(char_embed, np.ndarray): - self._pretrained_embed = True - self.bigram_embedding = Embedding(bigram_embed, dropout=drop_p) - self.lstm = LSTM(100 * (num_bigram_per_char + 1), hidden_size // 2, num_layers=num_layers, bidirectional=True, + self.char_embedding = char_embed + self.bigram_embedding = bigram_embed + self.lstm = LSTM(char_embed.embed_size+bigram_embed.embed_size, hidden_size // 2, num_layers=num_layers, + bidirectional=True, batch_first=True) self.feature_fn = FeatureFunMax(hidden_size, L) self.semi_crf_relay = SemiCRFShiftRelay(L) self.feat_drop = nn.Dropout(drop_p) self.reset_param() - # self.feature_fn.reset_parameters() def reset_param(self): for name, param in self.named_parameters(): - if 'embedding' in name and self._pretrained_embed: + if 'embedding' in name: continue if 'bias_hh' in name: nn.init.constant_(param, 0) @@ -51,10 +45,8 @@ class ShiftRelayCWSModel(nn.Module): nn.init.xavier_uniform_(param) def get_feats(self, chars, bigrams, seq_len): - batch_size, max_len = chars.size() chars = self.char_embedding(chars) bigrams = self.bigram_embedding(bigrams) - bigrams = bigrams.view(bigrams.size(0), max_len, -1) chars = torch.cat([chars, bigrams], dim=-1) feats, _ = self.lstm(chars, seq_len) feats = self.feat_drop(feats) diff --git a/reproduction/seqence_labelling/cws/train_bilstm_crf.py b/reproduction/seqence_labelling/cws/train_bilstm_crf.py new file mode 100644 index 00000000..b9a77249 --- /dev/null +++ b/reproduction/seqence_labelling/cws/train_bilstm_crf.py @@ -0,0 +1,52 @@ +import sys +sys.path.append('../../..') + +from fastNLP.io.pipe.cws import CWSPipe +from reproduction.seqence_labelling.cws.model.bilstm_crf_cws import BiLSTMCRF +from fastNLP import Trainer, cache_results +from fastNLP.embeddings import StaticEmbedding +from fastNLP import EvaluateCallback, BucketSampler, SpanFPreRecMetric, GradientClipCallback +from torch.optim import Adagrad + +###########hyper +dataname = 'pku' +hidden_size = 400 +num_layers = 1 +lr = 0.05 +###########hyper + + +@cache_results('{}.pkl'.format(dataname), _refresh=False) +def get_data(): + data_bundle = CWSPipe(dataset_name=dataname, bigrams=True, trigrams=False).process_from_file() + char_embed = StaticEmbedding(data_bundle.get_vocab('chars'), dropout=0.33, word_dropout=0.01, + model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt') + bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), dropout=0.33,min_freq=3, word_dropout=0.01, + model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt') + return data_bundle, char_embed, bigram_embed + +data_bundle, char_embed, bigram_embed = get_data() +print(data_bundle) + +model = BiLSTMCRF(char_embed, hidden_size, num_layers, target_vocab=data_bundle.get_vocab('target'), bigram_embed=bigram_embed, + trigram_embed=None, dropout=0.3) +model.cuda() + +callbacks = [] +callbacks.append(EvaluateCallback(data_bundle.get_dataset('test'))) +callbacks.append(GradientClipCallback(clip_type='value', clip_value=5)) +optimizer = Adagrad(model.parameters(), lr=lr) + +metrics = [] +metric1 = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'), encoding_type='bmes') +metrics.append(metric1) + +trainer = Trainer(data_bundle.get_dataset('train'), model, optimizer=optimizer, loss=None, + batch_size=128, sampler=BucketSampler(), update_every=1, + num_workers=1, n_epochs=10, print_every=5, + dev_data=data_bundle.get_dataset('dev'), + metrics=metrics, + metric_key=None, + validate_every=-1, save_path=None, use_tqdm=True, device=0, + callbacks=callbacks, check_code_level=0, dev_batch_size=128) +trainer.train() diff --git a/reproduction/seqence_labelling/cws/train_shift_relay.py b/reproduction/seqence_labelling/cws/train_shift_relay.py index 55576575..322f42bb 100644 --- a/reproduction/seqence_labelling/cws/train_shift_relay.py +++ b/reproduction/seqence_labelling/cws/train_shift_relay.py @@ -1,64 +1,53 @@ -import os +import sys +sys.path.append('../../..') from fastNLP import cache_results -from reproduction.seqence_labelling.cws.data.CWSDataLoader import SigHanLoader -from reproduction.seqence_labelling.cws.model.model import ShiftRelayCWSModel -from fastNLP.io.embed_loader import EmbeddingOption -from fastNLP.core.vocabulary import VocabularyOption +from reproduction.seqence_labelling.cws.data.cws_shift_pipe import CWSShiftRelayPipe +from reproduction.seqence_labelling.cws.model.bilstm_shift_relay import ShiftRelayCWSModel from fastNLP import Trainer from torch.optim import Adam from fastNLP import BucketSampler from fastNLP import GradientClipCallback from reproduction.seqence_labelling.cws.model.metric import RelayMetric - - -# 借助一下fastNLP的自动缓存机制,但是只能缓存4G以下的结果 -@cache_results(None) -def prepare_data(): - data = SigHanLoader(target_type='shift_relay').process(file_dir, char_embed_opt=char_embed_opt, - bigram_vocab_opt=bigram_vocab_opt, - bigram_embed_opt=bigram_embed_opt, - L=L) - return data +from fastNLP.embeddings import StaticEmbedding +from fastNLP import EvaluateCallback #########hyper L = 4 hidden_size = 200 num_layers = 1 drop_p = 0.2 -lr = 0.02 - +lr = 0.008 +data_name = 'pku' #########hyper device = 0 -# !!!!这里千万不要放完全路径,因为这样会暴露你们在服务器上的用户名,比较危险。所以一定要使用相对路径,最好把数据放到 -# 你们的reproduction路径下,然后设置.gitignore -file_dir = '/path/to/' -char_embed_path = '/pretrain/vectors/1grams_t3_m50_corpus.txt' -bigram_embed_path = '/pretrain/vectors/2grams_t3_m50_corpus.txt' -bigram_vocab_opt = VocabularyOption(min_freq=3) -char_embed_opt = EmbeddingOption(embed_filepath=char_embed_path) -bigram_embed_opt = EmbeddingOption(embed_filepath=bigram_embed_path) - -data_name = os.path.basename(file_dir) cache_fp = 'caches/{}.pkl'.format(data_name) +@cache_results(_cache_fp=cache_fp, _refresh=True) # 将结果缓存到cache_fp中,这样下次运行就直接读取,而不需要再次运行 +def prepare_data(): + data_bundle = CWSShiftRelayPipe(dataset_name=data_name, L=L).process_from_file() + # 预训练的character embedding和bigram embedding + char_embed = StaticEmbedding(data_bundle.get_vocab('chars'), dropout=0.5, word_dropout=0.01, + model_dir_or_name='~/exps/CWS/pretrain/vectors/1grams_t3_m50_corpus.txt') + bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'), dropout=0.5, min_freq=3, word_dropout=0.01, + model_dir_or_name='~/exps/CWS/pretrain/vectors/2grams_t3_m50_corpus.txt') -data = prepare_data(_cache_fp=cache_fp, _refresh=True) + return data_bundle, char_embed, bigram_embed -model = ShiftRelayCWSModel(char_embed=data.embeddings['chars'], bigram_embed=data.embeddings['bigrams'], - hidden_size=hidden_size, num_layers=num_layers, - L=L, num_bigram_per_char=1, drop_p=drop_p) +data, char_embed, bigram_embed = prepare_data() -sampler = BucketSampler(batch_size=32) +model = ShiftRelayCWSModel(char_embed=char_embed, bigram_embed=bigram_embed, + hidden_size=hidden_size, num_layers=num_layers, drop_p=drop_p, L=L) + +sampler = BucketSampler() optimizer = Adam(model.parameters(), lr=lr) -clipper = GradientClipCallback(clip_value=5, clip_type='value') -callbacks = [clipper] -# if pretrain: -# fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until) -# callbacks.append(fixer) -trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler, - update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(), +clipper = GradientClipCallback(clip_value=5, clip_type='value') # 截断太大的梯度 +evaluator = EvaluateCallback(data.get_dataset('test')) # 额外测试在test集上的效果 +callbacks = [clipper, evaluator] + +trainer = Trainer(data.get_dataset('train'), model, optimizer=optimizer, loss=None, batch_size=128, sampler=sampler, + update_every=1, n_epochs=10, print_every=5, dev_data=data.get_dataset('dev'), metrics=RelayMetric(), metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks, - check_code_level=0) + check_code_level=0, num_workers=1) trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py index 249e2851..c38dce38 100644 --- a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py +++ b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py @@ -8,11 +8,10 @@ import torch.nn.functional as F from fastNLP import Const class CNNBiLSTMCRF(nn.Module): - def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): + def __init__(self, embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): super().__init__() self.embedding = embed - self.char_embedding = char_embed - self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, + self.lstm = LSTM(input_size=self.embedding.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers, bidirectional=True, batch_first=True) self.fc = nn.Linear(hidden_size, len(tag_vocab)) @@ -32,9 +31,7 @@ class CNNBiLSTMCRF(nn.Module): nn.init.zeros_(param) def _forward(self, words, seq_len, target=None): - word_embeds = self.embedding(words) - char_embeds = self.char_embedding(words) - words = torch.cat((word_embeds, char_embeds), dim=-1) + words = self.embedding(words) outputs, _ = self.lstm(words, seq_len) self.dropout(outputs) diff --git a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py index 10c5bdea..3138a6c2 100644 --- a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py +++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py @@ -1,7 +1,7 @@ import sys sys.path.append('../../..') -from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding +from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding, StackEmbedding from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF from fastNLP import Trainer @@ -22,7 +22,7 @@ def load_data(): paths = {'test':"NER/corpus/CoNLL-2003/eng.testb", 'train':"NER/corpus/CoNLL-2003/eng.train", 'dev':"NER/corpus/CoNLL-2003/eng.testa"} - data = Conll2003NERPipe(encoding_type=encoding_type, target_pad_val=0).process_from_file(paths) + data = Conll2003NERPipe(encoding_type=encoding_type).process_from_file(paths) return data data = load_data() print(data) @@ -33,8 +33,9 @@ word_embed = StaticEmbedding(vocab=data.get_vocab('words'), model_dir_or_name='en-glove-6b-100d', requires_grad=True, lower=True, word_dropout=0.01, dropout=0.5) word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() +embed = StackEmbedding([word_embed, char_embed]) -model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], +model = CNNBiLSTMCRF(embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) callbacks = [ diff --git a/reproduction/seqence_labelling/ner/train_ontonote.py b/reproduction/seqence_labelling/ner/train_ontonote.py index 7b465d77..ee80b6f7 100644 --- a/reproduction/seqence_labelling/ner/train_ontonote.py +++ b/reproduction/seqence_labelling/ner/train_ontonote.py @@ -2,7 +2,7 @@ import sys sys.path.append('../../..') -from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding +from fastNLP.embeddings import CNNCharEmbedding, StaticEmbedding, StackEmbedding from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF from fastNLP import Trainer @@ -35,7 +35,7 @@ def cache(): char_embed = CNNCharEmbedding(vocab=data.vocabs['words'], embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3], dropout=dropout) word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], - model_dir_or_name='en-glove-100d', + model_dir_or_name='en-glove-6b-100d', requires_grad=True, normalize=normalize, word_dropout=0.01, @@ -47,7 +47,8 @@ data, char_embed, word_embed = cache() print(data) -model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], +embed = StackEmbedding([word_embed, char_embed]) +model = CNNBiLSTMCRF(embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type, dropout=dropout) callbacks = [ diff --git a/test/io/loader/test_cws_loader.py b/test/io/loader/test_cws_loader.py new file mode 100644 index 00000000..6ad607c3 --- /dev/null +++ b/test/io/loader/test_cws_loader.py @@ -0,0 +1,13 @@ +import unittest +import os +from fastNLP.io.loader import CWSLoader + + +class CWSLoaderTest(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_download(self): + dataset_names = ['pku', 'cityu', 'as', 'msra'] + for dataset_name in dataset_names: + with self.subTest(dataset_name=dataset_name): + data_bundle = CWSLoader(dataset_name=dataset_name).load() + print(data_bundle) \ No newline at end of file diff --git a/test/io/pipe/test_cws.py b/test/io/pipe/test_cws.py new file mode 100644 index 00000000..2fc57ae2 --- /dev/null +++ b/test/io/pipe/test_cws.py @@ -0,0 +1,13 @@ + +import unittest +import os +from fastNLP.io.pipe.cws import CWSPipe + +class CWSPipeTest(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_process_from_file(self): + dataset_names = ['pku', 'cityu', 'as', 'msra'] + for dataset_name in dataset_names: + with self.subTest(dataset_name=dataset_name): + data_bundle = CWSPipe(dataset_name=dataset_name).process_from_file() + print(data_bundle) \ No newline at end of file From f18ab642d70cb304212e71fd9b22e16fe3aa5699 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Thu, 22 Aug 2019 15:51:44 +0800 Subject: [PATCH 088/153] =?UTF-8?q?pytorch1.2=E7=89=88=E6=9C=AC=E4=B8=AD?= =?UTF-8?q?=E6=96=B0=E5=A2=9EboolTensor=E7=B1=BB=E5=9E=8B=EF=BC=8C?= =?UTF-8?q?=E6=89=80=E6=9C=89=E7=9A=84masked=5Ffill=E5=BF=85=E9=A1=BB?= =?UTF-8?q?=E4=B8=BAByteTensor=E7=B1=BB=E5=9E=8B=E7=9A=84=E7=B4=A2?= =?UTF-8?q?=E5=BC=95,=E4=BF=AE=E6=94=B9fastNLP=E4=BB=A5=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 4 ++-- fastNLP/embeddings/embedding.py | 4 ++-- fastNLP/models/biaffine_parser.py | 2 +- fastNLP/modules/decoder/crf.py | 6 +++--- fastNLP/modules/decoder/utils.py | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index bc0d46e2..6a10c489 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -115,7 +115,7 @@ class BertEmbedding(ContextualEmbedding): if self._word_sep_index: # 不能drop sep sep_mask = words.eq(self._word_sep_index) mask = torch.ones_like(words).float() * self.word_dropout - mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self._word_unk_index) if self._word_sep_index: words.masked_fill_(sep_mask, self._word_sep_index) @@ -252,7 +252,7 @@ class BertWordPieceEncoder(nn.Module): if self._word_sep_index: # 不能drop sep sep_mask = words.eq(self._wordpiece_unk_index) mask = torch.ones_like(words).float() * self.word_dropout - mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self._word_unk_index) if self._word_sep_index: words.masked_fill_(sep_mask, self._wordpiece_unk_index) diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index 8c5396b7..8b746c0d 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -63,7 +63,7 @@ class Embedding(nn.Module): """ if self.word_dropout>0 and self.training: mask = torch.ones_like(words).float() * self.word_dropout - mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self.unk_index) words = self.embed(words) return self.dropout(words) @@ -135,7 +135,7 @@ class TokenEmbedding(nn.Module): """ if self.word_dropout > 0 and self.training: mask = torch.ones_like(words).float() * self.word_dropout - mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self._word_unk_index) return words diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py index 29487864..bead09fc 100644 --- a/fastNLP/models/biaffine_parser.py +++ b/fastNLP/models/biaffine_parser.py @@ -150,7 +150,7 @@ class GraphParser(BaseModel): """ _, seq_len, _ = arc_matrix.shape matrix = arc_matrix + torch.diag(arc_matrix.new(seq_len).fill_(-np.inf)) - flip_mask = (mask == 0).byte() + flip_mask = mask.eq(0) matrix.masked_fill_(flip_mask.unsqueeze(1), -np.inf) _, heads = torch.max(matrix, dim=2) if mask is not None: diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py index b7a7547f..9f19afef 100644 --- a/fastNLP/modules/decoder/crf.py +++ b/fastNLP/modules/decoder/crf.py @@ -210,7 +210,7 @@ class ConditionalRandomField(nn.Module): trans_score = self.trans_m.view(1, n_tags, n_tags) tmp = alpha.view(batch_size, n_tags, 1) + emit_score + trans_score alpha = torch.logsumexp(tmp, 1).masked_fill(flip_mask[i].view(batch_size, 1), 0) + \ - alpha.masked_fill(mask[i].byte().view(batch_size, 1), 0) + alpha.masked_fill(mask[i].eq(1).view(batch_size, 1), 0) if self.include_start_end_trans: alpha = alpha + self.end_scores.view(1, -1) @@ -230,7 +230,7 @@ class ConditionalRandomField(nn.Module): seq_idx = torch.arange(seq_len, dtype=torch.long, device=logits.device) # trans_socre [L-1, B] - mask = mask.byte() + mask = mask.eq(1) flip_mask = mask.eq(0) trans_score = self.trans_m[tags[:seq_len - 1], tags[1:]].masked_fill(flip_mask[1:, :], 0) # emit_score [L, B] @@ -278,7 +278,7 @@ class ConditionalRandomField(nn.Module): """ batch_size, seq_len, n_tags = logits.size() logits = logits.transpose(0, 1).data # L, B, H - mask = mask.transpose(0, 1).data.byte() # L, B + mask = mask.transpose(0, 1).data.eq(1) # L, B # dp vpath = logits.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long) diff --git a/fastNLP/modules/decoder/utils.py b/fastNLP/modules/decoder/utils.py index 9e773336..3d5ac3f8 100644 --- a/fastNLP/modules/decoder/utils.py +++ b/fastNLP/modules/decoder/utils.py @@ -27,7 +27,7 @@ def viterbi_decode(logits, transitions, mask=None, unpad=False): "compatible." logits = logits.transpose(0, 1).data # L, B, H if mask is not None: - mask = mask.transpose(0, 1).data.byte() # L, B + mask = mask.transpose(0, 1).data.eq(1) # L, B else: mask = logits.new_ones((seq_len, batch_size), dtype=torch.uint8) From c38e8986cc5c692df17ba35c0eeaf59cb36383fc Mon Sep 17 00:00:00 2001 From: yh_cc Date: Thu, 22 Aug 2019 19:20:24 +0800 Subject: [PATCH 089/153] =?UTF-8?q?=E5=9C=A8linux=E6=A1=8C=E9=9D=A2?= =?UTF-8?q?=E7=B3=BB=E7=BB=9F=E4=B8=8ATrainer=E4=B8=AD=E4=BD=BF=E7=94=A8Te?= =?UTF-8?q?ster=E7=9A=84tqdm=E5=AD=98=E5=9C=A8bug;=20=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E4=B8=80=E4=B8=AA=E5=8F=AF=E9=80=89=E9=A1=B9=E4=BD=BF=E5=BE=97?= =?UTF-8?q?=E7=94=A8=E6=88=B7=E5=8F=AF=E4=BB=A5=E5=85=B3=E9=97=ADTester?= =?UTF-8?q?=E7=9A=84tqdm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/callback.py | 4 ++-- fastNLP/core/trainer.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 4ba4b945..24b42b6e 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -569,7 +569,7 @@ class FitlogCallback(Callback): batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), metrics=self.trainer.metrics, verbose=0, - use_tqdm=self.trainer.use_tqdm) + use_tqdm=self.trainer.test_use_tqdm) self.testers[key] = tester fitlog.add_progress(total_steps=self.n_steps) @@ -654,7 +654,7 @@ class EvaluateCallback(Callback): tester = Tester(data=data, model=self.model, batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), metrics=self.trainer.metrics, verbose=0, - use_tqdm=self.trainer.use_tqdm) + use_tqdm=self.trainer.test_use_tqdm) self.testers[key] = tester def on_valid_end(self, eval_result, metric_key, optimizer, better_result): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 2c52d104..290a89c1 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -545,6 +545,10 @@ class Trainer(object): self.logger = logger self.use_tqdm = use_tqdm + if 'test_use_tqdm' in kwargs: + self.test_use_tqdm = kwargs.get('test_use_tqdm') + else: + self.test_use_tqdm = self.use_tqdm self.pbar = None self.print_every = abs(self.print_every) self.kwargs = kwargs @@ -555,7 +559,7 @@ class Trainer(object): batch_size=kwargs.get("dev_batch_size", self.batch_size), device=None, # 由上面的部分处理device verbose=0, - use_tqdm=self.use_tqdm) + use_tqdm=self.test_use_tqdm) self.step = 0 self.start_time = None # start timestamp From 85f01f01d1ee320140fa9d63910c98ff1540cadb Mon Sep 17 00:00:00 2001 From: yh_cc Date: Fri, 23 Aug 2019 11:08:28 +0800 Subject: [PATCH 090/153] =?UTF-8?q?1.=E4=BF=AE=E5=A4=8D=E9=83=A8=E5=88=86?= =?UTF-8?q?=E6=B5=8B=E8=AF=95;=202.=E4=BF=AE=E5=A4=8DStaticEmbedding?= =?UTF-8?q?=E4=B8=AD=E6=9C=AA=E6=89=BE=E5=88=B0=E8=AF=8D=E5=88=9D=E5=A7=8B?= =?UTF-8?q?=E5=8C=96bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/static_embedding.py | 9 +++++-- test/embeddings/test_bert_embedding.py | 2 +- test/embeddings/test_static_embedding.py | 31 ++++++++++++++++++------ test/test_tutorials.py | 7 +++--- 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 4079b2a2..a75ad18f 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -106,6 +106,7 @@ class StaticEmbedding(TokenEmbedding): print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") vocab = truncated_vocab + self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) # 读取embedding if lower: lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown) @@ -142,7 +143,7 @@ class StaticEmbedding(TokenEmbedding): else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) - if normalize: + if not self.only_norm_found_vector and normalize: embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) if truncate_vocab: @@ -233,6 +234,7 @@ class StaticEmbedding(TokenEmbedding): if vocab.unknown: matrix[vocab.unknown_idx] = torch.zeros(dim) found_count = 0 + found_unknown = False for idx, line in enumerate(f, start_idx): try: parts = line.strip().split() @@ -243,9 +245,12 @@ class StaticEmbedding(TokenEmbedding): word = vocab.padding elif word == unknown and vocab.unknown is not None: word = vocab.unknown + found_unknown = True if word in vocab: index = vocab.to_index(word) matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) + if self.only_norm_found_vector: + matrix[index] = matrix[index]/np.linalg.norm(matrix[index]) found_count += 1 except Exception as e: if error == 'ignore': @@ -256,7 +261,7 @@ class StaticEmbedding(TokenEmbedding): print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) for word, index in vocab: if index not in matrix and not vocab._is_word_no_create_entry(word): - if vocab.unknown_idx in matrix: # 如果有unkonwn,用unknown初始化 + if found_unknown: # 如果有unkonwn,用unknown初始化 matrix[index] = matrix[vocab.unknown_idx] else: matrix[index] = None diff --git a/test/embeddings/test_bert_embedding.py b/test/embeddings/test_bert_embedding.py index c27ebd40..760029a3 100644 --- a/test/embeddings/test_bert_embedding.py +++ b/test/embeddings/test_bert_embedding.py @@ -9,6 +9,6 @@ class TestDownload(unittest.TestCase): def test_download(self): # import os vocab = Vocabulary().add_word_lst("This is a test .".split()) - embed = BertEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/embedding/bert-base-cased') + embed = BertEmbedding(vocab, model_dir_or_name='en') words = torch.LongTensor([[0, 1, 2]]) print(embed(words).size()) diff --git a/test/embeddings/test_static_embedding.py b/test/embeddings/test_static_embedding.py index ca97dd75..83137345 100644 --- a/test/embeddings/test_static_embedding.py +++ b/test/embeddings/test_static_embedding.py @@ -5,6 +5,23 @@ from fastNLP import Vocabulary import torch import os +class TestLoad(unittest.TestCase): + def test_norm1(self): + # 测试只对可以找到的norm + vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile']) + embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/glove.6B.50d_test.txt', + only_norm_found_vector=True) + self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1) + self.assertNotEqual(torch.norm(embed(torch.LongTensor([[4]]))).item(), 1) + + def test_norm2(self): + # 测试对所有都norm + vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile']) + embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/glove.6B.50d_test.txt', + normalize=True) + self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1) + self.assertEqual(round(torch.norm(embed(torch.LongTensor([[4]]))).item(), 4), 1) + class TestRandomSameEntry(unittest.TestCase): def test_same_vector(self): vocab = Vocabulary().add_word_lst(["The", "the", "THE", 'a', "A"]) @@ -21,7 +38,7 @@ class TestRandomSameEntry(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_same_vector2(self): vocab = Vocabulary().add_word_lst(["The", 'a', 'b', "the", "THE", "B", 'a', "A"]) - embed = StaticEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.6B.100d.txt', + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'b', "B", 'a', 'A']]]) words = embed(words) @@ -39,7 +56,7 @@ class TestRandomSameEntry(unittest.TestCase): no_create_word_lst = ['of', 'Of', 'With', 'with'] vocab = Vocabulary().add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) - embed = StaticEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in word_lst+no_create_word_lst]]) words = embed(words) @@ -48,7 +65,7 @@ class TestRandomSameEntry(unittest.TestCase): lowered_no_create_word_lst = [word.lower() for word in no_create_word_lst] lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) - lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) lowered_words = torch.LongTensor([[lowered_vocab.to_index(word) for word in lowered_word_lst+lowered_no_create_word_lst]]) lowered_words = lowered_embed(lowered_words) @@ -67,7 +84,7 @@ class TestRandomSameEntry(unittest.TestCase): all_words = word_lst[:-2] + no_create_word_lst[:-2] vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) - embed = StaticEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=True) words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) words = embed(words) @@ -76,7 +93,7 @@ class TestRandomSameEntry(unittest.TestCase): lowered_no_create_word_lst = [word.lower() for word in no_create_word_lst] lowered_vocab = Vocabulary().add_word_lst(lowered_word_lst) lowered_vocab.add_word_lst(lowered_no_create_word_lst, no_create_entry=True) - lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + lowered_embed = StaticEmbedding(lowered_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) lowered_words = torch.LongTensor([[lowered_vocab.to_index(word.lower()) for word in all_words]]) lowered_words = lowered_embed(lowered_words) @@ -94,14 +111,14 @@ class TestRandomSameEntry(unittest.TestCase): all_words = word_lst[:-2] + no_create_word_lst[:-2] vocab = Vocabulary().add_word_lst(word_lst) vocab.add_word_lst(no_create_word_lst, no_create_entry=True) - embed = StaticEmbedding(vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6B-100d', lower=False, min_freq=2) words = torch.LongTensor([[vocab.to_index(word) for word in all_words]]) words = embed(words) min_freq_vocab = Vocabulary(min_freq=2).add_word_lst(word_lst) min_freq_vocab.add_word_lst(no_create_word_lst, no_create_entry=True) - min_freq_embed = StaticEmbedding(min_freq_vocab, model_dir_or_name='/remote-home/source/fastnlp_caches/glove.6B.100d/glove.demo.txt', + min_freq_embed = StaticEmbedding(min_freq_vocab, model_dir_or_name='en-glove-6B-100d', lower=False) min_freq_words = torch.LongTensor([[min_freq_vocab.to_index(word.lower()) for word in all_words]]) min_freq_words = min_freq_embed(min_freq_words) diff --git a/test/test_tutorials.py b/test/test_tutorials.py index 6f4a8347..3ec0e381 100644 --- a/test/test_tutorials.py +++ b/test/test_tutorials.py @@ -5,14 +5,13 @@ from fastNLP import Instance from fastNLP import Vocabulary from fastNLP.core.losses import CrossEntropyLoss from fastNLP.core.metrics import AccuracyMetric - +from fastNLP.io.loader import CSVLoader class TestTutorial(unittest.TestCase): def test_fastnlp_10min_tutorial(self): # 从csv读取数据到DataSet sample_path = "test/data_for_tests/tutorial_sample_dataset.csv" - dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), - sep='\t') + dataset = CSVLoader(headers=['raw_sentence', 'label'], sep=' ')._load(sample_path) print(len(dataset)) print(dataset[0]) print(dataset[-3]) @@ -110,7 +109,7 @@ class TestTutorial(unittest.TestCase): def test_fastnlp_1min_tutorial(self): # tutorials/fastnlp_1min_tutorial.ipynb data_path = "test/data_for_tests/tutorial_sample_dataset.csv" - ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t') + ds = CSVLoader(headers=['raw_sentence', 'label'], sep=' ')._load(data_path) print(ds[1]) # 将所有数字转为小写 From ed6fd60aa9ee4f689d688a5de2efe5a3c2121895 Mon Sep 17 00:00:00 2001 From: wyg <1505116161@qq.com> Date: Fri, 23 Aug 2019 14:47:46 +0800 Subject: [PATCH 091/153] [verify] char_cnn use pipe --- .../text_classification/model/BertTC.py | 24 ++++++++++ .../text_classification/train_char_cnn.py | 45 +++++++++++++++---- 2 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 reproduction/text_classification/model/BertTC.py diff --git a/reproduction/text_classification/model/BertTC.py b/reproduction/text_classification/model/BertTC.py new file mode 100644 index 00000000..702c0cd1 --- /dev/null +++ b/reproduction/text_classification/model/BertTC.py @@ -0,0 +1,24 @@ +from fastNLP.embeddings import BertEmbedding +import torch +import torch.nn as nn +from fastNLP.core.const import Const as C + +class BertTC(nn.Module): + def __init__(self, vocab,num_class,bert_model_dir_or_name,fine_tune=False): + super(BertTC, self).__init__() + self.embed=BertEmbedding(vocab, requires_grad=fine_tune, + model_dir_or_name=bert_model_dir_or_name,include_cls_sep=True) + self.classifier = nn.Linear(self.embed.embedding_dim, num_class) + + def forward(self, words): + embedding_cls=self.embed(words)[:,0] + output=self.classifier(embedding_cls) + return {C.OUTPUT: output} + + def predict(self,words): + return self.forward(words) + +if __name__=="__main__": + ta=torch.tensor([[1,2,3],[4,5,6],[7,8,9]]) + tb=ta[:,0] + print(tb) diff --git a/reproduction/text_classification/train_char_cnn.py b/reproduction/text_classification/train_char_cnn.py index 3482de70..6b56608a 100644 --- a/reproduction/text_classification/train_char_cnn.py +++ b/reproduction/text_classification/train_char_cnn.py @@ -8,6 +8,7 @@ sys.path.append('../..') from fastNLP.core.const import Const as C import torch.nn as nn from fastNLP.io.data_loader import YelpLoader +from fastNLP.io.pipe.classification import YelpFullPipe,YelpPolarityPipe,SST2Pipe,IMDBPipe #from data.sstLoader import sst2Loader from model.char_cnn import CharacterLevelCNN from fastNLP import CrossEntropyLoss, AccuracyMetric @@ -46,6 +47,8 @@ class Config(): extra_characters='' max_length=1014 weight_decay = 1e-5 + to_lower=True + tokenizer = 'spacy' # 使用spacy进行分词 char_cnn_config={ "alphabet": { @@ -111,12 +114,35 @@ ops=Config ##1.task相关信息:利用dataloader载入dataInfo #dataloader=SST2Loader() #dataloader=IMDBLoader() -dataloader=YelpLoader(fine_grained=True) -datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False) +# dataloader=YelpLoader(fine_grained=True) +# datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False) char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"] ops.number_of_characters=len(char_vocab) ops.embedding_dim=ops.number_of_characters +# load data set +if ops.task == 'yelp_p': + data_bundle = YelpPolarityPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file() +elif ops.task == 'yelp_f': + data_bundle = YelpFullPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file() +elif ops.task == 'imdb': + data_bundle = IMDBPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file() +elif ops.task == 'sst-2': + data_bundle = SST2Pipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file() +else: + raise RuntimeError(f'NOT support {ops.task} task yet!') + + +def wordtochar(words): + chars = [] + for word in words: + #word = word.lower() + for char in word: + chars.append(char) + chars.append('') + chars.pop() + return chars + #chartoindex def chartoindex(chars): max_seq_len=ops.max_length @@ -136,13 +162,14 @@ def chartoindex(chars): char_index_list=[zero_index]*max_seq_len return char_index_list -for dataset in datainfo.datasets.values(): +for dataset in data_bundle.datasets.values(): + dataset.apply_field(wordtochar, field_name="raw_words", new_field_name='chars') dataset.apply_field(chartoindex,field_name='chars',new_field_name='chars') -datainfo.datasets['train'].set_input('chars') -datainfo.datasets['test'].set_input('chars') -datainfo.datasets['train'].set_target('target') -datainfo.datasets['test'].set_target('target') +data_bundle.datasets['train'].set_input('chars') +data_bundle.datasets['test'].set_input('chars') +data_bundle.datasets['train'].set_target('target') +data_bundle.datasets['test'].set_target('target') ##2. 定义/组装模型,这里可以随意,就如果是fastNLP封装好的,类似CNNText就直接用初始化调用就好了,这里只是给出一个伪框架表示占位,在这里建立符合fastNLP输入输出规范的model class ModelFactory(nn.Module): @@ -165,7 +192,7 @@ class ModelFactory(nn.Module): ## 2.或直接复用fastNLP的模型 #vocab=datainfo.vocabs['words'] -vocab_label=datainfo.vocabs['target'] +vocab_label=data_bundle.vocabs['target'] ''' # emded_char=CNNCharEmbedding(vocab) # embed_word = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) @@ -212,5 +239,5 @@ if __name__=="__main__": #print(vocab_label) #print(datainfo.datasets["train"]) - train(model,datainfo,loss,metric,optimizer,num_epochs=ops.train_epoch) + train(model,data_bundle,loss,metric,optimizer,num_epochs=ops.train_epoch) \ No newline at end of file From 8d7c3ba1409c8dc8cc4554a3623c905afb686b3c Mon Sep 17 00:00:00 2001 From: yh Date: Sat, 24 Aug 2019 01:09:28 +0800 Subject: [PATCH 092/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8DCharacterEmbedding?= =?UTF-8?q?=E4=B8=AD=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/char_embedding.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index e772703a..520e85e6 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -14,7 +14,7 @@ from ..modules.encoder.lstm import LSTM from ..core.vocabulary import Vocabulary from .embedding import TokenEmbedding from .utils import _construct_char_vocab_from_vocab - +from .utils import get_embeddings class CNNCharEmbedding(TokenEmbedding): """ @@ -50,7 +50,7 @@ class CNNCharEmbedding(TokenEmbedding): 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, - dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), + dropout:float=0, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None): super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) @@ -58,7 +58,6 @@ class CNNCharEmbedding(TokenEmbedding): assert kernel % 2 == 1, "Only odd kernel is allowed." assert pool_method in ('max', 'avg') - self.dropout = nn.Dropout(dropout) self.pool_method = pool_method # activation function if isinstance(activation, str): @@ -96,7 +95,7 @@ class CNNCharEmbedding(TokenEmbedding): if pre_train_char_embed: self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) else: - self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) + self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size)) self.convs = nn.ModuleList([nn.Conv1d( char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) @@ -164,6 +163,8 @@ class CNNCharEmbedding(TokenEmbedding): for name, param in self.named_parameters(): if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset continue + if 'char_embedding' in name: + continue if param.data.dim()>1: nn.init.xavier_uniform_(param, 1) else: @@ -203,15 +204,14 @@ class LSTMCharEmbedding(TokenEmbedding): 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, - dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, + dropout:float=0, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, bidirectional=True, pre_train_char_embed: str=None): - super(LSTMCharEmbedding, self).__init__(vocab) + super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) assert hidden_size % 2 == 0, "Only even kernel is allowed." assert pool_method in ('max', 'avg') self.pool_method = pool_method - self.dropout = nn.Dropout(dropout) # activation function if isinstance(activation, str): if activation.lower() == 'relu': From d6c597d32e66121a4f24c3fdbf6f5f0a9ee6e56e Mon Sep 17 00:00:00 2001 From: ChenXin Date: Sun, 25 Aug 2019 11:13:25 +0800 Subject: [PATCH 093/153] add __doc__ & __all__ in module 'embeddings' --- fastNLP/embeddings/__init__.py | 1 - fastNLP/embeddings/bert_embedding.py | 158 ++++++++++++--------- fastNLP/embeddings/char_embedding.py | 68 +++++---- fastNLP/embeddings/contextual_embedding.py | 29 ++-- fastNLP/embeddings/elmo_embedding.py | 77 +++++----- fastNLP/embeddings/embedding.py | 56 ++++---- fastNLP/embeddings/stack_embedding.py | 24 +++- fastNLP/embeddings/static_embedding.py | 61 ++++---- fastNLP/embeddings/utils.py | 16 ++- 9 files changed, 277 insertions(+), 213 deletions(-) diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py index 37881f17..8a970e25 100644 --- a/fastNLP/embeddings/__init__.py +++ b/fastNLP/embeddings/__init__.py @@ -18,7 +18,6 @@ __all__ = [ "get_embeddings", ] - from .embedding import Embedding, TokenEmbedding from .static_embedding import StaticEmbedding from .elmo_embedding import ElmoEmbedding diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 6a10c489..e8844aa1 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -1,3 +1,12 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "BertEmbedding", + "BertWordPieceEncoder" +] import os import collections @@ -13,6 +22,7 @@ from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer from .contextual_embedding import ContextualEmbedding import warnings + class BertEmbedding(ContextualEmbedding): """ 别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` @@ -54,11 +64,12 @@ class BertEmbedding(ContextualEmbedding): word pieces后的内容,并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS] 来进行分类的任务将auto_truncate置为True。 """ - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', - pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False, - pooled_cls=True, requires_grad: bool=False, auto_truncate:bool=False): + + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', + pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, + pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False): super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): @@ -71,21 +82,21 @@ class BertEmbedding(ContextualEmbedding): model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - + self._word_sep_index = None if '[SEP]' in vocab: self._word_sep_index = vocab['[SEP]'] - + self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) - + self.requires_grad = requires_grad - self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size - + self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size + def _delete_model_weights(self): del self.model - + def forward(self, words): """ 计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 @@ -100,9 +111,9 @@ class BertEmbedding(ContextualEmbedding): return self.dropout(outputs) outputs = self.model(words) outputs = torch.cat([*outputs], dim=-1) - + return self.dropout(outputs) - + def drop_word(self, words): """ 按照设定随机将words设置为unknown_index。 @@ -120,7 +131,7 @@ class BertEmbedding(ContextualEmbedding): if self._word_sep_index: words.masked_fill_(sep_mask, self._word_sep_index) return words - + @property def requires_grad(self): """ @@ -129,12 +140,12 @@ class BertEmbedding(ContextualEmbedding): :return: """ requires_grads = set([param.requires_grad for name, param in self.named_parameters() - if 'word_pieces_lengths' not in name]) + if 'word_pieces_lengths' not in name]) if len(requires_grads) == 1: return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): @@ -155,10 +166,11 @@ class BertWordPieceEncoder(nn.Module): :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 :param bool requires_grad: 是否需要gradient。 """ - def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False, - word_dropout=0, dropout=0, requires_grad: bool=False): + + def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False, + word_dropout=0, dropout=0, requires_grad: bool = False): super().__init__() - + if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: model_url = _get_embedding_url('bert', model_dir_or_name.lower()) model_dir = cached_path(model_url, name='embedding') @@ -167,7 +179,7 @@ class BertWordPieceEncoder(nn.Module): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - + self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) self._sep_index = self.model._sep_index self._wordpiece_unk_index = self.model._wordpiece_unknown_index @@ -175,7 +187,7 @@ class BertWordPieceEncoder(nn.Module): self.requires_grad = requires_grad self.word_dropout = word_dropout self.dropout_layer = nn.Dropout(dropout) - + @property def requires_grad(self): """ @@ -187,24 +199,24 @@ class BertWordPieceEncoder(nn.Module): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): param.requires_grad = value - + @property def embed_size(self): return self._embed_size - + @property def embedding_dim(self): return self._embed_size - + @property def num_embedding(self): return self.model.encoder.config.vocab_size - + def index_datasets(self, *datasets, field_name, add_cls_sep=True): """ 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 @@ -216,7 +228,7 @@ class BertWordPieceEncoder(nn.Module): :return: """ self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep) - + def forward(self, word_pieces, token_type_ids=None): """ 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 @@ -233,13 +245,13 @@ class BertWordPieceEncoder(nn.Module): token_type_ids = sep_mask_cumsum.fmod(2) if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 token_type_ids = token_type_ids.eq(0).long() - + word_pieces = self.drop_word(word_pieces) outputs = self.model(word_pieces, token_type_ids) outputs = torch.cat([*outputs], dim=-1) - + return self.dropout_layer(outputs) - + def drop_word(self, words): """ 按照设定随机将words设置为unknown_index。 @@ -260,10 +272,10 @@ class BertWordPieceEncoder(nn.Module): class _WordBertModel(nn.Module): - def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', - include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False, min_freq=2): + def __init__(self, model_dir: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', + include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): super().__init__() - + self.tokenzier = BertTokenizer.from_pretrained(model_dir) self.encoder = BertModel.from_pretrained(model_dir) self._max_position_embeddings = self.encoder.config.max_position_embeddings @@ -271,23 +283,23 @@ class _WordBertModel(nn.Module): encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) for layer in self.layers: - if layer<0: - assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ - f"a bert model with {encoder_layer_number} layers." + if layer < 0: + assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + f"a bert model with {encoder_layer_number} layers." else: - assert layer=min_freq and not vocab._is_word_no_create_entry(word): #出现次数大于这个次数才新增 + if index != vocab.unknown_idx and word_pieces[0] == '[UNK]': # 说明这个词不在原始的word里面 + if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry( + word): # 出现次数大于这个次数才新增 word_piece_dict[word] = 1 # 新增一个值 continue for word_piece in word_pieces: @@ -327,7 +340,7 @@ class _WordBertModel(nn.Module): new_word_piece_vocab[token] = len(new_word_piece_vocab) self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) self.encoder.embeddings.word_embeddings = embed - + word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: @@ -347,7 +360,7 @@ class _WordBertModel(nn.Module): self.word_to_wordpieces = np.array(word_to_wordpieces) self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) print("Successfully generate word pieces.") - + def forward(self, words): """ @@ -358,34 +371,37 @@ class _WordBertModel(nn.Module): batch_size, max_word_len = words.size() word_mask = words.ne(self._word_pad_index) # 为1的地方有word seq_len = word_mask.sum(dim=-1) - batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), 0) # batch_size x max_len + batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), + 0) # batch_size x max_len word_pieces_lengths = batch_word_pieces_length.sum(dim=-1) # batch_size word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item() # 表示word piece的长度(包括padding) - if word_piece_length+2>self._max_position_embeddings: + if word_piece_length + 2 > self._max_position_embeddings: if self.auto_truncate: - word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings, - self._max_position_embeddings-2) + word_pieces_lengths = word_pieces_lengths.masked_fill( + word_pieces_lengths + 2 > self._max_position_embeddings, + self._max_position_embeddings - 2) else: - raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the " - f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") - + raise RuntimeError( + "After split words into word pieces, the lengths of word pieces are longer than the " + f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") + # +2是由于需要加入[CLS]与[SEP] - word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)), + word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)), fill_value=self._wordpiece_pad_index) attn_masks = torch.zeros_like(word_pieces) # 1. 获取words的word_pieces的id,以及对应的span范围 word_indexes = words.cpu().numpy() for i in range(batch_size): word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) - if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2: - word_pieces_i = word_pieces_i[:self._max_position_embeddings-2] - word_pieces[i, 1:word_pieces_lengths[i]+1] = torch.LongTensor(word_pieces_i) - attn_masks[i, :word_pieces_lengths[i]+2].fill_(1) + if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2: + word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2] + word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i) + attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1) # 添加[cls]和[sep] word_pieces[:, 0].fill_(self._cls_index) batch_indexes = torch.arange(batch_size).to(words) - word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index - if self._has_sep_in_vocab: #但[SEP]在vocab中出现应该才会需要token_ids + word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index + if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) token_type_ids = sep_mask_cumsum.fmod(2) @@ -396,9 +412,9 @@ class _WordBertModel(nn.Module): # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, - output_all_encoded_layers=True) + output_all_encoded_layers=True) # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size - + if self.include_cls_sep: outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, bert_outputs[-1].size(-1)) @@ -414,7 +430,7 @@ class _WordBertModel(nn.Module): real_word_piece_length = output_layer.size(1) - 2 if word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 paddings = output_layer.new_zeros(batch_size, - word_piece_length-real_word_piece_length, + word_piece_length - real_word_piece_length, output_layer.size(2)) output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() # 从word_piece collapse到word的表示 @@ -423,27 +439,27 @@ class _WordBertModel(nn.Module): if self.pool_method == 'first': for i in range(batch_size): i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 - outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size + outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[ + i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size elif self.pool_method == 'last': for i in range(batch_size): - i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end + i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i] + 1] - 1 # 每个word的end outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] elif self.pool_method == 'max': for i in range(batch_size): for j in range(seq_len[i]): - start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] - outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) + start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] + outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) else: for i in range(batch_size): for j in range(seq_len[i]): - start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] - outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) + start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] + outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) if self.include_cls_sep: - if l in (len(bert_outputs)-1, -1) and self.pooled_cls: + if l in (len(bert_outputs) - 1, -1) and self.pooled_cls: outputs[l_index, :, 0] = pooled_cls else: outputs[l_index, :, 0] = output_layer[:, 0] - outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift] + outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift] # 3. 最终的embedding结果 return outputs - diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index 520e85e6..24c84314 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -3,6 +3,10 @@ 词的index而不需要使用词语中的char的index来获取表达。 """ +__all__ = [ + "CNNCharEmbedding", + "LSTMCharEmbedding" +] import torch import torch.nn as nn @@ -16,6 +20,7 @@ from .embedding import TokenEmbedding from .utils import _construct_char_vocab_from_vocab from .utils import get_embeddings + class CNNCharEmbedding(TokenEmbedding): """ 别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` @@ -49,14 +54,15 @@ class CNNCharEmbedding(TokenEmbedding): (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ - def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, - dropout:float=0, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), - pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None): + + def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, + dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1), + pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None): super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + for kernel in kernel_sizes: assert kernel % 2 == 1, "Only odd kernel is allowed." - + assert pool_method in ('max', 'avg') self.pool_method = pool_method # activation function @@ -74,7 +80,7 @@ class CNNCharEmbedding(TokenEmbedding): else: raise Exception( "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") - + print("Start constructing character vocabulary.") # 建立char的词表 self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) @@ -96,14 +102,14 @@ class CNNCharEmbedding(TokenEmbedding): self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed) else: self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size)) - + self.convs = nn.ModuleList([nn.Conv1d( char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2) for i in range(len(kernel_sizes))]) self._embed_size = embed_size self.fc = nn.Linear(sum(filter_nums), embed_size) self.reset_parameters() - + def forward(self, words): """ 输入words的index后,生成对应的words的表示。 @@ -114,14 +120,14 @@ class CNNCharEmbedding(TokenEmbedding): words = self.drop_word(words) batch_size, max_len = words.size() chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len - word_lengths = self.word_lengths[words] # batch_size x max_len + word_lengths = self.word_lengths[words] # batch_size x max_len max_word_len = word_lengths.max() chars = chars[:, :, :max_word_len] # 为1的地方为mask chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size chars = self.dropout(chars) - reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) + reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1) reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) for conv in self.convs] @@ -129,13 +135,13 @@ class CNNCharEmbedding(TokenEmbedding): conv_chars = self.activation(conv_chars) if self.pool_method == 'max': conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) - chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) + chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters) else: conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0) - chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float() + chars = torch.sum(conv_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() chars = self.fc(chars) return self.dropout(chars) - + @property def requires_grad(self): """ @@ -151,21 +157,21 @@ class CNNCharEmbedding(TokenEmbedding): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中 continue param.requires_grad = value - + def reset_parameters(self): for name, param in self.named_parameters(): if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset continue if 'char_embedding' in name: continue - if param.data.dim()>1: + if param.data.dim() > 1: nn.init.xavier_uniform_(param, 1) else: nn.init.uniform_(param, -1, 1) @@ -203,13 +209,15 @@ class LSTMCharEmbedding(TokenEmbedding): (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径;第二种是传入embedding的名称,第二种情况将自动查看缓存中是否存在该模型, 没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding. """ - def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0, - dropout:float=0, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2, - bidirectional=True, pre_train_char_embed: str=None): + + def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0, + dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu', + min_char_freq: int = 2, + bidirectional=True, pre_train_char_embed: str = None): super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + assert hidden_size % 2 == 0, "Only even kernel is allowed." - + assert pool_method in ('max', 'avg') self.pool_method = pool_method # activation function @@ -227,7 +235,7 @@ class LSTMCharEmbedding(TokenEmbedding): else: raise Exception( "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") - + print("Start constructing character vocabulary.") # 建立char的词表 self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) @@ -249,14 +257,14 @@ class LSTMCharEmbedding(TokenEmbedding): self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed) else: self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size) - + self.fc = nn.Linear(hidden_size, embed_size) hidden_size = hidden_size // 2 if bidirectional else hidden_size - + self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True) self._embed_size = embed_size self.bidirectional = bidirectional - + def forward(self, words): """ 输入words的index后,生成对应的words的表示。 @@ -278,7 +286,7 @@ class LSTMCharEmbedding(TokenEmbedding): char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) # B x M x M x H - + lstm_chars = self.activation(lstm_chars) if self.pool_method == 'max': lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf')) @@ -286,11 +294,11 @@ class LSTMCharEmbedding(TokenEmbedding): else: lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0) chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float() - + chars = self.fc(chars) - + return self.dropout(chars) - + @property def requires_grad(self): """ @@ -307,7 +315,7 @@ class LSTMCharEmbedding(TokenEmbedding): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py index 152b0ab9..2a1e2f82 100644 --- a/fastNLP/embeddings/contextual_embedding.py +++ b/fastNLP/embeddings/contextual_embedding.py @@ -1,3 +1,12 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "ContextualEmbedding" +] + from abc import abstractmethod import torch @@ -8,16 +17,12 @@ from ..core.sampler import SequentialSampler from ..core.utils import _move_model_to_device, _get_model_device from .embedding import TokenEmbedding -__all__ = [ - "ContextualEmbedding" -] - class ContextualEmbedding(TokenEmbedding): - def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0): + def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0): super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - - def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True): + + def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool = True): """ 由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 @@ -34,7 +39,7 @@ class ContextualEmbedding(TokenEmbedding): except Exception as e: print(f"Exception happens at {index} dataset.") raise e - + sent_embeds = {} _move_model_to_device(self, device=device) device = _get_model_device(self) @@ -54,7 +59,7 @@ class ContextualEmbedding(TokenEmbedding): word_embeds = self(words).detach().cpu().numpy() for b in range(words.size(0)): length = seq_len_from_behind[b] - if length==0: + if length == 0: sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b] else: sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] @@ -65,7 +70,7 @@ class ContextualEmbedding(TokenEmbedding): self.sent_embeds = sent_embeds if delete_weights: self._delete_model_weights() - + def _get_sent_reprs(self, words): """ 获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None @@ -88,12 +93,12 @@ class ContextualEmbedding(TokenEmbedding): embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device) return embeds return None - + @abstractmethod def _delete_model_weights(self): """删除计算表示的模型以节省资源""" raise NotImplementedError - + def remove_sentence_cache(self): """ 删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。 diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index 24cd052e..fb5388fd 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -1,6 +1,13 @@ +""" +.. todo:: + doc +""" -import os +__all__ = [ + "ElmoEmbedding" +] +import os import torch import torch.nn as nn import torch.nn.functional as F @@ -49,11 +56,11 @@ class ElmoEmbedding(ContextualEmbedding): :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding, 并删除character encoder,之后将直接使用cache的embedding。默认为False。 """ - + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False, word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False): super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - + # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: model_url = _get_embedding_url('elmo', model_dir_or_name.lower()) @@ -64,7 +71,7 @@ class ElmoEmbedding(ContextualEmbedding): else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) - + if layers == 'mix': self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), requires_grad=requires_grad) @@ -79,16 +86,16 @@ class ElmoEmbedding(ContextualEmbedding): self.layers = layers self._get_outputs = self._get_layer_outputs self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 - + self.requires_grad = requires_grad - + def _get_mixed_outputs(self, outputs): # outputs: num_layers x batch_size x max_len x hidden_size # return: batch_size x max_len x hidden_size weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs) outputs = torch.einsum('l,lbij->bij', weights, outputs) return self.gamma.to(outputs) * outputs - + def set_mix_weights_requires_grad(self, flag=True): """ 当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用 @@ -100,15 +107,15 @@ class ElmoEmbedding(ContextualEmbedding): if hasattr(self, 'layer_weights'): self.layer_weights.requires_grad = flag self.gamma.requires_grad = flag - + def _get_layer_outputs(self, outputs): if len(self.layers) == 1: outputs = outputs[self.layers[0]] else: outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1) - + return outputs - + def forward(self, words: torch.LongTensor): """ 计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的 @@ -125,12 +132,12 @@ class ElmoEmbedding(ContextualEmbedding): outputs = self.model(words) outputs = self._get_outputs(outputs) return self.dropout(outputs) - + def _delete_model_weights(self): for name in ['layers', 'model', 'layer_weights', 'gamma']: if hasattr(self, name): delattr(self, name) - + @property def requires_grad(self): """ @@ -144,7 +151,7 @@ class ElmoEmbedding(ContextualEmbedding): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): @@ -162,7 +169,7 @@ class _ElmoModel(nn.Module): (4) 设计一个保存token的embedding,允许缓存word的表示。 """ - + def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False): super(_ElmoModel, self).__init__() self.model_dir = model_dir @@ -187,14 +194,14 @@ class _ElmoModel(nn.Module): config = json.load(config_f) self.weight_file = os.path.join(model_dir, weight_file) self.config = config - + OOV_TAG = '' PAD_TAG = '' BOS_TAG = '' EOS_TAG = '' BOW_TAG = '' EOW_TAG = '' - + # For the model trained with character-based word encoder. char_lexicon = {} with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi: @@ -204,29 +211,29 @@ class _ElmoModel(nn.Module): tokens.insert(0, '\u3000') token, i = tokens char_lexicon[token] = int(i) - + # 做一些sanity check for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]: assert special_word in char_lexicon, f"{special_word} not found in char.dic." - + # 从vocab中构建char_vocab char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG) # 需要保证在里面 char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG]) - + for word, index in vocab: char_vocab.add_word_lst(list(word)) - + self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx # 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示) char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']), padding_idx=len(char_vocab)) - + # 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu') - + char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight'] - + found_char_count = 0 for char, index in char_vocab: # 调整character embedding if char in char_lexicon: @@ -235,11 +242,11 @@ class _ElmoModel(nn.Module): else: index_in_pre = char_lexicon[OOV_TAG] char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] - + print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") # 生成words到chars的映射 max_chars = config['char_cnn']['max_characters_per_token'] - + self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), fill_value=len(char_vocab), dtype=torch.long), @@ -258,20 +265,20 @@ class _ElmoModel(nn.Module): char_vocab.to_index(EOW_TAG)] char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids)) self.words_to_chars_embedding[index] = torch.LongTensor(char_ids) - + self.char_vocab = char_vocab - + self.token_embedder = ConvTokenEmbedder( config, self.weight_file, None, char_emb_layer) elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight self.token_embedder.load_state_dict(elmo_model["char_cnn"]) - + self.output_dim = config['lstm']['projection_dim'] - + # lstm encoder self.encoder = ElmobiLm(config) self.encoder.load_state_dict(elmo_model["lstm"]) - + if cache_word_reprs: if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 print("Start to generate cache word representations.") @@ -280,7 +287,7 @@ class _ElmoModel(nn.Module): word_size = self.words_to_chars_embedding.size(0) num_batches = word_size // batch_size + \ int(word_size % batch_size != 0) - + self.cached_word_embedding = nn.Embedding(word_size, config['lstm']['projection_dim']) with torch.no_grad(): @@ -291,12 +298,12 @@ class _ElmoModel(nn.Module): word_reprs = self.token_embedder(words.unsqueeze(1), chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) - + print("Finish generating cached word representations. Going to delete the character encoder.") del self.token_embedder, self.words_to_chars_embedding else: print("There is no need to cache word representations, since no character information is used.") - + def forward(self, words): """ @@ -321,7 +328,7 @@ class _ElmoModel(nn.Module): else: chars = None token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim - + encoder_output = self.encoder(token_embedding, seq_len) if encoder_output.size(2) < max_len + 2: num_layers, _, output_len, hidden_size = encoder_output.size() @@ -332,7 +339,7 @@ class _ElmoModel(nn.Module): token_embedding = token_embedding.masked_fill(mask, 0) token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3]) encoder_output = torch.cat((token_embedding, encoder_output), dim=0) - + # 删除, . 这里没有精确地删除,但应该也不会影响最后的结果了。 encoder_output = encoder_output[:, :, 1:-1] return encoder_output diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index 8b746c0d..7ac841ce 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -3,6 +3,10 @@ """ +__all__ = [ + "Embedding", + "TokenEmbedding" +] import torch.nn as nn from abc import abstractmethod @@ -33,11 +37,11 @@ class Embedding(nn.Module): :param float dropout: 对Embedding的输出的dropout。 :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。 """ - + def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None): - + super(Embedding, self).__init__() - + self.embed = get_embeddings(init_embed) self.dropout = nn.Dropout(dropout) @@ -48,44 +52,44 @@ class Embedding(nn.Module): self._embed_size = self.embed.embedding_dim else: self._embed_size = self.embed.weight.size(1) - if word_dropout>0 and not isinstance(unk_index, int): + if word_dropout > 0 and not isinstance(unk_index, int): raise ValueError("When drop word is set, you need to pass in the unk_index.") else: self._embed_size = self.embed.embed_size unk_index = self.embed.get_word_vocab().unknown_idx self.unk_index = unk_index self.word_dropout = word_dropout - + def forward(self, words): """ :param torch.LongTensor words: [batch, seq_len] :return: torch.Tensor : [batch, seq_len, embed_dim] """ - if self.word_dropout>0 and self.training: + if self.word_dropout > 0 and self.training: mask = torch.ones_like(words).float() * self.word_dropout mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self.unk_index) words = self.embed(words) return self.dropout(words) - + @property - def num_embedding(self)->int: + def num_embedding(self) -> int: if isinstance(self.embed, nn.Embedding): return self.embed.weight.size(0) else: return self.embed.num_embedding - + def __len__(self): return len(self.embed) - + @property def embed_size(self) -> int: return self._embed_size - + @property def embedding_dim(self) -> int: return self._embed_size - + @property def requires_grad(self): """ @@ -96,14 +100,14 @@ class Embedding(nn.Module): return self.embed.weight.requires_grad else: return self.embed.requires_grad - + @requires_grad.setter def requires_grad(self, value): if not isinstance(self.embed, TokenEmbedding): self.embed.weight.requires_grad = value else: self.embed.requires_grad = value - + @property def size(self): if isinstance(self.embed, TokenEmbedding): @@ -120,12 +124,12 @@ class TokenEmbedding(nn.Module): assert vocab.padding is not None, "Vocabulary must have a padding entry." self._word_vocab = vocab self._word_pad_index = vocab.padding_idx - if word_dropout>0: + if word_dropout > 0: assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word." self.word_dropout = word_dropout self._word_unk_index = vocab.unknown_idx self.dropout_layer = nn.Dropout(dropout) - + def drop_word(self, words): """ 按照设定随机将words设置为unknown_index。 @@ -138,7 +142,7 @@ class TokenEmbedding(nn.Module): mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 words = words.masked_fill(mask, self._word_unk_index) return words - + def dropout(self, words): """ 对embedding后的word表示进行drop。 @@ -147,7 +151,7 @@ class TokenEmbedding(nn.Module): :return: """ return self.dropout_layer(words) - + @property def requires_grad(self): """ @@ -159,23 +163,23 @@ class TokenEmbedding(nn.Module): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for param in self.parameters(): param.requires_grad = value - + def __len__(self): return len(self._word_vocab) - + @property def embed_size(self) -> int: return self._embed_size - + @property def embedding_dim(self) -> int: return self._embed_size - + @property def num_embedding(self) -> int: """ @@ -183,7 +187,7 @@ class TokenEmbedding(nn.Module): :return: """ return len(self._word_vocab) - + def get_word_vocab(self): """ 返回embedding的词典。 @@ -191,11 +195,11 @@ class TokenEmbedding(nn.Module): :return: Vocabulary """ return self._word_vocab - + @property def size(self): return torch.Size(self.num_embedding, self._embed_size) - + @abstractmethod def forward(self, words): raise NotImplementedError diff --git a/fastNLP/embeddings/stack_embedding.py b/fastNLP/embeddings/stack_embedding.py index d3ce462b..14781945 100644 --- a/fastNLP/embeddings/stack_embedding.py +++ b/fastNLP/embeddings/stack_embedding.py @@ -1,3 +1,12 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "StackEmbedding", +] + from typing import List import torch @@ -26,6 +35,7 @@ class StackEmbedding(TokenEmbedding): :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 """ + def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0): vocabs = [] for embed in embeds: @@ -34,14 +44,14 @@ class StackEmbedding(TokenEmbedding): _vocab = vocabs[0] for vocab in vocabs[1:]: assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary." - + super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout) assert isinstance(embeds, list) for embed in embeds: assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported." self.embeds = nn.ModuleList(embeds) self._embed_size = sum([embed.embed_size for embed in self.embeds]) - + def append(self, embed: TokenEmbedding): """ 添加一个embedding到结尾。 @@ -50,18 +60,18 @@ class StackEmbedding(TokenEmbedding): """ assert isinstance(embed, TokenEmbedding) self.embeds.append(embed) - + def pop(self): """ 弹出最后一个embed :return: """ return self.embeds.pop() - + @property def embed_size(self): return self._embed_size - + @property def requires_grad(self): """ @@ -73,12 +83,12 @@ class StackEmbedding(TokenEmbedding): return requires_grads.pop() else: return None - + @requires_grad.setter def requires_grad(self, value): for embed in self.embeds(): embed.requires_grad = value - + def forward(self, words): """ 得到多个embedding的结果,并把结果按照顺序concat起来。 diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index a75ad18f..1c66e52b 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -1,4 +1,11 @@ +""" +.. todo:: + doc +""" +__all__ = [ + "StaticEmbedding" +] import os import torch @@ -13,6 +20,7 @@ from ..modules.utils import _get_file_name_base_on_postfix from copy import deepcopy from collections import defaultdict + class StaticEmbedding(TokenEmbedding): """ 别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` @@ -55,15 +63,16 @@ class StaticEmbedding(TokenEmbedding): :param bool normalize: 是否对vector进行normalize,使得每个vector的norm为1。 :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。 """ - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True, + + def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True, init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs): super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - if embedding_dim>0: + if embedding_dim > 0: model_dir_or_name = None - + # 得到cache_path if model_dir_or_name is None: - assert embedding_dim>=1, "The dimension of embedding should be larger than 1." + assert embedding_dim >= 1, "The dimension of embedding should be larger than 1." embedding_dim = int(embedding_dim) model_path = None elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: @@ -76,9 +85,9 @@ class StaticEmbedding(TokenEmbedding): model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt') else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - + # 根据min_freq缩小vocab - truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq 1) or (vocab.min_freq and vocab.min_freq < min_freq) if truncate_vocab: truncated_vocab = deepcopy(vocab) truncated_vocab.min_freq = min_freq @@ -89,14 +98,14 @@ class StaticEmbedding(TokenEmbedding): lowered_word_count[word.lower()] += count for word in truncated_vocab.word_count.keys(): word_count = truncated_vocab.word_count[word] - if lowered_word_count[word.lower()]>=min_freq and word_count= min_freq and word_count < min_freq: + truncated_vocab.add_word_lst([word] * (min_freq - word_count), no_create_entry=truncated_vocab._is_word_no_create_entry(word)) - + # 只限制在train里面的词语使用min_freq筛选 if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None: for word in truncated_vocab.word_count.keys(): - if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] Date: Sun, 25 Aug 2019 16:57:47 +0800 Subject: [PATCH 094/153] delete predictor.py --- fastNLP/core/predictor.py | 79 ------------------------------------- test/core/test_predictor.py | 48 ---------------------- 2 files changed, 127 deletions(-) delete mode 100644 fastNLP/core/predictor.py delete mode 100644 test/core/test_predictor.py diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py deleted file mode 100644 index 2d6a7380..00000000 --- a/fastNLP/core/predictor.py +++ /dev/null @@ -1,79 +0,0 @@ -""" - ..todo:: - 检查这个类是否需要 -""" -from collections import defaultdict - -import torch - -from . import DataSetIter -from . import DataSet -from . import SequentialSampler -from .utils import _build_args, _move_dict_value_to_device, _get_model_device - - -class Predictor(object): - """ - 一个根据训练模型预测输出的预测器(Predictor) - - 与测试器(Tester)不同的是,predictor不关心模型性能的评价指标,只做inference。 - 这是一个fastNLP调用的高级模型包装器。它与Trainer、Tester不共享任何操作。 - - :param torch.nn.Module network: 用来完成预测任务的模型 - """ - - def __init__(self, network): - if not isinstance(network, torch.nn.Module): - raise ValueError( - "Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network))) - self.network = network - self.batch_size = 1 - self.batch_output = [] - - def predict(self, data: DataSet, seq_len_field_name=None): - """用已经训练好的模型进行inference. - - :param fastNLP.DataSet data: 待预测的数据集 - :param str seq_len_field_name: 表示序列长度信息的field名字 - :return: dict dict里面的内容为模型预测的结果 - """ - if not isinstance(data, DataSet): - raise ValueError("Only Dataset class is allowed, not {}.".format(type(data))) - if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays: - raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data)) - - prev_training = self.network.training - self.network.eval() - network_device = _get_model_device(self.network) - batch_output = defaultdict(list) - data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) - - if hasattr(self.network, "predict"): - predict_func = self.network.predict - else: - predict_func = self.network.forward - - with torch.no_grad(): - for batch_x, _ in data_iterator: - _move_dict_value_to_device(batch_x, _, device=network_device) - refined_batch_x = _build_args(predict_func, **batch_x) - prediction = predict_func(**refined_batch_x) - - if seq_len_field_name is not None: - seq_lens = batch_x[seq_len_field_name].tolist() - - for key, value in prediction.items(): - value = value.cpu().numpy() - if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): - batch_output[key].extend(value.tolist()) - else: - if seq_len_field_name is not None: - tmp_batch = [] - for idx, seq_len in enumerate(seq_lens): - tmp_batch.append(value[idx, :seq_len]) - batch_output[key].extend(tmp_batch) - else: - batch_output[key].append(value) - - self.network.train(prev_training) - return batch_output diff --git a/test/core/test_predictor.py b/test/core/test_predictor.py deleted file mode 100644 index 701353dc..00000000 --- a/test/core/test_predictor.py +++ /dev/null @@ -1,48 +0,0 @@ -import unittest -from collections import defaultdict - -import numpy as np -import torch - -from fastNLP.core.dataset import DataSet -from fastNLP.core.instance import Instance -from fastNLP.core.predictor import Predictor - - -def prepare_fake_dataset(): - mean = np.array([-3, -3]) - cov = np.array([[1, 0], [0, 1]]) - class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) - - mean = np.array([3, 3]) - cov = np.array([[1, 0], [0, 1]]) - class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) - - data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + - [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) - return data_set - - -class LinearModel(torch.nn.Module): - def __init__(self): - super(LinearModel, self).__init__() - self.linear = torch.nn.Linear(2, 1) - - def forward(self, x): - return {"predict": self.linear(x)} - - -class TestPredictor(unittest.TestCase): - def test_simple(self): - model = LinearModel() - predictor = Predictor(model) - data = prepare_fake_dataset() - data.set_input("x") - ans = predictor.predict(data) - self.assertTrue(isinstance(ans, defaultdict)) - self.assertTrue("predict" in ans) - self.assertTrue(isinstance(ans["predict"], list)) - - def test_sequence(self): - # test sequence input/output - pass From 65a6fd3dc721508f40dab11aac1d0ffac9781eee Mon Sep 17 00:00:00 2001 From: xuyige Date: Sun, 25 Aug 2019 17:48:01 +0800 Subject: [PATCH 095/153] Revert "delete predictor.py" This reverts commit 8445bdbc793c69e998efd9381229820ae9a5ba9d. --- fastNLP/core/predictor.py | 79 +++++++++++++++++++++++++++++++++++++ test/core/test_predictor.py | 48 ++++++++++++++++++++++ 2 files changed, 127 insertions(+) create mode 100644 fastNLP/core/predictor.py create mode 100644 test/core/test_predictor.py diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py new file mode 100644 index 00000000..2d6a7380 --- /dev/null +++ b/fastNLP/core/predictor.py @@ -0,0 +1,79 @@ +""" + ..todo:: + 检查这个类是否需要 +""" +from collections import defaultdict + +import torch + +from . import DataSetIter +from . import DataSet +from . import SequentialSampler +from .utils import _build_args, _move_dict_value_to_device, _get_model_device + + +class Predictor(object): + """ + 一个根据训练模型预测输出的预测器(Predictor) + + 与测试器(Tester)不同的是,predictor不关心模型性能的评价指标,只做inference。 + 这是一个fastNLP调用的高级模型包装器。它与Trainer、Tester不共享任何操作。 + + :param torch.nn.Module network: 用来完成预测任务的模型 + """ + + def __init__(self, network): + if not isinstance(network, torch.nn.Module): + raise ValueError( + "Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network))) + self.network = network + self.batch_size = 1 + self.batch_output = [] + + def predict(self, data: DataSet, seq_len_field_name=None): + """用已经训练好的模型进行inference. + + :param fastNLP.DataSet data: 待预测的数据集 + :param str seq_len_field_name: 表示序列长度信息的field名字 + :return: dict dict里面的内容为模型预测的结果 + """ + if not isinstance(data, DataSet): + raise ValueError("Only Dataset class is allowed, not {}.".format(type(data))) + if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays: + raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data)) + + prev_training = self.network.training + self.network.eval() + network_device = _get_model_device(self.network) + batch_output = defaultdict(list) + data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) + + if hasattr(self.network, "predict"): + predict_func = self.network.predict + else: + predict_func = self.network.forward + + with torch.no_grad(): + for batch_x, _ in data_iterator: + _move_dict_value_to_device(batch_x, _, device=network_device) + refined_batch_x = _build_args(predict_func, **batch_x) + prediction = predict_func(**refined_batch_x) + + if seq_len_field_name is not None: + seq_lens = batch_x[seq_len_field_name].tolist() + + for key, value in prediction.items(): + value = value.cpu().numpy() + if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): + batch_output[key].extend(value.tolist()) + else: + if seq_len_field_name is not None: + tmp_batch = [] + for idx, seq_len in enumerate(seq_lens): + tmp_batch.append(value[idx, :seq_len]) + batch_output[key].extend(tmp_batch) + else: + batch_output[key].append(value) + + self.network.train(prev_training) + return batch_output diff --git a/test/core/test_predictor.py b/test/core/test_predictor.py new file mode 100644 index 00000000..701353dc --- /dev/null +++ b/test/core/test_predictor.py @@ -0,0 +1,48 @@ +import unittest +from collections import defaultdict + +import numpy as np +import torch + +from fastNLP.core.dataset import DataSet +from fastNLP.core.instance import Instance +from fastNLP.core.predictor import Predictor + + +def prepare_fake_dataset(): + mean = np.array([-3, -3]) + cov = np.array([[1, 0], [0, 1]]) + class_A = np.random.multivariate_normal(mean, cov, size=(1000,)) + + mean = np.array([3, 3]) + cov = np.array([[1, 0], [0, 1]]) + class_B = np.random.multivariate_normal(mean, cov, size=(1000,)) + + data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] + + [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B]) + return data_set + + +class LinearModel(torch.nn.Module): + def __init__(self): + super(LinearModel, self).__init__() + self.linear = torch.nn.Linear(2, 1) + + def forward(self, x): + return {"predict": self.linear(x)} + + +class TestPredictor(unittest.TestCase): + def test_simple(self): + model = LinearModel() + predictor = Predictor(model) + data = prepare_fake_dataset() + data.set_input("x") + ans = predictor.predict(data) + self.assertTrue(isinstance(ans, defaultdict)) + self.assertTrue("predict" in ans) + self.assertTrue(isinstance(ans["predict"], list)) + + def test_sequence(self): + # test sequence input/output + pass From 74934271dc77e53a3deb0e7efc85f401f5d1f349 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sun, 25 Aug 2019 18:20:58 +0800 Subject: [PATCH 096/153] =?UTF-8?q?1.=E5=A2=9E=E5=8A=A0sequence=20labellin?= =?UTF-8?q?g=E4=B8=ADbert=20ner;=202.=E5=B0=86print=E6=9B=BF=E6=8D=A2?= =?UTF-8?q?=E4=B8=BAlogger?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/batch.py | 4 +- fastNLP/core/callback.py | 18 +-- fastNLP/core/dataset.py | 11 +- fastNLP/core/dist_trainer.py | 1 - fastNLP/core/field.py | 19 +-- fastNLP/core/tester.py | 2 +- fastNLP/core/utils.py | 4 +- fastNLP/core/vocabulary.py | 7 +- fastNLP/embeddings/bert_embedding.py | 16 +- fastNLP/embeddings/char_embedding.py | 9 +- fastNLP/embeddings/contextual_embedding.py | 10 +- fastNLP/embeddings/elmo_embedding.py | 10 +- fastNLP/embeddings/embedding.py | 4 +- fastNLP/embeddings/static_embedding.py | 9 +- fastNLP/io/embed_loader.py | 8 +- fastNLP/io/file_reader.py | 9 +- fastNLP/io/file_utils.py | 13 +- fastNLP/io/pipe/classification.py | 2 +- fastNLP/io/utils.py | 6 +- fastNLP/modules/encoder/bert.py | 15 +- .../ner/data/Conll2003Loader.py | 93 ----------- .../ner/data/OntoNoteLoader.py | 152 ------------------ .../seqence_labelling/ner/data/utils.py | 49 ------ .../seqence_labelling/ner/model/bert_crf.py | 31 ++++ .../seqence_labelling/ner/test/__init__.py | 0 .../seqence_labelling/ner/test/test.py | 33 ---- .../seqence_labelling/ner/train_bert.py | 52 ++++++ .../seqence_labelling/ner/train_idcnn.py | 22 +-- 28 files changed, 182 insertions(+), 427 deletions(-) delete mode 100644 reproduction/seqence_labelling/ner/data/Conll2003Loader.py delete mode 100644 reproduction/seqence_labelling/ner/data/OntoNoteLoader.py delete mode 100644 reproduction/seqence_labelling/ner/data/utils.py create mode 100644 reproduction/seqence_labelling/ner/model/bert_crf.py delete mode 100644 reproduction/seqence_labelling/ner/test/__init__.py delete mode 100644 reproduction/seqence_labelling/ner/test/test.py create mode 100644 reproduction/seqence_labelling/ner/train_bert.py diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 8d97783e..ff710b30 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -17,7 +17,7 @@ from numbers import Number from .sampler import SequentialSampler from .dataset import DataSet - +from ._logger import logger _python_is_exit = False @@ -75,7 +75,7 @@ class DataSetGetter: try: data, flag = _to_tensor(data, f.dtype) except TypeError as e: - print(f"Field {n} cannot be converted to torch.tensor.") + logger.error(f"Field {n} cannot be converted to torch.tensor.") raise e batch_dict[n] = data return batch_dict diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 24b42b6e..2c130061 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -83,7 +83,6 @@ try: except: tensorboardX_flag = False -from ..io.model_io import ModelSaver, ModelLoader from .dataset import DataSet from .tester import Tester from ._logger import logger @@ -505,7 +504,7 @@ class EarlyStopCallback(Callback): def on_exception(self, exception): if isinstance(exception, EarlyStopError): - print("Early Stopping triggered in epoch {}!".format(self.epoch)) + logger.info("Early Stopping triggered in epoch {}!".format(self.epoch)) else: raise exception # 抛出陌生Error @@ -752,8 +751,7 @@ class LRFinder(Callback): self.smooth_value = SmoothValue(0.8) self.opt = None self.find = None - self.loader = ModelLoader() - + @property def lr_gen(self): scale = (self.end_lr - self.start_lr) / self.batch_per_epoch @@ -768,7 +766,7 @@ class LRFinder(Callback): self.opt = self.trainer.optimizer # pytorch optimizer self.opt.param_groups[0]["lr"] = self.start_lr # save model - ModelSaver("tmp").save_pytorch(self.trainer.model, param_only=True) + torch.save(self.model.state_dict(), 'tmp') self.find = True def on_backward_begin(self, loss): @@ -797,7 +795,9 @@ class LRFinder(Callback): self.opt.param_groups[0]["lr"] = self.best_lr self.find = False # reset model - ModelLoader().load_pytorch(self.trainer.model, "tmp") + states = torch.load('tmp') + self.model.load_state_dict(states) + os.remove('tmp') self.pbar.write("Model reset. \nFind best lr={}".format(self.best_lr)) @@ -988,14 +988,14 @@ class SaveModelCallback(Callback): try: _save_model(self.model, model_name=name, save_dir=self.save_dir, only_param=self.only_param) except Exception as e: - print(f"The following exception:{e} happens when save model to {self.save_dir}.") + logger.error(f"The following exception:{e} happens when save model to {self.save_dir}.") if delete_pair: try: delete_model_path = os.path.join(self.save_dir, delete_pair[1]) if os.path.exists(delete_model_path): os.remove(delete_model_path) except Exception as e: - print(f"Fail to delete model {name} at {self.save_dir} caused by exception:{e}.") + logger.error(f"Fail to delete model {name} at {self.save_dir} caused by exception:{e}.") def on_exception(self, exception): if self.save_on_exception: @@ -1032,7 +1032,7 @@ class EchoCallback(Callback): def __getattribute__(self, item): if item.startswith('on_'): - print('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid()), + logger.info('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid()), file=self.out) return super(EchoCallback, self).__getattribute__(item) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 4c689842..51bcef43 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -300,6 +300,7 @@ from .utils import _get_func_signature from .field import AppendToTargetOrInputException from .field import SetInputOrTargetException from .const import Const +from ._logger import logger class DataSet(object): """ @@ -452,7 +453,7 @@ class DataSet(object): try: self.field_arrays[name].append(field) except AppendToTargetOrInputException as e: - print(f"Cannot append to field:{name}.") + logger.error(f"Cannot append to field:{name}.") raise e def add_fieldarray(self, field_name, fieldarray): @@ -609,7 +610,7 @@ class DataSet(object): self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self.field_arrays[name].is_target = flag except SetInputOrTargetException as e: - print(f"Cannot set field:{name} as target.") + logger.error(f"Cannot set field:{name} as target.") raise e else: raise KeyError("{} is not a valid field name.".format(name)) @@ -633,7 +634,7 @@ class DataSet(object): self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self.field_arrays[name].is_input = flag except SetInputOrTargetException as e: - print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.") + logger.error(f"Cannot set field:{name} as input, exception happens at the {e.index} value.") raise e else: raise KeyError("{} is not a valid field name.".format(name)) @@ -728,7 +729,7 @@ class DataSet(object): results.append(func(ins[field_name])) except Exception as e: if idx != -1: - print("Exception happens at the `{}`th(from 1) instance.".format(idx+1)) + logger.error("Exception happens at the `{}`th(from 1) instance.".format(idx+1)) raise e if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(_get_func_signature(func=func))) @@ -795,7 +796,7 @@ class DataSet(object): results.append(func(ins)) except BaseException as e: if idx != -1: - print("Exception happens at the `{}`th instance.".format(idx)) + logger.error("Exception happens at the `{}`th instance.".format(idx)) raise e # results = [func(ins) for ins in self._inner_iter()] diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 346539cd..7c64fee4 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -54,7 +54,6 @@ class DistTrainer(): num_workers=1, drop_last=False, dev_data=None, metrics=None, metric_key=None, update_every=1, print_every=10, validate_every=-1, - log_path=None, save_every=-1, save_path=None, device='auto', fp16='', backend=None, init_method=None): diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 26d22ada..b3f024f8 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -12,6 +12,7 @@ from abc import abstractmethod from copy import deepcopy from collections import Counter from .utils import _is_iterable +from ._logger import logger class SetInputOrTargetException(Exception): @@ -39,7 +40,7 @@ class FieldArray: try: _content = list(_content) except BaseException as e: - print(f"Cannot convert content(of type:{type(content)}) into list.") + logger.error(f"Cannot convert content(of type:{type(content)}) into list.") raise e self.name = name self.content = _content @@ -263,7 +264,7 @@ class FieldArray: try: new_contents.append(cell.split(sep)) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) @@ -283,8 +284,8 @@ class FieldArray: else: new_contents.append(int(cell)) except Exception as e: - print(f"Exception happens when process value in index {index}.") - print(e) + logger.error(f"Exception happens when process value in index {index}.") + raise e return self._after_process(new_contents, inplace=inplace) def float(self, inplace=True): @@ -303,7 +304,7 @@ class FieldArray: else: new_contents.append(float(cell)) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) @@ -323,7 +324,7 @@ class FieldArray: else: new_contents.append(bool(cell)) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) @@ -344,7 +345,7 @@ class FieldArray: else: new_contents.append(cell.lower()) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) @@ -364,7 +365,7 @@ class FieldArray: else: new_contents.append(cell.upper()) except Exception as e: - print(f"Exception happens when process value in index {index}.") + logger.error(f"Exception happens when process value in index {index}.") raise e return self._after_process(new_contents, inplace=inplace) @@ -401,7 +402,7 @@ class FieldArray: self.is_input = self.is_input self.is_target = self.is_input except SetInputOrTargetException as e: - print("The newly generated field cannot be set as input or target.") + logger.error("The newly generated field cannot be set as input or target.") raise e return self else: diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index b339f671..e549df81 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -192,7 +192,7 @@ class Tester(object): dataset=self.data, check_level=0) if self.verbose >= 1: - print("[tester] \n{}".format(self._format_eval_results(eval_results))) + logger.info("[tester] \n{}".format(self._format_eval_results(eval_results))) self._mode(network, is_test=False) return eval_results diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index a023c29e..fcb2a07b 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -145,7 +145,7 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1): with open(cache_filepath, 'rb') as f: results = _pickle.load(f) if verbose == 1: - print("Read cache from {}.".format(cache_filepath)) + logger.info("Read cache from {}.".format(cache_filepath)) refresh_flag = False if refresh_flag: @@ -156,7 +156,7 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1): _prepare_cache_filepath(cache_filepath) with open(cache_filepath, 'wb') as f: _pickle.dump(results, f) - print("Save cache to {}.".format(cache_filepath)) + logger.info("Save cache to {}.".format(cache_filepath)) return results diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 330d73dd..92f54f9a 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -10,6 +10,7 @@ from .utils import Option from functools import partial import numpy as np from .utils import _is_iterable +from ._logger import logger class VocabularyOption(Option): def __init__(self, @@ -49,7 +50,7 @@ def _check_build_status(func): if self.rebuild is False: self.rebuild = True if self.max_size is not None and len(self.word_count) >= self.max_size: - print("[Warning] Vocabulary has reached the max size {} when calling {} method. " + logger.info("[Warning] Vocabulary has reached the max size {} when calling {} method. " "Adding more words may cause unexpected behaviour of Vocabulary. ".format( self.max_size, func.__name__)) return func(self, *args, **kwargs) @@ -297,7 +298,7 @@ class Vocabulary(object): for f_n, n_f_n in zip(field_name, new_field_name): dataset.apply_field(index_instance, field_name=f_n, new_field_name=n_f_n) except Exception as e: - print("When processing the `{}` dataset, the following error occurred.".format(idx)) + logger.info("When processing the `{}` dataset, the following error occurred.".format(idx)) raise e else: raise RuntimeError("Only DataSet type is allowed.") @@ -353,7 +354,7 @@ class Vocabulary(object): try: dataset.apply(construct_vocab) except BaseException as e: - print("When processing the `{}` dataset, the following error occurred:".format(idx)) + log("When processing the `{}` dataset, the following error occurred:".format(idx)) raise e else: raise TypeError("Only DataSet type is allowed.") diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index e8844aa1..4bd06ec3 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -21,6 +21,7 @@ from ..io.file_utils import _get_embedding_url, cached_path, PRETRAINED_BERT_MOD from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer from .contextual_embedding import ContextualEmbedding import warnings +from ..core import logger class BertEmbedding(ContextualEmbedding): @@ -125,8 +126,10 @@ class BertEmbedding(ContextualEmbedding): with torch.no_grad(): if self._word_sep_index: # 不能drop sep sep_mask = words.eq(self._word_sep_index) - mask = torch.ones_like(words).float() * self.word_dropout + mask = torch.full_like(words, fill_value=self.word_dropout) mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 + pad_mask = words.ne(0) + mask = pad_mask.__and__(mask) # pad的位置不为unk words = words.masked_fill(mask, self._word_unk_index) if self._word_sep_index: words.masked_fill_(sep_mask, self._word_sep_index) @@ -182,6 +185,7 @@ class BertWordPieceEncoder(nn.Module): self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) self._sep_index = self.model._sep_index + self._wordpiece_pad_index = self.model._wordpiece_pad_index self._wordpiece_unk_index = self.model._wordpiece_unknown_index self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size self.requires_grad = requires_grad @@ -263,8 +267,10 @@ class BertWordPieceEncoder(nn.Module): with torch.no_grad(): if self._word_sep_index: # 不能drop sep sep_mask = words.eq(self._wordpiece_unk_index) - mask = torch.ones_like(words).float() * self.word_dropout + mask = torch.full_like(words, fill_value=self.word_dropout) mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 + pad_mask = words.ne(self._wordpiece_pad_index) + mask = pad_mask.__and__(mask) # pad的位置不为unk words = words.masked_fill(mask, self._word_unk_index) if self._word_sep_index: words.masked_fill_(sep_mask, self._wordpiece_unk_index) @@ -297,7 +303,7 @@ class _WordBertModel(nn.Module): self.auto_truncate = auto_truncate # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] - print("Start to generating word pieces for word.") + logger.info("Start to generating word pieces for word.") # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 word_piece_dict = {'[CLS]': 1, '[SEP]': 1} # 用到的word_piece以及新增的 found_count = 0 @@ -356,10 +362,10 @@ class _WordBertModel(nn.Module): self._sep_index = self.tokenzier.vocab['[SEP]'] self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece - print("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab))) + logger.info("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab))) self.word_to_wordpieces = np.array(word_to_wordpieces) self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) - print("Successfully generate word pieces.") + logger.debug("Successfully generate word pieces.") def forward(self, words): """ diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index 24c84314..acffa054 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -19,6 +19,7 @@ from ..core.vocabulary import Vocabulary from .embedding import TokenEmbedding from .utils import _construct_char_vocab_from_vocab from .utils import get_embeddings +from ..core import logger class CNNCharEmbedding(TokenEmbedding): @@ -81,11 +82,11 @@ class CNNCharEmbedding(TokenEmbedding): raise Exception( "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") - print("Start constructing character vocabulary.") + logger.info("Start constructing character vocabulary.") # 建立char的词表 self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) self.char_pad_index = self.char_vocab.padding_idx - print(f"In total, there are {len(self.char_vocab)} distinct characters.") + logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index max_word_len = max(map(lambda x: len(x[0]), vocab)) self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len), @@ -236,11 +237,11 @@ class LSTMCharEmbedding(TokenEmbedding): raise Exception( "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]") - print("Start constructing character vocabulary.") + logger.info("Start constructing character vocabulary.") # 建立char的词表 self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq) self.char_pad_index = self.char_vocab.padding_idx - print(f"In total, there are {len(self.char_vocab)} distinct characters.") + logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index self.max_word_len = max(map(lambda x: len(x[0]), vocab)) self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len), diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py index 2a1e2f82..2c304da7 100644 --- a/fastNLP/embeddings/contextual_embedding.py +++ b/fastNLP/embeddings/contextual_embedding.py @@ -16,7 +16,7 @@ from ..core.batch import DataSetIter from ..core.sampler import SequentialSampler from ..core.utils import _move_model_to_device, _get_model_device from .embedding import TokenEmbedding - +from ..core import logger class ContextualEmbedding(TokenEmbedding): def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0): @@ -37,14 +37,14 @@ class ContextualEmbedding(TokenEmbedding): assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed." assert 'words' in dataset.get_input_name(), "`words` field has to be set as input." except Exception as e: - print(f"Exception happens at {index} dataset.") + logger.error(f"Exception happens at {index} dataset.") raise e sent_embeds = {} _move_model_to_device(self, device=device) device = _get_model_device(self) pad_index = self._word_vocab.padding_idx - print("Start to calculate sentence representations.") + logger.info("Start to calculate sentence representations.") with torch.no_grad(): for index, dataset in enumerate(datasets): try: @@ -64,9 +64,9 @@ class ContextualEmbedding(TokenEmbedding): else: sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length] except Exception as e: - print(f"Exception happens at {index} dataset.") + logger.error(f"Exception happens at {index} dataset.") raise e - print("Finish calculating sentence representations.") + logger.info("Finish calculating sentence representations.") self.sent_embeds = sent_embeds if delete_weights: self._delete_model_weights() diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index fb5388fd..3df424a2 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -18,7 +18,7 @@ from ..core.vocabulary import Vocabulary from ..io.file_utils import cached_path, _get_embedding_url, PRETRAINED_ELMO_MODEL_DIR from ..modules.encoder._elmo import ElmobiLm, ConvTokenEmbedder from .contextual_embedding import ContextualEmbedding - +from ..core import logger class ElmoEmbedding(ContextualEmbedding): """ @@ -243,7 +243,7 @@ class _ElmoModel(nn.Module): index_in_pre = char_lexicon[OOV_TAG] char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre] - print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") + logger.info(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.") # 生成words到chars的映射 max_chars = config['char_cnn']['max_characters_per_token'] @@ -281,7 +281,7 @@ class _ElmoModel(nn.Module): if cache_word_reprs: if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用 - print("Start to generate cache word representations.") + logger.info("Start to generate cache word representations.") batch_size = 320 # bos eos word_size = self.words_to_chars_embedding.size(0) @@ -299,10 +299,10 @@ class _ElmoModel(nn.Module): chars).detach() # batch_size x 1 x config['encoder']['projection_dim'] self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1) - print("Finish generating cached word representations. Going to delete the character encoder.") + logger.info("Finish generating cached word representations. Going to delete the character encoder.") del self.token_embedder, self.words_to_chars_embedding else: - print("There is no need to cache word representations, since no character information is used.") + logger.info("There is no need to cache word representations, since no character information is used.") def forward(self, words): """ diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index 7ac841ce..a94985c1 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -138,8 +138,10 @@ class TokenEmbedding(nn.Module): :return: """ if self.word_dropout > 0 and self.training: - mask = torch.ones_like(words).float() * self.word_dropout + mask = torch.full_like(words, fill_value=self.word_dropout) mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 + pad_mask = words.ne(self._word_pad_index) + mask = mask.__and__(pad_mask) words = words.masked_fill(mask, self._word_unk_index) return words diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 1c66e52b..98986565 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -19,6 +19,7 @@ from .embedding import TokenEmbedding from ..modules.utils import _get_file_name_base_on_postfix from copy import deepcopy from collections import defaultdict +from ..core import logger class StaticEmbedding(TokenEmbedding): @@ -112,7 +113,7 @@ class StaticEmbedding(TokenEmbedding): truncated_words_to_words = torch.arange(len(vocab)).long() for word, index in vocab: truncated_words_to_words[index] = truncated_vocab.to_index(word) - print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") + logger.info(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.") vocab = truncated_vocab self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False) @@ -124,7 +125,7 @@ class StaticEmbedding(TokenEmbedding): lowered_vocab.add_word(word.lower(), no_create_entry=True) else: lowered_vocab.add_word(word.lower()) # 先加入需要创建entry的 - print(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} " + logger.info(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} " f"unique lowered words.") if model_path: embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) @@ -265,9 +266,9 @@ class StaticEmbedding(TokenEmbedding): if error == 'ignore': warnings.warn("Error occurred at the {} line.".format(idx)) else: - print("Error occurred at the {} line.".format(idx)) + logger.error("Error occurred at the {} line.".format(idx)) raise e - print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) + logger.info("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) for word, index in vocab: if index not in matrix and not vocab._is_word_no_create_entry(word): if found_unknown: # 如果有unkonwn,用unknown初始化 diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 48048983..c58385e1 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -11,7 +11,7 @@ import numpy as np from ..core.vocabulary import Vocabulary from .data_bundle import BaseLoader from ..core.utils import Option - +import logging class EmbeddingOption(Option): def __init__(self, @@ -91,10 +91,10 @@ class EmbedLoader(BaseLoader): if error == 'ignore': warnings.warn("Error occurred at the {} line.".format(idx)) else: - print("Error occurred at the {} line.".format(idx)) + logging.error("Error occurred at the {} line.".format(idx)) raise e total_hits = sum(hit_flags) - print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) + logging.info("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) if init_method is None: found_vectors = matrix[hit_flags] if len(found_vectors) != 0: @@ -157,7 +157,7 @@ class EmbedLoader(BaseLoader): warnings.warn("Error occurred at the {} line.".format(idx)) pass else: - print("Error occurred at the {} line.".format(idx)) + logging.error("Error occurred at the {} line.".format(idx)) raise e if dim == -1: raise RuntimeError("{} is an empty file.".format(embed_filepath)) diff --git a/fastNLP/io/file_reader.py b/fastNLP/io/file_reader.py index 6aa89b80..0320572c 100644 --- a/fastNLP/io/file_reader.py +++ b/fastNLP/io/file_reader.py @@ -2,7 +2,8 @@ 此模块用于给其它模块提供读取文件的函数,没有为用户提供 API """ import json -import warnings +from ..core import logger + def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True): """ @@ -103,9 +104,9 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): yield line_idx, res except Exception as e: if dropna: - warnings.warn('Invalid instance ends at line: {} has been dropped.'.format(line_idx)) + logger.warn('Invalid instance which ends at line: {} has been dropped.'.format(line_idx)) continue - raise ValueError('Invalid instance ends at line: {}'.format(line_idx)) + raise ValueError('Invalid instance which ends at line: {}'.format(line_idx)) elif line.startswith('#'): continue else: @@ -117,5 +118,5 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): except Exception as e: if dropna: return - print('invalid instance ends at line: {}'.format(line_idx)) + logger.error('invalid instance ends at line: {}'.format(line_idx)) raise e diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 5af3c4ff..9dbb515d 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -7,6 +7,7 @@ import tempfile from tqdm import tqdm import shutil from requests import HTTPError +from ..core import logger PRETRAINED_BERT_MODEL_DIR = { 'en': 'bert-base-cased.zip', @@ -336,7 +337,7 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: content_length = req.headers.get("Content-Length") total = int(content_length) if content_length is not None else None progress = tqdm(unit="B", total=total, unit_scale=1) - print("%s not found in cache, downloading to %s" % (url, temp_filename)) + logger.info("%s not found in cache, downloading to %s" % (url, temp_filename)) with open(temp_filename, "wb") as temp_file: for chunk in req.iter_content(chunk_size=1024 * 16): @@ -344,12 +345,12 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: progress.update(len(chunk)) temp_file.write(chunk) progress.close() - print(f"Finish download from {url}") + logger.info(f"Finish download from {url}") # 开始解压 if suffix in ('.zip', '.tar.gz', '.gz'): uncompress_temp_dir = tempfile.mkdtemp() - print(f"Start to uncompress file to {uncompress_temp_dir}") + logger.debug(f"Start to uncompress file to {uncompress_temp_dir}") if suffix == '.zip': unzip_file(Path(temp_filename), Path(uncompress_temp_dir)) elif suffix == '.gz': @@ -362,13 +363,13 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0]) cache_path.mkdir(parents=True, exist_ok=True) - print("Finish un-compressing file.") + logger.debug("Finish un-compressing file.") else: uncompress_temp_dir = temp_filename cache_path = str(cache_path) + suffix # 复制到指定的位置 - print(f"Copy file to {cache_path}") + logger.info(f"Copy file to {cache_path}") if os.path.isdir(uncompress_temp_dir): for filename in os.listdir(uncompress_temp_dir): if os.path.isdir(os.path.join(uncompress_temp_dir, filename)): @@ -379,7 +380,7 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path: shutil.copyfile(uncompress_temp_dir, cache_path) success = True except Exception as e: - print(e) + logger.error(e) raise e finally: if not success: diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index daa17da9..f42d5400 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -11,7 +11,7 @@ from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_insta from .pipe import Pipe import re nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') -from ...core.utils import cache_results + class _CLSPipe(Pipe): """ diff --git a/fastNLP/io/utils.py b/fastNLP/io/utils.py index 76b32b0a..faec2a55 100644 --- a/fastNLP/io/utils.py +++ b/fastNLP/io/utils.py @@ -2,7 +2,7 @@ import os from typing import Union, Dict from pathlib import Path - +from ..core import logger def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: """ @@ -70,8 +70,8 @@ def get_tokenizer(): import spacy spacy.prefer_gpu() en = spacy.load('en') - print('use spacy tokenizer') + logger.info('use spacy tokenizer') return lambda x: [w.text for w in en.tokenizer(x)] except Exception as e: - print('use raw tokenizer') + logger.error('use raw tokenizer') return lambda x: x.split() diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index ffc43863..b74c4da0 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -17,8 +17,7 @@ import os import torch from torch import nn -import sys - +from ...core import logger from ..utils import _get_file_name_base_on_postfix CONFIG_FILE = 'bert_config.json' @@ -489,10 +488,10 @@ class BertModel(nn.Module): load(model, prefix='' if hasattr(model, 'bert') else 'bert.') if len(missing_keys) > 0: - print("Weights of {} not initialized from pretrained model: {}".format( + logger.warn("Weights of {} not initialized from pretrained model: {}".format( model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: - print("Weights from pretrained model not used in {}: {}".format( + logger.warn("Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) return model @@ -799,7 +798,7 @@ class BertTokenizer(object): for token in tokens: ids.append(self.vocab[token]) if len(ids) > self.max_len: - print( + logger.warn( "Token indices sequence length is longer than the specified maximum " " sequence length for this BERT model ({} > {}). Running this" " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) @@ -823,7 +822,7 @@ class BertTokenizer(object): with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: - print("Saving vocabulary to {}: vocabulary indices are not consecutive." + logger.warn("Saving vocabulary to {}: vocabulary indices are not consecutive." " Please check that the vocabulary is not corrupted!".format(vocab_file)) index = token_index writer.write(token + u'\n') @@ -837,7 +836,7 @@ class BertTokenizer(object): """ pretrained_model_name_or_path = _get_file_name_base_on_postfix(model_dir, '.txt') - print("loading vocabulary file {}".format(pretrained_model_name_or_path)) + logger.info("loading vocabulary file {}".format(pretrained_model_name_or_path)) max_len = 512 kwargs['max_len'] = min(kwargs.get('max_position_embeddings', int(1e12)), max_len) # Instantiate tokenizer. @@ -901,7 +900,7 @@ class _WordPieceBertModel(nn.Module): is_input=True) dataset.set_pad_val('word_pieces', self._wordpiece_pad_index) except Exception as e: - print(f"Exception happens when processing the {index} dataset.") + logger.error(f"Exception happens when processing the {index} dataset.") raise e def forward(self, word_pieces, token_type_ids=None): diff --git a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py deleted file mode 100644 index 0af4681e..00000000 --- a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py +++ /dev/null @@ -1,93 +0,0 @@ - -from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.data_bundle import DataSetLoader, DataBundle -from typing import Union, Dict -from fastNLP import Vocabulary -from fastNLP import Const -from reproduction.utils import check_dataloader_paths - -from fastNLP.io import ConllLoader -from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 - - -class Conll2003DataLoader(DataSetLoader): - def __init__(self, task:str='ner', encoding_type:str='bioes'): - """ - 加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos - 时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回 - 的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但 - 鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行 - ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。 - - :param task: 指定需要标注任务。可选ner, pos, chunk - """ - assert task in ('ner', 'pos', 'chunk') - index = {'ner':3, 'pos':1, 'chunk':2}[task] - self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index]) - self._tag_converters = [] - if task in ('ner', 'chunk'): - self._tag_converters = [iob2] - if encoding_type == 'bioes': - self._tag_converters.append(iob2bioes) - - def load(self, path: str): - dataset = self._loader.load(path) - def convert_tag_schema(tags): - for converter in self._tag_converters: - tags = converter(tags) - return tags - if self._tag_converters: - dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) - return dataset - - def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=False): - """ - 读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略 - - :param paths: - :param word_vocab_opt: vocabulary的初始化值 - :param lower: 是否将所有字母转为小写。 - :return: - """ - # 读取数据 - paths = check_dataloader_paths(paths) - data = DataBundle() - input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] - target_fields = [Const.TARGET, Const.INPUT_LEN] - for name, path in paths.items(): - dataset = self.load(path) - dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) - if lower: - dataset.words.lower() - data.datasets[name] = dataset - - # 对construct vocab - word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, - no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) - word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) - data.vocabs[Const.INPUT] = word_vocab - - # cap words - cap_word_vocab = Vocabulary() - cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words', - no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) - cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') - input_fields.append('cap_words') - data.vocabs['cap_words'] = cap_word_vocab - - # 对target建vocab - target_vocab = Vocabulary(unknown=None, padding=None) - target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) - target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) - data.vocabs[Const.TARGET] = target_vocab - - for name, dataset in data.datasets.items(): - dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) - dataset.set_input(*input_fields) - dataset.set_target(*target_fields) - - return data - -if __name__ == '__main__': - pass \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py deleted file mode 100644 index 25c6f29b..00000000 --- a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py +++ /dev/null @@ -1,152 +0,0 @@ -from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.io.data_bundle import DataSetLoader, DataBundle -from typing import Union, Dict -from fastNLP import DataSet -from fastNLP import Vocabulary -from fastNLP import Const -from reproduction.utils import check_dataloader_paths - -from fastNLP.io import ConllLoader -from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 - -class OntoNoteNERDataLoader(DataSetLoader): - """ - 用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。 - - """ - def __init__(self, encoding_type:str='bioes'): - assert encoding_type in ('bioes', 'bio') - self.encoding_type = encoding_type - if encoding_type=='bioes': - self.encoding_method = iob2bioes - else: - self.encoding_method = iob2 - - def load(self, path:str)->DataSet: - """ - 给定一个文件路径,读取数据。返回的DataSet包含以下的field - raw_words: List[str] - target: List[str] - - :param path: - :return: - """ - dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path) - def convert_to_bio(tags): - bio_tags = [] - flag = None - for tag in tags: - label = tag.strip("()*") - if '(' in tag: - bio_label = 'B-' + label - flag = label - elif flag: - bio_label = 'I-' + flag - else: - bio_label = 'O' - if ')' in tag: - flag = None - bio_tags.append(bio_label) - return self.encoding_method(bio_tags) - - def convert_word(words): - converted_words = [] - for word in words: - word = word.replace('/.', '.') # 有些结尾的.是/.形式的 - if not word.startswith('-'): - converted_words.append(word) - continue - # 以下是由于这些符号被转义了,再转回来 - tfrs = {'-LRB-':'(', - '-RRB-': ')', - '-LSB-': '[', - '-RSB-': ']', - '-LCB-': '{', - '-RCB-': '}' - } - if word in tfrs: - converted_words.append(tfrs[word]) - else: - converted_words.append(word) - return converted_words - - dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words') - dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target') - - return dataset - - def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, - lower:bool=True)->DataBundle: - """ - 读取并处理数据。返回的DataInfo包含以下的内容 - vocabs: - word: Vocabulary - target: Vocabulary - datasets: - train: DataSet - words: List[int], 被设置为input - target: int. label,被同时设置为input和target - seq_len: int. 句子的长度,被同时设置为input和target - raw_words: List[str] - xxx(根据传入的paths可能有所变化) - - :param paths: - :param word_vocab_opt: vocabulary的初始化值 - :param lower: 是否使用小写 - :return: - """ - paths = check_dataloader_paths(paths) - data = DataBundle() - input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] - target_fields = [Const.TARGET, Const.INPUT_LEN] - for name, path in paths.items(): - dataset = self.load(path) - dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) - if lower: - dataset.words.lower() - data.datasets[name] = dataset - - # 对construct vocab - word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, - no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) - word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) - data.vocabs[Const.INPUT] = word_vocab - - # cap words - cap_word_vocab = Vocabulary() - cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') - cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') - input_fields.append('cap_words') - data.vocabs['cap_words'] = cap_word_vocab - - # 对target建vocab - target_vocab = Vocabulary(unknown=None, padding=None) - target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) - target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) - data.vocabs[Const.TARGET] = target_vocab - - for name, dataset in data.datasets.items(): - dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) - dataset.set_input(*input_fields) - dataset.set_target(*target_fields) - - return data - - -if __name__ == '__main__': - loader = OntoNoteNERDataLoader() - dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt') - print(dataset.target.value_count()) - print(dataset[:4]) - - -""" -train 115812 2200752 -development 15680 304684 -test 12217 230111 - -train 92403 1901772 -valid 13606 279180 -test 10258 204135 -""" \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/data/utils.py b/reproduction/seqence_labelling/ner/data/utils.py deleted file mode 100644 index 8f7af792..00000000 --- a/reproduction/seqence_labelling/ner/data/utils.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import List - -def iob2(tags:List[str])->List[str]: - """ - 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。 - - :param tags: 需要转换的tags - """ - for i, tag in enumerate(tags): - if tag == "O": - continue - split = tag.split("-") - if len(split) != 2 or split[0] not in ["I", "B"]: - raise TypeError("The encoding schema is not a valid IOB type.") - if split[0] == "B": - continue - elif i == 0 or tags[i - 1] == "O": # conversion IOB1 to IOB2 - tags[i] = "B" + tag[1:] - elif tags[i - 1][1:] == tag[1:]: - continue - else: # conversion IOB1 to IOB2 - tags[i] = "B" + tag[1:] - return tags - -def iob2bioes(tags:List[str])->List[str]: - """ - 将iob的tag转换为bmeso编码 - :param tags: - :return: - """ - new_tags = [] - for i, tag in enumerate(tags): - if tag == 'O': - new_tags.append(tag) - else: - split = tag.split('-')[0] - if split == 'B': - if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I': - new_tags.append(tag) - else: - new_tags.append(tag.replace('B-', 'S-')) - elif split == 'I': - if i + 1 Date: Sun, 25 Aug 2019 18:58:03 +0800 Subject: [PATCH 097/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dword=20drop=20bug,=20?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=9B=B8=E5=BA=94=E6=B5=8B=E8=AF=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 4 ++-- fastNLP/embeddings/embedding.py | 2 +- test/embeddings/test_bert_embedding.py | 9 ++++++++- test/embeddings/test_static_embedding.py | 11 +++++++++++ 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 4bd06ec3..047048d8 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -126,7 +126,7 @@ class BertEmbedding(ContextualEmbedding): with torch.no_grad(): if self._word_sep_index: # 不能drop sep sep_mask = words.eq(self._word_sep_index) - mask = torch.full_like(words, fill_value=self.word_dropout) + mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 pad_mask = words.ne(0) mask = pad_mask.__and__(mask) # pad的位置不为unk @@ -267,7 +267,7 @@ class BertWordPieceEncoder(nn.Module): with torch.no_grad(): if self._word_sep_index: # 不能drop sep sep_mask = words.eq(self._wordpiece_unk_index) - mask = torch.full_like(words, fill_value=self.word_dropout) + mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 pad_mask = words.ne(self._wordpiece_pad_index) mask = pad_mask.__and__(mask) # pad的位置不为unk diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index a94985c1..5e7b9803 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -138,7 +138,7 @@ class TokenEmbedding(nn.Module): :return: """ if self.word_dropout > 0 and self.training: - mask = torch.full_like(words, fill_value=self.word_dropout) + mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 pad_mask = words.ne(self._word_pad_index) mask = mask.__and__(pad_mask) diff --git a/test/embeddings/test_bert_embedding.py b/test/embeddings/test_bert_embedding.py index 760029a3..da81c8c9 100644 --- a/test/embeddings/test_bert_embedding.py +++ b/test/embeddings/test_bert_embedding.py @@ -10,5 +10,12 @@ class TestDownload(unittest.TestCase): # import os vocab = Vocabulary().add_word_lst("This is a test .".split()) embed = BertEmbedding(vocab, model_dir_or_name='en') - words = torch.LongTensor([[0, 1, 2]]) + words = torch.LongTensor([[2, 3, 4, 0]]) print(embed(words).size()) + + def test_word_drop(self): + vocab = Vocabulary().add_word_lst("This is a test .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='en', dropout=0.1, word_dropout=0.2) + for i in range(10): + words = torch.LongTensor([[2, 3, 4, 0]]) + print(embed(words).size()) \ No newline at end of file diff --git a/test/embeddings/test_static_embedding.py b/test/embeddings/test_static_embedding.py index 83137345..c17daa0a 100644 --- a/test/embeddings/test_static_embedding.py +++ b/test/embeddings/test_static_embedding.py @@ -5,6 +5,7 @@ from fastNLP import Vocabulary import torch import os + class TestLoad(unittest.TestCase): def test_norm1(self): # 测试只对可以找到的norm @@ -22,6 +23,16 @@ class TestLoad(unittest.TestCase): self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1) self.assertEqual(round(torch.norm(embed(torch.LongTensor([[4]]))).item(), 4), 1) + def test_dropword(self): + # 测试是否可以通过drop word + vocab = Vocabulary().add_word_lst([chr(i) for i in range(1, 200)]) + embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=10, dropout=0.1, word_dropout=0.4) + for i in range(10): + length = torch.randint(1, 50, (1,)).item() + batch = torch.randint(1, 4, (1,)).item() + words = torch.randint(1, 200, (batch, length)).long() + embed(words) + class TestRandomSameEntry(unittest.TestCase): def test_same_vector(self): vocab = Vocabulary().add_word_lst(["The", "the", "THE", 'a', "A"]) From 584a92c64c62f7319bd2966070d4e138bdf39801 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Mon, 26 Aug 2019 01:33:17 +0800 Subject: [PATCH 098/153] =?UTF-8?q?1.=E5=A2=9E=E5=8A=A0sequence=20labeling?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E7=9A=84=E6=95=B0=E6=8D=AE=E8=AF=B4=E6=98=8E?= =?UTF-8?q?;=202.=E5=A2=9E=E5=8A=A0=E5=AF=B9CWSPipe=E7=9A=84=E5=BC=95?= =?UTF-8?q?=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/io/__init__.py | 1 + fastNLP/io/pipe/__init__.py | 3 ++ .../seqence_labelling/chinese_ner/readme.md | 30 +++++++++++++++++ reproduction/seqence_labelling/cws/readme.md | 32 +++++++++++++++++++ .../seqence_labelling/cws/test/__init__.py | 0 .../cws/test/test_CWSDataLoader.py | 17 ---------- 6 files changed, 66 insertions(+), 17 deletions(-) create mode 100644 reproduction/seqence_labelling/chinese_ner/readme.md create mode 100644 reproduction/seqence_labelling/cws/readme.md delete mode 100644 reproduction/seqence_labelling/cws/test/__init__.py delete mode 100644 reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 01683628..a3ea0148 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -38,6 +38,7 @@ __all__ = [ 'JsonLoader', 'CWSLoader', + "CWSPipe", 'MNLILoader', "QuoraLoader", diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 1907af4a..048e4cfe 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据,所有的 Pipe 都包含 ``proce __all__ = [ "Pipe", + "CWSPipe", + "YelpFullPipe", "YelpPolarityPipe", "SSTPipe", @@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe from .pipe import Pipe from .conll import Conll2003Pipe +from .cws import CWSPipe diff --git a/reproduction/seqence_labelling/chinese_ner/readme.md b/reproduction/seqence_labelling/chinese_ner/readme.md new file mode 100644 index 00000000..3a9d37d8 --- /dev/null +++ b/reproduction/seqence_labelling/chinese_ner/readme.md @@ -0,0 +1,30 @@ +使用以下中文NERPipe自动下载的统计数据 + +| MsraNERPipe | # of sents | # of tokens | +| ----------- | ---------- | ----------- | +| train | 41747 | 1954374 | +| dev | 4617 | 215505 | +| test | 4365 | 172601 | +| total | 50729 | 2342480 | +这里报道的统计数据,与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致 + + + +| WeiboNERPipe | # of sents | # of tokens | +| ------------ | ---------- | ----------- | +| train | 1350 | 73778 | +| dev | 270 | 14509 | +| test | 270 | 14842 | +| total | 1890 | 1890 | +这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致 + + + + +| PeopleDailyPipe | # of sents | # of tokens | +| --------------- | ---------- | ----------- | +| train | 50658 | 2169879 | +| dev | 4631 | 172601 | +| test | 68 | 2270 | +| total | 55357 | 2344750 | +这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的 diff --git a/reproduction/seqence_labelling/cws/readme.md b/reproduction/seqence_labelling/cws/readme.md new file mode 100644 index 00000000..a25bb0ed --- /dev/null +++ b/reproduction/seqence_labelling/cws/readme.md @@ -0,0 +1,32 @@ +四个数据集的统计信息,最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。 + +| pku | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 17173 | 1650222 | +| dev | 1881 | 176226 | +| test | 1944 | 172733 | +| total | 20998 | 1999181 | + + +| cityu | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 47696 | 2164907 | +| dev | 5323 | 238447 | +| test | 1492 | 67690 | +| total | 54511 | 2471044 | + + +| msra | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 78242 | 3644550 | +| dev | 8676 | 405919 | +| test | 3985 | 184355 | +| total | 90903 | 4234824 | + + +| as | # of sents | # of tokens | +| ----- | ---------- | ----------- | +| train | 638273 | 7536586 | +| dev | 70680 | 831464 | +| test | 14429 | 197681 | +| total | 723382 | 8565731 | diff --git a/reproduction/seqence_labelling/cws/test/__init__.py b/reproduction/seqence_labelling/cws/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py b/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py deleted file mode 100644 index f4260849..00000000 --- a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py +++ /dev/null @@ -1,17 +0,0 @@ - - -import unittest -from ..data.CWSDataLoader import SigHanLoader -from fastNLP.core.vocabulary import VocabularyOption - - -class TestCWSDataLoader(unittest.TestCase): - def test_case1(self): - cws_loader = SigHanLoader(target_type='bmes') - data = cws_loader.process('pku_demo.txt') - print(data.datasets) - - def test_calse2(self): - cws_loader = SigHanLoader(target_type='bmes') - data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption()) - print(data.datasets) \ No newline at end of file From 78be840ab97b47acbf517962173b9781cc6fbebe Mon Sep 17 00:00:00 2001 From: xuyige Date: Mon, 26 Aug 2019 01:56:20 +0800 Subject: [PATCH 099/153] 1.update README 2. fix a filename-bug in pretrain_static_file; 3. add Pipe to documents; 4. update documents in some loaders; 5. update tutorial 2 & 3 to adapt version 0.5.0 --- README.md | 13 +- .../tutorials/tutorial_2_load_dataset.rst | 220 ++++++------------ .../source/tutorials/tutorial_3_embedding.rst | 89 ++----- docs/source/user/tutorials.rst | 2 +- fastNLP/io/__init__.py | 5 +- fastNLP/io/file_utils.py | 2 +- fastNLP/io/loader/__init__.py | 4 +- fastNLP/io/loader/classification.py | 1 - fastNLP/io/loader/conll.py | 3 +- fastNLP/io/loader/csv.py | 2 +- fastNLP/io/pipe/matching.py | 4 +- fastNLP/io/pipe/pipe.py | 3 + 12 files changed, 117 insertions(+), 231 deletions(-) diff --git a/README.md b/README.md index b35776dc..531fbc83 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,12 @@ ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg) [![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest) -fastNLP 是一款轻量级的 NLP 处理套件。你既可以使用它快速地完成一个序列标注([NER](reproduction/seqence_labelling/ner)、POS-Tagging等)、中文分词、[文本分类](reproduction/text_classification)、[Matching](reproduction/matching)、[指代消解](reproduction/coreference_resolution)、[摘要](reproduction/Summarization)等任务; 也可以使用它构建许多复杂的网络模型,进行科研。它具有如下的特性: +fastNLP 是一款轻量级的 NLP 工具包。你既可以使用它快速地完成一个序列标注([NER](reproduction/seqence_labelling/ner)、POS-Tagging等)、中文分词、[文本分类](reproduction/text_classification)、[Matching](reproduction/matching)、[指代消解](reproduction/coreference_resolution)、[摘要](reproduction/Summarization)等任务; 也可以使用它快速构建许多复杂的网络模型,进行科研。它具有如下的特性: -- 统一的Tabular式数据容器,让数据预处理过程简洁明了。内置多种数据集的DataSet Loader,省去预处理代码; +- 统一的Tabular式数据容器,让数据预处理过程简洁明了。内置多种数据集的Loader和Pipe,省去预处理代码; - 多种训练、测试组件,例如训练器Trainer;测试器Tester;以及各种评测metrics等等; - 各种方便的NLP工具,例如预处理embedding加载(包括ELMo和BERT); 中间数据cache等; +- 部分[数据集与预训练模型](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)的自动下载 - 详尽的中文[文档](https://fastnlp.readthedocs.io/)、[教程](https://fastnlp.readthedocs.io/zh/latest/user/tutorials.html)以供查阅; - 提供诸多高级模块,例如Variational LSTM, Transformer, CRF等; - 在序列标注、中文分词、文本分类、Matching、指代消解、摘要等任务上封装了各种模型可供直接使用,详细内容见 [reproduction](reproduction) 部分; @@ -36,7 +37,7 @@ pip install fastNLP python -m spacy download en ``` -目前使用pip安装fastNLP的版本是0.4.1,有较多功能仍未更新,最新内容以master分支为准。 +目前使用pypi安装fastNLP的版本是0.4.1,有较多功能仍未更新,最新内容以master分支为准。 fastNLP0.5.0版本将在近期推出,请密切关注。 @@ -44,7 +45,7 @@ fastNLP0.5.0版本将在近期推出,请密切关注。 - [0. 快速入门](https://fastnlp.readthedocs.io/zh/latest/user/quickstart.html) - [1. 使用DataSet预处理文本](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html) -- [2. 使用DataSetLoader加载数据集](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_load_dataset.html) +- [2. 使用Loader和Pipe加载并处理数据集](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_load_dataset.html) - [3. 使用Embedding模块将文本转成向量](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_3_embedding.html) - [4. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_4_loss_optimizer.html) - [5. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_5_datasetiter.html) @@ -91,7 +92,7 @@ fastNLP 在 embeddings 模块中内置了几种不同的embedding:静态embedd ## 项目结构 -![](./docs/source/figures/workflow.png) + fastNLP的大致工作流程如上图所示,而项目结构如下: @@ -118,7 +119,7 @@ fastNLP的大致工作流程如上图所示,而项目结构如下: fastNLP.io - 实现了读写功能,包括数据读入,模型读写等 + 实现了读写功能,包括数据读入与预处理,模型读写,自动下载等 diff --git a/docs/source/tutorials/tutorial_2_load_dataset.rst b/docs/source/tutorials/tutorial_2_load_dataset.rst index 4fa4a84d..17ad6baf 100644 --- a/docs/source/tutorials/tutorial_2_load_dataset.rst +++ b/docs/source/tutorials/tutorial_2_load_dataset.rst @@ -1,57 +1,53 @@ -================================= -使用DataSetLoader加载数据集 -================================= +======================================= +使用Loader和Pipe加载并处理数据集 +======================================= 这一部分是一个关于如何加载数据集的教程 教程目录: - - `Part I: 数据集容器`_ - - `Part II: 数据集的使用方式`_ - - `Part III: 不同数据类型的DataSetLoader`_ - - `Part IV: DataSetLoader举例`_ - - `Part V: fastNLP封装好的数据集加载器`_ + - `Part I: 数据集容器DataBundle`_ + - `Part II: 加载数据集的基类Loader`_ + - `Part III: 不同格式类型的基础Loader`_ + - `Part IV: 使用Pipe对数据集进行预处理`_ + - `Part V: fastNLP封装好的Loader和Pipe`_ ----------------------------- -Part I: 数据集容器 ----------------------------- +------------------------------------ +Part I: 数据集容器DataBundle +------------------------------------ -在fastNLP中,我们使用 :class:`~fastNLP.io.base_loader.DataBundle` 来存储数据集信息。 -:class:`~fastNLP.io.base_loader.DataBundle` 类包含了两个重要内容: `datasets` 和 `vocabs` 。 +在fastNLP中,我们使用 :class:`~fastNLP.io.data_bundle.DataBundle` 来存储数据集信息。 +:class:`~fastNLP.io.data_bundle.DataBundle` 类包含了两个重要内容: `datasets` 和 `vocabs` 。 `datasets` 是一个 `key` 为数据集名称(如 `train` , `dev` ,和 `test` 等), `value` 为 :class:`~fastNLP.DataSet` 的字典。 `vocabs` 是一个 `key` 为词表名称(如 :attr:`fastNLP.Const.INPUT` 表示输入文本的词表名称, :attr:`fastNLP.Const.TARGET` 表示目标 的真实标签词表的名称,等等), `value` 为词表内容( :class:`~fastNLP.Vocabulary` )的字典。 ----------------------------- -Part II: 数据集的使用方式 ----------------------------- +------------------------------------- +Part II: 加载数据集的基类Loader +------------------------------------- -在fastNLP中,我们采用 :class:`~fastNLP.io.base_loader.DataSetLoader` 来作为加载数据集的基类。 -:class:`~fastNLP.io.base_loader.DataSetLoader` 定义了各种DataSetLoader所需的API接口,开发者应该继承它实现各种的DataSetLoader。 -在各种数据集的DataSetLoader当中,至少应该编写如下内容: +在fastNLP中,我们采用 :class:`~fastNLP.io.loader.Loader` 来作为加载数据集的基类。 +:class:`~fastNLP.io.loader.Loader` 定义了各种Loader所需的API接口,开发者应该继承它实现各种的Loader。 +在各种数据集的Loader当中,至少应该编写如下内容: - - _load 函数:从一个数据文件中读取数据到一个 :class:`~fastNLP.DataSet` - - load 函数(可以使用基类的方法):从一个或多个数据文件中读取数据到一个或多个 :class:`~fastNLP.DataSet` - - process 函数:一个或多个从数据文件中读取数据,并处理成可以训练的 :class:`~fastNLP.io.DataBundle` + - _load 函数:从一个数据文件中读取数据,返回一个 :class:`~fastNLP.DataSet` + - load 函数:从文件或者文件夹中读取数据并组装成 :class:`~fastNLP.io.data_bundle.DataBundle` - **\*process函数中可以调用load函数或_load函数** - -DataSetLoader的_load或者load函数返回的 :class:`~fastNLP.DataSet` 当中,内容为数据集的文本信息,process函数返回的 -:class:`~fastNLP.io.DataBundle` 当中, `datasets` 的内容为已经index好的、可以直接被 :class:`~fastNLP.Trainer` -接受的内容。 +Loader的load函数返回的 :class:`~fastNLP.io.data_bundle.DataBundle` 里面包含了数据集的原始数据。 -------------------------------------------------------- -Part III: 不同数据类型的DataSetLoader +Part III: 不同格式类型的基础Loader -------------------------------------------------------- -:class:`~fastNLP.io.dataset_loader.CSVLoader` +:class:`~fastNLP.io.loader.CSVLoader` 读取CSV类型的数据集文件。例子如下: .. code-block:: python + from fastNLP.io.loader import CSVLoader data_set_loader = CSVLoader( headers=('words', 'target'), sep='\t' ) @@ -67,17 +63,18 @@ Part III: 不同数据类型的DataSetLoader The performances are an absolute joy . 4 -:class:`~fastNLP.io.dataset_loader.JsonLoader` +:class:`~fastNLP.io.loader.JsonLoader` 读取Json类型的数据集文件,数据必须按行存储,每行是一个包含各类属性的Json对象。例子如下: .. code-block:: python - data_set_loader = JsonLoader( + from fastNLP.io.loader import JsonLoader + oader = JsonLoader( fields={'sentence1': 'words1', 'sentence2': 'words2', 'gold_label': 'target'} ) # 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'words1'、'words2'、'target'这三个fields - data_set = data_set_loader._load('path/to/your/file') + data_set = loader._load('path/to/your/file') 数据集内容样例如下 :: @@ -86,139 +83,68 @@ Part III: 不同数据类型的DataSetLoader {"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"} ------------------------------------------ -Part IV: DataSetLoader举例 +Part IV: 使用Pipe对数据集进行预处理 ------------------------------------------ -以Matching任务为例子: - - :class:`~fastNLP.io.data_loader.MatchingLoader` - 我们在fastNLP当中封装了一个Matching任务数据集的数据加载类: :class:`~fastNLP.io.data_loader.MatchingLoader` . - - 在MatchingLoader类当中我们封装了一个对数据集中的文本内容进行进一步的预处理的函数: - :meth:`~fastNLP.io.data_loader.MatchingLoader.process` - 这个函数具有各种预处理option,如: - - 是否将文本转成全小写 - - 是否需要序列长度信息,需要什么类型的序列长度信息 - - 是否需要用BertTokenizer来获取序列的WordPiece信息 - - 等等 +在fastNLP中,我们采用 :class:`~fastNLP.io.pipe.Pipe` 来作为加载数据集的基类。 +:class:`~fastNLP.io.pipe.Pipe` 定义了各种Pipe所需的API接口,开发者应该继承它实现各种的Pipe。 +在各种数据集的Pipe当中,至少应该编写如下内容: - 具体内容参见 :meth:`fastNLP.io.MatchingLoader.process` 。 + - process 函数:对输入的 :class:`~fastNLP.io.data_bundle.DataBundle` 进行处理(如构建词表、 + 将dataset的文本内容转成index等等),然后返回该 :class:`~fastNLP.io.data_bundle.DataBundle` + - process_from_file 函数:输入数据集所在文件夹,读取内容并组装成 :class:`~fastNLP.io.data_bundle.DataBundle` , + 然后调用相对应的process函数对数据进行预处理 - :class:`~fastNLP.io.data_loader.SNLILoader` - 一个关于SNLI数据集的DataSetLoader。SNLI数据集来自 - `SNLI Data Set `_ . +以SNLI数据集为例,写一个自定义Pipe的例子如下: - 在 :class:`~fastNLP.io.data_loader.SNLILoader` 的 :meth:`~fastNLP.io.data_loader.SNLILoader._load` - 函数中,我们用以下代码将数据集内容从文本文件读入内存: +.. code-block:: python - .. code-block:: python + from fastNLP.io.loader import SNLILoader + from fastNLP.io.pipe import MatchingPipe - data = SNLILoader().process( - paths='path/to/snli/data', to_lower=False, seq_len_type='seq_len', - get_index=True, concat=False, - ) - print(data) + class MySNLIPipe(MatchingPipe): - 输出的内容是:: + def process(self, data_bundle): + data_bundle = super(MySNLIPipe, self).process(data_bundle) + # MatchingPipe类里封装了一个关于matching任务的process函数,可以直接继承使用 + # 如果有需要进行额外的预处理操作可以在这里加入您的代码 + return data_bundle - In total 3 datasets: - train has 549367 instances. - dev has 9842 instances. - test has 9824 instances. - In total 2 vocabs: - words has 43154 entries. - target has 3 entries. + def process_from_file(self, paths=None): + data_bundle = SNLILoader().load(paths) # 使用SNLILoader读取原始数据集 + # SNLILoader的load函数中,paths如果为None则会自动下载 + return self.process(data_bundle) # 调用相对应的process函数对data_bundle进行处理 +调用Pipe示例: - 这里的data是一个 :class:`~fastNLP.io.base_loader.DataBundle` ,取 ``datasets`` 字典里的内容即可直接传入 - :class:`~fastNLP.Trainer` 或者 :class:`~fastNLP.Tester` 进行训练或者测试。 +.. code-block:: python - :class:`~fastNLP.io.data_loader.IMDBLoader` - 以IMDB数据集为例,在 :class:`~fastNLP.io.data_loader.IMDBLoader` 的 :meth:`~fastNLP.io.data_loader.IMDBLoader._load` - 函数中,我们用以下代码将数据集内容从文本文件读入内存: + from fastNLP.io.pipe import SNLIBertPipe + data_bundle = SNLIBertPipe(lower=True, tokenizer=arg.tokenizer).process_from_file() + print(data_bundle) - .. code-block:: python +输出的内容是:: - data = IMDBLoader().process( - paths={'train': 'path/to/train/file', 'test': 'path/to/test/file'} - ) - print(data) + In total 3 datasets: + train has 549367 instances. + dev has 9842 instances. + test has 9824 instances. + In total 2 vocabs: + words has 34184 entries. + target has 3 entries. - 输出的内容是:: - - In total 3 datasets: - train has 22500 instances. - test has 25000 instances. - dev has 2500 instances. - In total 2 vocabs: - words has 82846 entries. - target has 2 entries. - - - 这里的将原来的train集按9:1的比例分成了训练集和验证集。 +这里表示一共有3个数据集和2个词表。其中: + - 3个数据集分别为train、dev、test数据集,分别有549367、9842、9824个instance + - 2个词表分别为words词表与target词表。其中words词表为句子文本所构建的词表,一共有34184个单词; + target词表为目标标签所构建的词表,一共有3种标签。(注:如果有多个输入,则句子文本所构建的词表将 + 会被命名为words1以对应相对应的列名) ------------------------------------------ -Part V: fastNLP封装好的数据集加载器 +Part V: fastNLP封装好的Loader和Pipe ------------------------------------------ -fastNLP封装好的数据集加载器可以适用于多种类型的任务: - - - `文本分类任务`_ - - `序列标注任务`_ - - `Matching任务`_ - - -文本分类任务 -------------------- - -========================== ================================================================== -数据集名称 数据集加载器 --------------------------- ------------------------------------------------------------------ -IMDb :class:`~fastNLP.io.data_loader.IMDBLoader` --------------------------- ------------------------------------------------------------------ -SST :class:`~fastNLP.io.data_loader.SSTLoader` --------------------------- ------------------------------------------------------------------ -SST-2 :class:`~fastNLP.io.data_loader.SST2Loader` --------------------------- ------------------------------------------------------------------ -Yelp Polarity :class:`~fastNLP.io.data_loader.YelpLoader` --------------------------- ------------------------------------------------------------------ -Yelp Full :class:`~fastNLP.io.data_loader.YelpLoader` --------------------------- ------------------------------------------------------------------ -MTL16 :class:`~fastNLP.io.data_loader.MTL16Loader` -========================== ================================================================== - - - -序列标注任务 -------------------- - -========================== ================================================================== -数据集名称 数据集加载器 --------------------------- ------------------------------------------------------------------ -Conll :class:`~fastNLP.io.data_loader.ConllLoader` --------------------------- ------------------------------------------------------------------ -Conll2003 :class:`~fastNLP.io.data_loader.Conll2003Loader` --------------------------- ------------------------------------------------------------------ -人民日报数据集 :class:`~fastNLP.io.data_loader.PeopleDailyCorpusLoader` -========================== ================================================================== - - - -Matching任务 -------------------- - -========================== ================================================================== -数据集名称 数据集加载器 --------------------------- ------------------------------------------------------------------ -SNLI :class:`~fastNLP.io.data_loader.SNLILoader` --------------------------- ------------------------------------------------------------------ -MultiNLI :class:`~fastNLP.io.data_loader.MNLILoader` --------------------------- ------------------------------------------------------------------ -QNLI :class:`~fastNLP.io.data_loader.QNLILoader` --------------------------- ------------------------------------------------------------------ -RTE :class:`~fastNLP.io.data_loader.RTELoader` --------------------------- ------------------------------------------------------------------ -Quora Pair Dataset :class:`~fastNLP.io.data_loader.QuoraLoader` -========================== ================================================================== +fastNLP封装了多种任务/数据集的Loader和Pipe并提供自动下载功能,具体参见文档 + +`fastNLP可加载的embedding与数据集 `_ diff --git a/docs/source/tutorials/tutorial_3_embedding.rst b/docs/source/tutorials/tutorial_3_embedding.rst index 489b43b4..07dc30bc 100644 --- a/docs/source/tutorials/tutorial_3_embedding.rst +++ b/docs/source/tutorials/tutorial_3_embedding.rst @@ -12,6 +12,7 @@ - `Part IV: 使用预训练的Contextual Embedding(ELMo & BERT)`_ - `Part V: 使用character-level的embedding`_ - `Part VI: 叠加使用多个embedding`_ + - `Part VII: fastNLP支持的预训练Embedding`_ @@ -35,12 +36,14 @@ Part II: 使用随机初始化的embedding .. code-block:: python + from fastNLP import Embedding embed = Embedding(10000, 50) 也可以传入一个初始化的参数矩阵: .. code-block:: python + from fastNLP import Embedding embed = Embedding(init_embed) 其中的init_embed可以是torch.FloatTensor、torch.nn.Embedding或者numpy.ndarray。 @@ -59,6 +62,7 @@ Embedding,例子如下: .. code-block:: python + from fastNLP import StaticEmbedding embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) vocab为根据数据集构建的词表,model_dir_or_name可以是一个路径,也可以是embedding模型的名称: @@ -67,34 +71,13 @@ vocab为根据数据集构建的词表,model_dir_or_name可以是一个路径 和word2vec类型的权重文件都支持) 2 如果传入的是模型名称,那么fastNLP将会根据名称查找embedding模型,如果在cache目录下找到模型则会 - 自动加载;如果找不到则会自动下载。可以通过环境变量 ``FASTNLP_CACHE_DIR`` 来自定义cache目录,如:: + 自动加载;如果找不到则会自动下载到cache目录。默认的cache目录为 `~/.fastNLP` 文件夹。可以通过环境 + 变量 ``FASTNLP_CACHE_DIR`` 来自定义cache目录,如:: $ FASTNLP_CACHE_DIR=~/fastnlp_cache_dir python your_python_file.py 这个命令表示fastNLP将会在 `~/fastnlp_cache_dir` 这个目录下寻找模型,找不到则会自动将模型下载到这个目录 -目前支持的静态embedding模型有: - - ========================== ================================ - 模型名称 模型 - -------------------------- -------------------------------- - en glove.840B.300d - -------------------------- -------------------------------- - en-glove-840d-300 glove.840B.300d - -------------------------- -------------------------------- - en-glove-6b-50 glove.6B.50d - -------------------------- -------------------------------- - en-word2vec-300 谷歌word2vec 300维 - -------------------------- -------------------------------- - en-fasttext 英文fasttext 300维 - -------------------------- -------------------------------- - cn 腾讯中文词向量 200维 - -------------------------- -------------------------------- - cn-fasttext 中文fasttext 300维 - ========================== ================================ - - - ----------------------------------------------------------- Part IV: 使用预训练的Contextual Embedding(ELMo & BERT) ----------------------------------------------------------- @@ -106,62 +89,20 @@ Part IV: 使用预训练的Contextual Embedding(ELMo & BERT) .. code-block:: python + from fastNLP import ElmoEmbedding embed = ElmoEmbedding(vocab, model_dir_or_name='small', requires_grad=False) -目前支持的ElmoEmbedding模型有: - - ========================== ================================ - 模型名称 模型 - -------------------------- -------------------------------- - small allennlp ELMo的small - -------------------------- -------------------------------- - medium allennlp ELMo的medium - -------------------------- -------------------------------- - original allennlp ELMo的original - -------------------------- -------------------------------- - 5.5b-original allennlp ELMo的5.5B original - ========================== ================================ - BERT-embedding的使用方法如下: .. code-block:: python + from fastNLP import BertEmbedding embed = BertEmbedding( vocab, model_dir_or_name='en-base-cased', requires_grad=False, layers='4,-2,-1' ) 其中layers变量表示需要取哪几层的encode结果。 -目前支持的BertEmbedding模型有: - - ========================== ==================================== - 模型名称 模型 - -------------------------- ------------------------------------ - en bert-base-cased - -------------------------- ------------------------------------ - en-base-uncased bert-base-uncased - -------------------------- ------------------------------------ - en-base-cased bert-base-cased - -------------------------- ------------------------------------ - en-large-uncased bert-large-uncased - -------------------------- ------------------------------------ - en-large-cased bert-large-cased - -------------------------- ------------------------------------ - -------------------------- ------------------------------------ - en-large-cased-wwm bert-large-cased-whole-word-mask - -------------------------- ------------------------------------ - en-large-uncased-wwm bert-large-uncased-whole-word-mask - -------------------------- ------------------------------------ - en-base-cased-mrpc bert-base-cased-finetuned-mrpc - -------------------------- ------------------------------------ - -------------------------- ------------------------------------ - multilingual bert-base-multilingual-cased - -------------------------- ------------------------------------ - multilingual-base-uncased bert-base-multilingual-uncased - -------------------------- ------------------------------------ - multilingual-base-cased bert-base-multilingual-cased - ========================== ==================================== - ----------------------------------------------------- Part V: 使用character-level的embedding ----------------------------------------------------- @@ -173,6 +114,7 @@ CNNCharEmbedding的使用例子如下: .. code-block:: python + from fastNLP import CNNCharEmbedding embed = CNNCharEmbedding(vocab, embed_size=100, char_emb_size=50) 这表示这个CNNCharEmbedding当中character的embedding维度大小为50,返回的embedding结果维度大小为100。 @@ -181,12 +123,12 @@ CNNCharEmbedding的使用例子如下: .. code-block:: python + from fastNLP import LSTMCharEmbedding embed = LSTMCharEmbedding(vocab, embed_size=100, char_emb_size=50) 这表示这个LSTMCharEmbedding当中character的embedding维度大小为50,返回的embedding结果维度大小为100。 - ----------------------------------------------------- Part VI: 叠加使用多个embedding ----------------------------------------------------- @@ -197,6 +139,7 @@ Part VI: 叠加使用多个embedding .. code-block:: python + from fastNLP import StaticEmbedding, StackEmbedding embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True) @@ -208,7 +151,17 @@ StackEmbedding会把多个embedding的结果拼接起来,如上面例子的sta .. code-block:: python + from fastNLP import StaticEmbedding, StackEmbedding, ElmoEmbedding elmo_embedding = ElmoEmbedding(vocab, model_dir_or_name='medium', layers='0,1,2', requires_grad=False) glove_embedding = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) stack_embed = StackEmbedding([elmo_embedding, glove_embedding]) + +------------------------------------------ +Part VII: fastNLP支持的预训练Embedding +------------------------------------------ + +fastNLP支持多种预训练Embedding并提供自动下载功能,具体参见文档 + +`fastNLP可加载的embedding与数据集 `_ + diff --git a/docs/source/user/tutorials.rst b/docs/source/user/tutorials.rst index 196f9c29..3e9e1b54 100644 --- a/docs/source/user/tutorials.rst +++ b/docs/source/user/tutorials.rst @@ -8,7 +8,7 @@ fastNLP 详细使用教程 :maxdepth: 1 使用DataSet预处理文本 - 使用DataSetLoader加载数据集 + 使用Loader和Pipe加载并处理数据集 使用Embedding模块将文本转成向量 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index a3ea0148..8ed1956a 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -38,7 +38,6 @@ __all__ = [ 'JsonLoader', 'CWSLoader', - "CWSPipe", 'MNLILoader', "QuoraLoader", @@ -46,6 +45,8 @@ __all__ = [ "QNLILoader", "RTELoader", + "Pipe", + "YelpFullPipe", "YelpPolarityPipe", "SSTPipe", @@ -59,6 +60,8 @@ __all__ = [ "PeopleDailyPipe", "WeiboNERPipe", + "CWSPipe", + "MatchingBertPipe", "RTEBertPipe", "SNLIBertPipe", diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 9dbb515d..bd02158e 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -59,7 +59,7 @@ PRETRAIN_STATIC_FILES = { 'en-fasttext-crawl': "crawl-300d-2M.vec.zip", 'cn': "tencent_cn.zip", - 'cn-tencent': "tencent_cn.txt.zip", + 'cn-tencent': "tencent_cn.zip", 'cn-fasttext': "cc.zh.300.vec.gz", 'cn-sgns-literature-word': 'sgns.literature.word.txt.zip', } diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 820c33be..6c23f213 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -62,8 +62,8 @@ __all__ = [ "PeopleDailyNERLoader", "WeiboNERLoader", - # 'CSVLoader', - # 'JsonLoader', + 'CSVLoader', + 'JsonLoader', 'CWSLoader', diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index 67e19773..f64a26e7 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -5,7 +5,6 @@ import warnings import os import random import shutil -import numpy as np import glob import time diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py index 5dc4c6d7..b5241cff 100644 --- a/fastNLP/io/loader/conll.py +++ b/fastNLP/io/loader/conll.py @@ -11,9 +11,10 @@ import shutil import time import random + class ConllLoader(Loader): """ - 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader` + 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.loader.ConllLoader` ConllLoader支持读取的数据格式: 以空行隔开两个sample,除了分割行,每一行用空格或者制表符隔开不同的元素。如下例所示: diff --git a/fastNLP/io/loader/csv.py b/fastNLP/io/loader/csv.py index 166f912b..5195cc8e 100644 --- a/fastNLP/io/loader/csv.py +++ b/fastNLP/io/loader/csv.py @@ -6,7 +6,7 @@ from .loader import Loader class CSVLoader(Loader): """ - 别名::class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.dataset_loader.CSVLoader` + 别名::class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.loader.CSVLoader` 读取CSV格式的数据集, 返回 ``DataSet`` 。 diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 0d1b4e82..ffa6375b 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -181,8 +181,8 @@ class MatchingPipe(Pipe): "This site includes a...", "The Government Executive...", "not_entailment" "...", "..." - :param data_bundle: - :return: + :param data_bundle: 通过loader读取得到的data_bundle,里面包含了数据集的原始数据内容 + :return: data_bundle """ data_bundle = self._tokenize(data_bundle, [Const.RAW_WORDS(0), Const.RAW_WORDS(1)], [Const.INPUTS(0), Const.INPUTS(1)]) diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py index a2b74301..cc45dee4 100644 --- a/fastNLP/io/pipe/pipe.py +++ b/fastNLP/io/pipe/pipe.py @@ -2,6 +2,9 @@ from .. import DataBundle class Pipe: + """ + 别名::class:`fastNLP.io.Pipe` :class:`fastNLP.io.pipe.Pipe` + """ def process(self, data_bundle: DataBundle) -> DataBundle: """ 对输入的DataBundle进行处理,然后返回该DataBundle。 From 9e16791c538b856184efd4095ab0faed5ff4d2ce Mon Sep 17 00:00:00 2001 From: ChenXin Date: Sun, 25 Aug 2019 17:08:19 +0800 Subject: [PATCH 100/153] fix some importing bugs --- fastNLP/io/pipe/cws.py | 84 ++++++++++++++++++++++++------------------ 1 file changed, 49 insertions(+), 35 deletions(-) diff --git a/fastNLP/io/pipe/cws.py b/fastNLP/io/pipe/cws.py index 6ea1ae0c..4ca0219c 100644 --- a/fastNLP/io/pipe/cws.py +++ b/fastNLP/io/pipe/cws.py @@ -1,10 +1,13 @@ +import re +from itertools import chain + from .pipe import Pipe +from .utils import _indexize from .. import DataBundle from ..loader import CWSLoader -from ... import Const -from itertools import chain -from .utils import _indexize -import re +from ...core.const import Const + + def _word_lens_to_bmes(word_lens): """ @@ -13,11 +16,11 @@ def _word_lens_to_bmes(word_lens): """ tags = [] for word_len in word_lens: - if word_len==1: + if word_len == 1: tags.append('S') else: tags.append('B') - tags.extend(['M']*(word_len-2)) + tags.extend(['M'] * (word_len - 2)) tags.append('E') return tags @@ -30,10 +33,10 @@ def _word_lens_to_segapp(word_lens): """ tags = [] for word_len in word_lens: - if word_len==1: + if word_len == 1: tags.append('SEG') else: - tags.extend(['APP']*(word_len-1)) + tags.extend(['APP'] * (word_len - 1)) tags.append('SEG') return tags @@ -97,13 +100,21 @@ def _digit_span_to_special_tag(span): else: return '' + def _find_and_replace_digit_spans(line): - # only consider words start with number, contains '.', characters. - # If ends with space, will be processed - # If ends with Chinese character, will be processed - # If ends with or contains english char, not handled. - # floats are replaced by - # otherwise unkdgt + """ + only consider words start with number, contains '.', characters. + + If ends with space, will be processed + + If ends with Chinese character, will be processed + + If ends with or contains english char, not handled. + + floats are replaced by + + otherwise unkdgt + """ new_line = '' pattern = '\d[\d\\.﹒·]*(?=[\u4e00-\u9fff ,%,。!<-“])' prev_end = 0 @@ -136,17 +147,18 @@ class CWSPipe(Pipe): :param bool bigrams: 是否增加一列bigram. bigram的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...] :param bool trigrams: 是否增加一列trigram. trigram的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] """ + def __init__(self, dataset_name=None, encoding_type='bmes', replace_num_alpha=True, bigrams=False, trigrams=False): - if encoding_type=='bmes': + if encoding_type == 'bmes': self.word_lens_to_tags = _word_lens_to_bmes else: self.word_lens_to_tags = _word_lens_to_segapp - + self.dataset_name = dataset_name self.bigrams = bigrams self.trigrams = trigrams self.replace_num_alpha = replace_num_alpha - + def _tokenize(self, data_bundle): """ 将data_bundle中的'chars'列切分成一个一个的word. @@ -162,10 +174,10 @@ class CWSPipe(Pipe): char = [] subchar = [] for c in word: - if c=='<': + if c == '<': subchar.append(c) continue - if c=='>' and subchar[0]=='<': + if c == '>' and subchar[0] == '<': char.append(''.join(subchar)) subchar = [] if subchar: @@ -175,12 +187,12 @@ class CWSPipe(Pipe): char.extend(subchar) chars.append(char) return chars - + for name, dataset in data_bundle.datasets.items(): dataset.apply_field(split_word_into_chars, field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) return data_bundle - + def process(self, data_bundle: DataBundle) -> DataBundle: """ 可以处理的DataSet需要包含raw_words列 @@ -196,42 +208,43 @@ class CWSPipe(Pipe): :return: """ data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT) - + if self.replace_num_alpha: data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT) - + self._tokenize(data_bundle) - + for name, dataset in data_bundle.datasets.items(): - dataset.apply_field(lambda chars:self.word_lens_to_tags(map(len, chars)), field_name=Const.CHAR_INPUT, + dataset.apply_field(lambda chars: self.word_lens_to_tags(map(len, chars)), field_name=Const.CHAR_INPUT, new_field_name=Const.TARGET) - dataset.apply_field(lambda chars:list(chain(*chars)), field_name=Const.CHAR_INPUT, + dataset.apply_field(lambda chars: list(chain(*chars)), field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) input_field_names = [Const.CHAR_INPUT] if self.bigrams: for name, dataset in data_bundle.datasets.items(): - dataset.apply_field(lambda chars: [c1+c2 for c1, c2 in zip(chars, chars[1:]+[''])], + dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + [''])], field_name=Const.CHAR_INPUT, new_field_name='bigrams') input_field_names.append('bigrams') if self.trigrams: for name, dataset in data_bundle.datasets.items(): - dataset.apply_field(lambda chars: [c1+c2+c3 for c1, c2, c3 in zip(chars, chars[1:]+[''], chars[2:]+['']*2)], + dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in + zip(chars, chars[1:] + [''], chars[2:] + [''] * 2)], field_name=Const.CHAR_INPUT, new_field_name='trigrams') input_field_names.append('trigrams') - + _indexize(data_bundle, input_field_names, Const.TARGET) - + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) - + data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) - + return data_bundle - + def process_from_file(self, paths=None) -> DataBundle: """ @@ -239,8 +252,9 @@ class CWSPipe(Pipe): :return: """ if self.dataset_name is None and paths is None: - raise RuntimeError("You have to set `paths` when calling process_from_file() or `dataset_name `when initialization.") + raise RuntimeError( + "You have to set `paths` when calling process_from_file() or `dataset_name `when initialization.") if self.dataset_name is not None and paths is not None: raise RuntimeError("You cannot specify `paths` and `dataset_name` simultaneously") data_bundle = CWSLoader(self.dataset_name).load(paths) - return self.process(data_bundle) \ No newline at end of file + return self.process(data_bundle) From 34e17e97935f69aef54a9d75694713f0823c41fe Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 10:07:52 +0800 Subject: [PATCH 101/153] update the fastNLP.__init__ : use loader&pipe to replace data_loader --- fastNLP/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py index 2720f292..19efac31 100644 --- a/fastNLP/__init__.py +++ b/fastNLP/__init__.py @@ -65,8 +65,8 @@ __all__ = [ ] __version__ = '0.4.5' -from .core import * +from . import embeddings from . import models from . import modules -from . import embeddings -from .io import data_loader +from .core import * +from .io import loader, pipe From 9535ec60b65e7a2bc70394f444b5067bcb161ad9 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 10:17:43 +0800 Subject: [PATCH 102/153] update the doc system: use customized tool to generate the rst files --- docs/count.py | 133 +++++++----------- docs/source/conf.py | 8 +- docs/source/fastNLP.core.batch.rst | 6 +- docs/source/fastNLP.core.callback.rst | 6 +- docs/source/fastNLP.core.const.rst | 6 +- docs/source/fastNLP.core.dataset.rst | 6 +- docs/source/fastNLP.core.field.rst | 6 +- docs/source/fastNLP.core.instance.rst | 6 +- docs/source/fastNLP.core.losses.rst | 6 +- docs/source/fastNLP.core.metrics.rst | 6 +- docs/source/fastNLP.core.optimizer.rst | 6 +- docs/source/fastNLP.core.rst | 9 +- docs/source/fastNLP.core.sampler.rst | 6 +- docs/source/fastNLP.core.tester.rst | 6 +- docs/source/fastNLP.core.trainer.rst | 6 +- docs/source/fastNLP.core.utils.rst | 6 +- docs/source/fastNLP.core.vocabulary.rst | 6 +- .../fastNLP.embeddings.bert_embedding.rst | 10 +- .../fastNLP.embeddings.char_embedding.rst | 10 +- ...astNLP.embeddings.contextual_embedding.rst | 7 + .../fastNLP.embeddings.elmo_embedding.rst | 10 +- docs/source/fastNLP.embeddings.embedding.rst | 6 +- docs/source/fastNLP.embeddings.rst | 10 +- .../fastNLP.embeddings.stack_embedding.rst | 10 +- .../fastNLP.embeddings.static_embedding.rst | 10 +- docs/source/fastNLP.embeddings.utils.rst | 6 +- docs/source/fastNLP.io.data_bundle.rst | 10 +- docs/source/fastNLP.io.data_loader.rst | 8 -- docs/source/fastNLP.io.dataset_loader.rst | 9 +- docs/source/fastNLP.io.embed_loader.rst | 10 +- docs/source/fastNLP.io.file_utils.rst | 10 +- docs/source/fastNLP.io.loader.rst | 5 +- docs/source/fastNLP.io.model_io.rst | 10 +- docs/source/fastNLP.io.pipe.rst | 5 +- docs/source/fastNLP.io.rst | 21 +-- docs/source/fastNLP.io.utils.rst | 6 +- .../source/fastNLP.models.biaffine_parser.rst | 10 +- ...fastNLP.models.cnn_text_classification.rst | 10 +- docs/source/fastNLP.models.rst | 9 +- .../fastNLP.models.sequence_labeling.rst | 10 +- docs/source/fastNLP.models.snli.rst | 6 +- .../fastNLP.models.star_transformer.rst | 10 +- docs/source/fastNLP.modules.decoder.rst | 5 +- docs/source/fastNLP.modules.encoder.rst | 5 +- docs/source/fastNLP.modules.rst | 15 +- docs/source/fastNLP.modules.utils.rst | 6 +- docs/source/fastNLP.rst | 9 +- 47 files changed, 223 insertions(+), 279 deletions(-) create mode 100644 docs/source/fastNLP.embeddings.contextual_embedding.rst delete mode 100644 docs/source/fastNLP.io.data_loader.rst diff --git a/docs/count.py b/docs/count.py index d906f4c0..e1aad115 100644 --- a/docs/count.py +++ b/docs/count.py @@ -1,98 +1,65 @@ import os +import sys -def find_all(path='../fastNLP'): - head_list = [] - alias_list = [] - for path, dirs, files in os.walk(path): +def find_all_modules(): + modules = {} + children = {} + to_doc = set() + root = '../fastNLP' + for path, dirs, files in os.walk(root): for file in files: if file.endswith('.py'): name = ".".join(path.split('/')[1:]) if file.split('.')[0] != "__init__": name = name + '.' + file.split('.')[0] - if len(name.split('.')) < 3 or name.startswith('fastNLP.core'): - heads, alias = find_one(path + '/' + file) - for h in heads: - head_list.append(name + "." + h) - for a in alias: - alias_list.append(a) - heads = {} - for h in head_list: - end = h.split('.')[-1] - file = h[:-len(end) - 1] - if end not in heads: - heads[end] = set() - heads[end].add(file) - alias = {} - for a in alias_list: - for each in a: - end = each.split('.')[-1] - file = each[:-len(end) - 1] - if end not in alias: - alias[end] = set() - alias[end].add(file) - print("IN alias NOT IN heads") - for item in alias: - if item not in heads: - print(item, alias[item]) - elif len(heads[item]) != 2: - print(item, alias[item], heads[item]) - - print("\n\nIN heads NOT IN alias") - for item in heads: - if item not in alias: - print(item, heads[item]) + __import__(name) + m = sys.modules[name] + modules[name] = m + try: + m.__all__ + except: + print(name, "__all__ missing") + continue + if m.__doc__ is None: + print(name, "__doc__ missing") + continue + if "undocumented" not in m.__doc__: + to_doc.add(name) + for module in to_doc: + t = ".".join(module.split('.')[:-1]) + if t in to_doc: + if t not in children: + children[t] = set() + children[t].add(module) + for m in children: + children[m] = sorted(children[m]) + return modules, to_doc, children -def find_class(path): - with open(path, 'r') as fin: - lines = fin.readlines() - pars = {} - for i, line in enumerate(lines): - if line.strip().startswith('class'): - line = line.strip()[len('class'):-1].strip() - if line[-1] == ')': - line = line[:-1].split('(') - name = line[0].strip() - parents = line[1].split(',') - for i in range(len(parents)): - parents[i] = parents[i].strip() - if len(parents) == 1: - pars[name] = parents[0] - else: - pars[name] = tuple(parents) - return pars +def create_rst_file(modules, name, children): + m = modules[name] + with open("./source/" + name + ".rst", "w") as fout: + t = "=" * len(name) + fout.write(name + "\n") + fout.write(t + "\n") + fout.write("\n") + fout.write(".. automodule:: " + name + "\n") + if len(m.__all__) > 0: + fout.write(" :members: " + ", ".join(m.__all__) + "\n") + fout.write(" :inherited-members:\n") + fout.write("\n") + if name in children: + fout.write("子模块\n------\n\n.. toctree::\n\n") + for module in children[name]: + fout.write(" " + module + "\n") -def find_one(path): - head_list = [] - alias = [] - with open(path, 'r') as fin: - lines = fin.readlines() - flag = False - for i, line in enumerate(lines): - if line.strip().startswith('__all__'): - line = line.strip()[len('__all__'):].strip() - if line[-1] == ']': - line = line[1:-1].strip()[1:].strip() - head_list.append(line.strip("\"").strip("\'").strip()) - else: - flag = True - elif line.strip() == ']': - flag = False - elif flag: - line = line.strip()[:-1].strip("\"").strip("\'").strip() - if len(line) == 0 or line[0] == '#': - continue - head_list.append(line) - if line.startswith('def') or line.startswith('class'): - if lines[i + 2].strip().startswith("别名:"): - names = lines[i + 2].strip()[len("别名:"):].split() - names[0] = names[0][len(":class:`"):-1] - names[1] = names[1][len(":class:`"):-1] - alias.append((names[0], names[1])) - return head_list, alias +def main(): + modules, to_doc, children = find_all_modules() + for name in to_doc: + create_rst_file(modules, name, children) if __name__ == "__main__": - find_all() # use to check __all__ + main() diff --git a/docs/source/conf.py b/docs/source/conf.py index 2e10bc89..83cb7185 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -48,12 +48,14 @@ extensions = [ autodoc_default_options = { 'member-order': 'bysource', 'special-members': '__init__', - 'undoc-members': True, + 'undoc-members': False, } +autoclass_content = "class" + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] - +# template_bridge # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # @@ -113,7 +115,7 @@ html_static_path = ['_static'] # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'fastNLPdoc' +htmlhelp_basename = 'fastNLP doc' # -- Options for LaTeX output ------------------------------------------------ diff --git a/docs/source/fastNLP.core.batch.rst b/docs/source/fastNLP.core.batch.rst index 03008b52..50ad6fed 100644 --- a/docs/source/fastNLP.core.batch.rst +++ b/docs/source/fastNLP.core.batch.rst @@ -2,6 +2,6 @@ fastNLP.core.batch ================== .. automodule:: fastNLP.core.batch - :members: - :undoc-members: - :show-inheritance: + :members: BatchIter, DataSetIter, TorchLoaderIter + :inherited-members: + diff --git a/docs/source/fastNLP.core.callback.rst b/docs/source/fastNLP.core.callback.rst index 74a7825d..d37ddb11 100644 --- a/docs/source/fastNLP.core.callback.rst +++ b/docs/source/fastNLP.core.callback.rst @@ -2,6 +2,6 @@ fastNLP.core.callback ===================== .. automodule:: fastNLP.core.callback - :members: - :undoc-members: - :show-inheritance: + :members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, TesterCallback, CallbackException, EarlyStopError + :inherited-members: + diff --git a/docs/source/fastNLP.core.const.rst b/docs/source/fastNLP.core.const.rst index 330a8883..82a1992e 100644 --- a/docs/source/fastNLP.core.const.rst +++ b/docs/source/fastNLP.core.const.rst @@ -2,6 +2,6 @@ fastNLP.core.const ================== .. automodule:: fastNLP.core.const - :members: - :undoc-members: - :show-inheritance: + :members: Const + :inherited-members: + diff --git a/docs/source/fastNLP.core.dataset.rst b/docs/source/fastNLP.core.dataset.rst index 1ad94bb6..e13d7f1c 100644 --- a/docs/source/fastNLP.core.dataset.rst +++ b/docs/source/fastNLP.core.dataset.rst @@ -2,6 +2,6 @@ fastNLP.core.dataset ==================== .. automodule:: fastNLP.core.dataset - :members: - :undoc-members: - :show-inheritance: + :members: DataSet + :inherited-members: + diff --git a/docs/source/fastNLP.core.field.rst b/docs/source/fastNLP.core.field.rst index 7fc099c9..73dad8af 100644 --- a/docs/source/fastNLP.core.field.rst +++ b/docs/source/fastNLP.core.field.rst @@ -2,6 +2,6 @@ fastNLP.core.field ================== .. automodule:: fastNLP.core.field - :members: - :undoc-members: - :show-inheritance: + :members: Padder, AutoPadder, EngChar2DPadder + :inherited-members: + diff --git a/docs/source/fastNLP.core.instance.rst b/docs/source/fastNLP.core.instance.rst index 6e496ac1..010567b9 100644 --- a/docs/source/fastNLP.core.instance.rst +++ b/docs/source/fastNLP.core.instance.rst @@ -2,6 +2,6 @@ fastNLP.core.instance ===================== .. automodule:: fastNLP.core.instance - :members: - :undoc-members: - :show-inheritance: + :members: Instance + :inherited-members: + diff --git a/docs/source/fastNLP.core.losses.rst b/docs/source/fastNLP.core.losses.rst index 8e63dfa1..daf246f8 100644 --- a/docs/source/fastNLP.core.losses.rst +++ b/docs/source/fastNLP.core.losses.rst @@ -2,6 +2,6 @@ fastNLP.core.losses =================== .. automodule:: fastNLP.core.losses - :members: - :undoc-members: - :show-inheritance: + :members: LossBase, LossFunc, LossInForward, CrossEntropyLoss, BCELoss, L1Loss, NLLLoss + :inherited-members: + diff --git a/docs/source/fastNLP.core.metrics.rst b/docs/source/fastNLP.core.metrics.rst index d3b87bb8..96748a78 100644 --- a/docs/source/fastNLP.core.metrics.rst +++ b/docs/source/fastNLP.core.metrics.rst @@ -2,6 +2,6 @@ fastNLP.core.metrics ==================== .. automodule:: fastNLP.core.metrics - :members: - :undoc-members: - :show-inheritance: + :members: MetricBase, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric + :inherited-members: + diff --git a/docs/source/fastNLP.core.optimizer.rst b/docs/source/fastNLP.core.optimizer.rst index c80be53f..44e45c4f 100644 --- a/docs/source/fastNLP.core.optimizer.rst +++ b/docs/source/fastNLP.core.optimizer.rst @@ -2,6 +2,6 @@ fastNLP.core.optimizer ====================== .. automodule:: fastNLP.core.optimizer - :members: - :undoc-members: - :show-inheritance: + :members: Optimizer, SGD, Adam, AdamW + :inherited-members: + diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst index 08d161b7..56de46e9 100644 --- a/docs/source/fastNLP.core.rst +++ b/docs/source/fastNLP.core.rst @@ -2,12 +2,11 @@ fastNLP.core ============ .. automodule:: fastNLP.core - :members: - :undoc-members: - :show-inheritance: + :members: DataSet, Instance, FieldArray, Padder, AutoPadder, EngChar2DPadder, Vocabulary, DataSetIter, BatchIter, TorchLoaderIter, Const, Tester, Trainer, cache_results, seq_len_to_mask, get_seq_len, logger, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, TesterCallback, CallbackException, EarlyStopError, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, SequentialSampler, BucketSampler, RandomSampler, Sampler + :inherited-members: -Submodules ----------- +子模块 +------ .. toctree:: diff --git a/docs/source/fastNLP.core.sampler.rst b/docs/source/fastNLP.core.sampler.rst index 0110f0c0..56291894 100644 --- a/docs/source/fastNLP.core.sampler.rst +++ b/docs/source/fastNLP.core.sampler.rst @@ -2,6 +2,6 @@ fastNLP.core.sampler ==================== .. automodule:: fastNLP.core.sampler - :members: - :undoc-members: - :show-inheritance: + :members: Sampler, BucketSampler, SequentialSampler, RandomSampler + :inherited-members: + diff --git a/docs/source/fastNLP.core.tester.rst b/docs/source/fastNLP.core.tester.rst index 4d71a27b..90ec2a88 100644 --- a/docs/source/fastNLP.core.tester.rst +++ b/docs/source/fastNLP.core.tester.rst @@ -2,6 +2,6 @@ fastNLP.core.tester =================== .. automodule:: fastNLP.core.tester - :members: - :undoc-members: - :show-inheritance: + :members: Tester + :inherited-members: + diff --git a/docs/source/fastNLP.core.trainer.rst b/docs/source/fastNLP.core.trainer.rst index 60bf2d5b..92c08718 100644 --- a/docs/source/fastNLP.core.trainer.rst +++ b/docs/source/fastNLP.core.trainer.rst @@ -2,6 +2,6 @@ fastNLP.core.trainer ==================== .. automodule:: fastNLP.core.trainer - :members: - :undoc-members: - :show-inheritance: + :members: Trainer + :inherited-members: + diff --git a/docs/source/fastNLP.core.utils.rst b/docs/source/fastNLP.core.utils.rst index 3f80b4e8..027a43e9 100644 --- a/docs/source/fastNLP.core.utils.rst +++ b/docs/source/fastNLP.core.utils.rst @@ -2,6 +2,6 @@ fastNLP.core.utils ================== .. automodule:: fastNLP.core.utils - :members: - :undoc-members: - :show-inheritance: + :members: cache_results, seq_len_to_mask, get_seq_len + :inherited-members: + diff --git a/docs/source/fastNLP.core.vocabulary.rst b/docs/source/fastNLP.core.vocabulary.rst index ba9598b9..ac07a8c6 100644 --- a/docs/source/fastNLP.core.vocabulary.rst +++ b/docs/source/fastNLP.core.vocabulary.rst @@ -2,6 +2,6 @@ fastNLP.core.vocabulary ======================= .. automodule:: fastNLP.core.vocabulary - :members: - :undoc-members: - :show-inheritance: + :members: Vocabulary, VocabularyOption + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.bert_embedding.rst b/docs/source/fastNLP.embeddings.bert_embedding.rst index 24ceff1c..51828cb0 100644 --- a/docs/source/fastNLP.embeddings.bert_embedding.rst +++ b/docs/source/fastNLP.embeddings.bert_embedding.rst @@ -1,7 +1,7 @@ -fastNLP.embeddings.bert\_embedding -================================== +fastNLP.embeddings.bert_embedding +================================= .. automodule:: fastNLP.embeddings.bert_embedding - :members: - :undoc-members: - :show-inheritance: + :members: BertEmbedding, BertWordPieceEncoder + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.char_embedding.rst b/docs/source/fastNLP.embeddings.char_embedding.rst index 501089d8..a9b129d8 100644 --- a/docs/source/fastNLP.embeddings.char_embedding.rst +++ b/docs/source/fastNLP.embeddings.char_embedding.rst @@ -1,7 +1,7 @@ -fastNLP.embeddings.char\_embedding -================================== +fastNLP.embeddings.char_embedding +================================= .. automodule:: fastNLP.embeddings.char_embedding - :members: - :undoc-members: - :show-inheritance: + :members: CNNCharEmbedding, LSTMCharEmbedding + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.contextual_embedding.rst b/docs/source/fastNLP.embeddings.contextual_embedding.rst new file mode 100644 index 00000000..ee64c7a0 --- /dev/null +++ b/docs/source/fastNLP.embeddings.contextual_embedding.rst @@ -0,0 +1,7 @@ +fastNLP.embeddings.contextual_embedding +======================================= + +.. automodule:: fastNLP.embeddings.contextual_embedding + :members: ContextualEmbedding + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.elmo_embedding.rst b/docs/source/fastNLP.embeddings.elmo_embedding.rst index 76669ee3..06cc13af 100644 --- a/docs/source/fastNLP.embeddings.elmo_embedding.rst +++ b/docs/source/fastNLP.embeddings.elmo_embedding.rst @@ -1,7 +1,7 @@ -fastNLP.embeddings.elmo\_embedding -================================== +fastNLP.embeddings.elmo_embedding +================================= .. automodule:: fastNLP.embeddings.elmo_embedding - :members: - :undoc-members: - :show-inheritance: + :members: ElmoEmbedding + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.embedding.rst b/docs/source/fastNLP.embeddings.embedding.rst index 5960d2cd..4d5fcf46 100644 --- a/docs/source/fastNLP.embeddings.embedding.rst +++ b/docs/source/fastNLP.embeddings.embedding.rst @@ -2,6 +2,6 @@ fastNLP.embeddings.embedding ============================ .. automodule:: fastNLP.embeddings.embedding - :members: - :undoc-members: - :show-inheritance: + :members: Embedding, TokenEmbedding + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.rst b/docs/source/fastNLP.embeddings.rst index 6872e91d..8376408c 100644 --- a/docs/source/fastNLP.embeddings.rst +++ b/docs/source/fastNLP.embeddings.rst @@ -2,17 +2,17 @@ fastNLP.embeddings ================== .. automodule:: fastNLP.embeddings - :members: - :undoc-members: - :show-inheritance: + :members: Embedding, TokenEmbedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, BertWordPieceEncoder, StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding, get_embeddings + :inherited-members: -Submodules ----------- +子模块 +------ .. toctree:: fastNLP.embeddings.bert_embedding fastNLP.embeddings.char_embedding + fastNLP.embeddings.contextual_embedding fastNLP.embeddings.elmo_embedding fastNLP.embeddings.embedding fastNLP.embeddings.stack_embedding diff --git a/docs/source/fastNLP.embeddings.stack_embedding.rst b/docs/source/fastNLP.embeddings.stack_embedding.rst index 4d2115f7..6af91623 100644 --- a/docs/source/fastNLP.embeddings.stack_embedding.rst +++ b/docs/source/fastNLP.embeddings.stack_embedding.rst @@ -1,7 +1,7 @@ -fastNLP.embeddings.stack\_embedding -=================================== +fastNLP.embeddings.stack_embedding +================================== .. automodule:: fastNLP.embeddings.stack_embedding - :members: - :undoc-members: - :show-inheritance: + :members: StackEmbedding + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.static_embedding.rst b/docs/source/fastNLP.embeddings.static_embedding.rst index e46de81a..2df1c329 100644 --- a/docs/source/fastNLP.embeddings.static_embedding.rst +++ b/docs/source/fastNLP.embeddings.static_embedding.rst @@ -1,7 +1,7 @@ -fastNLP.embeddings.static\_embedding -==================================== +fastNLP.embeddings.static_embedding +=================================== .. automodule:: fastNLP.embeddings.static_embedding - :members: - :undoc-members: - :show-inheritance: + :members: StaticEmbedding + :inherited-members: + diff --git a/docs/source/fastNLP.embeddings.utils.rst b/docs/source/fastNLP.embeddings.utils.rst index 263bfbd6..13e5936b 100644 --- a/docs/source/fastNLP.embeddings.utils.rst +++ b/docs/source/fastNLP.embeddings.utils.rst @@ -2,6 +2,6 @@ fastNLP.embeddings.utils ======================== .. automodule:: fastNLP.embeddings.utils - :members: - :undoc-members: - :show-inheritance: + :members: get_embeddings + :inherited-members: + diff --git a/docs/source/fastNLP.io.data_bundle.rst b/docs/source/fastNLP.io.data_bundle.rst index a6273956..71a921f1 100644 --- a/docs/source/fastNLP.io.data_bundle.rst +++ b/docs/source/fastNLP.io.data_bundle.rst @@ -1,7 +1,7 @@ -fastNLP.io.data\_bundle -======================= +fastNLP.io.data_bundle +====================== .. automodule:: fastNLP.io.data_bundle - :members: - :undoc-members: - :show-inheritance: + :members: DataBundle + :inherited-members: + diff --git a/docs/source/fastNLP.io.data_loader.rst b/docs/source/fastNLP.io.data_loader.rst deleted file mode 100644 index 0b4f5d0b..00000000 --- a/docs/source/fastNLP.io.data_loader.rst +++ /dev/null @@ -1,8 +0,0 @@ -fastNLP.io.data\_loader -======================= - -.. automodule:: fastNLP.io.data_loader - :members: - :undoc-members: - :show-inheritance: - diff --git a/docs/source/fastNLP.io.dataset_loader.rst b/docs/source/fastNLP.io.dataset_loader.rst index e7990714..c211ecf9 100644 --- a/docs/source/fastNLP.io.dataset_loader.rst +++ b/docs/source/fastNLP.io.dataset_loader.rst @@ -1,7 +1,6 @@ -fastNLP.io.dataset\_loader -========================== +fastNLP.io.dataset_loader +========================= .. automodule:: fastNLP.io.dataset_loader - :members: - :undoc-members: - :show-inheritance: + :members: CSVLoader, JsonLoader + diff --git a/docs/source/fastNLP.io.embed_loader.rst b/docs/source/fastNLP.io.embed_loader.rst index 69e1f7ff..581f5c1b 100644 --- a/docs/source/fastNLP.io.embed_loader.rst +++ b/docs/source/fastNLP.io.embed_loader.rst @@ -1,7 +1,7 @@ -fastNLP.io.embed\_loader -======================== +fastNLP.io.embed_loader +======================= .. automodule:: fastNLP.io.embed_loader - :members: - :undoc-members: - :show-inheritance: + :members: EmbedLoader, EmbeddingOption + :inherited-members: + diff --git a/docs/source/fastNLP.io.file_utils.rst b/docs/source/fastNLP.io.file_utils.rst index 944550d7..0815e068 100644 --- a/docs/source/fastNLP.io.file_utils.rst +++ b/docs/source/fastNLP.io.file_utils.rst @@ -1,7 +1,7 @@ -fastNLP.io.file\_utils -====================== +fastNLP.io.file_utils +===================== .. automodule:: fastNLP.io.file_utils - :members: - :undoc-members: - :show-inheritance: + :members: cached_path, get_filepath, get_cache_path, split_filename_suffix, get_from_cache + :inherited-members: + diff --git a/docs/source/fastNLP.io.loader.rst b/docs/source/fastNLP.io.loader.rst index bbdc1d7a..060b5450 100644 --- a/docs/source/fastNLP.io.loader.rst +++ b/docs/source/fastNLP.io.loader.rst @@ -2,7 +2,6 @@ fastNLP.io.loader ================= .. automodule:: fastNLP.io.loader - :members: - :undoc-members: - :show-inheritance: + :members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader + :inherited-members: diff --git a/docs/source/fastNLP.io.model_io.rst b/docs/source/fastNLP.io.model_io.rst index 537ce752..183122b1 100644 --- a/docs/source/fastNLP.io.model_io.rst +++ b/docs/source/fastNLP.io.model_io.rst @@ -1,7 +1,7 @@ -fastNLP.io.model\_io -==================== +fastNLP.io.model_io +=================== .. automodule:: fastNLP.io.model_io - :members: - :undoc-members: - :show-inheritance: + :members: ModelLoader, ModelSaver + :inherited-members: + diff --git a/docs/source/fastNLP.io.pipe.rst b/docs/source/fastNLP.io.pipe.rst index bf126585..d35d2ddc 100644 --- a/docs/source/fastNLP.io.pipe.rst +++ b/docs/source/fastNLP.io.pipe.rst @@ -2,7 +2,6 @@ fastNLP.io.pipe =============== .. automodule:: fastNLP.io.pipe - :members: - :undoc-members: - :show-inheritance: + :members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe + :inherited-members: diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst index 0cd5d3f2..2aacb883 100644 --- a/docs/source/fastNLP.io.rst +++ b/docs/source/fastNLP.io.rst @@ -2,27 +2,18 @@ fastNLP.io ========== .. automodule:: fastNLP.io - :members: - :undoc-members: - :show-inheritance: + :members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver + :inherited-members: -Subpackages ------------ - -.. toctree:: - - fastNLP.io.data_loader - fastNLP.io.loader - fastNLP.io.pipe - -Submodules ----------- +子模块 +------ .. toctree:: fastNLP.io.data_bundle - fastNLP.io.dataset_loader fastNLP.io.embed_loader fastNLP.io.file_utils + fastNLP.io.loader fastNLP.io.model_io + fastNLP.io.pipe fastNLP.io.utils diff --git a/docs/source/fastNLP.io.utils.rst b/docs/source/fastNLP.io.utils.rst index 0b3f3938..3bff3c45 100644 --- a/docs/source/fastNLP.io.utils.rst +++ b/docs/source/fastNLP.io.utils.rst @@ -2,6 +2,6 @@ fastNLP.io.utils ================ .. automodule:: fastNLP.io.utils - :members: - :undoc-members: - :show-inheritance: + :members: check_loader_paths + :inherited-members: + diff --git a/docs/source/fastNLP.models.biaffine_parser.rst b/docs/source/fastNLP.models.biaffine_parser.rst index f19504e8..c3dbb0a5 100644 --- a/docs/source/fastNLP.models.biaffine_parser.rst +++ b/docs/source/fastNLP.models.biaffine_parser.rst @@ -1,7 +1,7 @@ -fastNLP.models.biaffine\_parser -=============================== +fastNLP.models.biaffine_parser +============================== .. automodule:: fastNLP.models.biaffine_parser - :members: - :undoc-members: - :show-inheritance: + :members: BiaffineParser, GraphParser + :inherited-members: + diff --git a/docs/source/fastNLP.models.cnn_text_classification.rst b/docs/source/fastNLP.models.cnn_text_classification.rst index eacf6916..fe4bb157 100644 --- a/docs/source/fastNLP.models.cnn_text_classification.rst +++ b/docs/source/fastNLP.models.cnn_text_classification.rst @@ -1,7 +1,7 @@ -fastNLP.models.cnn\_text\_classification -======================================== +fastNLP.models.cnn_text_classification +====================================== .. automodule:: fastNLP.models.cnn_text_classification - :members: - :undoc-members: - :show-inheritance: + :members: CNNText + :inherited-members: + diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst index 36875b85..88854a79 100644 --- a/docs/source/fastNLP.models.rst +++ b/docs/source/fastNLP.models.rst @@ -2,12 +2,11 @@ fastNLP.models ============== .. automodule:: fastNLP.models - :members: - :undoc-members: - :show-inheritance: + :members: CNNText, SeqLabeling, AdvSeqLabel, ESIM, StarTransEnc, STSeqLabel, STNLICls, STSeqCls, BiaffineParser, GraphParser + :inherited-members: -Submodules ----------- +子模块 +------ .. toctree:: diff --git a/docs/source/fastNLP.models.sequence_labeling.rst b/docs/source/fastNLP.models.sequence_labeling.rst index 85e28f06..b66e637e 100644 --- a/docs/source/fastNLP.models.sequence_labeling.rst +++ b/docs/source/fastNLP.models.sequence_labeling.rst @@ -1,7 +1,7 @@ -fastNLP.models.sequence\_labeling -================================= +fastNLP.models.sequence_labeling +================================ .. automodule:: fastNLP.models.sequence_labeling - :members: - :undoc-members: - :show-inheritance: + :members: SeqLabeling, AdvSeqLabel + :inherited-members: + diff --git a/docs/source/fastNLP.models.snli.rst b/docs/source/fastNLP.models.snli.rst index 3b9b555c..8551051a 100644 --- a/docs/source/fastNLP.models.snli.rst +++ b/docs/source/fastNLP.models.snli.rst @@ -2,6 +2,6 @@ fastNLP.models.snli =================== .. automodule:: fastNLP.models.snli - :members: - :undoc-members: - :show-inheritance: + :members: ESIM + :inherited-members: + diff --git a/docs/source/fastNLP.models.star_transformer.rst b/docs/source/fastNLP.models.star_transformer.rst index 69d5c5b2..f4b5989e 100644 --- a/docs/source/fastNLP.models.star_transformer.rst +++ b/docs/source/fastNLP.models.star_transformer.rst @@ -1,7 +1,7 @@ -fastNLP.models.star\_transformer -================================ +fastNLP.models.star_transformer +=============================== .. automodule:: fastNLP.models.star_transformer - :members: - :undoc-members: - :show-inheritance: + :members: StarTransEnc, STNLICls, STSeqCls, STSeqLabel + :inherited-members: + diff --git a/docs/source/fastNLP.modules.decoder.rst b/docs/source/fastNLP.modules.decoder.rst index ecc2adbd..b121f9e9 100644 --- a/docs/source/fastNLP.modules.decoder.rst +++ b/docs/source/fastNLP.modules.decoder.rst @@ -2,7 +2,6 @@ fastNLP.modules.decoder ======================= .. automodule:: fastNLP.modules.decoder - :members: - :undoc-members: - :show-inheritance: + :members: MLP, ConditionalRandomField, viterbi_decode, allowed_transitions + :inherited-members: diff --git a/docs/source/fastNLP.modules.encoder.rst b/docs/source/fastNLP.modules.encoder.rst index e60f9fa4..6b44a192 100644 --- a/docs/source/fastNLP.modules.encoder.rst +++ b/docs/source/fastNLP.modules.encoder.rst @@ -2,7 +2,6 @@ fastNLP.modules.encoder ======================= .. automodule:: fastNLP.modules.encoder - :members: - :undoc-members: - :show-inheritance: + :members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask, MultiHeadAttention + :inherited-members: diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst index 06494b53..6134d0dd 100644 --- a/docs/source/fastNLP.modules.rst +++ b/docs/source/fastNLP.modules.rst @@ -2,21 +2,14 @@ fastNLP.modules =============== .. automodule:: fastNLP.modules - :members: - :undoc-members: - :show-inheritance: + :members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask, MultiHeadAttention, MLP, ConditionalRandomField, viterbi_decode, allowed_transitions, TimestepDropout + :inherited-members: -Subpackages ------------ +子模块 +------ .. toctree:: fastNLP.modules.decoder fastNLP.modules.encoder - -Submodules ----------- - -.. toctree:: - fastNLP.modules.utils diff --git a/docs/source/fastNLP.modules.utils.rst b/docs/source/fastNLP.modules.utils.rst index c0219435..e28ca35a 100644 --- a/docs/source/fastNLP.modules.utils.rst +++ b/docs/source/fastNLP.modules.utils.rst @@ -2,6 +2,6 @@ fastNLP.modules.utils ===================== .. automodule:: fastNLP.modules.utils - :members: - :undoc-members: - :show-inheritance: + :members: initial_parameter, summary + :inherited-members: + diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst index e3ba429d..f22ea936 100644 --- a/docs/source/fastNLP.rst +++ b/docs/source/fastNLP.rst @@ -2,12 +2,11 @@ fastNLP ======= .. automodule:: fastNLP - :members: - :undoc-members: - :show-inheritance: + :members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC, LRFinder, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger + :inherited-members: -Subpackages ------------ +子模块 +------ .. toctree:: From efe88263bb2fb7bebacb8022eb86c390e266ec36 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 10:21:10 +0800 Subject: [PATCH 103/153] add __all__ and __doc__ for all files in module 'core', using 'undocumented' tags --- fastNLP/core/__init__.py | 67 +++++++++++++++++++++- fastNLP/core/_logger.py | 38 ++++++------ fastNLP/core/_parallel_utils.py | 21 ++++--- fastNLP/core/const.py | 26 ++++++--- fastNLP/core/dist_trainer.py | 22 +++---- fastNLP/core/field.py | 19 ++++-- fastNLP/core/predictor.py | 28 ++++----- fastNLP/core/vocabulary.py | 28 +++++---- fastNLP/embeddings/contextual_embedding.py | 10 ++-- 9 files changed, 178 insertions(+), 81 deletions(-) diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index 1feaf3fb..efee08b5 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -10,8 +10,72 @@ core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fa 对于常用的功能,你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用,您可以在下面找到每个子模块的具体文档。 - """ +__all__ = [ + "DataSet", + + "Instance", + + "FieldArray", + "Padder", + "AutoPadder", + "EngChar2DPadder", + + "Vocabulary", + + "DataSetIter", + "BatchIter", + "TorchLoaderIter", + + "Const", + + "Tester", + "Trainer", + + "cache_results", + "seq_len_to_mask", + "get_seq_len", + "logger", + + "Callback", + "GradientClipCallback", + "EarlyStopCallback", + "FitlogCallback", + "EvaluateCallback", + "LRScheduler", + "ControlC", + "LRFinder", + "TensorboardCallback", + "WarmupCallback", + 'SaveModelCallback', + "EchoCallback", + "TesterCallback", + "CallbackException", + "EarlyStopError", + + "LossFunc", + "CrossEntropyLoss", + "L1Loss", + "BCELoss", + "NLLLoss", + "LossInForward", + + "AccuracyMetric", + "SpanFPreRecMetric", + "ExtractiveQAMetric", + + "Optimizer", + "SGD", + "Adam", + "AdamW", + + "SequentialSampler", + "BucketSampler", + "RandomSampler", + "Sampler", +] + +from ._logger import logger from .batch import DataSetIter, BatchIter, TorchLoaderIter from .callback import Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, \ LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, \ @@ -28,4 +92,3 @@ from .tester import Tester from .trainer import Trainer from .utils import cache_results, seq_len_to_mask, get_seq_len from .vocabulary import Vocabulary -from ._logger import logger diff --git a/fastNLP/core/_logger.py b/fastNLP/core/_logger.py index 50266d7a..7198cfbd 100644 --- a/fastNLP/core/_logger.py +++ b/fastNLP/core/_logger.py @@ -1,15 +1,15 @@ +"""undocumented""" + +__all__ = [ + 'logger', +] + import logging import logging.config -import torch -import _pickle as pickle import os import sys import warnings -__all__ = [ - 'logger', -] - ROOT_NAME = 'fastNLP' try: @@ -25,7 +25,7 @@ if tqdm is not None: class TqdmLoggingHandler(logging.Handler): def __init__(self, level=logging.INFO): super().__init__(level) - + def emit(self, record): try: msg = self.format(record) @@ -59,14 +59,14 @@ def _add_file_handler(logger, path, level='INFO'): if os.path.abspath(path) == h.baseFilename: # file path already added return - + # File Handler if os.path.exists(path): assert os.path.isfile(path) warnings.warn('log already exists in {}'.format(path)) dirname = os.path.abspath(os.path.dirname(path)) os.makedirs(dirname, exist_ok=True) - + file_handler = logging.FileHandler(path, mode='a') file_handler.setLevel(_get_level(level)) file_formatter = logging.Formatter(fmt='%(asctime)s - %(module)s - [%(levelname)s] - %(message)s', @@ -87,7 +87,7 @@ def _set_stdout_handler(logger, stdout='tqdm', level='INFO'): break if stream_handler is not None: logger.removeHandler(stream_handler) - + # Stream Handler if stdout == 'plain': stream_handler = logging.StreamHandler(sys.stdout) @@ -95,7 +95,7 @@ def _set_stdout_handler(logger, stdout='tqdm', level='INFO'): stream_handler = TqdmLoggingHandler(level) else: stream_handler = None - + if stream_handler is not None: stream_formatter = logging.Formatter('%(message)s') stream_handler.setLevel(level) @@ -103,38 +103,40 @@ def _set_stdout_handler(logger, stdout='tqdm', level='INFO'): logger.addHandler(stream_handler) - class FastNLPLogger(logging.getLoggerClass()): def __init__(self, name): super().__init__(name) - + def add_file(self, path='./log.txt', level='INFO'): """add log output file and level""" _add_file_handler(self, path, level) - + def set_stdout(self, stdout='tqdm', level='INFO'): """set stdout format and level""" _set_stdout_handler(self, stdout, level) + logging.setLoggerClass(FastNLPLogger) + + # print(logging.getLoggerClass()) # print(logging.getLogger()) def _init_logger(path=None, stdout='tqdm', level='INFO'): """initialize logger""" level = _get_level(level) - + # logger = logging.getLogger() logger = logging.getLogger(ROOT_NAME) logger.propagate = False logger.setLevel(level) - + _set_stdout_handler(logger, stdout, level) - + # File Handler if path is not None: _add_file_handler(logger, path, level) - + return logger diff --git a/fastNLP/core/_parallel_utils.py b/fastNLP/core/_parallel_utils.py index 6b24d9f9..ce745820 100644 --- a/fastNLP/core/_parallel_utils.py +++ b/fastNLP/core/_parallel_utils.py @@ -1,11 +1,14 @@ +"""undocumented""" + +__all__ = [] import threading + import torch from torch import nn from torch.nn.parallel.parallel_apply import get_a_var - -from torch.nn.parallel.scatter_gather import scatter_kwargs, gather from torch.nn.parallel.replicate import replicate +from torch.nn.parallel.scatter_gather import scatter_kwargs, gather def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None): @@ -27,11 +30,11 @@ def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None): assert len(modules) == len(devices) else: devices = [None] * len(modules) - + lock = threading.Lock() results = {} grad_enabled = torch.is_grad_enabled() - + def _worker(i, module, input, kwargs, device=None): torch.set_grad_enabled(grad_enabled) if device is None: @@ -47,20 +50,20 @@ def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None): except Exception as e: with lock: results[i] = e - + if len(modules) > 1: threads = [threading.Thread(target=_worker, args=(i, module, input, kwargs, device)) for i, (module, input, kwargs, device) in enumerate(zip(modules, inputs, kwargs_tup, devices))] - + for thread in threads: thread.start() for thread in threads: thread.join() else: _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0]) - + outputs = [] for i in range(len(inputs)): output = results[i] @@ -79,6 +82,7 @@ def _data_parallel_wrapper(func_name, device_ids, output_device): :param output_device: nn.DataParallel中的output_device :return: """ + def wrapper(network, *inputs, **kwargs): inputs, kwargs = scatter_kwargs(inputs, kwargs, device_ids, dim=0) if len(device_ids) == 1: @@ -86,6 +90,7 @@ def _data_parallel_wrapper(func_name, device_ids, output_device): replicas = replicate(network, device_ids[:len(inputs)]) outputs = parallel_apply(replicas, func_name, inputs, kwargs, device_ids[:len(replicas)]) return gather(outputs, output_device) + return wrapper @@ -99,4 +104,4 @@ def _model_contains_inner_module(model): if isinstance(model, nn.Module): if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): return True - return False \ No newline at end of file + return False diff --git a/fastNLP/core/const.py b/fastNLP/core/const.py index 27e8d1cb..ad5d1f1e 100644 --- a/fastNLP/core/const.py +++ b/fastNLP/core/const.py @@ -1,3 +1,13 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "Const" +] + + class Const: """ fastNLP中field命名常量。 @@ -25,47 +35,47 @@ class Const: LOSS = 'loss' RAW_WORD = 'raw_words' RAW_CHAR = 'raw_chars' - + @staticmethod def INPUTS(i): """得到第 i 个 ``INPUT`` 的命名""" i = int(i) + 1 return Const.INPUT + str(i) - + @staticmethod def CHAR_INPUTS(i): """得到第 i 个 ``CHAR_INPUT`` 的命名""" i = int(i) + 1 return Const.CHAR_INPUT + str(i) - + @staticmethod def RAW_WORDS(i): i = int(i) + 1 return Const.RAW_WORD + str(i) - + @staticmethod def RAW_CHARS(i): i = int(i) + 1 return Const.RAW_CHAR + str(i) - + @staticmethod def INPUT_LENS(i): """得到第 i 个 ``INPUT_LEN`` 的命名""" i = int(i) + 1 return Const.INPUT_LEN + str(i) - + @staticmethod def OUTPUTS(i): """得到第 i 个 ``OUTPUT`` 的命名""" i = int(i) + 1 return Const.OUTPUT + str(i) - + @staticmethod def TARGETS(i): """得到第 i 个 ``TARGET`` 的命名""" i = int(i) + 1 return Const.TARGET + str(i) - + @staticmethod def LOSSES(i): """得到第 i 个 ``LOSS`` 的命名""" diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index 7c64fee4..3a293447 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -1,29 +1,29 @@ -""" +"""undocumented 正在开发中的分布式训练代码 """ +import logging +import os +import time +from datetime import datetime + import torch import torch.cuda -import torch.optim import torch.distributed as dist -from torch.utils.data.distributed import DistributedSampler +import torch.optim +from pkg_resources import parse_version from torch.nn.parallel import DistributedDataParallel as DDP -import os +from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm -import time -from datetime import datetime, timedelta -from functools import partial +from ._logger import logger from .batch import DataSetIter, BatchIter from .callback import DistCallbackManager, CallbackException, TesterCallback from .dataset import DataSet from .losses import _prepare_losser from .optimizer import Optimizer from .utils import _build_args -from .utils import _move_dict_value_to_device from .utils import _get_func_signature -from ._logger import logger -import logging -from pkg_resources import parse_version +from .utils import _move_dict_value_to_device __all__ = [ 'get_local_rank', diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index b3f024f8..05f987c2 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -1,18 +1,25 @@ +""" +.. todo:: + doc +""" + __all__ = [ "Padder", "AutoPadder", "EngChar2DPadder", ] -from numbers import Number -import torch -import numpy as np -from typing import Any from abc import abstractmethod -from copy import deepcopy from collections import Counter -from .utils import _is_iterable +from copy import deepcopy +from numbers import Number +from typing import Any + +import numpy as np +import torch + from ._logger import logger +from .utils import _is_iterable class SetInputOrTargetException(Exception): diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index 2d6a7380..c6b8fc90 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -1,13 +1,15 @@ -""" - ..todo:: - 检查这个类是否需要 -""" +"""undocumented""" + +__all__ = [ + "Predictor" +] + from collections import defaultdict import torch -from . import DataSetIter from . import DataSet +from . import DataSetIter from . import SequentialSampler from .utils import _build_args, _move_dict_value_to_device, _get_model_device @@ -21,7 +23,7 @@ class Predictor(object): :param torch.nn.Module network: 用来完成预测任务的模型 """ - + def __init__(self, network): if not isinstance(network, torch.nn.Module): raise ValueError( @@ -29,7 +31,7 @@ class Predictor(object): self.network = network self.batch_size = 1 self.batch_output = [] - + def predict(self, data: DataSet, seq_len_field_name=None): """用已经训练好的模型进行inference. @@ -41,27 +43,27 @@ class Predictor(object): raise ValueError("Only Dataset class is allowed, not {}.".format(type(data))) if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays: raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data)) - + prev_training = self.network.training self.network.eval() network_device = _get_model_device(self.network) batch_output = defaultdict(list) data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) - + if hasattr(self.network, "predict"): predict_func = self.network.predict else: predict_func = self.network.forward - + with torch.no_grad(): for batch_x, _ in data_iterator: _move_dict_value_to_device(batch_x, _, device=network_device) refined_batch_x = _build_args(predict_func, **batch_x) prediction = predict_func(**refined_batch_x) - + if seq_len_field_name is not None: seq_lens = batch_x[seq_len_field_name].tolist() - + for key, value in prediction.items(): value = value.cpu().numpy() if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): @@ -74,6 +76,6 @@ class Predictor(object): batch_output[key].extend(tmp_batch) else: batch_output[key].append(value) - + self.network.train(prev_training) return batch_output diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 92f54f9a..52d33a5a 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -1,16 +1,22 @@ +""" +.. todo:: + doc +""" + __all__ = [ "Vocabulary", "VocabularyOption", ] -from functools import wraps from collections import Counter +from functools import partial +from functools import wraps + +from ._logger import logger from .dataset import DataSet from .utils import Option -from functools import partial -import numpy as np from .utils import _is_iterable -from ._logger import logger + class VocabularyOption(Option): def __init__(self, @@ -51,7 +57,7 @@ def _check_build_status(func): self.rebuild = True if self.max_size is not None and len(self.word_count) >= self.max_size: logger.info("[Warning] Vocabulary has reached the max size {} when calling {} method. " - "Adding more words may cause unexpected behaviour of Vocabulary. ".format( + "Adding more words may cause unexpected behaviour of Vocabulary. ".format( self.max_size, func.__name__)) return func(self, *args, **kwargs) @@ -199,7 +205,7 @@ class Vocabulary(object): self.build_reverse_vocab() self.rebuild = False return self - + def build_reverse_vocab(self): """ 基于 `word to index` dict, 构建 `index to word` dict. @@ -279,19 +285,19 @@ class Vocabulary(object): if not isinstance(field[0][0], str) and _is_iterable(field[0][0]): raise RuntimeError("Only support field with 2 dimensions.") return [[self.to_index(c) for c in w] for w in field] - + new_field_name = new_field_name or field_name - + if type(new_field_name) == type(field_name): if isinstance(new_field_name, list): assert len(new_field_name) == len(field_name), "new_field_name should have same number elements with " \ - "field_name." + "field_name." elif isinstance(new_field_name, str): field_name = [field_name] new_field_name = [new_field_name] else: raise TypeError("field_name and new_field_name can only be str or List[str].") - + for idx, dataset in enumerate(datasets): if isinstance(dataset, DataSet): try: @@ -377,7 +383,7 @@ class Vocabulary(object): :return: bool """ return word in self._no_create_word - + def to_index(self, w): """ 将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出``ValueError``:: diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py index 2c304da7..9910a44b 100644 --- a/fastNLP/embeddings/contextual_embedding.py +++ b/fastNLP/embeddings/contextual_embedding.py @@ -8,15 +8,17 @@ __all__ = [ ] from abc import abstractmethod + import torch -from ..core.vocabulary import Vocabulary -from ..core.dataset import DataSet +from .embedding import TokenEmbedding +from ..core import logger from ..core.batch import DataSetIter +from ..core.dataset import DataSet from ..core.sampler import SequentialSampler from ..core.utils import _move_model_to_device, _get_model_device -from .embedding import TokenEmbedding -from ..core import logger +from ..core.vocabulary import Vocabulary + class ContextualEmbedding(TokenEmbedding): def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0): From 0d5f43b451473fe25703cb1f9798fcf03eb64c76 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 10:25:01 +0800 Subject: [PATCH 104/153] add __all__ and __doc__ for all files in module 'io', using 'undocumented' tags --- fastNLP/io/data_bundle.py | 7 +- fastNLP/io/dataset_loader.py | 6 +- fastNLP/io/embed_loader.py | 9 +- fastNLP/io/file_reader.py | 16 ++- fastNLP/io/file_utils.py | 23 +++- fastNLP/io/loader/classification.py | 26 +++-- fastNLP/io/loader/conll.py | 84 +++++++++------ fastNLP/io/loader/csv.py | 10 +- fastNLP/io/loader/cws.py | 17 ++- fastNLP/io/loader/json.py | 10 +- fastNLP/io/loader/loader.py | 13 ++- fastNLP/io/loader/matching.py | 82 ++++++++------ fastNLP/io/pipe/classification.py | 161 +++++++++++++++------------- fastNLP/io/pipe/conll.py | 79 ++++++++------ fastNLP/io/pipe/cws.py | 6 ++ fastNLP/io/pipe/matching.py | 75 ++++++++----- fastNLP/io/pipe/pipe.py | 6 ++ fastNLP/io/pipe/utils.py | 38 ++++--- fastNLP/io/utils.py | 25 +++-- 19 files changed, 439 insertions(+), 254 deletions(-) diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index 1e663f1e..db60a86f 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -1,10 +1,15 @@ +""" +.. todo:: + doc +""" __all__ = [ 'DataBundle', ] import _pickle as pickle -from typing import Union, Dict import os +from typing import Union, Dict + from ..core.dataset import DataSet from ..core.vocabulary import Vocabulary diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 82e96597..fca0de69 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -1,4 +1,4 @@ -""" +"""undocumented .. warning:: 本模块将在 `0.5.0版本` 中被废弃,由 :mod:`~fastNLP.io.loader` 和 :mod:`~fastNLP.io.pipe` 模块替代。 @@ -23,10 +23,10 @@ __all__ = [ ] +from .data_bundle import DataSetLoader +from .file_reader import _read_csv, _read_json from ..core.dataset import DataSet from ..core.instance import Instance -from .file_reader import _read_csv, _read_json -from .data_bundle import DataSetLoader class JsonLoader(DataSetLoader): diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index c58385e1..780d91e4 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -1,17 +1,22 @@ +""" +.. todo:: + doc +""" __all__ = [ "EmbedLoader", "EmbeddingOption", ] +import logging import os import warnings import numpy as np -from ..core.vocabulary import Vocabulary from .data_bundle import BaseLoader from ..core.utils import Option -import logging +from ..core.vocabulary import Vocabulary + class EmbeddingOption(Option): def __init__(self, diff --git a/fastNLP/io/file_reader.py b/fastNLP/io/file_reader.py index 0320572c..7a953098 100644 --- a/fastNLP/io/file_reader.py +++ b/fastNLP/io/file_reader.py @@ -1,7 +1,11 @@ -""" +"""undocumented 此模块用于给其它模块提供读取文件的函数,没有为用户提供 API """ + +__all__ = [] + import json + from ..core import logger @@ -24,8 +28,8 @@ def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True): headers = headers.split(sep) start_idx += 1 elif not isinstance(headers, (list, tuple)): - raise TypeError("headers should be list or tuple, not {}." \ - .format(type(headers))) + raise TypeError("headers should be list or tuple, not {}." \ + .format(type(headers))) for line_idx, line in enumerate(f, start_idx): contents = line.rstrip('\r\n').split(sep) if len(contents) != len(headers): @@ -82,6 +86,7 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): :if False, raise ValueError when reading invalid data. default: True :return: generator, every time yield (line number, conll item) """ + def parse_conll(sample): sample = list(map(list, zip(*sample))) sample = [sample[i] for i in indexes] @@ -89,14 +94,15 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True): if len(f) <= 0: raise ValueError('empty field') return sample + with open(path, 'r', encoding=encoding) as f: sample = [] start = next(f).strip() - if start!='': + if start != '': sample.append(start.split()) for line_idx, line in enumerate(f, 1): line = line.strip() - if line=='': + if line == '': if len(sample): try: res = parse_conll(sample) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index bd02158e..8ecdff25 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -1,12 +1,27 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "cached_path", + "get_filepath", + "get_cache_path", + "split_filename_suffix", + "get_from_cache", +] + import os +import re +import shutil +import tempfile from pathlib import Path from urllib.parse import urlparse -import re + import requests -import tempfile -from tqdm import tqdm -import shutil from requests import HTTPError +from tqdm import tqdm + from ..core import logger PRETRAINED_BERT_MODEL_DIR = { diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index f64a26e7..ec00d2b4 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -1,12 +1,24 @@ -from ...core.dataset import DataSet -from ...core.instance import Instance -from .loader import Loader -import warnings +"""undocumented""" + +__all__ = [ + "YelpLoader", + "YelpFullLoader", + "YelpPolarityLoader", + "IMDBLoader", + "SSTLoader", + "SST2Loader", +] + +import glob import os import random import shutil -import glob import time +import warnings + +from .loader import Loader +from ...core.dataset import DataSet +from ...core.instance import Instance class YelpLoader(Loader): @@ -58,7 +70,7 @@ class YelpLoader(Loader): class YelpFullLoader(YelpLoader): - def download(self, dev_ratio: float = 0.1, re_download:bool=False): + def download(self, dev_ratio: float = 0.1, re_download: bool = False): """ 自动下载数据集,如果你使用了这个数据集,请引用以下的文章 @@ -127,7 +139,7 @@ class YelpPolarityLoader(YelpLoader): if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 shutil.rmtree(data_dir) data_dir = self._get_dataset_path(dataset_name=dataset_name) - + if not os.path.exists(os.path.join(data_dir, 'dev.csv')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py index b5241cff..1bd1b448 100644 --- a/fastNLP/io/loader/conll.py +++ b/fastNLP/io/loader/conll.py @@ -1,15 +1,28 @@ -from typing import Dict, Union +"""undocumented""" + +__all__ = [ + "ConllLoader", + "Conll2003Loader", + "Conll2003NERLoader", + "OntoNotesNERLoader", + "CTBLoader", + "CNNERLoader", + "MsraNERLoader", + "WeiboNERLoader", + "PeopleDailyNERLoader" +] -from .loader import Loader -from ...core.dataset import DataSet -from ..file_reader import _read_conll -from ...core.instance import Instance -from ...core.const import Const import glob import os +import random import shutil import time -import random + +from .loader import Loader +from ..file_reader import _read_conll +from ...core.const import Const +from ...core.dataset import DataSet +from ...core.instance import Instance class ConllLoader(Loader): @@ -47,6 +60,7 @@ class ConllLoader(Loader): :param bool dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True`` """ + def __init__(self, headers, indexes=None, dropna=True): super(ConllLoader, self).__init__() if not isinstance(headers, (list, tuple)): @@ -60,7 +74,7 @@ class ConllLoader(Loader): if len(indexes) != len(headers): raise ValueError self.indexes = indexes - + def _load(self, path): """ 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 @@ -101,12 +115,13 @@ class Conll2003Loader(ConllLoader): "[...]", "[...]", "[...]", "[...]" """ + def __init__(self): headers = [ 'raw_words', 'pos', 'chunk', 'ner', ] super(Conll2003Loader, self).__init__(headers=headers) - + def _load(self, path): """ 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 @@ -127,7 +142,7 @@ class Conll2003Loader(ConllLoader): ins = {h: data[i] for i, h in enumerate(self.headers)} ds.append(Instance(**ins)) return ds - + def download(self, output_dir=None): raise RuntimeError("conll2003 cannot be downloaded automatically.") @@ -158,12 +173,13 @@ class Conll2003NERLoader(ConllLoader): "[...]", "[...]" """ + def __init__(self): headers = [ 'raw_words', 'target', ] super().__init__(headers=headers, indexes=[0, 3]) - + def _load(self, path): """ 传入的一个文件路径,将该文件读入DataSet中,field由ConllLoader初始化时指定的headers决定。 @@ -184,7 +200,7 @@ class Conll2003NERLoader(ConllLoader): ins = {h: data[i] for i, h in enumerate(self.headers)} ds.append(Instance(**ins)) return ds - + def download(self): raise RuntimeError("conll2003 cannot be downloaded automatically.") @@ -204,13 +220,13 @@ class OntoNotesNERLoader(ConllLoader): "[...]", "[...]" """ - + def __init__(self): super().__init__(headers=[Const.RAW_WORD, Const.TARGET], indexes=[3, 10]) - - def _load(self, path:str): + + def _load(self, path: str): dataset = super()._load(path) - + def convert_to_bio(tags): bio_tags = [] flag = None @@ -227,7 +243,7 @@ class OntoNotesNERLoader(ConllLoader): flag = None bio_tags.append(bio_label) return bio_tags - + def convert_word(words): converted_words = [] for word in words: @@ -236,7 +252,7 @@ class OntoNotesNERLoader(ConllLoader): converted_words.append(word) continue # 以下是由于这些符号被转义了,再转回来 - tfrs = {'-LRB-':'(', + tfrs = {'-LRB-': '(', '-RRB-': ')', '-LSB-': '[', '-RSB-': ']', @@ -248,12 +264,12 @@ class OntoNotesNERLoader(ConllLoader): else: converted_words.append(word) return converted_words - + dataset.apply_field(convert_word, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD) dataset.apply_field(convert_to_bio, field_name=Const.TARGET, new_field_name=Const.TARGET) - + return dataset - + def download(self): raise RuntimeError("Ontonotes cannot be downloaded automatically, you can refer " "https://github.com/yhcc/OntoNotes-5.0-NER to download and preprocess.") @@ -262,13 +278,13 @@ class OntoNotesNERLoader(ConllLoader): class CTBLoader(Loader): def __init__(self): super().__init__() - - def _load(self, path:str): + + def _load(self, path: str): pass class CNNERLoader(Loader): - def _load(self, path:str): + def _load(self, path: str): """ 支持加载形如以下格式的内容,一行两列,以空格隔开两个sample @@ -331,10 +347,11 @@ class MsraNERLoader(CNNERLoader): "[...]", "[...]" """ + def __init__(self): super().__init__() - - def download(self, dev_ratio:float=0.1, re_download:bool=False)->str: + + def download(self, dev_ratio: float = 0.1, re_download: bool = False) -> str: """ 自动下载MSAR-NER的数据,如果你使用该数据,请引用 Gina-Anne Levow, 2006, The Third International Chinese Language Processing Bakeoff: Word Segmentation and Named Entity Recognition. @@ -356,7 +373,7 @@ class MsraNERLoader(CNNERLoader): if time.time() - modify_time > 1 and re_download: # 通过这种比较丑陋的方式判断一下文件是否是才下载的 shutil.rmtree(data_dir) data_dir = self._get_dataset_path(dataset_name=dataset_name) - + if not os.path.exists(os.path.join(data_dir, 'dev.conll')): if dev_ratio > 0: assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)." @@ -380,15 +397,15 @@ class MsraNERLoader(CNNERLoader): finally: if os.path.exists(os.path.join(data_dir, 'middle_file.conll')): os.remove(os.path.join(data_dir, 'middle_file.conll')) - + return data_dir class WeiboNERLoader(CNNERLoader): def __init__(self): super().__init__() - - def download(self)->str: + + def download(self) -> str: """ 自动下载Weibo-NER的数据,如果你使用了该数据,请引用 Nanyun Peng and Mark Dredze, 2015, Named Entity Recognition for Chinese Social Media with Jointly Trained Embeddings. @@ -397,7 +414,7 @@ class WeiboNERLoader(CNNERLoader): """ dataset_name = 'weibo-ner' data_dir = self._get_dataset_path(dataset_name=dataset_name) - + return data_dir @@ -427,11 +444,12 @@ class PeopleDailyNERLoader(CNNERLoader): "[...]", "[...]" """ + def __init__(self): super().__init__() - + def download(self) -> str: dataset_name = 'peopledaily' data_dir = self._get_dataset_path(dataset_name=dataset_name) - + return data_dir diff --git a/fastNLP/io/loader/csv.py b/fastNLP/io/loader/csv.py index 5195cc8e..0d6e35fa 100644 --- a/fastNLP/io/loader/csv.py +++ b/fastNLP/io/loader/csv.py @@ -1,7 +1,13 @@ +"""undocumented""" + +__all__ = [ + "CSVLoader", +] + +from .loader import Loader +from ..file_reader import _read_csv from ...core.dataset import DataSet from ...core.instance import Instance -from ..file_reader import _read_csv -from .loader import Loader class CSVLoader(Loader): diff --git a/fastNLP/io/loader/cws.py b/fastNLP/io/loader/cws.py index fab7639c..2fbb1091 100644 --- a/fastNLP/io/loader/cws.py +++ b/fastNLP/io/loader/cws.py @@ -1,11 +1,18 @@ -from .loader import Loader -from ...core.dataset import DataSet -from ...core.instance import Instance +"""undocumented""" + +__all__ = [ + "CWSLoader" +] + import glob import os -import time -import shutil import random +import shutil +import time + +from .loader import Loader +from ...core.dataset import DataSet +from ...core.instance import Instance class CWSLoader(Loader): diff --git a/fastNLP/io/loader/json.py b/fastNLP/io/loader/json.py index 8856b73a..012dee5a 100644 --- a/fastNLP/io/loader/json.py +++ b/fastNLP/io/loader/json.py @@ -1,7 +1,13 @@ +"""undocumented""" + +__all__ = [ + "JsonLoader" +] + +from .loader import Loader +from ..file_reader import _read_json from ...core.dataset import DataSet from ...core.instance import Instance -from ..file_reader import _read_json -from .loader import Loader class JsonLoader(Loader): diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py index e7b419ac..22636a27 100644 --- a/fastNLP/io/loader/loader.py +++ b/fastNLP/io/loader/loader.py @@ -1,8 +1,15 @@ -from ...core.dataset import DataSet -from .. import DataBundle -from ..utils import check_loader_paths +"""undocumented""" + +__all__ = [ + "Loader" +] + from typing import Union, Dict + +from .. import DataBundle from ..file_utils import _get_dataset_url, get_cache_path, cached_path +from ..utils import check_loader_paths +from ...core.dataset import DataSet class Loader: diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py index 26455914..7f03ca3e 100644 --- a/fastNLP/io/loader/matching.py +++ b/fastNLP/io/loader/matching.py @@ -1,10 +1,21 @@ +"""undocumented""" + +__all__ = [ + "MNLILoader", + "SNLILoader", + "QNLILoader", + "RTELoader", + "QuoraLoader", +] + +import os import warnings -from .loader import Loader +from typing import Union, Dict + from .json import JsonLoader -from ...core.const import Const +from .loader import Loader from .. import DataBundle -import os -from typing import Union, Dict +from ...core.const import Const from ...core.dataset import DataSet from ...core.instance import Instance @@ -22,10 +33,11 @@ class MNLILoader(Loader): "...", "...","." """ + def __init__(self): super().__init__() - - def _load(self, path:str): + + def _load(self, path: str): ds = DataSet() with open(path, 'r', encoding='utf-8') as f: f.readline() # 跳过header @@ -50,8 +62,8 @@ class MNLILoader(Loader): if raw_words1 and raw_words2 and target: ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) return ds - - def load(self, paths:str=None): + + def load(self, paths: str = None): """ :param str paths: 传入数据所在目录,会在该目录下寻找dev_matched.tsv, dev_mismatched.tsv, test_matched.tsv, @@ -64,13 +76,13 @@ class MNLILoader(Loader): paths = self.download() if not os.path.isdir(paths): raise NotADirectoryError(f"{paths} is not a valid directory.") - - files = {'dev_matched':"dev_matched.tsv", - "dev_mismatched":"dev_mismatched.tsv", - "test_matched":"test_matched.tsv", - "test_mismatched":"test_mismatched.tsv", - "train":'train.tsv'} - + + files = {'dev_matched': "dev_matched.tsv", + "dev_mismatched": "dev_mismatched.tsv", + "test_matched": "test_matched.tsv", + "test_mismatched": "test_mismatched.tsv", + "train": 'train.tsv'} + datasets = {} for name, filename in files.items(): filepath = os.path.join(paths, filename) @@ -78,11 +90,11 @@ class MNLILoader(Loader): if 'test' not in name: raise FileNotFoundError(f"{name} not found in directory {filepath}.") datasets[name] = self._load(filepath) - + data_bundle = DataBundle(datasets=datasets) - + return data_bundle - + def download(self): """ 如果你使用了这个数据,请引用 @@ -106,14 +118,15 @@ class SNLILoader(JsonLoader): "...", "...", "." """ + def __init__(self): super().__init__(fields={ 'sentence1': Const.RAW_WORDS(0), 'sentence2': Const.RAW_WORDS(1), 'gold_label': Const.TARGET, }) - - def load(self, paths: Union[str, Dict[str, str]]=None) -> DataBundle: + + def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle: """ 从指定一个或多个路径中的文件中读取数据,返回:class:`~fastNLP.io.DataBundle` 。 @@ -138,11 +151,11 @@ class SNLILoader(JsonLoader): paths = _paths else: raise NotADirectoryError(f"{paths} is not a valid directory.") - + datasets = {name: self._load(path) for name, path in paths.items()} data_bundle = DataBundle(datasets=datasets) return data_bundle - + def download(self): """ 如果您的文章使用了这份数据,请引用 @@ -169,12 +182,13 @@ class QNLILoader(JsonLoader): test数据集没有target列 """ + def __init__(self): super().__init__() - + def _load(self, path): ds = DataSet() - + with open(path, 'r', encoding='utf-8') as f: f.readline() # 跳过header if path.endswith("test.tsv"): @@ -198,7 +212,7 @@ class QNLILoader(JsonLoader): if raw_words1 and raw_words2 and target: ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) return ds - + def download(self): """ 如果您的实验使用到了该数据,请引用 @@ -225,12 +239,13 @@ class RTELoader(Loader): test数据集没有target列 """ + def __init__(self): super().__init__() - - def _load(self, path:str): + + def _load(self, path: str): ds = DataSet() - + with open(path, 'r', encoding='utf-8') as f: f.readline() # 跳过header if path.endswith("test.tsv"): @@ -254,7 +269,7 @@ class RTELoader(Loader): if raw_words1 and raw_words2 and target: ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) return ds - + def download(self): return self._get_dataset_path('rte') @@ -281,12 +296,13 @@ class QuoraLoader(Loader): "...","." """ + def __init__(self): super().__init__() - - def _load(self, path:str): + + def _load(self, path: str): ds = DataSet() - + with open(path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() @@ -298,6 +314,6 @@ class QuoraLoader(Loader): if raw_words1 and raw_words2 and target: ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target)) return ds - + def download(self): raise RuntimeError("Quora cannot be downloaded automatically.") diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index f42d5400..30c591a4 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -1,26 +1,39 @@ +"""undocumented""" + +__all__ = [ + "YelpFullPipe", + "YelpPolarityPipe", + "SSTPipe", + "SST2Pipe", + 'IMDBPipe' +] + +import re + from nltk import Tree +from .pipe import Pipe +from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance from ..data_bundle import DataBundle -from ...core.vocabulary import Vocabulary -from ...core.const import Const from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader +from ...core.const import Const from ...core.dataset import DataSet from ...core.instance import Instance +from ...core.vocabulary import Vocabulary -from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance -from .pipe import Pipe -import re nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') + class _CLSPipe(Pipe): """ 分类问题的基类,负责对classification的数据进行tokenize操作。默认是对raw_words列操作,然后生成words列 """ - def __init__(self, tokenizer:str='spacy', lang='en'): + + def __init__(self, tokenizer: str = 'spacy', lang='en'): self.tokenizer = get_tokenizer(tokenizer, lang=lang) - + def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None): """ 将DataBundle中的数据进行tokenize @@ -33,9 +46,9 @@ class _CLSPipe(Pipe): new_field_name = new_field_name or field_name for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name) - + return data_bundle - + def _granularize(self, data_bundle, tag_map): """ 该函数对data_bundle中'target'列中的内容进行转换。 @@ -47,9 +60,9 @@ class _CLSPipe(Pipe): """ for name in list(data_bundle.datasets.keys()): dataset = data_bundle.get_dataset(name) - dataset.apply_field(lambda target:tag_map.get(target, -100), field_name=Const.TARGET, + dataset.apply_field(lambda target: tag_map.get(target, -100), field_name=Const.TARGET, new_field_name=Const.TARGET) - dataset.drop(lambda ins:ins[Const.TARGET] == -100) + dataset.drop(lambda ins: ins[Const.TARGET] == -100) data_bundle.set_dataset(dataset, name) return data_bundle @@ -69,7 +82,7 @@ def _clean_str(words): t = ''.join(tt) if t != '': words_collection.append(t) - + return words_collection @@ -89,19 +102,20 @@ class YelpFullPipe(_CLSPipe): 1、2归为1类,3归为1类,4、5归为1类;若为5, 则有5分类问题。 :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 """ - def __init__(self, lower:bool=False, granularity=5, tokenizer:str='spacy'): + + def __init__(self, lower: bool = False, granularity=5, tokenizer: str = 'spacy'): super().__init__(tokenizer=tokenizer, lang='en') self.lower = lower assert granularity in (2, 3, 5), "granularity can only be 2,3,5." self.granularity = granularity - - if granularity==2: + + if granularity == 2: self.tag_map = {"1": 0, "2": 0, "4": 1, "5": 1} - elif granularity==3: - self.tag_map = {"1": 0, "2": 0, "3":1, "4": 2, "5": 2} + elif granularity == 3: + self.tag_map = {"1": 0, "2": 0, "3": 1, "4": 2, "5": 2} else: self.tag_map = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4} - + def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None): """ 将DataBundle中的数据进行tokenize @@ -116,7 +130,7 @@ class YelpFullPipe(_CLSPipe): dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name) dataset.apply_field(_clean_str, field_name=field_name, new_field_name=new_field_name) return data_bundle - + def process(self, data_bundle): """ 传入的DataSet应该具备如下的结构 @@ -131,30 +145,30 @@ class YelpFullPipe(_CLSPipe): :param data_bundle: :return: """ - + # 复制一列words data_bundle = _add_words_field(data_bundle, lower=self.lower) - + # 进行tokenize data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) - + # 根据granularity设置tag data_bundle = self._granularize(data_bundle, tag_map=self.tag_map) - + # 删除空行 data_bundle = _drop_empty_instance(data_bundle, field_name=Const.INPUT) - + # index data_bundle = _indexize(data_bundle=data_bundle) - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) - + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) data_bundle.set_target(Const.TARGET) - + return data_bundle - + def process_from_file(self, paths=None): """ @@ -179,27 +193,28 @@ class YelpPolarityPipe(_CLSPipe): :param bool lower: 是否对输入进行小写化。 :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 """ - def __init__(self, lower:bool=False, tokenizer:str='spacy'): + + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): super().__init__(tokenizer=tokenizer, lang='en') self.lower = lower - + def process(self, data_bundle): # 复制一列words data_bundle = _add_words_field(data_bundle, lower=self.lower) - + # 进行tokenize data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) # index data_bundle = _indexize(data_bundle=data_bundle) - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) - + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) data_bundle.set_target(Const.TARGET) - + return data_bundle - + def process_from_file(self, paths=None): """ @@ -230,7 +245,7 @@ class SSTPipe(_CLSPipe): 0、1归为1类,2归为1类,3、4归为1类;若为5, 则有5分类问题。 :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 """ - + def __init__(self, subtree=False, train_subtree=True, lower=False, granularity=5, tokenizer='spacy'): super().__init__(tokenizer=tokenizer, lang='en') self.subtree = subtree @@ -238,15 +253,15 @@ class SSTPipe(_CLSPipe): self.lower = lower assert granularity in (2, 3, 5), "granularity can only be 2,3,5." self.granularity = granularity - - if granularity==2: + + if granularity == 2: self.tag_map = {"0": 0, "1": 0, "3": 1, "4": 1} - elif granularity==3: - self.tag_map = {"0": 0, "1": 0, "2":1, "3": 2, "4": 2} + elif granularity == 3: + self.tag_map = {"0": 0, "1": 0, "2": 1, "3": 2, "4": 2} else: self.tag_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4} - - def process(self, data_bundle:DataBundle): + + def process(self, data_bundle: DataBundle): """ 对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与 @@ -277,26 +292,26 @@ class SSTPipe(_CLSPipe): instance = Instance(raw_words=' '.join(tree.leaves()), target=tree.label()) ds.append(instance) data_bundle.set_dataset(ds, name) - + _add_words_field(data_bundle, lower=self.lower) - + # 进行tokenize data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT) - + # 根据granularity设置tag data_bundle = self._granularize(data_bundle, tag_map=self.tag_map) - + # index data_bundle = _indexize(data_bundle=data_bundle) - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) - + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) data_bundle.set_target(Const.TARGET) - + return data_bundle - + def process_from_file(self, paths=None): data_bundle = SSTLoader().load(paths) return self.process(data_bundle=data_bundle) @@ -316,11 +331,12 @@ class SST2Pipe(_CLSPipe): :param bool lower: 是否对输入进行小写化。 :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。 """ + def __init__(self, lower=False, tokenizer='spacy'): super().__init__(tokenizer=tokenizer, lang='en') self.lower = lower - - def process(self, data_bundle:DataBundle): + + def process(self, data_bundle: DataBundle): """ 可以处理的DataSet应该具备如下的结构 @@ -335,15 +351,15 @@ class SST2Pipe(_CLSPipe): :return: """ _add_words_field(data_bundle, self.lower) - + data_bundle = self._tokenize(data_bundle=data_bundle) - + src_vocab = Vocabulary() src_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT, - no_create_entry_dataset=[dataset for name,dataset in data_bundle.datasets.items() if + no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if name != 'train']) src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) - + tgt_vocab = Vocabulary(unknown=None, padding=None) tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) datasets = [] @@ -351,18 +367,18 @@ class SST2Pipe(_CLSPipe): if dataset.has_field(Const.TARGET): datasets.append(dataset) tgt_vocab.index_dataset(*datasets, field_name=Const.TARGET) - + data_bundle.set_vocab(src_vocab, Const.INPUT) data_bundle.set_vocab(tgt_vocab, Const.TARGET) - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) - + data_bundle.set_input(Const.INPUT, Const.INPUT_LEN) data_bundle.set_target(Const.TARGET) - + return data_bundle - + def process_from_file(self, paths=None): """ @@ -390,11 +406,12 @@ class IMDBPipe(_CLSPipe): :param bool lower: 是否将words列的数据小写。 :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 """ - def __init__(self, lower:bool=False, tokenizer:str='spacy'): + + def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): super().__init__(tokenizer=tokenizer, lang='en') self.lower = lower - - def process(self, data_bundle:DataBundle): + + def process(self, data_bundle: DataBundle): """ 期待的DataBunlde中输入的DataSet应该类似于如下,有两个field,raw_words和target,且均为str类型 @@ -409,25 +426,26 @@ class IMDBPipe(_CLSPipe): target列应该为str。 :return: DataBundle """ + # 替换
def replace_br(raw_words): raw_words = raw_words.replace("
", ' ') return raw_words - + for name, dataset in data_bundle.datasets.items(): dataset.apply_field(replace_br, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD) - + _add_words_field(data_bundle, lower=self.lower) self._tokenize(data_bundle, field_name=Const.INPUT, new_field_name=Const.INPUT) _indexize(data_bundle) - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) dataset.set_input(Const.INPUT, Const.INPUT_LEN) dataset.set_target(Const.TARGET) - + return data_bundle - + def process_from_file(self, paths=None): """ @@ -437,8 +455,5 @@ class IMDBPipe(_CLSPipe): # 读取数据 data_bundle = IMDBLoader().load(paths) data_bundle = self.process(data_bundle) - + return data_bundle - - - diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index 617d1236..2efec8e0 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -1,13 +1,25 @@ +"""undocumented""" + +__all__ = [ + "Conll2003NERPipe", + "Conll2003Pipe", + "OntoNotesNERPipe", + "MsraNERPipe", + "PeopleDailyPipe", + "WeiboNERPipe" +] + from .pipe import Pipe -from .. import DataBundle +from .utils import _add_chars_field +from .utils import _indexize, _add_words_field from .utils import iob2, iob2bioes -from ...core.const import Const +from .. import DataBundle from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader -from .utils import _indexize, _add_words_field -from .utils import _add_chars_field from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader, ConllLoader +from ...core.const import Const from ...core.vocabulary import Vocabulary + class _NERPipe(Pipe): """ NER任务的处理Pipe, 该Pipe会(1)复制raw_words列,并命名为words; (2)在words, target列建立词表 @@ -20,14 +32,14 @@ class _NERPipe(Pipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 """ - + def __init__(self, encoding_type: str = 'bio', lower: bool = False): if encoding_type == 'bio': self.convert_tag = iob2 else: self.convert_tag = lambda words: iob2bioes(iob2(words)) self.lower = lower - + def process(self, data_bundle: DataBundle) -> DataBundle: """ 支持的DataSet的field为 @@ -46,21 +58,21 @@ class _NERPipe(Pipe): # 转换tag for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) - + _add_words_field(data_bundle, lower=self.lower) - + # index _indexize(data_bundle) - + input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) - + data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) - + return data_bundle @@ -84,7 +96,7 @@ class Conll2003NERPipe(_NERPipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 """ - + def process_from_file(self, paths) -> DataBundle: """ @@ -94,7 +106,7 @@ class Conll2003NERPipe(_NERPipe): # 读取数据 data_bundle = Conll2003NERLoader().load(paths) data_bundle = self.process(data_bundle) - + return data_bundle @@ -125,8 +137,8 @@ class Conll2003Pipe(Pipe): else: self.ner_convert_tag = lambda tags: iob2bioes(iob2(tags)) self.lower = lower - - def process(self, data_bundle)->DataBundle: + + def process(self, data_bundle) -> DataBundle: """ 输入的DataSet应该类似于如下的形式 @@ -145,9 +157,9 @@ class Conll2003Pipe(Pipe): dataset.drop(lambda x: "-DOCSTART-" in x[Const.RAW_WORD]) dataset.apply_field(self.chunk_convert_tag, field_name='chunk', new_field_name='chunk') dataset.apply_field(self.ner_convert_tag, field_name='ner', new_field_name='ner') - + _add_words_field(data_bundle, lower=self.lower) - + # index _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=['pos', 'ner']) # chunk中存在一些tag只在dev中出现,没在train中 @@ -155,18 +167,18 @@ class Conll2003Pipe(Pipe): tgt_vocab.from_dataset(*data_bundle.datasets.values(), field_name='chunk') tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name='chunk') data_bundle.set_vocab(tgt_vocab, 'chunk') - + input_fields = [Const.INPUT, Const.INPUT_LEN] target_fields = ['pos', 'ner', 'chunk', Const.INPUT_LEN] - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) - + data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) - + return data_bundle - + def process_from_file(self, paths): """ @@ -194,7 +206,7 @@ class OntoNotesNERPipe(_NERPipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 :param bool lower: 是否将words小写化后再建立词表,绝大多数情况都不需要设置为True。 """ - + def process_from_file(self, paths): data_bundle = OntoNotesNERLoader().load(paths) return self.process(data_bundle) @@ -211,13 +223,13 @@ class _CNNERPipe(Pipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 """ - + def __init__(self, encoding_type: str = 'bio'): if encoding_type == 'bio': self.convert_tag = iob2 else: self.convert_tag = lambda words: iob2bioes(iob2(words)) - + def process(self, data_bundle: DataBundle) -> DataBundle: """ 支持的DataSet的field为 @@ -239,21 +251,21 @@ class _CNNERPipe(Pipe): # 转换tag for name, dataset in data_bundle.datasets.items(): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) - + _add_chars_field(data_bundle, lower=False) - + # index _indexize(data_bundle, input_field_names=Const.CHAR_INPUT, target_field_names=Const.TARGET) - + input_fields = [Const.TARGET, Const.CHAR_INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET, Const.INPUT_LEN] - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.CHAR_INPUT) - + data_bundle.set_input(*input_fields) data_bundle.set_target(*target_fields) - + return data_bundle @@ -272,6 +284,7 @@ class MsraNERPipe(_CNNERPipe): target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 """ + def process_from_file(self, paths=None) -> DataBundle: data_bundle = MsraNERLoader().load(paths) return self.process(data_bundle) @@ -291,6 +304,7 @@ class PeopleDailyPipe(_CNNERPipe): raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 """ + def process_from_file(self, paths=None) -> DataBundle: data_bundle = PeopleDailyNERLoader().load(paths) return self.process(data_bundle) @@ -312,6 +326,7 @@ class WeiboNERPipe(_CNNERPipe): :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 """ + def process_from_file(self, paths=None) -> DataBundle: data_bundle = WeiboNERLoader().load(paths) return self.process(data_bundle) diff --git a/fastNLP/io/pipe/cws.py b/fastNLP/io/pipe/cws.py index 4ca0219c..748cf10a 100644 --- a/fastNLP/io/pipe/cws.py +++ b/fastNLP/io/pipe/cws.py @@ -1,3 +1,9 @@ +"""undocumented""" + +__all__ = [ + "CWSPipe" +] + import re from itertools import chain diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index ffa6375b..699438c8 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -1,9 +1,25 @@ +"""undocumented""" + +__all__ = [ + "MatchingBertPipe", + "RTEBertPipe", + "SNLIBertPipe", + "QuoraBertPipe", + "QNLIBertPipe", + "MNLIBertPipe", + "MatchingPipe", + "RTEPipe", + "SNLIPipe", + "QuoraPipe", + "QNLIPipe", + "MNLIPipe", +] from .pipe import Pipe from .utils import get_tokenizer +from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader from ...core.const import Const from ...core.vocabulary import Vocabulary -from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader class MatchingBertPipe(Pipe): @@ -24,12 +40,13 @@ class MatchingBertPipe(Pipe): :param bool lower: 是否将word小写化。 :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。 """ - def __init__(self, lower=False, tokenizer: str='raw'): + + def __init__(self, lower=False, tokenizer: str = 'raw'): super().__init__() - + self.lower = bool(lower) self.tokenizer = get_tokenizer(tokenizer=tokenizer) - + def _tokenize(self, data_bundle, field_names, new_field_names): """ @@ -43,62 +60,62 @@ class MatchingBertPipe(Pipe): dataset.apply_field(lambda words: self.tokenizer(words), field_name=field_name, new_field_name=new_field_name) return data_bundle - + def process(self, data_bundle): for dataset in data_bundle.datasets.values(): if dataset.has_field(Const.TARGET): dataset.drop(lambda x: x[Const.TARGET] == '-') - + for name, dataset in data_bundle.datasets.items(): dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0), ) dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1), ) - + if self.lower: for name, dataset in data_bundle.datasets.items(): dataset[Const.INPUTS(0)].lower() dataset[Const.INPUTS(1)].lower() - + data_bundle = self._tokenize(data_bundle, [Const.INPUTS(0), Const.INPUTS(1)], [Const.INPUTS(0), Const.INPUTS(1)]) - + # concat两个words def concat(ins): words0 = ins[Const.INPUTS(0)] words1 = ins[Const.INPUTS(1)] words = words0 + ['[SEP]'] + words1 return words - + for name, dataset in data_bundle.datasets.items(): dataset.apply(concat, new_field_name=Const.INPUT) dataset.delete_field(Const.INPUTS(0)) dataset.delete_field(Const.INPUTS(1)) - + word_vocab = Vocabulary() word_vocab.from_dataset(*[dataset for name, dataset in data_bundle.datasets.items() if 'train' in name], field_name=Const.INPUT, no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if 'train' not in name]) word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT) - + target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if dataset.has_field(Const.TARGET)] target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) - + data_bundle.set_vocab(word_vocab, Const.INPUT) data_bundle.set_vocab(target_vocab, Const.TARGET) - + input_fields = [Const.INPUT, Const.INPUT_LEN] target_fields = [Const.TARGET] - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUT) dataset.set_input(*input_fields, flag=True) for fields in target_fields: if dataset.has_field(fields): dataset.set_target(fields, flag=True) - + return data_bundle @@ -150,12 +167,13 @@ class MatchingPipe(Pipe): :param bool lower: 是否将所有raw_words转为小写。 :param str tokenizer: 将原始数据tokenize的方式。支持spacy, raw. spacy是使用spacy切分,raw就是用空格切分。 """ - def __init__(self, lower=False, tokenizer: str='raw'): + + def __init__(self, lower=False, tokenizer: str = 'raw'): super().__init__() - + self.lower = bool(lower) self.tokenizer = get_tokenizer(tokenizer=tokenizer) - + def _tokenize(self, data_bundle, field_names, new_field_names): """ @@ -169,7 +187,7 @@ class MatchingPipe(Pipe): dataset.apply_field(lambda words: self.tokenizer(words), field_name=field_name, new_field_name=new_field_name) return data_bundle - + def process(self, data_bundle): """ 接受的DataBundle中的DataSet应该具有以下的field, target列可以没有 @@ -186,35 +204,35 @@ class MatchingPipe(Pipe): """ data_bundle = self._tokenize(data_bundle, [Const.RAW_WORDS(0), Const.RAW_WORDS(1)], [Const.INPUTS(0), Const.INPUTS(1)]) - + for dataset in data_bundle.datasets.values(): if dataset.has_field(Const.TARGET): dataset.drop(lambda x: x[Const.TARGET] == '-') - + if self.lower: for name, dataset in data_bundle.datasets.items(): dataset[Const.INPUTS(0)].lower() dataset[Const.INPUTS(1)].lower() - + word_vocab = Vocabulary() word_vocab.from_dataset(*[dataset for name, dataset in data_bundle.datasets.items() if 'train' in name], field_name=[Const.INPUTS(0), Const.INPUTS(1)], no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if 'train' not in name]) word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=[Const.INPUTS(0), Const.INPUTS(1)]) - + target_vocab = Vocabulary(padding=None, unknown=None) target_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET) has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if dataset.has_field(Const.TARGET)] target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET) - + data_bundle.set_vocab(word_vocab, Const.INPUTS(0)) data_bundle.set_vocab(target_vocab, Const.TARGET) - + input_fields = [Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LENS(0), Const.INPUT_LENS(1)] target_fields = [Const.TARGET] - + for name, dataset in data_bundle.datasets.items(): dataset.add_seq_len(Const.INPUTS(0), Const.INPUT_LENS(0)) dataset.add_seq_len(Const.INPUTS(1), Const.INPUT_LENS(1)) @@ -222,7 +240,7 @@ class MatchingPipe(Pipe): for fields in target_fields: if dataset.has_field(fields): dataset.set_target(fields, flag=True) - + return data_bundle @@ -254,4 +272,3 @@ class MNLIPipe(MatchingPipe): def process_from_file(self, paths=None): data_bundle = MNLILoader().load(paths) return self.process(data_bundle) - diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py index cc45dee4..a1435fd3 100644 --- a/fastNLP/io/pipe/pipe.py +++ b/fastNLP/io/pipe/pipe.py @@ -1,3 +1,9 @@ +"""undocumented""" + +__all__ = [ + "Pipe", +] + from .. import DataBundle diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py index 8facd8d9..f32f58b7 100644 --- a/fastNLP/io/pipe/utils.py +++ b/fastNLP/io/pipe/utils.py @@ -1,8 +1,18 @@ +"""undocumented""" + +__all__ = [ + "iob2", + "iob2bioes", + "get_tokenizer", +] + from typing import List -from ...core.vocabulary import Vocabulary + from ...core.const import Const +from ...core.vocabulary import Vocabulary + -def iob2(tags:List[str])->List[str]: +def iob2(tags: List[str]) -> List[str]: """ 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。两种格式的区别见 https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format @@ -25,7 +35,8 @@ def iob2(tags:List[str])->List[str]: tags[i] = "B" + tag[1:] return tags -def iob2bioes(tags:List[str])->List[str]: + +def iob2bioes(tags: List[str]) -> List[str]: """ 将iob的tag转换为bioes编码 :param tags: @@ -38,12 +49,12 @@ def iob2bioes(tags:List[str])->List[str]: else: split = tag.split('-')[0] if split == 'B': - if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I': + if i + 1 != len(tags) and tags[i + 1].split('-')[0] == 'I': new_tags.append(tag) else: new_tags.append(tag.replace('B-', 'S-')) elif split == 'I': - if i + 1List[str]: return new_tags -def get_tokenizer(tokenizer:str, lang='en'): +def get_tokenizer(tokenizer: str, lang='en'): """ :param str tokenizer: 获取tokenzier方法 @@ -97,13 +108,13 @@ def _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=Con name != 'train']) src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name) data_bundle.set_vocab(src_vocab, input_field_name) - + for target_field_name in target_field_names: tgt_vocab = Vocabulary(unknown=None, padding=None) tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=target_field_name) tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=target_field_name) data_bundle.set_vocab(tgt_vocab, target_field_name) - + return data_bundle @@ -116,7 +127,7 @@ def _add_words_field(data_bundle, lower=False): :return: 传入的DataBundle """ data_bundle.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT, ignore_miss_dataset=True) - + if lower: for name, dataset in data_bundle.datasets.items(): dataset[Const.INPUT].lower() @@ -132,7 +143,7 @@ def _add_chars_field(data_bundle, lower=False): :return: 传入的DataBundle """ data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True) - + if lower: for name, dataset in data_bundle.datasets.items(): dataset[Const.CHAR_INPUT].lower() @@ -147,6 +158,7 @@ def _drop_empty_instance(data_bundle, field_name): :param str field_name: 对哪个field进行检查,如果为None,则任意field为空都会删掉 :return: 传入的DataBundle """ + def empty_instance(ins): if field_name: field_value = ins[field_name] @@ -157,10 +169,8 @@ def _drop_empty_instance(data_bundle, field_name): if field_value in ((), {}, [], ''): return True return False - + for name, dataset in data_bundle.datasets.items(): dataset.drop(empty_instance) - + return data_bundle - - diff --git a/fastNLP/io/utils.py b/fastNLP/io/utils.py index faec2a55..e1de2ae7 100644 --- a/fastNLP/io/utils.py +++ b/fastNLP/io/utils.py @@ -1,10 +1,20 @@ -import os +""" +.. todo:: + doc +""" -from typing import Union, Dict +__all__ = [ + "check_loader_paths" +] + +import os from pathlib import Path +from typing import Union, Dict + from ..core import logger -def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: + +def check_loader_paths(paths: Union[str, Dict[str, str]]) -> Dict[str, str]: """ 检查传入dataloader的文件的合法性。如果为合法路径,将返回至少包含'train'这个key的dict。类似于下面的结果:: @@ -33,11 +43,13 @@ def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: path_pair = ('train', filename) if 'dev' in filename: if path_pair: - raise Exception("File:{} in {} contains bot `{}` and `dev`.".format(filename, paths, path_pair[0])) + raise Exception( + "File:{} in {} contains bot `{}` and `dev`.".format(filename, paths, path_pair[0])) path_pair = ('dev', filename) if 'test' in filename: if path_pair: - raise Exception("File:{} in {} contains bot `{}` and `test`.".format(filename, paths, path_pair[0])) + raise Exception( + "File:{} in {} contains bot `{}` and `test`.".format(filename, paths, path_pair[0])) path_pair = ('test', filename) if path_pair: files[path_pair[0]] = os.path.join(paths, path_pair[1]) @@ -46,7 +58,7 @@ def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: return files else: raise FileNotFoundError(f"{paths} is not a valid file path.") - + elif isinstance(paths, dict): if paths: if 'train' not in paths: @@ -65,6 +77,7 @@ def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: else: raise TypeError(f"paths only supports str and dict. not {type(paths)}.") + def get_tokenizer(): try: import spacy From efa9496d09d139658683eec0b4a6ae44b93dd88c Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 10:25:51 +0800 Subject: [PATCH 105/153] add __all__ and __doc__ for all files in module 'models', using 'undocumented' tags --- fastNLP/models/base_model.py | 4 ++++ fastNLP/models/bert.py | 8 ++++++-- fastNLP/models/cnn_text_classification.py | 7 ++++++- fastNLP/models/enas_controller.py | 9 +++++++-- fastNLP/models/enas_model.py | 5 ++++- fastNLP/models/enas_trainer.py | 14 +++++++++----- fastNLP/models/enas_utils.py | 8 ++++++-- fastNLP/models/sequence_labeling.py | 12 ++++++------ fastNLP/models/snli.py | 7 +++++-- 9 files changed, 53 insertions(+), 21 deletions(-) diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 2646d580..61edb91f 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -1,3 +1,7 @@ +"""undocumented""" + +__all__ = [] + import torch from ..modules.decoder.mlp import MLP diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py index 3afccc14..0a89b765 100644 --- a/fastNLP/models/bert.py +++ b/fastNLP/models/bert.py @@ -1,16 +1,20 @@ -""" +"""undocumented bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. """ + +__all__ = [] + import os + import torch from torch import nn from .base_model import BaseModel from ..core.const import Const +from ..core.utils import seq_len_to_mask from ..modules.encoder import BertModel from ..modules.encoder.bert import BertConfig, CONFIG_FILE -from ..core.utils import seq_len_to_mask class BertForSequenceClassification(BaseModel): diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index e00a0697..37a60c35 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -1,3 +1,8 @@ +""" +.. todo:: + doc +""" + __all__ = [ "CNNText" ] @@ -7,8 +12,8 @@ import torch.nn as nn from ..core.const import Const as C from ..core.utils import seq_len_to_mask -from ..modules import encoder from ..embeddings import embedding +from ..modules import encoder class CNNText(torch.nn.Module): diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py index e83c6b51..eec820e4 100644 --- a/fastNLP/models/enas_controller.py +++ b/fastNLP/models/enas_controller.py @@ -1,5 +1,10 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch -"""A module with NAS controller-related code.""" +"""undocumented +Code Modified from https://github.com/carpedm20/ENAS-pytorch +A module with NAS controller-related code. +""" + +__all__ = [] + import collections import os diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py index b6b683c0..2e8ca713 100644 --- a/fastNLP/models/enas_model.py +++ b/fastNLP/models/enas_model.py @@ -1,7 +1,10 @@ -""" +"""undocumented Module containing the shared RNN model. Code Modified from https://github.com/carpedm20/ENAS-pytorch """ + +__all__ = [] + import collections import numpy as np diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py index 7abcc45f..98d778cd 100644 --- a/fastNLP/models/enas_trainer.py +++ b/fastNLP/models/enas_trainer.py @@ -1,11 +1,15 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch +"""undocumented +Code Modified from https://github.com/carpedm20/ENAS-pytorch +""" + +__all__ = [] + import math -import numpy as np import time -import torch - from datetime import datetime, timedelta +import numpy as np +import torch from torch.optim import Adam try: @@ -15,7 +19,7 @@ except: from ..core.trainer import Trainer from ..core.batch import DataSetIter -from ..core.callback import CallbackManager, CallbackException +from ..core.callback import CallbackException from ..core.dataset import DataSet from ..core.utils import _move_dict_value_to_device from . import enas_utils as utils diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py index 4e402a9a..cd6c2503 100644 --- a/fastNLP/models/enas_utils.py +++ b/fastNLP/models/enas_utils.py @@ -1,7 +1,11 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch +"""undocumented +Code Modified from https://github.com/carpedm20/ENAS-pytorch +""" + +__all__ = [] -from collections import defaultdict import collections +from collections import defaultdict import numpy as np import torch diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py index 4bf3f95f..0dff21f0 100644 --- a/fastNLP/models/sequence_labeling.py +++ b/fastNLP/models/sequence_labeling.py @@ -1,5 +1,5 @@ """ - 本模块实现了几种序列标注模型 +本模块实现了几种序列标注模型 """ __all__ = [ "SeqLabeling", @@ -12,14 +12,14 @@ import torch.nn as nn import torch.nn.functional as F from .base_model import BaseModel -from ..embeddings import embedding -from ..modules import decoder, encoder -from ..modules.decoder.crf import allowed_transitions -from ..core.utils import seq_len_to_mask from ..core.const import Const as C -from ..modules import LSTM +from ..core.utils import seq_len_to_mask +from ..embeddings import embedding from ..embeddings import get_embeddings from ..modules import ConditionalRandomField +from ..modules import LSTM +from ..modules import decoder, encoder +from ..modules.decoder.crf import allowed_transitions class BiLSTMCRF(BaseModel): diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py index 3be942e8..5ca4052d 100644 --- a/fastNLP/models/snli.py +++ b/fastNLP/models/snli.py @@ -1,3 +1,7 @@ +""" +.. todo:: + doc +""" __all__ = [ "ESIM" ] @@ -5,13 +9,12 @@ __all__ = [ import torch import torch.nn as nn import torch.nn.functional as F - from torch.nn import CrossEntropyLoss from .base_model import BaseModel -from ..embeddings.embedding import TokenEmbedding, Embedding from ..core.const import Const from ..core.utils import seq_len_to_mask +from ..embeddings.embedding import TokenEmbedding, Embedding class ESIM(BaseModel): From 2cf9c0ebb1722aae734ceb971b889c43198729a2 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 10:26:55 +0800 Subject: [PATCH 106/153] add __all__ and __doc__ for all files in module 'modules', using 'undocumented' tags --- fastNLP/modules/decoder/__init__.py | 6 +++- fastNLP/modules/decoder/crf.py | 5 +++- fastNLP/modules/decoder/mlp.py | 2 ++ fastNLP/modules/decoder/utils.py | 2 ++ fastNLP/modules/dropout.py | 6 +++- fastNLP/modules/encoder/__init__.py | 10 +++++-- fastNLP/modules/encoder/_elmo.py | 4 ++- fastNLP/modules/encoder/attention.py | 2 ++ fastNLP/modules/encoder/bert.py | 8 +++--- fastNLP/modules/encoder/char_encoder.py | 2 ++ fastNLP/modules/encoder/conv_maxpool.py | 2 ++ fastNLP/modules/encoder/lstm.py | 3 +- fastNLP/modules/encoder/pooling.py | 2 ++ fastNLP/modules/encoder/star_transformer.py | 3 +- fastNLP/modules/encoder/transformer.py | 2 ++ fastNLP/modules/encoder/variational_rnn.py | 3 +- fastNLP/modules/utils.py | 32 ++++++++++++++------- 17 files changed, 69 insertions(+), 25 deletions(-) diff --git a/fastNLP/modules/decoder/__init__.py b/fastNLP/modules/decoder/__init__.py index 664618b2..57acb172 100644 --- a/fastNLP/modules/decoder/__init__.py +++ b/fastNLP/modules/decoder/__init__.py @@ -1,3 +1,7 @@ +""" +.. todo:: + doc +""" __all__ = [ "MLP", "ConditionalRandomField", @@ -6,6 +10,6 @@ __all__ = [ ] from .crf import ConditionalRandomField +from .crf import allowed_transitions from .mlp import MLP from .utils import viterbi_decode -from .crf import allowed_transitions diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py index 9f19afef..b47d0162 100644 --- a/fastNLP/modules/decoder/crf.py +++ b/fastNLP/modules/decoder/crf.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "ConditionalRandomField", "allowed_transitions" @@ -9,13 +11,14 @@ from torch import nn from ..utils import initial_parameter from ...core import Vocabulary + def allowed_transitions(id2target, encoding_type='bio', include_start_end=False): """ 别名::class:`fastNLP.modules.allowed_transitions` :class:`fastNLP.modules.decoder.allowed_transitions` 给定一个id到label的映射表,返回所有可以跳转的(from_tag_id, to_tag_id)列表。 - :param dict,Vocabulary id2target: key是label的indices,value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是 + :param dict, ~fastNLP.Vocabulary id2target: key是label的indices,value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是 "B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.idx2word得到id2label。 :param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"。 :param bool include_start_end: 是否包含开始与结尾的转换。比如在bio中,b/o可以在开头,但是i不能在开头; diff --git a/fastNLP/modules/decoder/mlp.py b/fastNLP/modules/decoder/mlp.py index 9d9d80f2..f6e687a7 100644 --- a/fastNLP/modules/decoder/mlp.py +++ b/fastNLP/modules/decoder/mlp.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "MLP" ] diff --git a/fastNLP/modules/decoder/utils.py b/fastNLP/modules/decoder/utils.py index 3d5ac3f8..118b1414 100644 --- a/fastNLP/modules/decoder/utils.py +++ b/fastNLP/modules/decoder/utils.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "viterbi_decode" ] diff --git a/fastNLP/modules/dropout.py b/fastNLP/modules/dropout.py index 0ea2a2d9..24c20cc6 100644 --- a/fastNLP/modules/dropout.py +++ b/fastNLP/modules/dropout.py @@ -1,4 +1,8 @@ -__all__ = [] +"""undocumented""" + +__all__ = [ + "TimestepDropout" +] import torch diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py index 1e99a0fd..0dfc18de 100644 --- a/fastNLP/modules/encoder/__init__.py +++ b/fastNLP/modules/encoder/__init__.py @@ -1,3 +1,8 @@ +""" +.. todo:: + doc +""" + __all__ = [ # "BertModel", @@ -24,13 +29,12 @@ __all__ = [ "MultiHeadAttention", ] +from .attention import MultiHeadAttention from .bert import BertModel from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder from .conv_maxpool import ConvMaxpool from .lstm import LSTM +from .pooling import MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask from .star_transformer import StarTransformer from .transformer import TransformerEncoder from .variational_rnn import VarRNN, VarLSTM, VarGRU - -from .pooling import MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask -from .attention import MultiHeadAttention diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py index befae8bc..554cf8a9 100644 --- a/fastNLP/modules/encoder/_elmo.py +++ b/fastNLP/modules/encoder/_elmo.py @@ -1,7 +1,9 @@ -""" +"""undocumented 这个页面的代码大量参考了 allenNLP """ +__all__ = [] + from typing import Optional, Tuple, List, Callable import torch diff --git a/fastNLP/modules/encoder/attention.py b/fastNLP/modules/encoder/attention.py index fe3f7fd8..02bd078a 100644 --- a/fastNLP/modules/encoder/attention.py +++ b/fastNLP/modules/encoder/attention.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "MultiHeadAttention" ] diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index b74c4da0..5026f48a 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -1,4 +1,4 @@ -""" +"""undocumented 这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码, 如果你发现该代码对你 有用,也请引用一下他们。 """ @@ -8,17 +8,17 @@ __all__ = [ ] import collections - -import unicodedata import copy import json import math import os +import unicodedata import torch from torch import nn -from ...core import logger + from ..utils import _get_file_name_base_on_postfix +from ...core import logger CONFIG_FILE = 'bert_config.json' VOCAB_NAME = 'vocab.txt' diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py index 6a6e1470..e40bd0dd 100644 --- a/fastNLP/modules/encoder/char_encoder.py +++ b/fastNLP/modules/encoder/char_encoder.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "ConvolutionCharEncoder", "LSTMCharEncoder" diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py index 8ce6b163..68415189 100644 --- a/fastNLP/modules/encoder/conv_maxpool.py +++ b/fastNLP/modules/encoder/conv_maxpool.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "ConvMaxpool" ] diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index e2358132..1f3eae6d 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -1,7 +1,8 @@ -""" +"""undocumented 轻量封装的 Pytorch LSTM 模块. 可在 forward 时传入序列的长度, 自动对padding做合适的处理. """ + __all__ = [ "LSTM" ] diff --git a/fastNLP/modules/encoder/pooling.py b/fastNLP/modules/encoder/pooling.py index d8aa54ad..b1272284 100644 --- a/fastNLP/modules/encoder/pooling.py +++ b/fastNLP/modules/encoder/pooling.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "MaxPool", "MaxPoolWithMask", diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py index 3927a494..02d7a6a0 100644 --- a/fastNLP/modules/encoder/star_transformer.py +++ b/fastNLP/modules/encoder/star_transformer.py @@ -1,6 +1,7 @@ -""" +"""undocumented Star-Transformer 的encoder部分的 Pytorch 实现 """ + __all__ = [ "StarTransformer" ] diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py index bc488e54..ce9172d5 100644 --- a/fastNLP/modules/encoder/transformer.py +++ b/fastNLP/modules/encoder/transformer.py @@ -1,3 +1,5 @@ +"""undocumented""" + __all__ = [ "TransformerEncoder" ] diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py index 8e5e804b..933555c8 100644 --- a/fastNLP/modules/encoder/variational_rnn.py +++ b/fastNLP/modules/encoder/variational_rnn.py @@ -1,6 +1,7 @@ -""" +"""undocumented Variational RNN 的 Pytorch 实现 """ + __all__ = [ "VarRNN", "VarLSTM", diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index ead75711..09574782 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -1,10 +1,20 @@ +""" +.. todo:: + doc +""" + +__all__ = [ + "initial_parameter", + "summary" +] + +import os from functools import reduce import torch import torch.nn as nn import torch.nn.init as init -import glob -import os + def initial_parameter(net, initial_method=None): """A method used to initialize the weights of PyTorch models. @@ -40,7 +50,7 @@ def initial_parameter(net, initial_method=None): init_method = init.uniform_ else: init_method = init.xavier_normal_ - + def weights_init(m): # classname = m.__class__.__name__ if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv3d): # for all the cnn @@ -66,7 +76,7 @@ def initial_parameter(net, initial_method=None): else: init.normal_(w.data) # bias # print("init else") - + net.apply(weights_init) @@ -79,11 +89,11 @@ def summary(model: nn.Module): """ train = [] nontrain = [] - + def layer_summary(module: nn.Module): def count_size(sizes): - return reduce(lambda x, y: x*y, sizes) - + return reduce(lambda x, y: x * y, sizes) + for p in module.parameters(recurse=False): if p.requires_grad: train.append(count_size(p.shape)) @@ -91,7 +101,7 @@ def summary(model: nn.Module): nontrain.append(count_size(p.shape)) for subm in module.children(): layer_summary(subm) - + layer_summary(model) total_train = sum(train) total_nontrain = sum(nontrain) @@ -101,7 +111,7 @@ def summary(model: nn.Module): strings.append('Trainable params: {:,}'.format(total_train)) strings.append('Non-trainable params: {:,}'.format(total_nontrain)) max_len = len(max(strings, key=len)) - bar = '-'*(max_len + 3) + bar = '-' * (max_len + 3) strings = [bar] + strings + [bar] print('\n'.join(strings)) return total, total_train, total_nontrain @@ -128,9 +138,9 @@ def _get_file_name_base_on_postfix(dir_path, postfix): :param postfix: 形如".bin", ".json"等 :return: str,文件的路径 """ - files = list(filter(lambda filename:filename.endswith(postfix), os.listdir(os.path.join(dir_path)))) + files = list(filter(lambda filename: filename.endswith(postfix), os.listdir(os.path.join(dir_path)))) if len(files) == 0: raise FileNotFoundError(f"There is no file endswith *{postfix} file in {dir_path}") elif len(files) > 1: raise FileExistsError(f"There are multiple *{postfix} files in {dir_path}") - return os.path.join(dir_path, files[0]) \ No newline at end of file + return os.path.join(dir_path, files[0]) From e1f234841cf763839c767ebf4d6e750c5391adb4 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 11:00:45 +0800 Subject: [PATCH 107/153] mark the dataloader.__init__ as undocumented --- fastNLP/io/data_loader/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/io/data_loader/__init__.py b/fastNLP/io/data_loader/__init__.py index b3ca9021..8a9dd60b 100644 --- a/fastNLP/io/data_loader/__init__.py +++ b/fastNLP/io/data_loader/__init__.py @@ -1,4 +1,4 @@ -""" +"""undocumented .. warning:: 本模块在 `0.5.0版本` 中被废弃,由 :mod:`~fastNLP.io.loader` 和 :mod:`~fastNLP.io.pipe` 模块替代。 From ffd5fd813559cee2930f5d0d0274357fb151cc4c Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 11:58:20 +0800 Subject: [PATCH 108/153] delete the old doc-tool --- docs/format.py | 68 -------------------------------------------------- 1 file changed, 68 deletions(-) delete mode 100644 docs/format.py diff --git a/docs/format.py b/docs/format.py deleted file mode 100644 index 67671ae7..00000000 --- a/docs/format.py +++ /dev/null @@ -1,68 +0,0 @@ -import os - - -def shorten(file, to_delete, cut=False): - if file.endswith("index.rst") or file.endswith("conf.py"): - return - res = [] - with open(file, "r") as fin: - lines = fin.readlines() - for line in lines: - if cut and line.rstrip() == "Submodules": - break - else: - res.append(line.rstrip()) - for i, line in enumerate(res): - if line.endswith(" package"): - res[i] = res[i][:-len(" package")] - res[i + 1] = res[i + 1][:-len(" package")] - elif line.endswith(" module"): - res[i] = res[i][:-len(" module")] - res[i + 1] = res[i + 1][:-len(" module")] - else: - for name in to_delete: - if line.endswith(name): - res[i] = "del" - - with open(file, "w") as fout: - for line in res: - if line != "del": - print(line, file=fout) - - -def clear(path='./source/'): - files = os.listdir(path) - to_delete = [ - "fastNLP.core.dist_trainer", - "fastNLP.core.predictor", - - "fastNLP.io.file_reader", - "fastNLP.io.config_io", - - "fastNLP.embeddings.contextual_embedding", - - "fastNLP.modules.dropout", - "fastNLP.models.base_model", - "fastNLP.models.bert", - "fastNLP.models.enas_utils", - "fastNLP.models.enas_controller", - "fastNLP.models.enas_model", - "fastNLP.models.enas_trainer", - ] - for file in files: - if not os.path.isdir(path + file): - res = file.split('.') - if len(res) > 4: - to_delete.append(file[:-4]) - elif len(res) == 4: - shorten(path + file, to_delete, True) - else: - shorten(path + file, to_delete) - for file in to_delete: - try: - os.remove(path + file + ".rst") - except: - pass - - -clear() From 78af3491a432cb10b36d9cf17b75c12e40146026 Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Mon, 26 Aug 2019 14:03:40 +0800 Subject: [PATCH 109/153] =?UTF-8?q?=E4=BF=AE=E6=94=B9tutorial?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/tutorials/tutorial_4_loss_optimizer.rst | 7 +++++-- docs/source/tutorials/tutorial_5_datasetiter.rst | 5 ++++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/source/tutorials/tutorial_4_loss_optimizer.rst b/docs/source/tutorials/tutorial_4_loss_optimizer.rst index f863a7a8..a53ef89b 100644 --- a/docs/source/tutorials/tutorial_4_loss_optimizer.rst +++ b/docs/source/tutorials/tutorial_4_loss_optimizer.rst @@ -1,4 +1,4 @@ -============================================================================== +============================================================================== 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 ============================================================================== @@ -19,7 +19,9 @@ loader = SSTLoader() #这里的all.txt是下载好数据后train.txt、dev.txt、test.txt的组合 - dataset = loader.load("./trainDevTestTrees_PTB/trees/all.txt") + #loader.load(path)会首先判断path是否为none,若是则自动从网站下载数据,若不是则读入数据并返回databundle + databundle_ = loader.load("./trainDevTestTrees_PTB/trees/all.txt") + dataset = databundle_.datasets['train'] print(dataset[0]) 输出数据如下:: @@ -31,6 +33,7 @@ 数据处理 + 可以使用事先定义的 :class:`~fastNLP.io.SSTPipe` 类对数据进行基本预处理,这里我们手动进行处理。 我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``target`` :mod:`~fastNLP.core.field` 转化为整数。 .. code-block:: python diff --git a/docs/source/tutorials/tutorial_5_datasetiter.rst b/docs/source/tutorials/tutorial_5_datasetiter.rst index e81b18dd..2ec753c3 100644 --- a/docs/source/tutorials/tutorial_5_datasetiter.rst +++ b/docs/source/tutorials/tutorial_5_datasetiter.rst @@ -20,7 +20,9 @@ loader = SSTLoader() #这里的all.txt是下载好数据后train.txt、dev.txt、test.txt的组合 - dataset = loader.load("./trainDevTestTrees_PTB/trees/all.txt") + #loader.load(path)会首先判断path是否为none,若是则自动从网站下载数据,若不是则读入数据并返回databundle + databundle_ = loader.load("./trainDevTestTrees_PTB/trees/all.txt") + dataset = databundle_.datasets['train'] print(dataset[0]) 输出数据如下:: @@ -32,6 +34,7 @@ 数据处理 + 可以使用事先定义的 :class:`~fastNLP.io.SSTPipe` 类对数据进行基本预处理,这里我们手动进行处理。 我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``target`` :mod:`~fastNLP.core.field` 转化为整数。 .. code-block:: python From 53975c045a6841e38d4a7cfcc23abea6de0fe3f3 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 26 Aug 2019 14:58:36 +0800 Subject: [PATCH 110/153] update the doc-tool & fix an importing bug --- docs/count.py | 42 ++++++++++++++++++++++++++++++++++ fastNLP/modules/decoder/crf.py | 2 +- 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/count.py b/docs/count.py index e1aad115..72868403 100644 --- a/docs/count.py +++ b/docs/count.py @@ -1,7 +1,28 @@ +import inspect import os import sys +def _colored_string(string: str, color: str or int) -> str: + """在终端中显示一串有颜色的文字 + :param string: 在终端中显示的文字 + :param color: 文字的颜色 + :return: + """ + if isinstance(color, str): + color = { + "black": 30, "Black": 30, "BLACK": 30, + "red": 31, "Red": 31, "RED": 31, + "green": 32, "Green": 32, "GREEN": 32, + "yellow": 33, "Yellow": 33, "YELLOW": 33, + "blue": 34, "Blue": 34, "BLUE": 34, + "purple": 35, "Purple": 35, "PURPLE": 35, + "cyan": 36, "Cyan": 36, "CYAN": 36, + "white": 37, "White": 37, "WHITE": 37 + }[color] + return "\033[%dm%s\033[0m" % (color, string) + + def find_all_modules(): modules = {} children = {} @@ -55,10 +76,31 @@ def create_rst_file(modules, name, children): fout.write(" " + module + "\n") +def check_file(m, name): + for item, obj in inspect.getmembers(m): + if inspect.isclass(obj) and obj.__module__ == name: + print(obj) + if inspect.isfunction(obj) and obj.__module__ == name: + print("FUNC", obj) + + +def check_files(modules): + for name in sorted(modules.keys()): + if name == 'fastNLP.core.utils': + check_file(modules[name], name) + + def main(): + print(_colored_string('Getting modules...', "Blue")) modules, to_doc, children = find_all_modules() + print(_colored_string('Done!', "Green")) + print(_colored_string('Creating rst files...', "Blue")) for name in to_doc: create_rst_file(modules, name, children) + print(_colored_string('Done!', "Green")) + print(_colored_string('Checking all files...', "Blue")) + check_files(modules) + print(_colored_string('Done!', "Green")) if __name__ == "__main__": diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py index b47d0162..f63d46e3 100644 --- a/fastNLP/modules/decoder/crf.py +++ b/fastNLP/modules/decoder/crf.py @@ -9,7 +9,7 @@ import torch from torch import nn from ..utils import initial_parameter -from ...core import Vocabulary +from ...core.vocabulary import Vocabulary def allowed_transitions(id2target, encoding_type='bio', include_start_end=False): From 19bbaf11b6989a1a29384d5b1516bf934ccac296 Mon Sep 17 00:00:00 2001 From: yh Date: Tue, 27 Aug 2019 01:54:15 +0800 Subject: [PATCH 111/153] =?UTF-8?q?=E4=BD=BF=E7=94=A8=E6=9B=B4pytorch?= =?UTF-8?q?=E7=9A=84=E6=96=B9=E5=BC=8F=E5=A4=84=E7=90=86embedding=E4=B8=AD?= =?UTF-8?q?=E7=9A=84parameter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 2 +- fastNLP/embeddings/char_embedding.py | 14 ++++++-------- fastNLP/embeddings/elmo_embedding.py | 5 ++--- fastNLP/embeddings/static_embedding.py | 16 +++++++--------- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 6a10c489..f3ef69dd 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -345,7 +345,7 @@ class _WordBertModel(nn.Module): self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece print("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab))) self.word_to_wordpieces = np.array(word_to_wordpieces) - self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) + self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) print("Successfully generate word pieces.") def forward(self, words): diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index 520e85e6..ea0d4e93 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -82,10 +82,9 @@ class CNNCharEmbedding(TokenEmbedding): print(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index max_word_len = max(map(lambda x: len(x[0]), vocab)) - self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len), - fill_value=self.char_pad_index, dtype=torch.long), - requires_grad=False) - self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False) + self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), max_word_len), + fill_value=self.char_pad_index, dtype=torch.long)) + self.register_buffer('word_lengths', torch.zeros(len(vocab)).long()) for word, index in vocab: # if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。修改为不区分pad, 这样所有的也是同一个embed self.words_to_chars_embedding[index, :len(word)] = \ @@ -235,10 +234,9 @@ class LSTMCharEmbedding(TokenEmbedding): print(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index self.max_word_len = max(map(lambda x: len(x[0]), vocab)) - self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len), - fill_value=self.char_pad_index, dtype=torch.long), - requires_grad=False) - self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False) + self.register_buffer('words_to_chars_embedding', torch.full((len(vocab), self.max_word_len), + fill_value=self.char_pad_index, dtype=torch.long)) + self.register_buffer('word_lengths', torch.zeros(len(vocab)).long()) for word, index in vocab: # if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了. 修改为不区分pad与否 self.words_to_chars_embedding[index, :len(word)] = \ diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index 24cd052e..80178d21 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -240,10 +240,9 @@ class _ElmoModel(nn.Module): # 生成words到chars的映射 max_chars = config['char_cnn']['max_characters_per_token'] - self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars), + self.register_buffer('words_to_chars_embedding', torch.full((len(vocab) + 2, max_chars), fill_value=len(char_vocab), - dtype=torch.long), - requires_grad=False) + dtype=torch.long)) for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab) + 1)]: if len(word) + 2 > max_chars: word = word[:max_chars - 2] diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index a75ad18f..b0141682 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -121,28 +121,27 @@ class StaticEmbedding(TokenEmbedding): embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) - self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) + self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) if lowered_vocab.unknown: unknown_idx = lowered_vocab.unknown_idx else: unknown_idx = embedding.size(0) - 1 # 否则是最后一个为unknow - self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) - words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(), - requires_grad=False) + self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) + words_to_words = torch.full((len(vocab),), fill_value=unknown_idx).long() for word, index in vocab: if word not in lowered_vocab: word = word.lower() if word not in lowered_vocab and lowered_vocab._is_word_no_create_entry(word): continue # 如果不需要创建entry,已经默认unknown了 words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)] - self.words_to_words = words_to_words + self.register_buffer('words_to_words', words_to_words) self._word_unk_index = lowered_vocab.unknown_idx # 替换一下unknown的index else: if model_path: embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method) else: embedding = self._randomly_init_embed(len(vocab), embedding_dim, init_method) - self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) + self.register_buffer('words_to_words', torch.arange(len(vocab)).long()) if not self.only_norm_found_vector and normalize: embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12) @@ -151,7 +150,7 @@ class StaticEmbedding(TokenEmbedding): index_in_truncated_vocab = truncated_words_to_words[i] truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab] del self.words_to_words - self.words_to_words = nn.Parameter(truncated_words_to_words, requires_grad=False) + self.register_buffer('words_to_words', truncated_words_to_words) self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], padding_idx=vocab.padding_idx, @@ -273,8 +272,7 @@ class StaticEmbedding(TokenEmbedding): vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous() else: unknown_idx = vocab.unknown_idx - self.words_to_words = nn.Parameter(torch.full((len(vocab), ), fill_value=unknown_idx).long(), - requires_grad=False) + self.register_buffer('words_to_words', torch.full((len(vocab), ), fill_value=unknown_idx).long()) for index, (index_in_vocab, vec) in enumerate(matrix.items()): if vec is not None: From 04737a105d1d57c334ebb664cac64d4331a8593a Mon Sep 17 00:00:00 2001 From: ChenXin Date: Tue, 27 Aug 2019 20:46:05 +0800 Subject: [PATCH 112/153] update the doc-tool to show __init__ and class doc separately --- docs/count.py | 7 ++++--- docs/source/conf.py | 6 ++++-- docs/source/fastNLP.core.rst | 3 +-- docs/source/fastNLP.embeddings.rst | 1 + docs/source/fastNLP.io.rst | 1 + docs/source/fastNLP.models.biaffine_parser.rst | 1 - docs/source/fastNLP.models.cnn_text_classification.rst | 1 - docs/source/fastNLP.models.rst | 2 +- docs/source/fastNLP.models.sequence_labeling.rst | 1 - docs/source/fastNLP.models.snli.rst | 1 - docs/source/fastNLP.models.star_transformer.rst | 1 - docs/source/fastNLP.modules.decoder.rst | 1 - docs/source/fastNLP.modules.encoder.rst | 1 - docs/source/fastNLP.modules.rst | 2 +- docs/source/fastNLP.modules.utils.rst | 1 - docs/source/fastNLP.rst | 1 + 16 files changed, 14 insertions(+), 17 deletions(-) diff --git a/docs/count.py b/docs/count.py index 72868403..c75173ef 100644 --- a/docs/count.py +++ b/docs/count.py @@ -66,12 +66,13 @@ def create_rst_file(modules, name, children): fout.write(t + "\n") fout.write("\n") fout.write(".. automodule:: " + name + "\n") - if len(m.__all__) > 0: + if name != "fastNLP.core" and len(m.__all__) > 0: fout.write(" :members: " + ", ".join(m.__all__) + "\n") - fout.write(" :inherited-members:\n") + if not (name.startswith('fastNLP.models') or name.startswith('fastNLP.modules')): + fout.write(" :inherited-members:\n") fout.write("\n") if name in children: - fout.write("子模块\n------\n\n.. toctree::\n\n") + fout.write("子模块\n------\n\n.. toctree::\n :maxdepth: 1\n\n") for module in children[name]: fout.write(" " + module + "\n") diff --git a/docs/source/conf.py b/docs/source/conf.py index 83cb7185..7536ee32 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -168,10 +168,12 @@ texinfo_documents = [ # -- Extension configuration ------------------------------------------------- def maybe_skip_member(app, what, name, obj, skip, options): - if name.startswith("_"): - return True if obj.__doc__ is None: return True + if name == "__init__": + return False + if name.startswith("_"): + return True return False diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst index 56de46e9..15fe29d5 100644 --- a/docs/source/fastNLP.core.rst +++ b/docs/source/fastNLP.core.rst @@ -2,13 +2,12 @@ fastNLP.core ============ .. automodule:: fastNLP.core - :members: DataSet, Instance, FieldArray, Padder, AutoPadder, EngChar2DPadder, Vocabulary, DataSetIter, BatchIter, TorchLoaderIter, Const, Tester, Trainer, cache_results, seq_len_to_mask, get_seq_len, logger, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, TesterCallback, CallbackException, EarlyStopError, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, SequentialSampler, BucketSampler, RandomSampler, Sampler - :inherited-members: 子模块 ------ .. toctree:: + :maxdepth: 1 fastNLP.core.batch fastNLP.core.callback diff --git a/docs/source/fastNLP.embeddings.rst b/docs/source/fastNLP.embeddings.rst index 8376408c..b9e6a853 100644 --- a/docs/source/fastNLP.embeddings.rst +++ b/docs/source/fastNLP.embeddings.rst @@ -9,6 +9,7 @@ fastNLP.embeddings ------ .. toctree:: + :maxdepth: 1 fastNLP.embeddings.bert_embedding fastNLP.embeddings.char_embedding diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst index 2aacb883..96df9d6c 100644 --- a/docs/source/fastNLP.io.rst +++ b/docs/source/fastNLP.io.rst @@ -9,6 +9,7 @@ fastNLP.io ------ .. toctree:: + :maxdepth: 1 fastNLP.io.data_bundle fastNLP.io.embed_loader diff --git a/docs/source/fastNLP.models.biaffine_parser.rst b/docs/source/fastNLP.models.biaffine_parser.rst index c3dbb0a5..395638fe 100644 --- a/docs/source/fastNLP.models.biaffine_parser.rst +++ b/docs/source/fastNLP.models.biaffine_parser.rst @@ -3,5 +3,4 @@ fastNLP.models.biaffine_parser .. automodule:: fastNLP.models.biaffine_parser :members: BiaffineParser, GraphParser - :inherited-members: diff --git a/docs/source/fastNLP.models.cnn_text_classification.rst b/docs/source/fastNLP.models.cnn_text_classification.rst index fe4bb157..e9ed7ee1 100644 --- a/docs/source/fastNLP.models.cnn_text_classification.rst +++ b/docs/source/fastNLP.models.cnn_text_classification.rst @@ -3,5 +3,4 @@ fastNLP.models.cnn_text_classification .. automodule:: fastNLP.models.cnn_text_classification :members: CNNText - :inherited-members: diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst index 88854a79..fb782de1 100644 --- a/docs/source/fastNLP.models.rst +++ b/docs/source/fastNLP.models.rst @@ -3,12 +3,12 @@ fastNLP.models .. automodule:: fastNLP.models :members: CNNText, SeqLabeling, AdvSeqLabel, ESIM, StarTransEnc, STSeqLabel, STNLICls, STSeqCls, BiaffineParser, GraphParser - :inherited-members: 子模块 ------ .. toctree:: + :maxdepth: 1 fastNLP.models.biaffine_parser fastNLP.models.cnn_text_classification diff --git a/docs/source/fastNLP.models.sequence_labeling.rst b/docs/source/fastNLP.models.sequence_labeling.rst index b66e637e..f6551f8b 100644 --- a/docs/source/fastNLP.models.sequence_labeling.rst +++ b/docs/source/fastNLP.models.sequence_labeling.rst @@ -3,5 +3,4 @@ fastNLP.models.sequence_labeling .. automodule:: fastNLP.models.sequence_labeling :members: SeqLabeling, AdvSeqLabel - :inherited-members: diff --git a/docs/source/fastNLP.models.snli.rst b/docs/source/fastNLP.models.snli.rst index 8551051a..eed02139 100644 --- a/docs/source/fastNLP.models.snli.rst +++ b/docs/source/fastNLP.models.snli.rst @@ -3,5 +3,4 @@ fastNLP.models.snli .. automodule:: fastNLP.models.snli :members: ESIM - :inherited-members: diff --git a/docs/source/fastNLP.models.star_transformer.rst b/docs/source/fastNLP.models.star_transformer.rst index f4b5989e..80ab5b33 100644 --- a/docs/source/fastNLP.models.star_transformer.rst +++ b/docs/source/fastNLP.models.star_transformer.rst @@ -3,5 +3,4 @@ fastNLP.models.star_transformer .. automodule:: fastNLP.models.star_transformer :members: StarTransEnc, STNLICls, STSeqCls, STSeqLabel - :inherited-members: diff --git a/docs/source/fastNLP.modules.decoder.rst b/docs/source/fastNLP.modules.decoder.rst index b121f9e9..de6e0d9d 100644 --- a/docs/source/fastNLP.modules.decoder.rst +++ b/docs/source/fastNLP.modules.decoder.rst @@ -3,5 +3,4 @@ fastNLP.modules.decoder .. automodule:: fastNLP.modules.decoder :members: MLP, ConditionalRandomField, viterbi_decode, allowed_transitions - :inherited-members: diff --git a/docs/source/fastNLP.modules.encoder.rst b/docs/source/fastNLP.modules.encoder.rst index 6b44a192..fceabbdb 100644 --- a/docs/source/fastNLP.modules.encoder.rst +++ b/docs/source/fastNLP.modules.encoder.rst @@ -3,5 +3,4 @@ fastNLP.modules.encoder .. automodule:: fastNLP.modules.encoder :members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask, MultiHeadAttention - :inherited-members: diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst index 6134d0dd..b7c259ed 100644 --- a/docs/source/fastNLP.modules.rst +++ b/docs/source/fastNLP.modules.rst @@ -3,12 +3,12 @@ fastNLP.modules .. automodule:: fastNLP.modules :members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask, MultiHeadAttention, MLP, ConditionalRandomField, viterbi_decode, allowed_transitions, TimestepDropout - :inherited-members: 子模块 ------ .. toctree:: + :maxdepth: 1 fastNLP.modules.decoder fastNLP.modules.encoder diff --git a/docs/source/fastNLP.modules.utils.rst b/docs/source/fastNLP.modules.utils.rst index e28ca35a..101a0f45 100644 --- a/docs/source/fastNLP.modules.utils.rst +++ b/docs/source/fastNLP.modules.utils.rst @@ -3,5 +3,4 @@ fastNLP.modules.utils .. automodule:: fastNLP.modules.utils :members: initial_parameter, summary - :inherited-members: diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst index f22ea936..e01817f7 100644 --- a/docs/source/fastNLP.rst +++ b/docs/source/fastNLP.rst @@ -9,6 +9,7 @@ fastNLP ------ .. toctree:: + :maxdepth: 1 fastNLP.core fastNLP.embeddings From 169f519ffb0133b5f553d04c17c9f2cac0edebcb Mon Sep 17 00:00:00 2001 From: ChenXin Date: Tue, 27 Aug 2019 21:07:22 +0800 Subject: [PATCH 113/153] ignore the methods inherited from torch.nn.Embedding --- docs/count.py | 3 ++- docs/source/fastNLP.embeddings.bert_embedding.rst | 1 - docs/source/fastNLP.embeddings.char_embedding.rst | 1 - docs/source/fastNLP.embeddings.contextual_embedding.rst | 1 - docs/source/fastNLP.embeddings.elmo_embedding.rst | 1 - docs/source/fastNLP.embeddings.embedding.rst | 1 - docs/source/fastNLP.embeddings.rst | 1 - docs/source/fastNLP.embeddings.stack_embedding.rst | 1 - docs/source/fastNLP.embeddings.static_embedding.rst | 1 - docs/source/fastNLP.embeddings.utils.rst | 1 - docs/source/fastNLP.io.dataset_loader.rst | 6 ------ 11 files changed, 2 insertions(+), 16 deletions(-) delete mode 100644 docs/source/fastNLP.io.dataset_loader.rst diff --git a/docs/count.py b/docs/count.py index c75173ef..6a5d256b 100644 --- a/docs/count.py +++ b/docs/count.py @@ -68,7 +68,8 @@ def create_rst_file(modules, name, children): fout.write(".. automodule:: " + name + "\n") if name != "fastNLP.core" and len(m.__all__) > 0: fout.write(" :members: " + ", ".join(m.__all__) + "\n") - if not (name.startswith('fastNLP.models') or name.startswith('fastNLP.modules')): + short = name[len("fastNLP."):] + if not (short.startswith('models') or short.startswith('modules') or short.startswith('embeddings')): fout.write(" :inherited-members:\n") fout.write("\n") if name in children: diff --git a/docs/source/fastNLP.embeddings.bert_embedding.rst b/docs/source/fastNLP.embeddings.bert_embedding.rst index 51828cb0..1b59dc35 100644 --- a/docs/source/fastNLP.embeddings.bert_embedding.rst +++ b/docs/source/fastNLP.embeddings.bert_embedding.rst @@ -3,5 +3,4 @@ fastNLP.embeddings.bert_embedding .. automodule:: fastNLP.embeddings.bert_embedding :members: BertEmbedding, BertWordPieceEncoder - :inherited-members: diff --git a/docs/source/fastNLP.embeddings.char_embedding.rst b/docs/source/fastNLP.embeddings.char_embedding.rst index a9b129d8..bc8d64f9 100644 --- a/docs/source/fastNLP.embeddings.char_embedding.rst +++ b/docs/source/fastNLP.embeddings.char_embedding.rst @@ -3,5 +3,4 @@ fastNLP.embeddings.char_embedding .. automodule:: fastNLP.embeddings.char_embedding :members: CNNCharEmbedding, LSTMCharEmbedding - :inherited-members: diff --git a/docs/source/fastNLP.embeddings.contextual_embedding.rst b/docs/source/fastNLP.embeddings.contextual_embedding.rst index ee64c7a0..74e5f5be 100644 --- a/docs/source/fastNLP.embeddings.contextual_embedding.rst +++ b/docs/source/fastNLP.embeddings.contextual_embedding.rst @@ -3,5 +3,4 @@ fastNLP.embeddings.contextual_embedding .. automodule:: fastNLP.embeddings.contextual_embedding :members: ContextualEmbedding - :inherited-members: diff --git a/docs/source/fastNLP.embeddings.elmo_embedding.rst b/docs/source/fastNLP.embeddings.elmo_embedding.rst index 06cc13af..b8c6d41c 100644 --- a/docs/source/fastNLP.embeddings.elmo_embedding.rst +++ b/docs/source/fastNLP.embeddings.elmo_embedding.rst @@ -3,5 +3,4 @@ fastNLP.embeddings.elmo_embedding .. automodule:: fastNLP.embeddings.elmo_embedding :members: ElmoEmbedding - :inherited-members: diff --git a/docs/source/fastNLP.embeddings.embedding.rst b/docs/source/fastNLP.embeddings.embedding.rst index 4d5fcf46..6793446b 100644 --- a/docs/source/fastNLP.embeddings.embedding.rst +++ b/docs/source/fastNLP.embeddings.embedding.rst @@ -3,5 +3,4 @@ fastNLP.embeddings.embedding .. automodule:: fastNLP.embeddings.embedding :members: Embedding, TokenEmbedding - :inherited-members: diff --git a/docs/source/fastNLP.embeddings.rst b/docs/source/fastNLP.embeddings.rst index b9e6a853..f4f4a3e0 100644 --- a/docs/source/fastNLP.embeddings.rst +++ b/docs/source/fastNLP.embeddings.rst @@ -3,7 +3,6 @@ fastNLP.embeddings .. automodule:: fastNLP.embeddings :members: Embedding, TokenEmbedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, BertWordPieceEncoder, StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding, get_embeddings - :inherited-members: 子模块 ------ diff --git a/docs/source/fastNLP.embeddings.stack_embedding.rst b/docs/source/fastNLP.embeddings.stack_embedding.rst index 6af91623..a07d1ef5 100644 --- a/docs/source/fastNLP.embeddings.stack_embedding.rst +++ b/docs/source/fastNLP.embeddings.stack_embedding.rst @@ -3,5 +3,4 @@ fastNLP.embeddings.stack_embedding .. automodule:: fastNLP.embeddings.stack_embedding :members: StackEmbedding - :inherited-members: diff --git a/docs/source/fastNLP.embeddings.static_embedding.rst b/docs/source/fastNLP.embeddings.static_embedding.rst index 2df1c329..219ce0e5 100644 --- a/docs/source/fastNLP.embeddings.static_embedding.rst +++ b/docs/source/fastNLP.embeddings.static_embedding.rst @@ -3,5 +3,4 @@ fastNLP.embeddings.static_embedding .. automodule:: fastNLP.embeddings.static_embedding :members: StaticEmbedding - :inherited-members: diff --git a/docs/source/fastNLP.embeddings.utils.rst b/docs/source/fastNLP.embeddings.utils.rst index 13e5936b..077487c1 100644 --- a/docs/source/fastNLP.embeddings.utils.rst +++ b/docs/source/fastNLP.embeddings.utils.rst @@ -3,5 +3,4 @@ fastNLP.embeddings.utils .. automodule:: fastNLP.embeddings.utils :members: get_embeddings - :inherited-members: diff --git a/docs/source/fastNLP.io.dataset_loader.rst b/docs/source/fastNLP.io.dataset_loader.rst deleted file mode 100644 index c211ecf9..00000000 --- a/docs/source/fastNLP.io.dataset_loader.rst +++ /dev/null @@ -1,6 +0,0 @@ -fastNLP.io.dataset_loader -========================= - -.. automodule:: fastNLP.io.dataset_loader - :members: CSVLoader, JsonLoader - From fbbb2fcd8e6526143cd789f9bb7e370d966ac4c4 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Tue, 27 Aug 2019 21:33:18 +0800 Subject: [PATCH 114/153] fix some bugs in docs --- fastNLP/core/callback.py | 21 ++++++++++++--------- fastNLP/io/data_bundle.py | 4 ++-- fastNLP/io/pipe/conll.py | 4 ++-- fastNLP/io/pipe/matching.py | 4 ++-- fastNLP/io/pipe/pipe.py | 2 +- fastNLP/io/pipe/utils.py | 4 ++-- 6 files changed, 21 insertions(+), 18 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 2c130061..dde9a31a 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -70,10 +70,11 @@ __all__ = [ ] import os +import sys +from copy import deepcopy import torch -from copy import deepcopy -import sys + from .utils import _save_model try: @@ -928,13 +929,15 @@ class WarmupCallback(Callback): class SaveModelCallback(Callback): """ 由于Trainer在训练过程中只会保存最佳的模型, 该callback可实现多种方式的结果存储。 - 会根据训练开始的时间戳在save_dir下建立文件夹,再在文件夹下存放多个模型 - -save_dir - -2019-07-03-15-06-36 - -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_performance是性能 - -epoch:1_step:40_{metric_key}:{evaluate_performance}.pt - -2019-07-03-15-10-00 - -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_perfomance是性能 + 会根据训练开始的时间戳在save_dir下建立文件夹,再在文件夹下存放多个模型:: + + -save_dir + -2019-07-03-15-06-36 + -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_performance是性能 + -epoch:1_step:40_{metric_key}:{evaluate_performance}.pt + -2019-07-03-15-10-00 + -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_perfomance是性能 + :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型 :param int top: 保存dev表现top多少模型。-1为保存所有模型。 :param bool only_param: 是否只保存模型d饿权重。 diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index db60a86f..10f924f0 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -204,7 +204,7 @@ class DataBundle: 行的数据进行类型和维度推断本列的数据的类型和维度。 :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; 如果为False,则报错 - :return self + :return: self """ for field_name in field_names: for name, dataset in self.datasets.items(): @@ -229,7 +229,7 @@ class DataBundle: 行的数据进行类型和维度推断本列的数据的类型和维度。 :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; 如果为False,则报错 - :return self + :return: self """ for field_name in field_names: for name, dataset in self.datasets.items(): diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index 2efec8e0..eb7d4909 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -51,7 +51,7 @@ class _NERPipe(Pipe): "[AL-AIN, United, Arab, ...]", "[B-LOC, B-LOC, I-LOC, ...]" "[...]", "[...]" - :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 + :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 在传入DataBundle基础上原位修改。 :return: DataBundle """ @@ -244,7 +244,7 @@ class _CNNERPipe(Pipe): raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 - :param DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 + :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 在传入DataBundle基础上原位修改。 :return: DataBundle """ diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 699438c8..747e7b44 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -177,7 +177,7 @@ class MatchingPipe(Pipe): def _tokenize(self, data_bundle, field_names, new_field_names): """ - :param DataBundle data_bundle: DataBundle. + :param ~fastNLP.DataBundle data_bundle: DataBundle. :param list field_names: List[str], 需要tokenize的field名称 :param list new_field_names: List[str], tokenize之后field的名称,与field_names一一对应。 :return: 输入的DataBundle对象 @@ -199,7 +199,7 @@ class MatchingPipe(Pipe): "This site includes a...", "The Government Executive...", "not_entailment" "...", "..." - :param data_bundle: 通过loader读取得到的data_bundle,里面包含了数据集的原始数据内容 + :param ~fastNLP.DataBundle data_bundle: 通过loader读取得到的data_bundle,里面包含了数据集的原始数据内容 :return: data_bundle """ data_bundle = self._tokenize(data_bundle, [Const.RAW_WORDS(0), Const.RAW_WORDS(1)], diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py index a1435fd3..12d9c1cb 100644 --- a/fastNLP/io/pipe/pipe.py +++ b/fastNLP/io/pipe/pipe.py @@ -15,7 +15,7 @@ class Pipe: """ 对输入的DataBundle进行处理,然后返回该DataBundle。 - :param data_bundle: 需要处理的DataBundle对象 + :param ~fastNLP.DataBundle data_bundle: 需要处理的DataBundle对象 :return: """ raise NotImplementedError diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py index f32f58b7..ea7e0aa8 100644 --- a/fastNLP/io/pipe/utils.py +++ b/fastNLP/io/pipe/utils.py @@ -92,7 +92,7 @@ def _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=Con """ 在dataset中的field_name列建立词表,Const.TARGET列建立词表,并把词表加入到data_bundle中。 - :param data_bundle: + :param ~fastNLP.DataBundle data_bundle: :param: str,list input_field_names: :param: str,list target_field_names: 这一列的vocabulary没有unknown和padding :return: @@ -154,7 +154,7 @@ def _drop_empty_instance(data_bundle, field_name): """ 删除data_bundle的DataSet中存在的某个field为空的情况 - :param data_bundle: DataBundle + :param ~fastNLP.DataBundle data_bundle: :param str field_name: 对哪个field进行检查,如果为None,则任意field为空都会删掉 :return: 传入的DataBundle """ From 6201f661789e36c4e1e116846cc84d586aca2abd Mon Sep 17 00:00:00 2001 From: yh_cc Date: Wed, 28 Aug 2019 22:56:02 +0800 Subject: [PATCH 115/153] =?UTF-8?q?Trainer=E4=B8=AD=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E6=9C=80=E4=BD=B3=E6=A8=A1=E5=9E=8B=E5=AD=98=E5=9C=A8bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 290a89c1..61969c2e 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -718,7 +718,7 @@ class Trainer(object): self._save_model(self.model, "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])) elif self._load_best_model: - self._best_model_states = {name: param.cpu().clone() for name, param in self.model.named_parameters()} + self._best_model_states = {name: param.cpu().clone() for name, param in self.model.state_dict()} self.best_dev_perf = res self.best_dev_epoch = epoch self.best_dev_step = step From a46b8f129b88ef5b53692f18cf609ceeb31e48c0 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Wed, 28 Aug 2019 23:06:13 +0800 Subject: [PATCH 116/153] =?UTF-8?q?Trainer=E4=B8=AD=E4=BF=9D=E5=AD=98?= =?UTF-8?q?=E6=9C=80=E4=BD=B3=E6=A8=A1=E5=9E=8B=E5=AD=98=E5=9C=A8bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 61969c2e..a47f108b 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -718,7 +718,7 @@ class Trainer(object): self._save_model(self.model, "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time])) elif self._load_best_model: - self._best_model_states = {name: param.cpu().clone() for name, param in self.model.state_dict()} + self._best_model_states = {name: param.cpu().clone() for name, param in self.model.state_dict().items()} self.best_dev_perf = res self.best_dev_epoch = epoch self.best_dev_step = step From 55e736bf4c9020ce404400b605d1c2febd8d0766 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Wed, 28 Aug 2019 23:53:20 +0800 Subject: [PATCH 117/153] =?UTF-8?q?SpanFMetric=E5=A2=9E=E5=8A=A0=E5=AF=B9e?= =?UTF-8?q?ncoding=5Ftype=E5=92=8Ctag=5Fvocab=E7=9A=84=E6=A3=80=E6=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 26 ++++++++++++++++++++++++++ test/core/test_metrics.py | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 1d1e3819..28d88fbc 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -23,6 +23,7 @@ from .utils import _get_func_signature from .utils import seq_len_to_mask from .vocabulary import Vocabulary from abc import abstractmethod +import warnings class MetricBase(object): @@ -492,6 +493,30 @@ def _bio_tag_to_spans(tags, ignore_labels=None): return [(span[0], (span[1][0], span[1][1] + 1)) for span in spans if span[0] not in ignore_labels] +def _check_tag_vocab_and_encoding_type(vocab:Vocabulary, encoding_type:str): + """ + 检查vocab中的tag是否与encoding_type是匹配的 + + :param vocab: target的Vocabulary + :param encoding_type: bio, bmes, bioes, bmeso + :return: + """ + tag_set = set() + for tag, idx in vocab: + if idx in (vocab.unknown_idx, vocab.padding_idx): + continue + tag = tag[:1] + tag_set.add(tag) + tags = encoding_type + for tag in tag_set: + assert tag in tags, f"{tag} is not a valid tag in encoding type:{encoding_type}. Please check your " \ + f"encoding_type." + tags = tags.replace(tag, '') # 删除该值 + if tags: # 如果不为空,说明出现了未使用的tag + warnings.warn(f"Tag:{tags} in encoding type:{encoding_type} is not presented in your Vocabulary. Check your " + "encoding_type.") + + class SpanFPreRecMetric(MetricBase): r""" 别名::class:`fastNLP.SpanFPreRecMetric` :class:`fastNLP.core.metrics.SpanFPreRecMetric` @@ -546,6 +571,7 @@ class SpanFPreRecMetric(MetricBase): raise ValueError("f_type only supports `micro` or `macro`', got {}.".format(f_type)) self.encoding_type = encoding_type + _check_tag_vocab_and_encoding_type(tag_vocab, encoding_type) if self.encoding_type == 'bmes': self.tag_to_span_func = _bmes_tag_to_spans elif self.encoding_type == 'bio': diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index 236066d6..5a7c55cf 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -338,6 +338,41 @@ class SpanF1PreRecMetric(unittest.TestCase): for key, value in expected_metric.items(): self.assertAlmostEqual(value, metric_value[key], places=5) + def test_encoding_type(self): + # 检查传入的tag_vocab与encoding_type不符合时,是否会报错 + vocabs = {} + import random + from itertools import product + for encoding_type in ['bio', 'bioes', 'bmeso']: + vocab = Vocabulary(unknown=None, padding=None) + for i in range(random.randint(10, 100)): + label = str(random.randint(1, 10)) + for tag in encoding_type: + if tag!='o': + vocab.add_word(f'{tag}-{label}') + else: + vocab.add_word('o') + vocabs[encoding_type] = vocab + for e1, e2 in product(['bio', 'bioes', 'bmeso'], ['bio', 'bioes', 'bmeso']): + with self.subTest(e1=e1, e2=e2): + if e1==e2: + metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2) + else: + s2 = set(e2) + s2.update(set(e1)) + if s2==set(e2): + continue + with self.assertRaises(AssertionError): + metric = SpanFPreRecMetric(vocabs[e1], encoding_type=e2) + for encoding_type in ['bio', 'bioes', 'bmeso']: + with self.assertRaises(AssertionError): + metric = SpanFPreRecMetric(vocabs[encoding_type], encoding_type='bmes') + + with self.assertWarns(Warning): + vocab = Vocabulary(unknown=None, padding=None).add_word_lst(list('bmes')) + metric = SpanFPreRecMetric(vocab, encoding_type='bmeso') + vocab = Vocabulary().add_word_lst(list('bmes')) + metric = SpanFPreRecMetric(vocab, encoding_type='bmeso') class TestUsefulFunctions(unittest.TestCase): # 测试metrics.py中一些看上去挺有用的函数 From cbe5b347e54ce5181887743c62b06aabcd00b778 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Wed, 28 Aug 2019 23:53:53 +0800 Subject: [PATCH 118/153] =?UTF-8?q?SpanFMetric=E5=A2=9E=E5=8A=A0=E5=AF=B9e?= =?UTF-8?q?ncoding=5Ftype=E5=92=8Ctag=5Fvocab=E7=9A=84=E6=A3=80=E6=9F=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 28d88fbc..0dc601a3 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -505,7 +505,7 @@ def _check_tag_vocab_and_encoding_type(vocab:Vocabulary, encoding_type:str): for tag, idx in vocab: if idx in (vocab.unknown_idx, vocab.padding_idx): continue - tag = tag[:1] + tag = tag[:1].lower() tag_set.add(tag) tags = encoding_type for tag in tag_set: From 5d8a8c98c6997fda7afa236de8523c0c1916201d Mon Sep 17 00:00:00 2001 From: xuyige Date: Thu, 29 Aug 2019 01:43:04 +0800 Subject: [PATCH 119/153] 1. delete io/data_loader dir; 2. delete model/enas*; 3. delete legacy dir; 4. delete DateSetLoader and relevant codes; 5. fix a test code error in core/test_dataset.py; 6. delete io.BaseLoader and relevant code. --- fastNLP/io/__init__.py | 1 - fastNLP/io/data_bundle.py | 197 +---------- fastNLP/io/data_loader/__init__.py | 39 --- fastNLP/io/data_loader/conll.py | 109 ------ fastNLP/io/data_loader/imdb.py | 99 ------ fastNLP/io/data_loader/matching.py | 248 ------------- fastNLP/io/data_loader/mnli.py | 62 ---- fastNLP/io/data_loader/mtl.py | 68 ---- fastNLP/io/data_loader/people_daily.py | 85 ----- fastNLP/io/data_loader/qnli.py | 47 --- fastNLP/io/data_loader/quora.py | 34 -- fastNLP/io/data_loader/rte.py | 47 --- fastNLP/io/data_loader/snli.py | 46 --- fastNLP/io/data_loader/sst.py | 180 ---------- fastNLP/io/data_loader/yelp.py | 132 ------- fastNLP/io/dataset_loader.py | 121 ------- fastNLP/io/embed_loader.py | 13 +- fastNLP/io/model_io.py | 4 +- fastNLP/models/enas_controller.py | 228 ------------ fastNLP/models/enas_model.py | 393 --------------------- fastNLP/models/enas_trainer.py | 384 -------------------- fastNLP/models/enas_utils.py | 58 ---- legacy/api/README.md | 44 --- legacy/api/__init__.py | 2 - legacy/api/api.py | 463 ------------------------- legacy/api/converter.py | 181 ---------- legacy/api/examples.py | 56 --- legacy/api/pipeline.py | 33 -- legacy/api/processor.py | 428 ----------------------- legacy/api/utils.py | 134 ------- legacy/automl/__init__.py | 0 legacy/automl/enas_controller.py | 223 ------------ legacy/automl/enas_model.py | 388 --------------------- legacy/automl/enas_trainer.py | 383 -------------------- legacy/automl/enas_utils.py | 53 --- legacy/component/__init__.py | 1 - legacy/component/bert_tokenizer.py | 378 -------------------- test/core/test_dataset.py | 5 +- test/io/test_data_loader.py | 15 - test/io/test_dataset_loader.py | 77 ---- 40 files changed, 14 insertions(+), 5445 deletions(-) delete mode 100644 fastNLP/io/data_loader/__init__.py delete mode 100644 fastNLP/io/data_loader/conll.py delete mode 100644 fastNLP/io/data_loader/imdb.py delete mode 100644 fastNLP/io/data_loader/matching.py delete mode 100644 fastNLP/io/data_loader/mnli.py delete mode 100644 fastNLP/io/data_loader/mtl.py delete mode 100644 fastNLP/io/data_loader/people_daily.py delete mode 100644 fastNLP/io/data_loader/qnli.py delete mode 100644 fastNLP/io/data_loader/quora.py delete mode 100644 fastNLP/io/data_loader/rte.py delete mode 100644 fastNLP/io/data_loader/snli.py delete mode 100644 fastNLP/io/data_loader/sst.py delete mode 100644 fastNLP/io/data_loader/yelp.py delete mode 100644 fastNLP/io/dataset_loader.py delete mode 100644 fastNLP/models/enas_controller.py delete mode 100644 fastNLP/models/enas_model.py delete mode 100644 fastNLP/models/enas_trainer.py delete mode 100644 fastNLP/models/enas_utils.py delete mode 100644 legacy/api/README.md delete mode 100644 legacy/api/__init__.py delete mode 100644 legacy/api/api.py delete mode 100644 legacy/api/converter.py delete mode 100644 legacy/api/examples.py delete mode 100644 legacy/api/pipeline.py delete mode 100644 legacy/api/processor.py delete mode 100644 legacy/api/utils.py delete mode 100644 legacy/automl/__init__.py delete mode 100644 legacy/automl/enas_controller.py delete mode 100644 legacy/automl/enas_model.py delete mode 100644 legacy/automl/enas_trainer.py delete mode 100644 legacy/automl/enas_utils.py delete mode 100644 legacy/component/__init__.py delete mode 100644 legacy/component/bert_tokenizer.py delete mode 100644 test/io/test_data_loader.py delete mode 100644 test/io/test_dataset_loader.py diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 8ed1956a..251b7292 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -82,7 +82,6 @@ __all__ = [ from .embed_loader import EmbedLoader from .data_bundle import DataBundle -from .dataset_loader import CSVLoader, JsonLoader from .model_io import ModelLoader, ModelSaver from .loader import * diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index 10f924f0..969730a3 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -6,112 +6,10 @@ __all__ = [ 'DataBundle', ] -import _pickle as pickle -import os -from typing import Union, Dict - from ..core.dataset import DataSet from ..core.vocabulary import Vocabulary -class BaseLoader(object): - """ - 各个 Loader 的基类,提供了 API 的参考。 - - """ - - def __init__(self): - super(BaseLoader, self).__init__() - - @staticmethod - def load_lines(data_path): - """ - 按行读取,舍弃每行两侧空白字符,返回list of str - - :param data_path: 读取数据的路径 - """ - with open(data_path, "r", encoding="utf=8") as f: - text = f.readlines() - return [line.strip() for line in text] - - @classmethod - def load(cls, data_path): - """ - 先按行读取,去除一行两侧空白,再提取每行的字符。返回list of list of str - - :param data_path: - """ - with open(data_path, "r", encoding="utf-8") as f: - text = f.readlines() - return [[word for word in sent.strip()] for sent in text] - - @classmethod - def load_with_cache(cls, data_path, cache_path): - """缓存版的load - """ - if os.path.isfile(cache_path) and os.path.getmtime(data_path) < os.path.getmtime(cache_path): - with open(cache_path, 'rb') as f: - return pickle.load(f) - else: - obj = cls.load(data_path) - with open(cache_path, 'wb') as f: - pickle.dump(obj, f) - return obj - - -def _download_from_url(url, path): - try: - from tqdm.auto import tqdm - except: - from ..core.utils import _pseudo_tqdm as tqdm - import requests - - """Download file""" - r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True) - chunk_size = 16 * 1024 - total_size = int(r.headers.get('Content-length', 0)) - with open(path, "wb") as file, \ - tqdm(total=total_size, unit='B', unit_scale=1, desc=path.split('/')[-1]) as t: - for chunk in r.iter_content(chunk_size): - if chunk: - file.write(chunk) - t.update(len(chunk)) - - -def _uncompress(src, dst): - import zipfile - import gzip - import tarfile - import os - - def unzip(src, dst): - with zipfile.ZipFile(src, 'r') as f: - f.extractall(dst) - - def ungz(src, dst): - with gzip.open(src, 'rb') as f, open(dst, 'wb') as uf: - length = 16 * 1024 # 16KB - buf = f.read(length) - while buf: - uf.write(buf) - buf = f.read(length) - - def untar(src, dst): - with tarfile.open(src, 'r:gz') as f: - f.extractall(dst) - - fn, ext = os.path.splitext(src) - _, ext_2 = os.path.splitext(fn) - if ext == '.zip': - unzip(src, dst) - elif ext == '.gz' and ext_2 != '.tar': - ungz(src, dst) - elif (ext == '.gz' and ext_2 == '.tar') or ext_2 == '.tgz': - untar(src, dst) - else: - raise ValueError('unsupported file {}'.format(src)) - - class DataBundle: """ 经过处理的数据信息,包括一系列数据集(比如:分开的训练集、验证集和测试集)以及各个field对应的vocabulary。该对象一般由fastNLP中各种 @@ -154,7 +52,7 @@ class DataBundle: self.datasets[name] = dataset return self - def get_dataset(self, name:str)->DataSet: + def get_dataset(self, name: str) -> DataSet: """ 获取名为name的dataset @@ -163,7 +61,7 @@ class DataBundle: """ return self.datasets[name] - def delete_dataset(self, name:str): + def delete_dataset(self, name: str): """ 删除名为name的DataSet @@ -173,7 +71,7 @@ class DataBundle: self.datasets.pop(name, None) return self - def get_vocab(self, field_name:str)->Vocabulary: + def get_vocab(self, field_name: str) -> Vocabulary: """ 获取field名为field_name对应的vocab @@ -182,7 +80,7 @@ class DataBundle: """ return self.vocabs[field_name] - def delete_vocab(self, field_name:str): + def delete_vocab(self, field_name: str): """ 删除vocab :param str field_name: @@ -312,90 +210,3 @@ class DataBundle: return _str -class DataSetLoader: - """ - 别名::class:`fastNLP.io.DataSetLoader` :class:`fastNLP.io.dataset_loader.DataSetLoader` - - 定义了各种 DataSetLoader 所需的API 接口,开发者应该继承它实现各种的 DataSetLoader。 - - 开发者至少应该编写如下内容: - - - _load 函数:从一个数据文件中读取数据到一个 :class:`~fastNLP.DataSet` - - load 函数(可以使用基类的方法):从一个或多个数据文件中读取数据到一个或多个 :class:`~fastNLP.DataSet` - - process 函数:一个或多个从数据文件中读取数据,并处理成可以训练的一个或多个 :class:`~fastNLP.DataSet` - - **process 函数中可以 调用load 函数或 _load 函数** - - """ - URL = '' - DATA_DIR = '' - - ROOT_DIR = '.fastnlp/datasets/' - UNCOMPRESS = True - - def _download(self, url: str, pdir: str, uncompress=True) -> str: - """ - - 从 ``url`` 下载数据到 ``path``, 如果 ``uncompress`` 为 ``True`` ,自动解压。 - - :param url: 下载的网站 - :param pdir: 下载到的目录 - :param uncompress: 是否自动解压缩 - :return: 数据的存放路径 - """ - fn = os.path.basename(url) - path = os.path.join(pdir, fn) - """check data exists""" - if not os.path.exists(path): - os.makedirs(pdir, exist_ok=True) - _download_from_url(url, path) - if uncompress: - dst = os.path.join(pdir, 'data') - if not os.path.exists(dst): - _uncompress(path, dst) - return dst - return path - - def download(self): - return self._download( - self.URL, - os.path.join(self.ROOT_DIR, self.DATA_DIR), - uncompress=self.UNCOMPRESS) - - def load(self, paths: Union[str, Dict[str, str]]) -> Union[DataSet, Dict[str, DataSet]]: - """ - 从指定一个或多个路径中的文件中读取数据,返回一个或多个数据集 :class:`~fastNLP.DataSet` 。 - 如果处理多个路径,传入的 dict 中的 key 与返回的 dict 中的 key 保存一致。 - - :param Union[str, Dict[str, str]] paths: 文件路径 - :return: :class:`~fastNLP.DataSet` 类的对象或存储多个 :class:`~fastNLP.DataSet` 的字典 - """ - if isinstance(paths, str): - return self._load(paths) - return {name: self._load(path) for name, path in paths.items()} - - def _load(self, path: str) -> DataSet: - """从指定路径的文件中读取数据,返回 :class:`~fastNLP.DataSet` 类型的对象 - - :param str path: 文件路径 - :return: 一个 :class:`~fastNLP.DataSet` 类型的对象 - """ - raise NotImplementedError - - def process(self, paths: Union[str, Dict[str, str]], **options) -> DataBundle: - """ - 对于特定的任务和数据集,读取并处理数据,返回处理DataInfo类对象或字典。 - - 从指定一个或多个路径中的文件中读取数据,DataInfo对象中可以包含一个或多个数据集 。 - 如果处理多个路径,传入的 dict 的 key 与返回DataInfo中的 dict 中的 key 保存一致。 - - 返回的 :class:`DataBundle` 对象有如下属性: - - - vocabs: 由从数据集中获取的词表组成的字典,每个词表 - - datasets: 一个dict,包含一系列 :class:`~fastNLP.DataSet` 类型的对象。其中 field 的命名参考 :mod:`~fastNLP.core.const` - - :param paths: 原始数据读取的路径 - :param options: 根据不同的任务和数据集,设计自己的参数 - :return: 返回一个 DataBundle - """ - raise NotImplementedError diff --git a/fastNLP/io/data_loader/__init__.py b/fastNLP/io/data_loader/__init__.py deleted file mode 100644 index 8a9dd60b..00000000 --- a/fastNLP/io/data_loader/__init__.py +++ /dev/null @@ -1,39 +0,0 @@ -"""undocumented -.. warning:: - - 本模块在 `0.5.0版本` 中被废弃,由 :mod:`~fastNLP.io.loader` 和 :mod:`~fastNLP.io.pipe` 模块替代。 - -用于读数据集的模块, 可以读取文本分类、序列标注、Matching任务的数据集 - -这些模块的具体介绍如下,您可以通过阅读 :doc:`教程` 来进行了解。 -""" -__all__ = [ - 'ConllLoader', - 'Conll2003Loader', - 'IMDBLoader', - 'MatchingLoader', - 'SNLILoader', - 'MNLILoader', - 'MTL16Loader', - 'PeopleDailyCorpusLoader', - 'QNLILoader', - 'QuoraLoader', - 'RTELoader', - 'SSTLoader', - 'SST2Loader', - 'YelpLoader', -] - - -from .conll import ConllLoader, Conll2003Loader -from .imdb import IMDBLoader -from .matching import MatchingLoader -from .mnli import MNLILoader -from .mtl import MTL16Loader -from .people_daily import PeopleDailyCorpusLoader -from .qnli import QNLILoader -from .quora import QuoraLoader -from .rte import RTELoader -from .snli import SNLILoader -from .sst import SSTLoader, SST2Loader -from .yelp import YelpLoader diff --git a/fastNLP/io/data_loader/conll.py b/fastNLP/io/data_loader/conll.py deleted file mode 100644 index 31a90881..00000000 --- a/fastNLP/io/data_loader/conll.py +++ /dev/null @@ -1,109 +0,0 @@ - -from ...core.dataset import DataSet -from ...core.instance import Instance -from ..data_bundle import DataSetLoader -from ..file_reader import _read_conll -from typing import Union, Dict -from ..utils import check_loader_paths -from ..data_bundle import DataBundle - -class ConllLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader` - - 该ConllLoader支持读取的数据格式: 以空行隔开两个sample,除了分割行,每一行用空格或者制表符隔开不同的元素。如下例所示: - - Example:: - - # 文件中的内容 - Nadim NNP B-NP B-PER - Ladki NNP I-NP I-PER - - AL-AIN NNP B-NP B-LOC - United NNP B-NP B-LOC - Arab NNP I-NP I-LOC - Emirates NNPS I-NP I-LOC - 1996-12-06 CD I-NP O - ... - - # 如果用以下的参数读取,返回的DataSet将包含raw_words和pos两个field, 这两个field的值分别取自于第0列与第1列 - dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll') - # 如果用以下的参数读取,返回的DataSet将包含raw_words和ner两个field, 这两个field的值分别取自于第0列与第2列 - dataset = ConllLoader(headers=['raw_words', 'ner'], indexes=[0, 3])._load('/path/to/train.conll') - # 如果用以下的参数读取,返回的DataSet将包含raw_words, pos和ner三个field - dataset = ConllLoader(headers=['raw_words', 'pos', 'ner'], indexes=[0, 1, 3])._load('/path/to/train.conll') - - dataset = ConllLoader(headers=['raw_words', 'pos'], indexes=[0, 1])._load('/path/to/train.conll')中DataSet的raw_words - 列与pos列的内容都是List[str] - - 数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。 - - :param list headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 - :param list indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` - :param bool dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True`` - """ - - def __init__(self, headers, indexes=None, dropna=True): - super(ConllLoader, self).__init__() - if not isinstance(headers, (list, tuple)): - raise TypeError( - 'invalid headers: {}, should be list of strings'.format(headers)) - self.headers = headers - self.dropna = dropna - if indexes is None: - self.indexes = list(range(len(self.headers))) - else: - if len(indexes) != len(headers): - raise ValueError - self.indexes = indexes - - def _load(self, path): - """ - 传入的一个文件路径,将该文件读入DataSet中,field由Loader初始化时指定的headers决定。 - - :param str path: 文件的路径 - :return: DataSet - """ - ds = DataSet() - for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): - ins = {h: data[i] for i, h in enumerate(self.headers)} - ds.append(Instance(**ins)) - return ds - - def load(self, paths: Union[str, Dict[str, str]]) -> DataBundle: - """ - 从指定一个或多个路径中的文件中读取数据,返回:class:`~fastNLP.io.DataBundle` 。 - - 读取的field根据ConllLoader初始化时传入的headers决定。 - - :param Union[str, Dict[str, str]] paths: - :return: :class:`~fastNLP.DataSet` 类的对象或 :class:`~fastNLP.io.DataBundle` 的字典 - """ - paths = check_loader_paths(paths) - datasets = {name: self._load(path) for name, path in paths.items()} - data_bundle = DataBundle(datasets=datasets) - return data_bundle - - -class Conll2003Loader(ConllLoader): - """ - 别名::class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.data_loader.Conll2003Loader` - - 该Loader用以读取Conll2003数据,conll2003的数据可以在https://github.com/davidsbatista/NER-datasets/tree/master/CONLL2003 - 找到。数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。 - - 返回的DataSet将具有以下['raw_words', 'pos', 'chunks', 'ner']四个field, 每个field中的内容都是List[str]。 - - .. csv-table:: Conll2003Loader处理之 :header: "raw_words", "words", "target", "seq_len" - - "[Nadim, Ladki]", "[1, 2]", "[1, 2]", 2 - "[AL-AIN, United, Arab, ...]", "[3, 4, 5,...]", "[3, 4]", 5 - "[...]", "[...]", "[...]", . - - """ - - def __init__(self): - headers = [ - 'raw_words', 'pos', 'chunks', 'ner', - ] - super(Conll2003Loader, self).__init__(headers=headers) diff --git a/fastNLP/io/data_loader/imdb.py b/fastNLP/io/data_loader/imdb.py deleted file mode 100644 index c9dda76e..00000000 --- a/fastNLP/io/data_loader/imdb.py +++ /dev/null @@ -1,99 +0,0 @@ - -from typing import Union, Dict - -from ..embed_loader import EmbeddingOption, EmbedLoader -from ..data_bundle import DataSetLoader, DataBundle -from ...core.vocabulary import VocabularyOption, Vocabulary -from ...core.dataset import DataSet -from ...core.instance import Instance -from ...core.const import Const - -from ..utils import get_tokenizer - - -class IMDBLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.IMDBLoader` :class:`fastNLP.io.data_loader.IMDBLoader` - - 读取IMDB数据集,DataSet包含以下fields: - - words: list(str), 需要分类的文本 - - target: str, 文本的标签 - - """ - - def __init__(self): - super(IMDBLoader, self).__init__() - self.tokenizer = get_tokenizer() - - def _load(self, path): - dataset = DataSet() - with open(path, 'r', encoding="utf-8") as f: - for line in f: - line = line.strip() - if not line: - continue - parts = line.split('\t') - target = parts[0] - words = self.tokenizer(parts[1].lower()) - dataset.append(Instance(words=words, target=target)) - - if len(dataset) == 0: - raise RuntimeError(f"{path} has no valid data.") - - return dataset - - def process(self, - paths: Union[str, Dict[str, str]], - src_vocab_opt: VocabularyOption = None, - tgt_vocab_opt: VocabularyOption = None, - char_level_op=False): - - datasets = {} - info = DataBundle() - for name, path in paths.items(): - dataset = self.load(path) - datasets[name] = dataset - - def wordtochar(words): - chars = [] - for word in words: - word = word.lower() - for char in word: - chars.append(char) - chars.append('') - chars.pop() - return chars - - if char_level_op: - for dataset in datasets.values(): - dataset.apply_field(wordtochar, field_name="words", new_field_name='chars') - - datasets["train"], datasets["dev"] = datasets["train"].split(0.1, shuffle=False) - - src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) - src_vocab.from_dataset(datasets['train'], field_name='words') - - src_vocab.index_dataset(*datasets.values(), field_name='words') - - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) - tgt_vocab.from_dataset(datasets['train'], field_name='target') - tgt_vocab.index_dataset(*datasets.values(), field_name='target') - - info.vocabs = { - Const.INPUT: src_vocab, - Const.TARGET: tgt_vocab - } - - info.datasets = datasets - - for name, dataset in info.datasets.items(): - dataset.set_input(Const.INPUT) - dataset.set_target(Const.TARGET) - - return info - - - diff --git a/fastNLP/io/data_loader/matching.py b/fastNLP/io/data_loader/matching.py deleted file mode 100644 index 41c9a98d..00000000 --- a/fastNLP/io/data_loader/matching.py +++ /dev/null @@ -1,248 +0,0 @@ -import os - -from typing import Union, Dict, List - -from ...core.const import Const -from ...core.vocabulary import Vocabulary -from ..data_bundle import DataBundle, DataSetLoader -from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR -from ...modules.encoder.bert import BertTokenizer - - -class MatchingLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.data_loader.MatchingLoader` - - 读取Matching任务的数据集 - - :param dict paths: key是数据集名称(如train、dev、test),value是对应的文件名 - """ - - def __init__(self, paths: dict=None): - self.paths = paths - - def _load(self, path): - """ - :param str path: 待读取数据集的路径名 - :return: fastNLP.DataSet ds: 返回一个DataSet对象,里面必须包含3个field:其中两个分别为两个句子 - 的原始字符串文本,第三个为标签 - """ - raise NotImplementedError - - def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None, - to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None, - cut_text: int = None, get_index=True, auto_pad_length: int=None, - auto_pad_token: str='', set_input: Union[list, str, bool]=True, - set_target: Union[list, str, bool]=True, concat: Union[str, list, bool]=None, - extra_split: List[str]=None, ) -> DataBundle: - """ - :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, - 则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和 - 对应的全路径文件名。 - :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义 - 这个数据集的名字,如果不定义则默认为train。 - :param bool to_lower: 是否将文本自动转为小写。默认值为False。 - :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` : - 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 - attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len - :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 - :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。 - :param bool get_index: 是否需要根据词表将文本转为index - :param int auto_pad_length: 是否需要将文本自动pad到一定长度(超过这个长度的文本将会被截掉),默认为不会自动pad - :param str auto_pad_token: 自动pad的内容 - :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False - 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, - 于此同时其他field不会被设置为input。默认值为True。 - :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。 - :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个。 - 如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果 - 传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]']. - :param extra_split: 额外的分隔符,即除了空格之外的用于分词的字符。 - :return: - """ - if isinstance(set_input, str): - set_input = [set_input] - if isinstance(set_target, str): - set_target = [set_target] - if isinstance(set_input, bool): - auto_set_input = set_input - else: - auto_set_input = False - if isinstance(set_target, bool): - auto_set_target = set_target - else: - auto_set_target = False - if isinstance(paths, str): - if os.path.isdir(paths): - path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()} - else: - path = {dataset_name if dataset_name is not None else 'train': paths} - else: - path = paths - - data_info = DataBundle() - for data_name in path.keys(): - data_info.datasets[data_name] = self._load(path[data_name]) - - for data_name, data_set in data_info.datasets.items(): - if auto_set_input: - data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) - if auto_set_target: - if Const.TARGET in data_set.get_field_names(): - data_set.set_target(Const.TARGET) - - if extra_split is not None: - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: ' '.join(x[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0)) - data_set.apply(lambda x: ' '.join(x[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1)) - - for s in extra_split: - data_set.apply(lambda x: x[Const.INPUTS(0)].replace(s, ' ' + s + ' '), - new_field_name=Const.INPUTS(0)) - data_set.apply(lambda x: x[Const.INPUTS(0)].replace(s, ' ' + s + ' '), - new_field_name=Const.INPUTS(0)) - - _filt = lambda x: x - data_set.apply(lambda x: list(filter(_filt, x[Const.INPUTS(0)].split(' '))), - new_field_name=Const.INPUTS(0), is_input=auto_set_input) - data_set.apply(lambda x: list(filter(_filt, x[Const.INPUTS(1)].split(' '))), - new_field_name=Const.INPUTS(1), is_input=auto_set_input) - _filt = None - - if to_lower: - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0), - is_input=auto_set_input) - data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1), - is_input=auto_set_input) - - if bert_tokenizer is not None: - if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: - PRETRAIN_URL = _get_base_url('bert') - model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] - model_url = PRETRAIN_URL + model_name - model_dir = cached_path(model_url, name='embedding') - # 检查是否存在 - elif os.path.isdir(bert_tokenizer): - model_dir = bert_tokenizer - else: - raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") - - words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') - with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f: - lines = f.readlines() - lines = [line.strip() for line in lines] - words_vocab.add_word_lst(lines) - words_vocab.build_vocab() - - tokenizer = BertTokenizer.from_pretrained(model_dir) - - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields, - is_input=auto_set_input) - - if isinstance(concat, bool): - concat = 'default' if concat else None - if concat is not None: - if isinstance(concat, str): - CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'], - 'default': ['', '', '', '']} - if concat.lower() in CONCAT_MAP: - concat = CONCAT_MAP[concat] - else: - concat = 4 * [concat] - assert len(concat) == 4, \ - f'Please choose a list with 4 symbols which at the beginning of first sentence ' \ - f'the end of first sentence, the begin of second sentence, and the end of second' \ - f'sentence. Your input is {concat}' - - for data_name, data_set in data_info.datasets.items(): - data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] + - x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT) - data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT, - is_input=auto_set_input) - - if seq_len_type is not None: - if seq_len_type == 'seq_len': # - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), - is_input=auto_set_input) - elif seq_len_type == 'mask': - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: [1] * len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), - is_input=auto_set_input) - elif seq_len_type == 'bert': - for data_name, data_set in data_info.datasets.items(): - if Const.INPUT not in data_set.get_field_names(): - raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: ' - f'got {data_set.get_field_names()}') - data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), - new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input) - data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), - new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) - - if auto_pad_length is not None: - cut_text = min(auto_pad_length, cut_text if cut_text is not None else auto_pad_length) - - if cut_text is not None: - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')): - data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields, - is_input=auto_set_input) - - data_set_list = [d for n, d in data_info.datasets.items()] - assert len(data_set_list) > 0, f'There are NO data sets in data info!' - - if bert_tokenizer is None: - words_vocab = Vocabulary(padding=auto_pad_token) - words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], - field_name=[n for n in data_set_list[0].get_field_names() - if (Const.INPUT in n)], - no_create_entry_dataset=[d for n, d in data_info.datasets.items() - if 'train' not in n]) - target_vocab = Vocabulary(padding=None, unknown=None) - target_vocab = target_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], - field_name=Const.TARGET) - data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab} - - if get_index: - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, - is_input=auto_set_input) - - if Const.TARGET in data_set.get_field_names(): - data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET, - is_input=auto_set_input, is_target=auto_set_target) - - if auto_pad_length is not None: - if seq_len_type == 'seq_len': - raise RuntimeError(f'the sequence will be padded with the length {auto_pad_length}, ' - f'so the seq_len_type cannot be `{seq_len_type}`!') - for data_name, data_set in data_info.datasets.items(): - for fields in data_set.get_field_names(): - if Const.INPUT in fields: - data_set.apply(lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] * - (auto_pad_length - len(x[fields])), new_field_name=fields, - is_input=auto_set_input) - elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'): - data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])), - new_field_name=fields, is_input=auto_set_input) - - for data_name, data_set in data_info.datasets.items(): - if isinstance(set_input, list): - data_set.set_input(*[inputs for inputs in set_input if inputs in data_set.get_field_names()]) - if isinstance(set_target, list): - data_set.set_target(*[target for target in set_target if target in data_set.get_field_names()]) - - return data_info diff --git a/fastNLP/io/data_loader/mnli.py b/fastNLP/io/data_loader/mnli.py deleted file mode 100644 index 65863f3d..00000000 --- a/fastNLP/io/data_loader/mnli.py +++ /dev/null @@ -1,62 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import CSVLoader - - -class MNLILoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.MNLILoader` :class:`fastNLP.io.data_loader.MNLILoader` - - 读取MNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev_matched': 'dev_matched.tsv', - 'dev_mismatched': 'dev_mismatched.tsv', - 'test_matched': 'test_matched.tsv', - 'test_mismatched': 'test_mismatched.tsv', - # 'test_0.9_matched': 'multinli_0.9_test_matched_unlabeled.txt', - # 'test_0.9_mismatched': 'multinli_0.9_test_mismatched_unlabeled.txt', - - # test_0.9_mathed与mismatched是MNLI0.9版本的(数据来源:kaggle) - } - MatchingLoader.__init__(self, paths=paths) - CSVLoader.__init__(self, sep='\t') - self.fields = { - 'sentence1_binary_parse': Const.INPUTS(0), - 'sentence2_binary_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - - if Const.TARGET in ds.get_field_names(): - if ds[0][Const.TARGET] == 'hidden': - ds.delete_field(Const.TARGET) - - parentheses_table = str.maketrans({'(': None, ')': None}) - - ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(1)) - if Const.TARGET in ds.get_field_names(): - ds.drop(lambda x: x[Const.TARGET] == '-') - return ds diff --git a/fastNLP/io/data_loader/mtl.py b/fastNLP/io/data_loader/mtl.py deleted file mode 100644 index 923aadfb..00000000 --- a/fastNLP/io/data_loader/mtl.py +++ /dev/null @@ -1,68 +0,0 @@ - -from typing import Union, Dict - -from ..data_bundle import DataBundle -from ..dataset_loader import CSVLoader -from ...core.vocabulary import Vocabulary, VocabularyOption -from ...core.const import Const -from ..utils import check_loader_paths - - -class MTL16Loader(CSVLoader): - """ - 别名::class:`fastNLP.io.MTL16Loader` :class:`fastNLP.io.data_loader.MTL16Loader` - - 读取MTL16数据集,DataSet包含以下fields: - - words: list(str), 需要分类的文本 - - target: str, 文本的标签 - - 数据来源:https://pan.baidu.com/s/1c2L6vdA - - """ - - def __init__(self): - super(MTL16Loader, self).__init__(headers=(Const.TARGET, Const.INPUT), sep='\t') - - def _load(self, path): - dataset = super(MTL16Loader, self)._load(path) - dataset.apply(lambda x: x[Const.INPUT].lower().split(), new_field_name=Const.INPUT) - if len(dataset) == 0: - raise RuntimeError(f"{path} has no valid data.") - - return dataset - - def process(self, - paths: Union[str, Dict[str, str]], - src_vocab_opt: VocabularyOption = None, - tgt_vocab_opt: VocabularyOption = None,): - - paths = check_loader_paths(paths) - datasets = {} - info = DataBundle() - for name, path in paths.items(): - dataset = self.load(path) - datasets[name] = dataset - - src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) - src_vocab.from_dataset(datasets['train'], field_name=Const.INPUT) - src_vocab.index_dataset(*datasets.values(), field_name=Const.INPUT) - - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) - tgt_vocab.from_dataset(datasets['train'], field_name=Const.TARGET) - tgt_vocab.index_dataset(*datasets.values(), field_name=Const.TARGET) - - info.vocabs = { - Const.INPUT: src_vocab, - Const.TARGET: tgt_vocab - } - - info.datasets = datasets - - for name, dataset in info.datasets.items(): - dataset.set_input(Const.INPUT) - dataset.set_target(Const.TARGET) - - return info diff --git a/fastNLP/io/data_loader/people_daily.py b/fastNLP/io/data_loader/people_daily.py deleted file mode 100644 index afd66744..00000000 --- a/fastNLP/io/data_loader/people_daily.py +++ /dev/null @@ -1,85 +0,0 @@ - -from ..data_bundle import DataSetLoader -from ...core.dataset import DataSet -from ...core.instance import Instance -from ...core.const import Const - - -class PeopleDailyCorpusLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.PeopleDailyCorpusLoader` :class:`fastNLP.io.data_loader.PeopleDailyCorpusLoader` - - 读取人民日报数据集 - """ - - def __init__(self, pos=True, ner=True): - super(PeopleDailyCorpusLoader, self).__init__() - self.pos = pos - self.ner = ner - - def _load(self, data_path): - with open(data_path, "r", encoding="utf-8") as f: - sents = f.readlines() - examples = [] - for sent in sents: - if len(sent) <= 2: - continue - inside_ne = False - sent_pos_tag = [] - sent_words = [] - sent_ner = [] - words = sent.strip().split()[1:] - for word in words: - if "[" in word and "]" in word: - ner_tag = "U" - print(word) - elif "[" in word: - inside_ne = True - ner_tag = "B" - word = word[1:] - elif "]" in word: - ner_tag = "L" - word = word[:word.index("]")] - if inside_ne is True: - inside_ne = False - else: - raise RuntimeError("only ] appears!") - else: - if inside_ne is True: - ner_tag = "I" - else: - ner_tag = "O" - tmp = word.split("/") - token, pos = tmp[0], tmp[1] - sent_ner.append(ner_tag) - sent_pos_tag.append(pos) - sent_words.append(token) - example = [sent_words] - if self.pos is True: - example.append(sent_pos_tag) - if self.ner is True: - example.append(sent_ner) - examples.append(example) - return self.convert(examples) - - def convert(self, data): - """ - - :param data: python 内置对象 - :return: 一个 :class:`~fastNLP.DataSet` 类型的对象 - """ - data_set = DataSet() - for item in data: - sent_words = item[0] - if self.pos is True and self.ner is True: - instance = Instance( - words=sent_words, pos_tags=item[1], ner=item[2]) - elif self.pos is True: - instance = Instance(words=sent_words, pos_tags=item[1]) - elif self.ner is True: - instance = Instance(words=sent_words, ner=item[1]) - else: - instance = Instance(words=sent_words) - data_set.append(instance) - data_set.apply(lambda ins: len(ins[Const.INPUT]), new_field_name=Const.INPUT_LEN) - return data_set diff --git a/fastNLP/io/data_loader/qnli.py b/fastNLP/io/data_loader/qnli.py deleted file mode 100644 index 84b0f3d6..00000000 --- a/fastNLP/io/data_loader/qnli.py +++ /dev/null @@ -1,47 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import CSVLoader - - -class QNLILoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.data_loader.QNLILoader` - - 读取QNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv' # test set has not label - } - MatchingLoader.__init__(self, paths=paths) - self.fields = { - 'question': Const.INPUTS(0), - 'sentence': Const.INPUTS(1), - 'label': Const.TARGET, - } - CSVLoader.__init__(self, sep='\t') - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - for fields in ds.get_all_fields(): - if Const.INPUT in fields: - ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) - - return ds diff --git a/fastNLP/io/data_loader/quora.py b/fastNLP/io/data_loader/quora.py deleted file mode 100644 index d0ee41ec..00000000 --- a/fastNLP/io/data_loader/quora.py +++ /dev/null @@ -1,34 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import CSVLoader - - -class QuoraLoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.QuoraLoader` :class:`fastNLP.io.data_loader.QuoraLoader` - - 读取MNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv', - } - MatchingLoader.__init__(self, paths=paths) - CSVLoader.__init__(self, sep='\t', headers=(Const.TARGET, Const.INPUTS(0), Const.INPUTS(1), 'pairID')) - - def _load(self, path): - ds = CSVLoader._load(self, path) - return ds diff --git a/fastNLP/io/data_loader/rte.py b/fastNLP/io/data_loader/rte.py deleted file mode 100644 index f8c5e2fc..00000000 --- a/fastNLP/io/data_loader/rte.py +++ /dev/null @@ -1,47 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import CSVLoader - - -class RTELoader(MatchingLoader, CSVLoader): - """ - 别名::class:`fastNLP.io.RTELoader` :class:`fastNLP.io.data_loader.RTELoader` - - 读取RTE数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: - """ - - def __init__(self, paths: dict=None): - paths = paths if paths is not None else { - 'train': 'train.tsv', - 'dev': 'dev.tsv', - 'test': 'test.tsv' # test set has not label - } - MatchingLoader.__init__(self, paths=paths) - self.fields = { - 'sentence1': Const.INPUTS(0), - 'sentence2': Const.INPUTS(1), - 'label': Const.TARGET, - } - CSVLoader.__init__(self, sep='\t') - - def _load(self, path): - ds = CSVLoader._load(self, path) - - for k, v in self.fields.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - for fields in ds.get_all_fields(): - if Const.INPUT in fields: - ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) - - return ds diff --git a/fastNLP/io/data_loader/snli.py b/fastNLP/io/data_loader/snli.py deleted file mode 100644 index 1db0ac5b..00000000 --- a/fastNLP/io/data_loader/snli.py +++ /dev/null @@ -1,46 +0,0 @@ - -from ...core.const import Const - -from .matching import MatchingLoader -from ..dataset_loader import JsonLoader - - -class SNLILoader(MatchingLoader, JsonLoader): - """ - 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.data_loader.SNLILoader` - - 读取SNLI数据集,读取的DataSet包含fields:: - - words1: list(str),第一句文本, premise - - words2: list(str), 第二句文本, hypothesis - - target: str, 真实标签 - - 数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip - """ - - def __init__(self, paths: dict=None): - fields = { - 'sentence1_binary_parse': Const.INPUTS(0), - 'sentence2_binary_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - paths = paths if paths is not None else { - 'train': 'snli_1.0_train.jsonl', - 'dev': 'snli_1.0_dev.jsonl', - 'test': 'snli_1.0_test.jsonl'} - MatchingLoader.__init__(self, paths=paths) - JsonLoader.__init__(self, fields=fields) - - def _load(self, path): - ds = JsonLoader._load(self, path) - - parentheses_table = str.maketrans({'(': None, ')': None}) - - ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), - new_field_name=Const.INPUTS(1)) - ds.drop(lambda x: x[Const.TARGET] == '-') - return ds diff --git a/fastNLP/io/data_loader/sst.py b/fastNLP/io/data_loader/sst.py deleted file mode 100644 index 2034fc2b..00000000 --- a/fastNLP/io/data_loader/sst.py +++ /dev/null @@ -1,180 +0,0 @@ - -from typing import Union, Dict -from nltk import Tree - -from ..data_bundle import DataBundle, DataSetLoader -from ..dataset_loader import CSVLoader -from ...core.vocabulary import VocabularyOption, Vocabulary -from ...core.dataset import DataSet -from ...core.const import Const -from ...core.instance import Instance -from ..utils import check_loader_paths, get_tokenizer - - -class SSTLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.data_loader.SSTLoader` - - 读取SST数据集, DataSet包含fields:: - - words: list(str) 需要分类的文本 - target: str 文本的标签 - - 数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip - - :param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False`` - :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` - """ - - URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' - DATA_DIR = 'sst/' - - def __init__(self, subtree=False, fine_grained=False): - self.subtree = subtree - - tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral', - '3': 'positive', '4': 'very positive'} - if not fine_grained: - tag_v['0'] = tag_v['1'] - tag_v['4'] = tag_v['3'] - self.tag_v = tag_v - self.tokenizer = get_tokenizer() - - def _load(self, path): - """ - - :param str path: 存储数据的路径 - :return: 一个 :class:`~fastNLP.DataSet` 类型的对象 - """ - datalist = [] - with open(path, 'r', encoding='utf-8') as f: - datas = [] - for l in f: - datas.extend([(s, self.tag_v[t]) - for s, t in self._get_one(l, self.subtree)]) - ds = DataSet() - for words, tag in datas: - ds.append(Instance(words=words, target=tag)) - return ds - - def _get_one(self, data, subtree): - tree = Tree.fromstring(data) - if subtree: - return [(self.tokenizer(' '.join(t.leaves())), t.label()) for t in tree.subtrees() ] - return [(self.tokenizer(' '.join(tree.leaves())), tree.label())] - - def process(self, - paths, train_subtree=True, - src_vocab_op: VocabularyOption = None, - tgt_vocab_op: VocabularyOption = None,): - paths = check_loader_paths(paths) - input_name, target_name = 'words', 'target' - src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) - - info = DataBundle() - origin_subtree = self.subtree - self.subtree = train_subtree - info.datasets['train'] = self._load(paths['train']) - self.subtree = origin_subtree - for n, p in paths.items(): - if n != 'train': - info.datasets[n] = self._load(p) - - src_vocab.from_dataset( - info.datasets['train'], - field_name=input_name, - no_create_entry_dataset=[ds for n, ds in info.datasets.items() if n != 'train']) - tgt_vocab.from_dataset(info.datasets['train'], field_name=target_name) - - src_vocab.index_dataset( - *info.datasets.values(), - field_name=input_name, new_field_name=input_name) - tgt_vocab.index_dataset( - *info.datasets.values(), - field_name=target_name, new_field_name=target_name) - info.vocabs = { - input_name: src_vocab, - target_name: tgt_vocab - } - - return info - - -class SST2Loader(CSVLoader): - """ - 别名::class:`fastNLP.io.SST2Loader` :class:`fastNLP.io.data_loader.SST2Loader` - - 数据来源 SST: https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8 - """ - - def __init__(self): - super(SST2Loader, self).__init__(sep='\t') - self.tokenizer = get_tokenizer() - self.field = {'sentence': Const.INPUT, 'label': Const.TARGET} - - def _load(self, path: str) -> DataSet: - ds = super(SST2Loader, self)._load(path) - for k, v in self.field.items(): - if k in ds.get_field_names(): - ds.rename_field(k, v) - ds.apply(lambda x: self.tokenizer(x[Const.INPUT]), new_field_name=Const.INPUT) - print("all count:", len(ds)) - return ds - - def process(self, - paths: Union[str, Dict[str, str]], - src_vocab_opt: VocabularyOption = None, - tgt_vocab_opt: VocabularyOption = None, - char_level_op=False): - - paths = check_loader_paths(paths) - datasets = {} - info = DataBundle() - for name, path in paths.items(): - dataset = self.load(path) - dataset.apply_field(lambda words:words.copy(), field_name='words', new_field_name='raw_words') - datasets[name] = dataset - - def wordtochar(words): - chars = [] - for word in words: - word = word.lower() - for char in word: - chars.append(char) - chars.append('') - chars.pop() - return chars - - input_name, target_name = Const.INPUT, Const.TARGET - info.vocabs={} - - # 就分隔为char形式 - if char_level_op: - for dataset in datasets.values(): - dataset.apply_field(wordtochar, field_name=Const.INPUT, new_field_name=Const.CHAR_INPUT) - src_vocab = Vocabulary() if src_vocab_opt is None else Vocabulary(**src_vocab_opt) - src_vocab.from_dataset(datasets['train'], field_name=Const.INPUT, no_create_entry_dataset=[ - dataset for name, dataset in datasets.items() if name!='train' - ]) - src_vocab.index_dataset(*datasets.values(), field_name=Const.INPUT) - - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_opt is None else Vocabulary(**tgt_vocab_opt) - tgt_vocab.from_dataset(datasets['train'], field_name=Const.TARGET) - tgt_vocab.index_dataset(*datasets.values(), field_name=Const.TARGET) - - info.vocabs = { - Const.INPUT: src_vocab, - Const.TARGET: tgt_vocab - } - - info.datasets = datasets - - for name, dataset in info.datasets.items(): - dataset.set_input(Const.INPUT) - dataset.set_target(Const.TARGET) - - return info - diff --git a/fastNLP/io/data_loader/yelp.py b/fastNLP/io/data_loader/yelp.py deleted file mode 100644 index f2bc60c8..00000000 --- a/fastNLP/io/data_loader/yelp.py +++ /dev/null @@ -1,132 +0,0 @@ - -import csv -from typing import Iterable - -from ...core.const import Const -from ...core.dataset import DataSet -from ...core.instance import Instance -from ...core.vocabulary import VocabularyOption, Vocabulary -from ..data_bundle import DataBundle, DataSetLoader -from typing import Union, Dict -from ..utils import check_loader_paths, get_tokenizer - - -class YelpLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.YelpLoader` :class:`fastNLP.io.data_loader.YelpLoader` - 读取Yelp_full/Yelp_polarity数据集, DataSet包含fields: - - words: list(str), 需要分类的文本 - - target: str, 文本的标签 - - chars:list(str),未index的字符列表 - - 数据集:yelp_full/yelp_polarity - - :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` - :param lower: 是否需要自动转小写,默认为False。 - """ - - def __init__(self, fine_grained=False, lower=False): - super(YelpLoader, self).__init__() - tag_v = {'1.0': 'very negative', '2.0': 'negative', '3.0': 'neutral', - '4.0': 'positive', '5.0': 'very positive'} - if not fine_grained: - tag_v['1.0'] = tag_v['2.0'] - tag_v['5.0'] = tag_v['4.0'] - self.fine_grained = fine_grained - self.tag_v = tag_v - self.lower = lower - self.tokenizer = get_tokenizer() - - def _load(self, path): - ds = DataSet() - csv_reader = csv.reader(open(path, encoding='utf-8')) - all_count = 0 - real_count = 0 - for row in csv_reader: - all_count += 1 - if len(row) == 2: - target = self.tag_v[row[0] + ".0"] - words = clean_str(row[1], self.tokenizer, self.lower) - if len(words) != 0: - ds.append(Instance(words=words, target=target)) - real_count += 1 - print("all count:", all_count) - print("real count:", real_count) - return ds - - def process(self, paths: Union[str, Dict[str, str]], - train_ds: Iterable[str] = None, - src_vocab_op: VocabularyOption = None, - tgt_vocab_op: VocabularyOption = None, - char_level_op=False): - paths = check_loader_paths(paths) - info = DataBundle(datasets=self.load(paths)) - src_vocab = Vocabulary() if src_vocab_op is None else Vocabulary(**src_vocab_op) - tgt_vocab = Vocabulary(unknown=None, padding=None) \ - if tgt_vocab_op is None else Vocabulary(**tgt_vocab_op) - _train_ds = [info.datasets[name] - for name in train_ds] if train_ds else info.datasets.values() - - def wordtochar(words): - chars = [] - for word in words: - word = word.lower() - for char in word: - chars.append(char) - chars.append('') - chars.pop() - return chars - - input_name, target_name = Const.INPUT, Const.TARGET - info.vocabs = {} - # 就分隔为char形式 - if char_level_op: - for dataset in info.datasets.values(): - dataset.apply_field(wordtochar, field_name=Const.INPUT, new_field_name=Const.CHAR_INPUT) - else: - src_vocab.from_dataset(*_train_ds, field_name=input_name) - src_vocab.index_dataset(*info.datasets.values(), field_name=input_name, new_field_name=input_name) - info.vocabs[input_name] = src_vocab - - tgt_vocab.from_dataset(*_train_ds, field_name=target_name) - tgt_vocab.index_dataset( - *info.datasets.values(), - field_name=target_name, new_field_name=target_name) - - info.vocabs[target_name] = tgt_vocab - - info.datasets['train'], info.datasets['dev'] = info.datasets['train'].split(0.1, shuffle=False) - - for name, dataset in info.datasets.items(): - dataset.set_input(Const.INPUT) - dataset.set_target(Const.TARGET) - - return info - - -def clean_str(sentence, tokenizer, char_lower=False): - """ - heavily borrowed from github - https://github.com/LukeZhuang/Hierarchical-Attention-Network/blob/master/yelp-preprocess.ipynb - :param sentence: is a str - :return: - """ - if char_lower: - sentence = sentence.lower() - import re - nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') - words = tokenizer(sentence) - words_collection = [] - for word in words: - if word in ['-lrb-', '-rrb-', '', '-r', '-l', 'b-']: - continue - tt = nonalpnum.split(word) - t = ''.join(tt) - if t != '': - words_collection.append(t) - - return words_collection - diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py deleted file mode 100644 index fca0de69..00000000 --- a/fastNLP/io/dataset_loader.py +++ /dev/null @@ -1,121 +0,0 @@ -"""undocumented -.. warning:: - - 本模块将在 `0.5.0版本` 中被废弃,由 :mod:`~fastNLP.io.loader` 和 :mod:`~fastNLP.io.pipe` 模块替代。 - -dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的数据, 并返回 `DataSet` , -得到的 :class:`~fastNLP.DataSet` 对象可以直接传入 :class:`~fastNLP.Trainer` 和 :class:`~fastNLP.Tester`, 用于模型的训练和测试。 -以SNLI数据集为例:: - - loader = SNLILoader() - train_ds = loader.load('path/to/train') - dev_ds = loader.load('path/to/dev') - test_ds = loader.load('path/to/test') - - # ... do stuff - -为 fastNLP 提供 DataSetLoader 的开发者请参考 :class:`~fastNLP.io.DataSetLoader` 的介绍。 - -""" -__all__ = [ - 'CSVLoader', - 'JsonLoader', -] - - -from .data_bundle import DataSetLoader -from .file_reader import _read_csv, _read_json -from ..core.dataset import DataSet -from ..core.instance import Instance - - -class JsonLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.JsonLoader` :class:`fastNLP.io.dataset_loader.JsonLoader` - - 读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象 - - :param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name - ``fields`` 的 `key` 必须是json对象的属性名. ``fields`` 的 `value` 为读入后在DataSet存储的 `field_name` , - `value` 也可为 ``None`` , 这时读入后的 `field_name` 与json对象对应属性同名 - ``fields`` 可为 ``None`` , 这时,json对象所有属性都保存在DataSet中. Default: ``None`` - :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` . - Default: ``False`` - """ - - def __init__(self, fields=None, dropna=False): - super(JsonLoader, self).__init__() - self.dropna = dropna - self.fields = None - self.fields_list = None - if fields: - self.fields = {} - for k, v in fields.items(): - self.fields[k] = k if v is None else v - self.fields_list = list(self.fields.keys()) - - def _load(self, path): - ds = DataSet() - for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna): - if self.fields: - ins = {self.fields[k]: v for k, v in d.items()} - else: - ins = d - ds.append(Instance(**ins)) - return ds - - -class CSVLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.dataset_loader.CSVLoader` - - 读取CSV格式的数据集。返回 ``DataSet`` - - :param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称 - 若为 ``None`` ,则将读入文件的第一行视作 ``headers`` . Default: ``None`` - :param str sep: CSV文件中列与列之间的分隔符. Default: "," - :param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` . - Default: ``False`` - """ - - def __init__(self, headers=None, sep=",", dropna=False): - self.headers = headers - self.sep = sep - self.dropna = dropna - - def _load(self, path): - ds = DataSet() - for idx, data in _read_csv(path, headers=self.headers, - sep=self.sep, dropna=self.dropna): - ds.append(Instance(**data)) - return ds - - -def _cut_long_sentence(sent, max_sample_length=200): - """ - 将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。 - 所以截取的句子可能长于或者短于max_sample_length - - :param sent: str. - :param max_sample_length: int. - :return: list of str. - """ - sent_no_space = sent.replace(' ', '') - cutted_sentence = [] - if len(sent_no_space) > max_sample_length: - parts = sent.strip().split() - new_line = '' - length = 0 - for part in parts: - length += len(part) - new_line += part + ' ' - if length > max_sample_length: - new_line = new_line[:-1] - cutted_sentence.append(new_line) - length = 0 - new_line = '' - if new_line != '': - cutted_sentence.append(new_line[:-1]) - else: - cutted_sentence.append(sent) - return cutted_sentence diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 780d91e4..a157901f 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -13,7 +13,6 @@ import warnings import numpy as np -from .data_bundle import BaseLoader from ..core.utils import Option from ..core.vocabulary import Vocabulary @@ -32,7 +31,7 @@ class EmbeddingOption(Option): ) -class EmbedLoader(BaseLoader): +class EmbedLoader: """ 别名::class:`fastNLP.io.EmbedLoader` :class:`fastNLP.io.embed_loader.EmbedLoader` @@ -84,9 +83,9 @@ class EmbedLoader(BaseLoader): word = ''.join(parts[:-dim]) nums = parts[-dim:] # 对齐unk与pad - if word==padding and vocab.padding is not None: + if word == padding and vocab.padding is not None: word = vocab.padding - elif word==unknown and vocab.unknown is not None: + elif word == unknown and vocab.unknown is not None: word = vocab.unknown if word in vocab: index = vocab.to_index(word) @@ -171,7 +170,7 @@ class EmbedLoader(BaseLoader): index = vocab.to_index(key) matrix[index] = vec - if (unknown is not None and not found_unknown) or (padding is not None and not found_pad): + if ((unknown is not None) and (not found_unknown)) or ((padding is not None) and (not found_pad)): start_idx = 0 if padding is not None: start_idx += 1 @@ -180,9 +179,9 @@ class EmbedLoader(BaseLoader): mean = np.mean(matrix[start_idx:], axis=0, keepdims=True) std = np.std(matrix[start_idx:], axis=0, keepdims=True) - if (unknown is not None and not found_unknown): + if (unknown is not None) and (not found_unknown): matrix[start_idx - 1] = np.random.randn(1, dim).astype(dtype) * std + mean - if (padding is not None and not found_pad): + if (padding is not None) and (not found_pad): matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean if normalize: diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py index 22ced1ce..a1899f51 100644 --- a/fastNLP/io/model_io.py +++ b/fastNLP/io/model_io.py @@ -8,10 +8,8 @@ __all__ = [ import torch -from .data_bundle import BaseLoader - -class ModelLoader(BaseLoader): +class ModelLoader: """ 别名::class:`fastNLP.io.ModelLoader` :class:`fastNLP.io.model_io.ModelLoader` diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py deleted file mode 100644 index eec820e4..00000000 --- a/fastNLP/models/enas_controller.py +++ /dev/null @@ -1,228 +0,0 @@ -"""undocumented -Code Modified from https://github.com/carpedm20/ENAS-pytorch -A module with NAS controller-related code. -""" - -__all__ = [] - -import collections -import os - -import torch -import torch.nn.functional as F - -from . import enas_utils as utils -from .enas_utils import Node - - -def _construct_dags(prev_nodes, activations, func_names, num_blocks): - """Constructs a set of DAGs based on the actions, i.e., previous nodes and - activation functions, sampled from the controller/policy pi. - - Args: - prev_nodes: Previous node actions from the policy. - activations: Activations sampled from the policy. - func_names: Mapping from activation function names to functions. - num_blocks: Number of blocks in the target RNN cell. - - Returns: - A list of DAGs defined by the inputs. - - RNN cell DAGs are represented in the following way: - - 1. Each element (node) in a DAG is a list of `Node`s. - - 2. The `Node`s in the list dag[i] correspond to the subsequent nodes - that take the output from node i as their own input. - - 3. dag[-1] is the node that takes input from x^{(t)} and h^{(t - 1)}. - dag[-1] always feeds dag[0]. - dag[-1] acts as if `w_xc`, `w_hc`, `w_xh` and `w_hh` are its - weights. - - 4. dag[N - 1] is the node that produces the hidden state passed to - the next timestep. dag[N - 1] is also always a leaf node, and therefore - is always averaged with the other leaf nodes and fed to the output - decoder. - """ - dags = [] - for nodes, func_ids in zip(prev_nodes, activations): - dag = collections.defaultdict(list) - - # add first node - dag[-1] = [Node(0, func_names[func_ids[0]])] - dag[-2] = [Node(0, func_names[func_ids[0]])] - - # add following nodes - for jdx, (idx, func_id) in enumerate(zip(nodes, func_ids[1:])): - dag[utils.to_item(idx)].append(Node(jdx + 1, func_names[func_id])) - - leaf_nodes = set(range(num_blocks)) - dag.keys() - - # merge with avg - for idx in leaf_nodes: - dag[idx] = [Node(num_blocks, 'avg')] - - # This is actually y^{(t)}. h^{(t)} is node N - 1 in - # the graph, where N Is the number of nodes. I.e., h^{(t)} takes - # only one other node as its input. - # last h[t] node - last_node = Node(num_blocks + 1, 'h[t]') - dag[num_blocks] = [last_node] - dags.append(dag) - - return dags - - -class Controller(torch.nn.Module): - """Based on - https://github.com/pytorch/examples/blob/master/word_language_model/model.py - - RL controllers do not necessarily have much to do with - language models. - - Base the controller RNN on the GRU from: - https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py - """ - def __init__(self, num_blocks=4, controller_hid=100, cuda=False): - torch.nn.Module.__init__(self) - - # `num_tokens` here is just the activation function - # for every even step, - self.shared_rnn_activations = ['tanh', 'ReLU', 'identity', 'sigmoid'] - self.num_tokens = [len(self.shared_rnn_activations)] - self.controller_hid = controller_hid - self.use_cuda = cuda - self.num_blocks = num_blocks - for idx in range(num_blocks): - self.num_tokens += [idx + 1, len(self.shared_rnn_activations)] - self.func_names = self.shared_rnn_activations - - num_total_tokens = sum(self.num_tokens) - - self.encoder = torch.nn.Embedding(num_total_tokens, - controller_hid) - self.lstm = torch.nn.LSTMCell(controller_hid, controller_hid) - - # Perhaps these weights in the decoder should be - # shared? At least for the activation functions, which all have the - # same size. - self.decoders = [] - for idx, size in enumerate(self.num_tokens): - decoder = torch.nn.Linear(controller_hid, size) - self.decoders.append(decoder) - - self._decoders = torch.nn.ModuleList(self.decoders) - - self.reset_parameters() - self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - - def _get_default_hidden(key): - return utils.get_variable( - torch.zeros(key, self.controller_hid), - self.use_cuda, - requires_grad=False) - - self.static_inputs = utils.keydefaultdict(_get_default_hidden) - - def reset_parameters(self): - init_range = 0.1 - for param in self.parameters(): - param.data.uniform_(-init_range, init_range) - for decoder in self.decoders: - decoder.bias.data.fill_(0) - - def forward(self, # pylint:disable=arguments-differ - inputs, - hidden, - block_idx, - is_embed): - if not is_embed: - embed = self.encoder(inputs) - else: - embed = inputs - - hx, cx = self.lstm(embed, hidden) - logits = self.decoders[block_idx](hx) - - logits /= 5.0 - - # # exploration - # if self.args.mode == 'train': - # logits = (2.5 * F.tanh(logits)) - - return logits, (hx, cx) - - def sample(self, batch_size=1, with_details=False, save_dir=None): - """Samples a set of `args.num_blocks` many computational nodes from the - controller, where each node is made up of an activation function, and - each node except the last also includes a previous node. - """ - if batch_size < 1: - raise Exception(f'Wrong batch_size: {batch_size} < 1') - - # [B, L, H] - inputs = self.static_inputs[batch_size] - hidden = self.static_init_hidden[batch_size] - - activations = [] - entropies = [] - log_probs = [] - prev_nodes = [] - # The RNN controller alternately outputs an activation, - # followed by a previous node, for each block except the last one, - # which only gets an activation function. The last node is the output - # node, and its previous node is the average of all leaf nodes. - for block_idx in range(2*(self.num_blocks - 1) + 1): - logits, hidden = self.forward(inputs, - hidden, - block_idx, - is_embed=(block_idx == 0)) - - probs = F.softmax(logits, dim=-1) - log_prob = F.log_softmax(logits, dim=-1) - # .mean() for entropy? - entropy = -(log_prob * probs).sum(1, keepdim=False) - - action = probs.multinomial(num_samples=1).data - selected_log_prob = log_prob.gather( - 1, utils.get_variable(action, requires_grad=False)) - - # why the [:, 0] here? Should it be .squeeze(), or - # .view()? Same below with `action`. - entropies.append(entropy) - log_probs.append(selected_log_prob[:, 0]) - - # 0: function, 1: previous node - mode = block_idx % 2 - inputs = utils.get_variable( - action[:, 0] + sum(self.num_tokens[:mode]), - requires_grad=False) - - if mode == 0: - activations.append(action[:, 0]) - elif mode == 1: - prev_nodes.append(action[:, 0]) - - prev_nodes = torch.stack(prev_nodes).transpose(0, 1) - activations = torch.stack(activations).transpose(0, 1) - - dags = _construct_dags(prev_nodes, - activations, - self.func_names, - self.num_blocks) - - if save_dir is not None: - for idx, dag in enumerate(dags): - utils.draw_network(dag, - os.path.join(save_dir, f'graph{idx}.png')) - - if with_details: - return dags, torch.cat(log_probs), torch.cat(entropies) - - return dags - - def init_hidden(self, batch_size): - zeros = torch.zeros(batch_size, self.controller_hid) - return (utils.get_variable(zeros, self.use_cuda, requires_grad=False), - utils.get_variable(zeros.clone(), self.use_cuda, requires_grad=False)) diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py deleted file mode 100644 index 2e8ca713..00000000 --- a/fastNLP/models/enas_model.py +++ /dev/null @@ -1,393 +0,0 @@ -"""undocumented -Module containing the shared RNN model. -Code Modified from https://github.com/carpedm20/ENAS-pytorch -""" - -__all__ = [] - -import collections - -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable - -from . import enas_utils as utils -from .base_model import BaseModel - - -def _get_dropped_weights(w_raw, dropout_p, is_training): - """Drops out weights to implement DropConnect. - - Args: - w_raw: Full, pre-dropout, weights to be dropped out. - dropout_p: Proportion of weights to drop out. - is_training: True iff _shared_ model is training. - - Returns: - The dropped weights. - - Why does torch.nn.functional.dropout() return: - 1. `torch.autograd.Variable()` on the training loop - 2. `torch.nn.Parameter()` on the controller or eval loop, when - training = False... - - Even though the call to `_setweights` in the Smerity repo's - `weight_drop.py` does not have this behaviour, and `F.dropout` always - returns `torch.autograd.Variable` there, even when `training=False`? - - The above TODO is the reason for the hacky check for `torch.nn.Parameter`. - """ - dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training) - - if isinstance(dropped_w, torch.nn.Parameter): - dropped_w = dropped_w.clone() - - return dropped_w - - -class EmbeddingDropout(torch.nn.Embedding): - """Class for dropping out embeddings by zero'ing out parameters in the - embedding matrix. - - This is equivalent to dropping out particular words, e.g., in the sentence - 'the quick brown fox jumps over the lazy dog', dropping out 'the' would - lead to the sentence '### quick brown fox jumps over ### lazy dog' (in the - embedding vector space). - - See 'A Theoretically Grounded Application of Dropout in Recurrent Neural - Networks', (Gal and Ghahramani, 2016). - """ - - def __init__(self, - num_embeddings, - embedding_dim, - max_norm=None, - norm_type=2, - scale_grad_by_freq=False, - sparse=False, - dropout=0.1, - scale=None): - """Embedding constructor. - - Args: - dropout: Dropout probability. - scale: Used to scale parameters of embedding weight matrix that are - not dropped out. Note that this is _in addition_ to the - `1/(1 - dropout)` scaling. - - See `torch.nn.Embedding` for remaining arguments. - """ - torch.nn.Embedding.__init__(self, - num_embeddings=num_embeddings, - embedding_dim=embedding_dim, - max_norm=max_norm, - norm_type=norm_type, - scale_grad_by_freq=scale_grad_by_freq, - sparse=sparse) - self.dropout = dropout - assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 ' - 'and < 1.0') - self.scale = scale - - def forward(self, inputs): # pylint:disable=arguments-differ - """Embeds `inputs` with the dropped out embedding weight matrix.""" - if self.training: - dropout = self.dropout - else: - dropout = 0 - - if dropout: - mask = self.weight.data.new(self.weight.size(0), 1) - mask.bernoulli_(1 - dropout) - mask = mask.expand_as(self.weight) - mask = mask / (1 - dropout) - masked_weight = self.weight * Variable(mask) - else: - masked_weight = self.weight - if self.scale and self.scale != 1: - masked_weight = masked_weight * self.scale - - return F.embedding(inputs, - masked_weight, - max_norm=self.max_norm, - norm_type=self.norm_type, - scale_grad_by_freq=self.scale_grad_by_freq, - sparse=self.sparse) - - -class LockedDropout(nn.Module): - # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py - def __init__(self): - super().__init__() - - def forward(self, x, dropout=0.5): - if not self.training or not dropout: - return x - m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) - mask = Variable(m, requires_grad=False) / (1 - dropout) - mask = mask.expand_as(x) - return mask * x - - -class ENASModel(BaseModel): - """Shared RNN model.""" - - def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000): - super(ENASModel, self).__init__() - - self.use_cuda = cuda - - self.shared_hid = shared_hid - self.num_blocks = num_blocks - self.decoder = nn.Linear(self.shared_hid, num_classes) - self.encoder = EmbeddingDropout(embed_num, - shared_embed, - dropout=0.1) - self.lockdrop = LockedDropout() - self.dag = None - - # Tie weights - # self.decoder.weight = self.encoder.weight - - # Since W^{x, c} and W^{h, c} are always summed, there - # is no point duplicating their bias offset parameter. Likewise for - # W^{x, h} and W^{h, h}. - self.w_xc = nn.Linear(shared_embed, self.shared_hid) - self.w_xh = nn.Linear(shared_embed, self.shared_hid) - - # The raw weights are stored here because the hidden-to-hidden weights - # are weight dropped on the forward pass. - self.w_hc_raw = torch.nn.Parameter( - torch.Tensor(self.shared_hid, self.shared_hid)) - self.w_hh_raw = torch.nn.Parameter( - torch.Tensor(self.shared_hid, self.shared_hid)) - self.w_hc = None - self.w_hh = None - - self.w_h = collections.defaultdict(dict) - self.w_c = collections.defaultdict(dict) - - for idx in range(self.num_blocks): - for jdx in range(idx + 1, self.num_blocks): - self.w_h[idx][jdx] = nn.Linear(self.shared_hid, - self.shared_hid, - bias=False) - self.w_c[idx][jdx] = nn.Linear(self.shared_hid, - self.shared_hid, - bias=False) - - self._w_h = nn.ModuleList([self.w_h[idx][jdx] - for idx in self.w_h - for jdx in self.w_h[idx]]) - self._w_c = nn.ModuleList([self.w_c[idx][jdx] - for idx in self.w_c - for jdx in self.w_c[idx]]) - - self.batch_norm = None - # if args.mode == 'train': - # self.batch_norm = nn.BatchNorm1d(self.shared_hid) - # else: - # self.batch_norm = None - - self.reset_parameters() - self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - - def setDAG(self, dag): - if self.dag is None: - self.dag = dag - - def forward(self, word_seq, hidden=None): - inputs = torch.transpose(word_seq, 0, 1) - - time_steps = inputs.size(0) - batch_size = inputs.size(1) - - self.w_hh = _get_dropped_weights(self.w_hh_raw, - 0.5, - self.training) - self.w_hc = _get_dropped_weights(self.w_hc_raw, - 0.5, - self.training) - - # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden - hidden = self.static_init_hidden[batch_size] - - embed = self.encoder(inputs) - - embed = self.lockdrop(embed, 0.65 if self.training else 0) - - # The norm of hidden states are clipped here because - # otherwise ENAS is especially prone to exploding activations on the - # forward pass. This could probably be fixed in a more elegant way, but - # it might be exposing a weakness in the ENAS algorithm as currently - # proposed. - # - # For more details, see - # https://github.com/carpedm20/ENAS-pytorch/issues/6 - clipped_num = 0 - max_clipped_norm = 0 - h1tohT = [] - logits = [] - for step in range(time_steps): - x_t = embed[step] - logit, hidden = self.cell(x_t, hidden, self.dag) - - hidden_norms = hidden.norm(dim=-1) - max_norm = 25.0 - if hidden_norms.data.max() > max_norm: - # Just directly use the torch slice operations - # in PyTorch v0.4. - # - # This workaround for PyTorch v0.3.1 does everything in numpy, - # because the PyTorch slicing and slice assignment is too - # flaky. - hidden_norms = hidden_norms.data.cpu().numpy() - - clipped_num += 1 - if hidden_norms.max() > max_clipped_norm: - max_clipped_norm = hidden_norms.max() - - clip_select = hidden_norms > max_norm - clip_norms = hidden_norms[clip_select] - - mask = np.ones(hidden.size()) - normalizer = max_norm / clip_norms - normalizer = normalizer[:, np.newaxis] - - mask[clip_select] = normalizer - - if self.use_cuda: - hidden *= torch.autograd.Variable( - torch.FloatTensor(mask).cuda(), requires_grad=False) - else: - hidden *= torch.autograd.Variable( - torch.FloatTensor(mask), requires_grad=False) - logits.append(logit) - h1tohT.append(hidden) - - h1tohT = torch.stack(h1tohT) - output = torch.stack(logits) - raw_output = output - - output = self.lockdrop(output, 0.4 if self.training else 0) - - # Pooling - output = torch.mean(output, 0) - - decoded = self.decoder(output) - - extra_out = {'dropped': decoded, - 'hiddens': h1tohT, - 'raw': raw_output} - return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out} - - def cell(self, x, h_prev, dag): - """Computes a single pass through the discovered RNN cell.""" - c = {} - h = {} - f = {} - - f[0] = self.get_f(dag[-1][0].name) - c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None)) - h[0] = (c[0] * f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + - (1 - c[0]) * h_prev) - - leaf_node_ids = [] - q = collections.deque() - q.append(0) - - # Computes connections from the parent nodes `node_id` - # to their child nodes `next_id` recursively, skipping leaf nodes. A - # leaf node is a node whose id == `self.num_blocks`. - # - # Connections between parent i and child j should be computed as - # h_j = c_j*f_{ij}{(W^h_{ij}*h_i)} + (1 - c_j)*h_i, - # where c_j = \sigmoid{(W^c_{ij}*h_i)} - # - # See Training details from Section 3.1 of the paper. - # - # The following algorithm does a breadth-first (since `q.popleft()` is - # used) search over the nodes and computes all the hidden states. - while True: - if len(q) == 0: - break - - node_id = q.popleft() - nodes = dag[node_id] - - for next_node in nodes: - next_id = next_node.id - if next_id == self.num_blocks: - leaf_node_ids.append(node_id) - assert len(nodes) == 1, ('parent of leaf node should have ' - 'only one child') - continue - - w_h = self.w_h[node_id][next_id] - w_c = self.w_c[node_id][next_id] - - f[next_id] = self.get_f(next_node.name) - c[next_id] = torch.sigmoid(w_c(h[node_id])) - h[next_id] = (c[next_id] * f[next_id](w_h(h[node_id])) + - (1 - c[next_id]) * h[node_id]) - - q.append(next_id) - - # Instead of averaging loose ends, perhaps there should - # be a set of separate unshared weights for each "loose" connection - # between each node in a cell and the output. - # - # As it stands, all weights W^h_{ij} are doing double duty by - # connecting both from i to j, as well as from i to the output. - - # average all the loose ends - leaf_nodes = [h[node_id] for node_id in leaf_node_ids] - output = torch.mean(torch.stack(leaf_nodes, 2), -1) - - # stabilizing the Updates of omega - if self.batch_norm is not None: - output = self.batch_norm(output) - - return output, h[self.num_blocks - 1] - - def init_hidden(self, batch_size): - zeros = torch.zeros(batch_size, self.shared_hid) - return utils.get_variable(zeros, self.use_cuda, requires_grad=False) - - def get_f(self, name): - name = name.lower() - if name == 'relu': - f = torch.relu - elif name == 'tanh': - f = torch.tanh - elif name == 'identity': - f = lambda x: x - elif name == 'sigmoid': - f = torch.sigmoid - return f - - @property - def num_parameters(self): - def size(p): - return np.prod(p.size()) - - return sum([size(param) for param in self.parameters()]) - - def reset_parameters(self): - init_range = 0.025 - # init_range = 0.025 if self.args.mode == 'train' else 0.04 - for param in self.parameters(): - param.data.uniform_(-init_range, init_range) - self.decoder.bias.data.fill_(0) - - def predict(self, word_seq): - """ - - :param word_seq: torch.LongTensor, [batch_size, seq_len] - :return predict: dict of torch.LongTensor, [batch_size, seq_len] - """ - output = self(word_seq) - _, predict = output['pred'].max(dim=1) - return {'pred': predict} diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py deleted file mode 100644 index 98d778cd..00000000 --- a/fastNLP/models/enas_trainer.py +++ /dev/null @@ -1,384 +0,0 @@ -"""undocumented -Code Modified from https://github.com/carpedm20/ENAS-pytorch -""" - -__all__ = [] - -import math -import time -from datetime import datetime, timedelta - -import numpy as np -import torch -from torch.optim import Adam - -try: - from tqdm.auto import tqdm -except: - from ..core.utils import _pseudo_tqdm as tqdm - -from ..core.trainer import Trainer -from ..core.batch import DataSetIter -from ..core.callback import CallbackException -from ..core.dataset import DataSet -from ..core.utils import _move_dict_value_to_device -from . import enas_utils as utils -from ..core.utils import _build_args - - -def _get_no_grad_ctx_mgr(): - """Returns a the `torch.no_grad` context manager for PyTorch version >= - 0.4, or a no-op context manager otherwise. - """ - return torch.no_grad() - - -class ENASTrainer(Trainer): - """A class to wrap training code.""" - - def __init__(self, train_data, model, controller, **kwargs): - """Constructor for training algorithm. - :param DataSet train_data: the training data - :param torch.nn.modules.module model: a PyTorch model - :param torch.nn.modules.module controller: a PyTorch model - """ - self.final_epochs = kwargs['final_epochs'] - kwargs.pop('final_epochs') - super(ENASTrainer, self).__init__(train_data, model, **kwargs) - self.controller_step = 0 - self.shared_step = 0 - self.max_length = 35 - - self.shared = model - self.controller = controller - - self.shared_optim = Adam( - self.shared.parameters(), - lr=20.0, - weight_decay=1e-7) - - self.controller_optim = Adam( - self.controller.parameters(), - lr=3.5e-4) - - def train(self, load_best_model=True): - """ - :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 - 最好的模型参数。 - :return results: 返回一个字典类型的数据, - 内含以下内容:: - - seconds: float, 表示训练时长 - 以下三个内容只有在提供了dev_data的情况下会有。 - best_eval: Dict of Dict, 表示evaluation的结果 - best_epoch: int,在第几个epoch取得的最佳值 - best_step: int, 在第几个step(batch)更新取得的最佳值 - - """ - results = {} - if self.n_epochs <= 0: - print(f"training epoch is {self.n_epochs}, nothing was done.") - results['seconds'] = 0. - return results - try: - if torch.cuda.is_available() and "cuda" in self.device: - self.model = self.model.cuda() - self._model_device = self.model.parameters().__next__().device - self._mode(self.model, is_test=False) - - self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) - start_time = time.time() - print("training epochs started " + self.start_time, flush=True) - - try: - self.callback_manager.on_train_begin() - self._train() - self.callback_manager.on_train_end() - except (CallbackException, KeyboardInterrupt) as e: - self.callback_manager.on_exception(e) - - if self.dev_data is not None: - print( - "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + - self.tester._format_eval_results(self.best_dev_perf), ) - results['best_eval'] = self.best_dev_perf - results['best_epoch'] = self.best_dev_epoch - results['best_step'] = self.best_dev_step - if load_best_model: - model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) - load_succeed = self._load_model(self.model, model_name) - if load_succeed: - print("Reloaded the best model.") - else: - print("Fail to reload best model.") - finally: - pass - results['seconds'] = round(time.time() - start_time, 2) - - return results - - def _train(self): - if not self.use_tqdm: - from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm - else: - inner_tqdm = tqdm - self.step = 0 - start = time.time() - total_steps = (len(self.train_data) // self.batch_size + int( - len(self.train_data) % self.batch_size != 0)) * self.n_epochs - with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: - avg_loss = 0 - data_iterator = DataSetIter(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - for epoch in range(1, self.n_epochs + 1): - pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) - last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) - if epoch == self.n_epochs + 1 - self.final_epochs: - print('Entering the final stage. (Only train the selected structure)') - # early stopping - self.callback_manager.on_epoch_begin() - - # 1. Training the shared parameters omega of the child models - self.train_shared(pbar) - - # 2. Training the controller parameters theta - if not last_stage: - self.train_controller() - - if ((self.validate_every > 0 and self.step % self.validate_every == 0) or - (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ - and self.dev_data is not None: - if not last_stage: - self.derive() - eval_res = self._do_validation(epoch=epoch, step=self.step) - eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, - total_steps) + \ - self.tester._format_eval_results(eval_res) - pbar.write(eval_str) - - # lr decay; early stopping - self.callback_manager.on_epoch_end() - # =============== epochs end =================== # - pbar.close() - # ============ tqdm end ============== # - - def get_loss(self, inputs, targets, hidden, dags): - """Computes the loss for the same batch for M models. - - This amounts to an estimate of the loss, which is turned into an - estimate for the gradients of the shared model. - """ - if not isinstance(dags, list): - dags = [dags] - - loss = 0 - for dag in dags: - self.shared.setDAG(dag) - inputs = _build_args(self.shared.forward, **inputs) - inputs['hidden'] = hidden - result = self.shared(**inputs) - output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out'] - - self.callback_manager.on_loss_begin(targets, result) - sample_loss = self._compute_loss(result, targets) - loss += sample_loss - - assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`' - return loss, hidden, extra_out - - def train_shared(self, pbar=None, max_step=None, dag=None): - """Train the language model for 400 steps of minibatches of 64 - examples. - - Args: - max_step: Used to run extra training steps as a warm-up. - dag: If not None, is used instead of calling sample(). - - BPTT is truncated at 35 timesteps. - - For each weight update, gradients are estimated by sampling M models - from the fixed controller policy, and averaging their gradients - computed on a batch of training data. - """ - model = self.shared - model.train() - self.controller.eval() - - hidden = self.shared.init_hidden(self.batch_size) - - abs_max_grad = 0 - abs_max_hidden_norm = 0 - step = 0 - raw_total_loss = 0 - total_loss = 0 - train_idx = 0 - avg_loss = 0 - data_iterator = DataSetIter(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - - for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - indices = data_iterator.get_batch_indices() - # negative sampling; replace unknown; re-weight batch_y - self.callback_manager.on_batch_begin(batch_x, batch_y, indices) - # prediction = self._data_forward(self.model, batch_x) - - dags = self.controller.sample(1) - inputs, targets = batch_x, batch_y - # self.callback_manager.on_loss_begin(batch_y, prediction) - loss, hidden, extra_out = self.get_loss(inputs, - targets, - hidden, - dags) - hidden.detach_() - - avg_loss += loss.item() - - # Is loss NaN or inf? requires_grad = False - self.callback_manager.on_backward_begin(loss) - self._grad_backward(loss) - self.callback_manager.on_backward_end() - - self._update() - self.callback_manager.on_step_end() - - if (self.step + 1) % self.print_every == 0: - if self.use_tqdm: - print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) - pbar.update(self.print_every) - else: - end = time.time() - diff = timedelta(seconds=round(end - start)) - print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format( - epoch, self.step, avg_loss, diff) - pbar.set_postfix_str(print_output) - avg_loss = 0 - self.step += 1 - step += 1 - self.shared_step += 1 - self.callback_manager.on_batch_end() - # ================= mini-batch end ==================== # - - def get_reward(self, dag, entropies, hidden, valid_idx=0): - """Computes the perplexity of a single sampled model on a minibatch of - validation data. - """ - if not isinstance(entropies, np.ndarray): - entropies = entropies.data.cpu().numpy() - - data_iterator = DataSetIter(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - - for inputs, targets in data_iterator: - valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) - valid_loss = utils.to_item(valid_loss.data) - - valid_ppl = math.exp(valid_loss) - - R = 80 / valid_ppl - - rewards = R + 1e-4 * entropies - - return rewards, hidden - - def train_controller(self): - """Fixes the shared parameters and updates the controller parameters. - - The controller is updated with a score function gradient estimator - (i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl - is computed on a minibatch of validation data. - - A moving average baseline is used. - - The controller is trained for 2000 steps per epoch (i.e., - first (Train Shared) phase -> second (Train Controller) phase). - """ - model = self.controller - model.train() - # Why can't we call shared.eval() here? Leads to loss - # being uniformly zero for the controller. - # self.shared.eval() - - avg_reward_base = None - baseline = None - adv_history = [] - entropy_history = [] - reward_history = [] - - hidden = self.shared.init_hidden(self.batch_size) - total_loss = 0 - valid_idx = 0 - for step in range(20): - # sample models - dags, log_probs, entropies = self.controller.sample( - with_details=True) - - # calculate reward - np_entropies = entropies.data.cpu().numpy() - # No gradients should be backpropagated to the - # shared model during controller training, obviously. - with _get_no_grad_ctx_mgr(): - rewards, hidden = self.get_reward(dags, - np_entropies, - hidden, - valid_idx) - - reward_history.extend(rewards) - entropy_history.extend(np_entropies) - - # moving average baseline - if baseline is None: - baseline = rewards - else: - decay = 0.95 - baseline = decay * baseline + (1 - decay) * rewards - - adv = rewards - baseline - adv_history.extend(adv) - - # policy loss - loss = -log_probs * utils.get_variable(adv, - 'cuda' in self.device, - requires_grad=False) - - loss = loss.sum() # or loss.mean() - - # update - self.controller_optim.zero_grad() - loss.backward() - - self.controller_optim.step() - - total_loss += utils.to_item(loss.data) - - if ((step % 50) == 0) and (step > 0): - reward_history, adv_history, entropy_history = [], [], [] - total_loss = 0 - - self.controller_step += 1 - # prev_valid_idx = valid_idx - # valid_idx = ((valid_idx + self.max_length) % - # (self.valid_data.size(0) - 1)) - # # Whenever we wrap around to the beginning of the - # # validation data, we reset the hidden states. - # if prev_valid_idx > valid_idx: - # hidden = self.shared.init_hidden(self.batch_size) - - def derive(self, sample_num=10, valid_idx=0): - """We are always deriving based on the very first batch - of validation data? This seems wrong... - """ - hidden = self.shared.init_hidden(self.batch_size) - - dags, _, entropies = self.controller.sample(sample_num, - with_details=True) - - max_R = 0 - best_dag = None - for dag in dags: - R, _ = self.get_reward(dag, entropies, hidden, valid_idx) - if R.max() > max_R: - max_R = R.max() - best_dag = dag - - self.model.setDAG(best_dag) diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py deleted file mode 100644 index cd6c2503..00000000 --- a/fastNLP/models/enas_utils.py +++ /dev/null @@ -1,58 +0,0 @@ -"""undocumented -Code Modified from https://github.com/carpedm20/ENAS-pytorch -""" - -__all__ = [] - -import collections -from collections import defaultdict - -import numpy as np -import torch -from torch.autograd import Variable - - -def detach(h): - if type(h) == Variable: - return Variable(h.data) - else: - return tuple(detach(v) for v in h) - - -def get_variable(inputs, cuda=False, **kwargs): - if type(inputs) in [list, np.ndarray]: - inputs = torch.Tensor(inputs) - if cuda: - out = Variable(inputs.cuda(), **kwargs) - else: - out = Variable(inputs, **kwargs) - return out - - -def update_lr(optimizer, lr): - for param_group in optimizer.param_groups: - param_group['lr'] = lr - - -Node = collections.namedtuple('Node', ['id', 'name']) - - -class keydefaultdict(defaultdict): - def __missing__(self, key): - if self.default_factory is None: - raise KeyError(key) - else: - ret = self[key] = self.default_factory(key) - return ret - - -def to_item(x): - """Converts x, possibly scalar and possibly tensor, to a Python scalar.""" - if isinstance(x, (float, int)): - return x - - if float(torch.__version__[0:3]) < 0.4: - assert (x.dim() == 1) and (len(x) == 1) - return x[0] - - return x.item() diff --git a/legacy/api/README.md b/legacy/api/README.md deleted file mode 100644 index 73560f9f..00000000 --- a/legacy/api/README.md +++ /dev/null @@ -1,44 +0,0 @@ -# fastNLP 高级接口 - -### 环境与配置 -1. 系统环境:linux/ubuntu(推荐) -2. 编程语言:Python>=3.6 -3. Python包依赖 - - **torch==1.0** - - numpy>=1.14.2 - -### 中文分词 -```python -text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。', - '这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', - '那么这款无人机到底有多厉害?'] -from fastNLP.api import CWS -cws = CWS(device='cpu') -print(cws.predict(text)) -# ['编者 按 : 7月 12日 , 英国 航空 航天 系统 公司 公布 了 该 公司 研制 的 第一 款 高 科技 隐形 无人 机雷电 之 神 。', '这 款 飞行 从 外型 上 来 看 酷似 电影 中 的 太空 飞行器 , 据 英国 方面 介绍 , 可以 实现 洲际 远程 打击 。', '那么 这 款 无人 机 到底 有 多 厉害 ?'] -``` - -### 词性标注 -```python -# 输入已分词序列 -text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司', - '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'], - ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']] -from fastNLP.api import POS -pos = POS(device='cpu') -print(pos.predict(text)) -# [['编者/NN', '按:/NN', '7月/NT', '12日/NT', ',/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一款/NN', '高科技/NN', '隐形/AD', '无人机/VV', '雷电之神/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无人机/VV', '到底/AD', '有/VE', '多/AD', '厉害/VA', '?/PU']] -``` - -### 句法分析 -```python -text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司', - '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'], - ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']] -from fastNLP.api import Parser -parser = Parser(device='cpu') -print(parser.predict(text)) -# [['2/nn', '4/nn', '4/nn', '20/tmod', '11/punct', '10/nn', '10/nn', '10/nn', '10/nn', '11/nsubj', '20/dep', '11/asp', '14/det', '15/nsubj', '18/rcmod', '15/cpm', '18/nn', '11/dobj', '20/advmod', '0/root', '20/dobj', '20/punct'], ['4/advmod', '3/det', '8/xsubj', '8/dep', '8/advmod', '8/dep', '8/advmod', '0/root', '8/punct']] -``` - -完整样例见`examples.py` \ No newline at end of file diff --git a/legacy/api/__init__.py b/legacy/api/__init__.py deleted file mode 100644 index 5171d8c2..00000000 --- a/legacy/api/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -__all__ = ["CWS", "POS", "Parser"] -from .api import CWS, POS, Parser diff --git a/legacy/api/api.py b/legacy/api/api.py deleted file mode 100644 index 1408731f..00000000 --- a/legacy/api/api.py +++ /dev/null @@ -1,463 +0,0 @@ -import warnings - -import torch - -warnings.filterwarnings('ignore') -import os - -from fastNLP.core.dataset import DataSet -from .utils import load_url -from .processor import ModelProcessor -from fastNLP.io.dataset_loader import _cut_long_sentence -from fastNLP.io.data_loader import ConllLoader -from fastNLP.core.instance import Instance -from ..api.pipeline import Pipeline -from fastNLP.core.metrics import SpanFPreRecMetric -from .processor import IndexerProcessor - -# TODO add pretrain urls -model_urls = { - "cws": "http://123.206.98.91:8888/download/cws_lstm_ctb9_1_20-09908656.pkl", - "pos": "http://123.206.98.91:8888/download/pos_tag_model_20190119-43f8b435.pkl", - "parser": "http://123.206.98.91:8888/download/parser_20190204-c72ca5c0.pkl" -} - - -class ConllCWSReader(object): - """Deprecated. Use ConllLoader for all types of conll-format files.""" - - def __init__(self): - pass - - def load(self, path, cut_long_sent=False): - """ - 返回的DataSet只包含raw_sentence这个field,内容为str。 - 假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即 - :: - - 1 编者按 编者按 NN O 11 nmod:topic - 2 : : PU O 11 punct - 3 7月 7月 NT DATE 4 compound:nn - 4 12日 12日 NT DATE 11 nmod:tmod - 5 , , PU O 11 punct - - 1 这 这 DT O 3 det - 2 款 款 M O 1 mark:clf - 3 飞行 飞行 NN O 8 nsubj - 4 从 从 P O 5 case - 5 外型 外型 NN O 8 nmod:prep - - """ - datalist = [] - with open(path, 'r', encoding='utf-8') as f: - sample = [] - for line in f: - if line.startswith('\n'): - datalist.append(sample) - sample = [] - elif line.startswith('#'): - continue - else: - sample.append(line.strip().split()) - if len(sample) > 0: - datalist.append(sample) - - ds = DataSet() - for sample in datalist: - # print(sample) - res = self.get_char_lst(sample) - if res is None: - continue - line = ' '.join(res) - if cut_long_sent: - sents = _cut_long_sentence(line) - else: - sents = [line] - for raw_sentence in sents: - ds.append(Instance(raw_sentence=raw_sentence)) - return ds - - def get_char_lst(self, sample): - if len(sample) == 0: - return None - text = [] - for w in sample: - t1, t2, t3, t4 = w[1], w[3], w[6], w[7] - if t3 == '_': - return None - text.append(t1) - return text - - -class ConllxDataLoader(ConllLoader): - """返回“词级别”的标签信息,包括词、词性、(句法)头依赖、(句法)边标签。跟``ZhConllPOSReader``完全不同。 - - Deprecated. Use ConllLoader for all types of conll-format files. - """ - - def __init__(self): - headers = [ - 'words', 'pos_tags', 'heads', 'labels', - ] - indexs = [ - 1, 3, 6, 7, - ] - super(ConllxDataLoader, self).__init__(headers=headers, indexes=indexs) - - -class API: - def __init__(self): - self.pipeline = None - self._dict = None - - def predict(self, *args, **kwargs): - """Do prediction for the given input. - """ - raise NotImplementedError - - def test(self, file_path): - """Test performance over the given data set. - - :param str file_path: - :return: a dictionary of metric values - """ - raise NotImplementedError - - def load(self, path, device): - if os.path.exists(os.path.expanduser(path)): - _dict = torch.load(path, map_location='cpu') - else: - _dict = load_url(path, map_location='cpu') - self._dict = _dict - self.pipeline = _dict['pipeline'] - for processor in self.pipeline.pipeline: - if isinstance(processor, ModelProcessor): - processor.set_model_device(device) - - -class POS(API): - """FastNLP API for Part-Of-Speech tagging. - - :param str model_path: the path to the model. - :param str device: device name such as "cpu" or "cuda:0". Use the same notation as PyTorch. - - """ - - def __init__(self, model_path=None, device='cpu'): - super(POS, self).__init__() - if model_path is None: - model_path = model_urls['pos'] - - self.load(model_path, device) - - def predict(self, content): - """predict函数的介绍, - 函数介绍的第二句,这句话不会换行 - - :param content: list of list of str. Each string is a token(word). - :return answer: list of list of str. Each string is a tag. - """ - if not hasattr(self, "pipeline"): - raise ValueError("You have to load model first.") - - sentence_list = content - # 1. 检查sentence的类型 - for sentence in sentence_list: - if not all((type(obj) == str for obj in sentence)): - raise ValueError("Input must be list of list of string.") - - # 2. 组建dataset - dataset = DataSet() - dataset.add_field("words", sentence_list) - - # 3. 使用pipeline - self.pipeline(dataset) - - def merge_tag(words_list, tags_list): - rtn = [] - for words, tags in zip(words_list, tags_list): - rtn.append([w + "/" + t for w, t in zip(words, tags)]) - return rtn - - output = dataset.field_arrays["tag"].content - if isinstance(content, str): - return output[0] - elif isinstance(content, list): - return merge_tag(content, output) - - def test(self, file_path): - test_data = ConllxDataLoader().load(file_path) - - save_dict = self._dict - tag_vocab = save_dict["tag_vocab"] - pipeline = save_dict["pipeline"] - index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False) - pipeline.pipeline = [index_tag] + pipeline.pipeline - - test_data.rename_field("pos_tags", "tag") - pipeline(test_data) - test_data.set_target("truth") - prediction = test_data.field_arrays["predict"].content - truth = test_data.field_arrays["truth"].content - seq_len = test_data.field_arrays["word_seq_origin_len"].content - - # padding by hand - max_length = max([len(seq) for seq in prediction]) - for idx in range(len(prediction)): - prediction[idx] = list(prediction[idx]) + ([0] * (max_length - len(prediction[idx]))) - truth[idx] = list(truth[idx]) + ([0] * (max_length - len(truth[idx]))) - evaluator = SpanFPreRecMetric(tag_vocab=tag_vocab, pred="predict", target="truth", - seq_len="word_seq_origin_len") - evaluator({"predict": torch.Tensor(prediction), "word_seq_origin_len": torch.Tensor(seq_len)}, - {"truth": torch.Tensor(truth)}) - test_result = evaluator.get_metric() - f1 = round(test_result['f'] * 100, 2) - pre = round(test_result['pre'] * 100, 2) - rec = round(test_result['rec'] * 100, 2) - - return {"F1": f1, "precision": pre, "recall": rec} - - -class CWS(API): - """ - 中文分词高级接口。 - - :param model_path: 当model_path为None,使用默认位置的model。如果默认位置不存在,则自动下载模型 - :param device: str,可以为'cpu', 'cuda'或'cuda:0'等。会将模型load到相应device进行推断。 - """ - - def __init__(self, model_path=None, device='cpu'): - - super(CWS, self).__init__() - if model_path is None: - model_path = model_urls['cws'] - - self.load(model_path, device) - - def predict(self, content): - """ - 分词接口。 - - :param content: str或List[str], 例如: "中文分词很重要!", 返回的结果是"中文 分词 很 重要 !"。 如果传入的为List[str],比如 - [ "中文分词很重要!", ...], 返回的结果["中文 分词 很 重要 !", ...]。 - :return: str或List[str], 根据输入的的类型决定。 - """ - if not hasattr(self, 'pipeline'): - raise ValueError("You have to load model first.") - - sentence_list = [] - # 1. 检查sentence的类型 - if isinstance(content, str): - sentence_list.append(content) - elif isinstance(content, list): - sentence_list = content - - # 2. 组建dataset - dataset = DataSet() - dataset.add_field('raw_sentence', sentence_list) - - # 3. 使用pipeline - self.pipeline(dataset) - - output = dataset.get_field('output').content - if isinstance(content, str): - return output[0] - elif isinstance(content, list): - return output - - def test(self, filepath): - """ - 传入一个分词文件路径,返回该数据集上分词f1, precision, recall。 - 分词文件应该为:: - - 1 编者按 编者按 NN O 11 nmod:topic - 2 : : PU O 11 punct - 3 7月 7月 NT DATE 4 compound:nn - 4 12日 12日 NT DATE 11 nmod:tmod - 5 , , PU O 11 punct - - 1 这 这 DT O 3 det - 2 款 款 M O 1 mark:clf - 3 飞行 飞行 NN O 8 nsubj - 4 从 从 P O 5 case - 5 外型 外型 NN O 8 nmod:prep - - 以空行分割两个句子,有内容的每行有7列。 - - :param filepath: str, 文件路径路径。 - :return: float, float, float. 分别f1, precision, recall. - """ - tag_proc = self._dict['tag_proc'] - cws_model = self.pipeline.pipeline[-2].model - pipeline = self.pipeline.pipeline[:-2] - - pipeline.insert(1, tag_proc) - pp = Pipeline(pipeline) - - reader = ConllCWSReader() - - # te_filename = '/home/hyan/ctb3/test.conllx' - te_dataset = reader.load(filepath) - pp(te_dataset) - - from ..core.tester import Tester - from ..core.metrics import SpanFPreRecMetric - - tester = Tester(data=te_dataset, model=cws_model, metrics=SpanFPreRecMetric(tag_proc.get_vocab()), batch_size=64, - verbose=0) - eval_res = tester.test() - - f1 = eval_res['SpanFPreRecMetric']['f'] - pre = eval_res['SpanFPreRecMetric']['pre'] - rec = eval_res['SpanFPreRecMetric']['rec'] - # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec)) - - return {"F1": f1, "precision": pre, "recall": rec} - - -class Parser(API): - def __init__(self, model_path=None, device='cpu'): - super(Parser, self).__init__() - if model_path is None: - model_path = model_urls['parser'] - - self.pos_tagger = POS(device=device) - self.load(model_path, device) - - def predict(self, content): - if not hasattr(self, 'pipeline'): - raise ValueError("You have to load model first.") - - # 1. 利用POS得到分词和pos tagging结果 - pos_out = self.pos_tagger.predict(content) - # pos_out = ['这里/NN 是/VB 分词/NN 结果/NN'.split()] - - # 2. 组建dataset - dataset = DataSet() - dataset.add_field('wp', pos_out) - dataset.apply(lambda x: [''] + [w.split('/')[0] for w in x['wp']], new_field_name='words') - dataset.apply(lambda x: [''] + [w.split('/')[1] for w in x['wp']], new_field_name='pos') - dataset.rename_field("words", "raw_words") - - # 3. 使用pipeline - self.pipeline(dataset) - dataset.apply(lambda x: [str(arc) for arc in x['arc_pred']], new_field_name='arc_pred') - dataset.apply(lambda x: [arc + '/' + label for arc, label in - zip(x['arc_pred'], x['label_pred_seq'])][1:], new_field_name='output') - # output like: [['2/top', '0/root', '4/nn', '2/dep']] - return dataset.field_arrays['output'].content - - def load_test_file(self, path): - def get_one(sample): - sample = list(map(list, zip(*sample))) - if len(sample) == 0: - return None - for w in sample[7]: - if w == '_': - print('Error Sample {}'.format(sample)) - return None - # return word_seq, pos_seq, head_seq, head_tag_seq - return sample[1], sample[3], list(map(int, sample[6])), sample[7] - - datalist = [] - with open(path, 'r', encoding='utf-8') as f: - sample = [] - for line in f: - if line.startswith('\n'): - datalist.append(sample) - sample = [] - elif line.startswith('#'): - continue - else: - sample.append(line.split('\t')) - if len(sample) > 0: - datalist.append(sample) - - data = [get_one(sample) for sample in datalist] - data_list = list(filter(lambda x: x is not None, data)) - return data_list - - def test(self, filepath): - data = self.load_test_file(filepath) - - def convert(data): - BOS = '' - dataset = DataSet() - for sample in data: - word_seq = [BOS] + sample[0] - pos_seq = [BOS] + sample[1] - heads = [0] + sample[2] - head_tags = [BOS] + sample[3] - dataset.append(Instance(raw_words=word_seq, - pos=pos_seq, - gold_heads=heads, - arc_true=heads, - tags=head_tags)) - return dataset - - ds = convert(data) - pp = self.pipeline - for p in pp: - if p.field_name == 'word_list': - p.field_name = 'gold_words' - elif p.field_name == 'pos_list': - p.field_name = 'gold_pos' - # ds.rename_field("words", "raw_words") - # ds.rename_field("tag", "pos") - pp(ds) - head_cor, label_cor, total = 0, 0, 0 - for ins in ds: - head_gold = ins['gold_heads'] - head_pred = ins['arc_pred'] - length = len(head_gold) - total += length - for i in range(length): - head_cor += 1 if head_pred[i] == head_gold[i] else 0 - uas = head_cor / total - # print('uas:{:.2f}'.format(uas)) - - for p in pp: - if p.field_name == 'gold_words': - p.field_name = 'word_list' - elif p.field_name == 'gold_pos': - p.field_name = 'pos_list' - - return {"USA": round(uas, 5)} - - -class Analyzer: - def __init__(self, device='cpu'): - - self.cws = CWS(device=device) - self.pos = POS(device=device) - self.parser = Parser(device=device) - - def predict(self, content, seg=False, pos=False, parser=False): - if seg is False and pos is False and parser is False: - seg = True - output_dict = {} - if seg: - seg_output = self.cws.predict(content) - output_dict['seg'] = seg_output - if pos: - pos_output = self.pos.predict(content) - output_dict['pos'] = pos_output - if parser: - parser_output = self.parser.predict(content) - output_dict['parser'] = parser_output - - return output_dict - - def test(self, filepath): - output_dict = {} - if self.cws: - seg_output = self.cws.test(filepath) - output_dict['seg'] = seg_output - if self.pos: - pos_output = self.pos.test(filepath) - output_dict['pos'] = pos_output - if self.parser: - parser_output = self.parser.test(filepath) - output_dict['parser'] = parser_output - - return output_dict diff --git a/legacy/api/converter.py b/legacy/api/converter.py deleted file mode 100644 index 4e03e465..00000000 --- a/legacy/api/converter.py +++ /dev/null @@ -1,181 +0,0 @@ -import re - - -class SpanConverter: - def __init__(self, replace_tag, pattern): - super(SpanConverter, self).__init__() - - self.replace_tag = replace_tag - self.pattern = pattern - - def find_certain_span_and_replace(self, sentence): - replaced_sentence = '' - prev_end = 0 - for match in re.finditer(self.pattern, sentence): - start, end = match.span() - span = sentence[start:end] - replaced_sentence += sentence[prev_end:start] + self.span_to_special_tag(span) - prev_end = end - replaced_sentence += sentence[prev_end:] - - return replaced_sentence - - def span_to_special_tag(self, span): - - return self.replace_tag - - def find_certain_span(self, sentence): - spans = [] - for match in re.finditer(self.pattern, sentence): - spans.append(match.span()) - return spans - - -class AlphaSpanConverter(SpanConverter): - def __init__(self): - replace_tag = '' - # 理想状态下仅处理纯为字母的情况, 但不处理<[a-zA-Z]+>(因为这应该是特殊的tag). - pattern = '[a-zA-Z]+(?=[\u4e00-\u9fff ,%.!<\\-"])' - - super(AlphaSpanConverter, self).__init__(replace_tag, pattern) - - -class DigitSpanConverter(SpanConverter): - def __init__(self): - replace_tag = '' - pattern = '\d[\d\\.]*(?=[\u4e00-\u9fff ,%.!<-])' - - super(DigitSpanConverter, self).__init__(replace_tag, pattern) - - def span_to_special_tag(self, span): - # return self.special_tag - if span[0] == '0' and len(span) > 2: - return '' - decimal_point_count = 0 # one might have more than one decimal pointers - for idx, char in enumerate(span): - if char == '.' or char == '﹒' or char == '·': - decimal_point_count += 1 - if span[-1] == '.' or span[-1] == '﹒' or span[-1] == '·': - # last digit being decimal point means this is not a number - if decimal_point_count == 1: - return span - else: - return '' - if decimal_point_count == 1: - return '' - elif decimal_point_count > 1: - return '' - else: - return '' - - -class TimeConverter(SpanConverter): - def __init__(self): - replace_tag = '' - pattern = '\d+[::∶][\d::∶]+(?=[\u4e00-\u9fff ,%.!<-])' - - super().__init__(replace_tag, pattern) - - -class MixNumAlphaConverter(SpanConverter): - def __init__(self): - replace_tag = '' - pattern = None - - super().__init__(replace_tag, pattern) - - def find_certain_span_and_replace(self, sentence): - replaced_sentence = '' - start = 0 - matching_flag = False - number_flag = False - alpha_flag = False - link_flag = False - slash_flag = False - bracket_flag = False - for idx in range(len(sentence)): - if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): - if not matching_flag: - replaced_sentence += sentence[start:idx] - start = idx - if re.match('[0-9]', sentence[idx]): - number_flag = True - elif re.match('[\'′&\\-]', sentence[idx]): - link_flag = True - elif re.match('/', sentence[idx]): - slash_flag = True - elif re.match('[\\(\\)]', sentence[idx]): - bracket_flag = True - else: - alpha_flag = True - matching_flag = True - elif re.match('[\\.]', sentence[idx]): - pass - else: - if matching_flag: - if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ - or (slash_flag and alpha_flag) or (link_flag and number_flag) \ - or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): - span = sentence[start:idx] - start = idx - replaced_sentence += self.span_to_special_tag(span) - matching_flag = False - number_flag = False - alpha_flag = False - link_flag = False - slash_flag = False - bracket_flag = False - - replaced_sentence += sentence[start:] - return replaced_sentence - - def find_certain_span(self, sentence): - spans = [] - start = 0 - matching_flag = False - number_flag = False - alpha_flag = False - link_flag = False - slash_flag = False - bracket_flag = False - for idx in range(len(sentence)): - if re.match('[0-9a-zA-Z/\\(\\)\'′&\\-]', sentence[idx]): - if not matching_flag: - start = idx - if re.match('[0-9]', sentence[idx]): - number_flag = True - elif re.match('[\'′&\\-]', sentence[idx]): - link_flag = True - elif re.match('/', sentence[idx]): - slash_flag = True - elif re.match('[\\(\\)]', sentence[idx]): - bracket_flag = True - else: - alpha_flag = True - matching_flag = True - elif re.match('[\\.]', sentence[idx]): - pass - else: - if matching_flag: - if (number_flag and alpha_flag) or (link_flag and alpha_flag) \ - or (slash_flag and alpha_flag) or (link_flag and number_flag) \ - or (number_flag and bracket_flag) or (bracket_flag and alpha_flag): - spans.append((start, idx)) - start = idx - - matching_flag = False - number_flag = False - alpha_flag = False - link_flag = False - slash_flag = False - bracket_flag = False - - return spans - - -class EmailConverter(SpanConverter): - def __init__(self): - replaced_tag = "" - pattern = '[0-9a-zA-Z]+[@][.﹒0-9a-zA-Z@]+(?=[\u4e00-\u9fff ,%.!<\\-"$])' - - super(EmailConverter, self).__init__(replaced_tag, pattern) diff --git a/legacy/api/examples.py b/legacy/api/examples.py deleted file mode 100644 index c1b2e155..00000000 --- a/legacy/api/examples.py +++ /dev/null @@ -1,56 +0,0 @@ -""" -api/example.py contains all API examples provided by fastNLP. -It is used as a tutorial for API or a test script since it is difficult to test APIs in travis. - -""" -from . import CWS, POS, Parser - -text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。', - '这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。', - '那么这款无人机到底有多厉害?'] - - -def chinese_word_segmentation(): - cws = CWS(device='cpu') - print(cws.predict(text)) - - -def chinese_word_segmentation_test(): - cws = CWS(device='cpu') - print(cws.test("../../test/data_for_tests/zh_sample.conllx")) - - -def pos_tagging(): - # 输入已分词序列 - text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司', - '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'], - ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']] - pos = POS(device='cpu') - print(pos.predict(text)) - - -def pos_tagging_test(): - pos = POS(device='cpu') - print(pos.test("../../test/data_for_tests/zh_sample.conllx")) - - -def syntactic_parsing(): - text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司', - '研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'], - ['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']] - parser = Parser(device='cpu') - print(parser.predict(text)) - - -def syntactic_parsing_test(): - parser = Parser(device='cpu') - print(parser.test("../../test/data_for_tests/zh_sample.conllx")) - - -if __name__ == "__main__": - # chinese_word_segmentation() - # chinese_word_segmentation_test() - # pos_tagging() - # pos_tagging_test() - syntactic_parsing() - # syntactic_parsing_test() diff --git a/legacy/api/pipeline.py b/legacy/api/pipeline.py deleted file mode 100644 index 2cec16b3..00000000 --- a/legacy/api/pipeline.py +++ /dev/null @@ -1,33 +0,0 @@ -from ..api.processor import Processor - - -class Pipeline: - """ - Pipeline takes a DataSet object as input, runs multiple processors sequentially, and - outputs a DataSet object. - """ - - def __init__(self, processors=None): - self.pipeline = [] - if isinstance(processors, list): - for proc in processors: - assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(proc)) - self.pipeline = processors - - def add_processor(self, processor): - assert isinstance(processor, Processor), "Must be a Processor, not {}.".format(type(processor)) - self.pipeline.append(processor) - - def process(self, dataset): - assert len(self.pipeline) != 0, "You need to add some processor first." - - for proc in self.pipeline: - dataset = proc(dataset) - - return dataset - - def __call__(self, *args, **kwargs): - return self.process(*args, **kwargs) - - def __getitem__(self, item): - return self.pipeline[item] diff --git a/legacy/api/processor.py b/legacy/api/processor.py deleted file mode 100644 index 4c442ed2..00000000 --- a/legacy/api/processor.py +++ /dev/null @@ -1,428 +0,0 @@ -import re -from collections import defaultdict - -import torch - -from fastNLP.core.batch import Batch -from fastNLP.core.dataset import DataSet -from fastNLP.core.sampler import SequentialSampler -from fastNLP.core.vocabulary import Vocabulary - - -class Processor(object): - def __init__(self, field_name, new_added_field_name): - """ - - :param field_name: 处理哪个field - :param new_added_field_name: 如果为None,则认为是field_name,即覆盖原有的field - """ - self.field_name = field_name - if new_added_field_name is None: - self.new_added_field_name = field_name - else: - self.new_added_field_name = new_added_field_name - - def process(self, *args, **kwargs): - raise NotImplementedError - - def __call__(self, *args, **kwargs): - return self.process(*args, **kwargs) - - -class FullSpaceToHalfSpaceProcessor(Processor): - """全角转半角,以字符为处理单元 - - """ - - def __init__(self, field_name, change_alpha=True, change_digit=True, change_punctuation=True, - change_space=True): - super(FullSpaceToHalfSpaceProcessor, self).__init__(field_name, None) - - self.change_alpha = change_alpha - self.change_digit = change_digit - self.change_punctuation = change_punctuation - self.change_space = change_space - - FH_SPACE = [(u" ", u" ")] - FH_NUM = [ - (u"0", u"0"), (u"1", u"1"), (u"2", u"2"), (u"3", u"3"), (u"4", u"4"), - (u"5", u"5"), (u"6", u"6"), (u"7", u"7"), (u"8", u"8"), (u"9", u"9")] - FH_ALPHA = [ - (u"a", u"a"), (u"b", u"b"), (u"c", u"c"), (u"d", u"d"), (u"e", u"e"), - (u"f", u"f"), (u"g", u"g"), (u"h", u"h"), (u"i", u"i"), (u"j", u"j"), - (u"k", u"k"), (u"l", u"l"), (u"m", u"m"), (u"n", u"n"), (u"o", u"o"), - (u"p", u"p"), (u"q", u"q"), (u"r", u"r"), (u"s", u"s"), (u"t", u"t"), - (u"u", u"u"), (u"v", u"v"), (u"w", u"w"), (u"x", u"x"), (u"y", u"y"), - (u"z", u"z"), - (u"A", u"A"), (u"B", u"B"), (u"C", u"C"), (u"D", u"D"), (u"E", u"E"), - (u"F", u"F"), (u"G", u"G"), (u"H", u"H"), (u"I", u"I"), (u"J", u"J"), - (u"K", u"K"), (u"L", u"L"), (u"M", u"M"), (u"N", u"N"), (u"O", u"O"), - (u"P", u"P"), (u"Q", u"Q"), (u"R", u"R"), (u"S", u"S"), (u"T", u"T"), - (u"U", u"U"), (u"V", u"V"), (u"W", u"W"), (u"X", u"X"), (u"Y", u"Y"), - (u"Z", u"Z")] - # 谨慎使用标点符号转换, 因为"5.12特大地震"转换后可能就成了"5.12特大地震" - FH_PUNCTUATION = [ - (u'%', u'%'), (u'!', u'!'), (u'"', u'\"'), (u''', u'\''), (u'#', u'#'), - (u'¥', u'$'), (u'&', u'&'), (u'(', u'('), (u')', u')'), (u'*', u'*'), - (u'+', u'+'), (u',', u','), (u'-', u'-'), (u'.', u'.'), (u'/', u'/'), - (u':', u':'), (u';', u';'), (u'<', u'<'), (u'=', u'='), (u'>', u'>'), - (u'?', u'?'), (u'@', u'@'), (u'[', u'['), (u']', u']'), (u'\', u'\\'), - (u'^', u'^'), (u'_', u'_'), (u'`', u'`'), (u'~', u'~'), (u'{', u'{'), - (u'}', u'}'), (u'|', u'|')] - FHs = [] - if self.change_alpha: - FHs = FH_ALPHA - if self.change_digit: - FHs += FH_NUM - if self.change_punctuation: - FHs += FH_PUNCTUATION - if self.change_space: - FHs += FH_SPACE - self.convert_map = {k: v for k, v in FHs} - - def process(self, dataset): - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - - def inner_proc(ins): - sentence = ins[self.field_name] - new_sentence = [""] * len(sentence) - for idx, char in enumerate(sentence): - if char in self.convert_map: - char = self.convert_map[char] - new_sentence[idx] = char - return "".join(new_sentence) - - dataset.apply(inner_proc, new_field_name=self.field_name) - return dataset - - -class PreAppendProcessor(Processor): - """ - 向某个field的起始增加data(应该为str类型)。该field需要为list类型。即新增的field为 - [data] + instance[field_name] - - """ - - def __init__(self, data, field_name, new_added_field_name=None): - super(PreAppendProcessor, self).__init__(field_name, new_added_field_name) - self.data = data - - def process(self, dataset): - dataset.apply(lambda ins: [self.data] + ins[self.field_name], new_field_name=self.new_added_field_name) - return dataset - - -class SliceProcessor(Processor): - """ - 从某个field中只取部分内容。等价于instance[field_name][start:end:step] - - """ - - def __init__(self, start, end, step, field_name, new_added_field_name=None): - super(SliceProcessor, self).__init__(field_name, new_added_field_name) - for o in (start, end, step): - assert isinstance(o, int) or o is None - self.slice = slice(start, end, step) - - def process(self, dataset): - dataset.apply(lambda ins: ins[self.field_name][self.slice], new_field_name=self.new_added_field_name) - return dataset - - -class Num2TagProcessor(Processor): - """ - 将一句话中的数字转换为某个tag。 - - """ - - def __init__(self, tag, field_name, new_added_field_name=None): - """ - - :param tag: str, 将数字转换为该tag - :param field_name: - :param new_added_field_name: - """ - super(Num2TagProcessor, self).__init__(field_name, new_added_field_name) - self.tag = tag - self.pattern = r'[-+]?([0-9]+[.]?[0-9]*)+[/eE]?[-+]?([0-9]+[.]?[0-9]*)' - - def process(self, dataset): - - def inner_proc(ins): - s = ins[self.field_name] - new_s = [None] * len(s) - for i, w in enumerate(s): - if re.search(self.pattern, w) is not None: - w = self.tag - new_s[i] = w - return new_s - - dataset.apply(inner_proc, new_field_name=self.new_added_field_name) - return dataset - - -class IndexerProcessor(Processor): - """ - 给定一个vocabulary , 将指定field转换为index形式。指定field应该是一维的list,比如 - ['我', '是', xxx] - """ - - def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True): - - assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) - - super(IndexerProcessor, self).__init__(field_name, new_added_field_name) - self.vocab = vocab - self.delete_old_field = delete_old_field - self.is_input = is_input - - def set_vocab(self, vocab): - assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab)) - - self.vocab = vocab - - def process(self, dataset): - assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]], - new_field_name=self.new_added_field_name) - if self.is_input: - dataset.set_input(self.new_added_field_name) - - if self.delete_old_field: - dataset.delete_field(self.field_name) - - return dataset - - -class VocabProcessor(Processor): - """ - 传入若干个DataSet以建立vocabulary。 - - """ - - def __init__(self, field_name, min_freq=1, max_size=None): - super(VocabProcessor, self).__init__(field_name, None) - self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size) - - def process(self, *datasets): - for dataset in datasets: - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: self.vocab.update(ins[self.field_name])) - - def get_vocab(self): - self.vocab.build_vocab() - return self.vocab - - -class SeqLenProcessor(Processor): - """ - 根据某个field新增一个sequence length的field。取该field的第一维 - - """ - - def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True): - super(SeqLenProcessor, self).__init__(field_name, new_added_field_name) - self.is_input = is_input - - def process(self, dataset): - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: len(ins[self.field_name]), new_field_name=self.new_added_field_name) - if self.is_input: - dataset.set_input(self.new_added_field_name) - return dataset - - -from fastNLP.core.utils import _build_args - - -class ModelProcessor(Processor): - def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32): - """ - 传入一个model,在process()时传入一个dataset,该processor会通过Batch将DataSet的内容输出给model.predict或者model.forward. - model输出的内容会被增加到dataset中,field_name由model输出决定。如果生成的内容维度不是(Batch_size, )与 - (Batch_size, 1),则使用seqence length这个field进行unpad - TODO 这个类需要删除对seq_lens的依赖。 - - :param seq_len_field_name: - :param batch_size: - """ - super(ModelProcessor, self).__init__(None, None) - self.batch_size = batch_size - self.seq_len_field_name = seq_len_field_name - self.model = model - - def process(self, dataset): - self.model.eval() - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler()) - - batch_output = defaultdict(list) - predict_func = self.model.forward - with torch.no_grad(): - for batch_x, _ in data_iterator: - refined_batch_x = _build_args(predict_func, **batch_x) - prediction = predict_func(**refined_batch_x) - seq_lens = batch_x[self.seq_len_field_name].tolist() - - for key, value in prediction.items(): - tmp_batch = [] - value = value.cpu().numpy() - if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): - batch_output[key].extend(value.tolist()) - else: - for idx, seq_len in enumerate(seq_lens): - tmp_batch.append(value[idx, :seq_len]) - batch_output[key].extend(tmp_batch) - if not self.seq_len_field_name in prediction: - batch_output[self.seq_len_field_name].extend(seq_lens) - - # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么 - for field_name, fields in batch_output.items(): - dataset.add_field(field_name, fields, is_input=True, is_target=False) - - return dataset - - def set_model(self, model): - self.model = model - - def set_model_device(self, device): - device = torch.device(device) - self.model.to(device) - - -class Index2WordProcessor(Processor): - """ - 将DataSet中某个为index的field根据vocab转换为str - - """ - - def __init__(self, vocab, field_name, new_added_field_name): - super(Index2WordProcessor, self).__init__(field_name, new_added_field_name) - self.vocab = vocab - - def process(self, dataset): - dataset.apply(lambda ins: [self.vocab.to_word(w) for w in ins[self.field_name]], - new_field_name=self.new_added_field_name) - return dataset - - -class SetTargetProcessor(Processor): - def __init__(self, *fields, flag=True): - super(SetTargetProcessor, self).__init__(None, None) - self.fields = fields - self.flag = flag - - def process(self, dataset): - dataset.set_target(*self.fields, flag=self.flag) - return dataset - - -class SetInputProcessor(Processor): - def __init__(self, *fields, flag=True): - super(SetInputProcessor, self).__init__(None, None) - self.fields = fields - self.flag = flag - - def process(self, dataset): - dataset.set_input(*self.fields, flag=self.flag) - return dataset - - -class VocabIndexerProcessor(Processor): - """ - 根据DataSet创建Vocabulary,并将其用数字index。新生成的index的field会被放在new_added_filed_name, 如果没有提供 - new_added_field_name, 则覆盖原有的field_name. - - """ - - def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None, - verbose=0, is_input=True): - """ - - :param field_name: 从哪个field_name创建词表,以及对哪个field_name进行index操作 - :param new_added_filed_name: index时,生成的index field的名称,如果不传入,则覆盖field_name. - :param min_freq: 创建的Vocabulary允许的单词最少出现次数. - :param max_size: 创建的Vocabulary允许的最大的单词数量 - :param verbose: 0, 不输出任何信息;1,输出信息 - :param bool is_input: - """ - super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name) - self.min_freq = min_freq - self.max_size = max_size - - self.verbose = verbose - self.is_input = is_input - - def construct_vocab(self, *datasets): - """ - 使用传入的DataSet创建vocabulary - - :param datasets: DataSet类型的数据,用于构建vocabulary - :return: - """ - self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size) - for dataset in datasets: - assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: self.vocab.update(ins[self.field_name])) - self.vocab.build_vocab() - if self.verbose: - print("Vocabulary Constructed, has {} items.".format(len(self.vocab))) - - def process(self, *datasets, only_index_dataset=None): - """ - 若还未建立Vocabulary,则使用dataset中的DataSet建立vocabulary;若已经有了vocabulary则使用已有的vocabulary。得到vocabulary - 后,则会index datasets与only_index_dataset。 - - :param datasets: DataSet类型的数据 - :param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index,不会被用于生成vocabulary。 - :return: - """ - if len(datasets) == 0 and not hasattr(self, 'vocab'): - raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.") - if not hasattr(self, 'vocab'): - self.construct_vocab(*datasets) - else: - if self.verbose: - print("Using constructed vocabulary with {} items.".format(len(self.vocab))) - to_index_datasets = [] - if len(datasets) != 0: - for dataset in datasets: - assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) - to_index_datasets.append(dataset) - - if not (only_index_dataset is None): - if isinstance(only_index_dataset, list): - for dataset in only_index_dataset: - assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) - to_index_datasets.append(dataset) - elif isinstance(only_index_dataset, DataSet): - to_index_datasets.append(only_index_dataset) - else: - raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset))) - - for dataset in to_index_datasets: - assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset)) - dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]], - new_field_name=self.new_added_field_name, is_input=self.is_input) - # 只返回一个,infer时为了跟其他processor保持一致 - if len(to_index_datasets) == 1: - return to_index_datasets[0] - - def set_vocab(self, vocab): - assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab)) - self.vocab = vocab - - def delete_vocab(self): - del self.vocab - - def get_vocab_size(self): - return len(self.vocab) - - def set_verbose(self, verbose): - """ - 设置processor verbose状态。 - - :param verbose: int, 0,不输出任何信息;1,输出vocab 信息。 - :return: - """ - self.verbose = verbose diff --git a/legacy/api/utils.py b/legacy/api/utils.py deleted file mode 100644 index 184e5fe6..00000000 --- a/legacy/api/utils.py +++ /dev/null @@ -1,134 +0,0 @@ -import hashlib -import os -import re -import shutil -import sys -import tempfile - -import torch - -try: - from requests.utils import urlparse - from requests import get as urlopen - requests_available = True -except ImportError: - requests_available = False - if sys.version_info[0] == 2: - from urlparse import urlparse # noqa f811 - from urllib2 import urlopen # noqa f811 - else: - from urllib.request import urlopen - from urllib.parse import urlparse -try: - from tqdm.auto import tqdm -except: - from fastNLP.core.utils import _pseudo_tqdm as tqdm - -# matches bfd8deac from resnet18-bfd8deac.pth -HASH_REGEX = re.compile(r'-([a-f0-9]*)\.') - - -def load_url(url, model_dir=None, map_location=None, progress=True): - r"""Loads the Torch serialized object at the given URL. - - If the object is already present in `model_dir`, it's deserialized and - returned. The filename part of the URL should follow the naming convention - ``filename-.ext`` where ```` is the first eight or more - digits of the SHA256 hash of the contents of the file. The hash is used to - ensure unique names and to verify the contents of the file. - - The default value of `model_dir` is ``$TORCH_HOME/models`` where - ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be - overridden with the ``$TORCH_MODEL_ZOO`` environment variable. - - Args: - url (string): URL of the object to download - model_dir (string, optional): directory in which to save the object - map_location (optional): a function or a dict specifying how to remap storage locations (see torch.load) - progress (bool, optional): whether or not to display a progress bar to stderr - - Example: - # >>> state_dict = model_zoo.load_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') - - """ - if model_dir is None: - torch_home = os.path.expanduser(os.getenv('fastNLP_HOME', '~/.fastNLP')) - model_dir = os.getenv('fastNLP_MODEL_ZOO', os.path.join(torch_home, 'models')) - if not os.path.exists(model_dir): - os.makedirs(model_dir) - parts = urlparse(url) - filename = os.path.basename(parts.path) - cached_file = os.path.join(model_dir, filename) - if not os.path.exists(cached_file): - sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) - # hash_prefix = HASH_REGEX.search(filename).group(1) - _download_url_to_file(url, cached_file, hash_prefix=None, progress=progress) - return torch.load(cached_file, map_location=map_location) - - -def _download_url_to_file(url, dst, hash_prefix, progress): - if requests_available: - u = urlopen(url, stream=True) - file_size = int(u.headers["Content-Length"]) - u = u.raw - else: - u = urlopen(url) - meta = u.info() - if hasattr(meta, 'getheaders'): - file_size = int(meta.getheaders("Content-Length")[0]) - else: - file_size = int(meta.get_all("Content-Length")[0]) - - f = tempfile.NamedTemporaryFile(delete=False) - try: - if hash_prefix is not None: - sha256 = hashlib.sha256() - with tqdm(total=file_size, disable=not progress) as pbar: - while True: - buffer = u.read(8192) - if len(buffer) == 0: - break - f.write(buffer) - if hash_prefix is not None: - sha256.update(buffer) - pbar.update(len(buffer)) - - f.close() - if hash_prefix is not None: - digest = sha256.hexdigest() - if digest[:len(hash_prefix)] != hash_prefix: - raise RuntimeError('invalid hash value (expected "{}", got "{}")' - .format(hash_prefix, digest)) - shutil.move(f.name, dst) - finally: - f.close() - if os.path.exists(f.name): - os.remove(f.name) - - -if tqdm is None: - # fake tqdm if it's not installed - class tqdm(object): - - def __init__(self, total, disable=False): - self.total = total - self.disable = disable - self.n = 0 - - def update(self, n): - if self.disable: - return - - self.n += n - sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(self.total))) - sys.stderr.flush() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - if self.disable: - return - - sys.stderr.write('\n') - diff --git a/legacy/automl/__init__.py b/legacy/automl/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/legacy/automl/enas_controller.py b/legacy/automl/enas_controller.py deleted file mode 100644 index 6ddbb211..00000000 --- a/legacy/automl/enas_controller.py +++ /dev/null @@ -1,223 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch -"""A module with NAS controller-related code.""" -import collections -import os - -import torch -import torch.nn.functional as F - -import fastNLP.automl.enas_utils as utils -from fastNLP.automl.enas_utils import Node - - -def _construct_dags(prev_nodes, activations, func_names, num_blocks): - """Constructs a set of DAGs based on the actions, i.e., previous nodes and - activation functions, sampled from the controller/policy pi. - - Args: - prev_nodes: Previous node actions from the policy. - activations: Activations sampled from the policy. - func_names: Mapping from activation function names to functions. - num_blocks: Number of blocks in the target RNN cell. - - Returns: - A list of DAGs defined by the inputs. - - RNN cell DAGs are represented in the following way: - - 1. Each element (node) in a DAG is a list of `Node`s. - - 2. The `Node`s in the list dag[i] correspond to the subsequent nodes - that take the output from node i as their own input. - - 3. dag[-1] is the node that takes input from x^{(t)} and h^{(t - 1)}. - dag[-1] always feeds dag[0]. - dag[-1] acts as if `w_xc`, `w_hc`, `w_xh` and `w_hh` are its - weights. - - 4. dag[N - 1] is the node that produces the hidden state passed to - the next timestep. dag[N - 1] is also always a leaf node, and therefore - is always averaged with the other leaf nodes and fed to the output - decoder. - """ - dags = [] - for nodes, func_ids in zip(prev_nodes, activations): - dag = collections.defaultdict(list) - - # add first node - dag[-1] = [Node(0, func_names[func_ids[0]])] - dag[-2] = [Node(0, func_names[func_ids[0]])] - - # add following nodes - for jdx, (idx, func_id) in enumerate(zip(nodes, func_ids[1:])): - dag[utils.to_item(idx)].append(Node(jdx + 1, func_names[func_id])) - - leaf_nodes = set(range(num_blocks)) - dag.keys() - - # merge with avg - for idx in leaf_nodes: - dag[idx] = [Node(num_blocks, 'avg')] - - # This is actually y^{(t)}. h^{(t)} is node N - 1 in - # the graph, where N Is the number of nodes. I.e., h^{(t)} takes - # only one other node as its input. - # last h[t] node - last_node = Node(num_blocks + 1, 'h[t]') - dag[num_blocks] = [last_node] - dags.append(dag) - - return dags - - -class Controller(torch.nn.Module): - """Based on - https://github.com/pytorch/examples/blob/master/word_language_model/model.py - - RL controllers do not necessarily have much to do with - language models. - - Base the controller RNN on the GRU from: - https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py - """ - def __init__(self, num_blocks=4, controller_hid=100, cuda=False): - torch.nn.Module.__init__(self) - - # `num_tokens` here is just the activation function - # for every even step, - self.shared_rnn_activations = ['tanh', 'ReLU', 'identity', 'sigmoid'] - self.num_tokens = [len(self.shared_rnn_activations)] - self.controller_hid = controller_hid - self.use_cuda = cuda - self.num_blocks = num_blocks - for idx in range(num_blocks): - self.num_tokens += [idx + 1, len(self.shared_rnn_activations)] - self.func_names = self.shared_rnn_activations - - num_total_tokens = sum(self.num_tokens) - - self.encoder = torch.nn.Embedding(num_total_tokens, - controller_hid) - self.lstm = torch.nn.LSTMCell(controller_hid, controller_hid) - - # Perhaps these weights in the decoder should be - # shared? At least for the activation functions, which all have the - # same size. - self.decoders = [] - for idx, size in enumerate(self.num_tokens): - decoder = torch.nn.Linear(controller_hid, size) - self.decoders.append(decoder) - - self._decoders = torch.nn.ModuleList(self.decoders) - - self.reset_parameters() - self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - - def _get_default_hidden(key): - return utils.get_variable( - torch.zeros(key, self.controller_hid), - self.use_cuda, - requires_grad=False) - - self.static_inputs = utils.keydefaultdict(_get_default_hidden) - - def reset_parameters(self): - init_range = 0.1 - for param in self.parameters(): - param.data.uniform_(-init_range, init_range) - for decoder in self.decoders: - decoder.bias.data.fill_(0) - - def forward(self, # pylint:disable=arguments-differ - inputs, - hidden, - block_idx, - is_embed): - if not is_embed: - embed = self.encoder(inputs) - else: - embed = inputs - - hx, cx = self.lstm(embed, hidden) - logits = self.decoders[block_idx](hx) - - logits /= 5.0 - - # # exploration - # if self.args.mode == 'train': - # logits = (2.5 * F.tanh(logits)) - - return logits, (hx, cx) - - def sample(self, batch_size=1, with_details=False, save_dir=None): - """Samples a set of `args.num_blocks` many computational nodes from the - controller, where each node is made up of an activation function, and - each node except the last also includes a previous node. - """ - if batch_size < 1: - raise Exception(f'Wrong batch_size: {batch_size} < 1') - - # [B, L, H] - inputs = self.static_inputs[batch_size] - hidden = self.static_init_hidden[batch_size] - - activations = [] - entropies = [] - log_probs = [] - prev_nodes = [] - # The RNN controller alternately outputs an activation, - # followed by a previous node, for each block except the last one, - # which only gets an activation function. The last node is the output - # node, and its previous node is the average of all leaf nodes. - for block_idx in range(2*(self.num_blocks - 1) + 1): - logits, hidden = self.forward(inputs, - hidden, - block_idx, - is_embed=(block_idx == 0)) - - probs = F.softmax(logits, dim=-1) - log_prob = F.log_softmax(logits, dim=-1) - # .mean() for entropy? - entropy = -(log_prob * probs).sum(1, keepdim=False) - - action = probs.multinomial(num_samples=1).data - selected_log_prob = log_prob.gather( - 1, utils.get_variable(action, requires_grad=False)) - - # why the [:, 0] here? Should it be .squeeze(), or - # .view()? Same below with `action`. - entropies.append(entropy) - log_probs.append(selected_log_prob[:, 0]) - - # 0: function, 1: previous node - mode = block_idx % 2 - inputs = utils.get_variable( - action[:, 0] + sum(self.num_tokens[:mode]), - requires_grad=False) - - if mode == 0: - activations.append(action[:, 0]) - elif mode == 1: - prev_nodes.append(action[:, 0]) - - prev_nodes = torch.stack(prev_nodes).transpose(0, 1) - activations = torch.stack(activations).transpose(0, 1) - - dags = _construct_dags(prev_nodes, - activations, - self.func_names, - self.num_blocks) - - if save_dir is not None: - for idx, dag in enumerate(dags): - utils.draw_network(dag, - os.path.join(save_dir, f'graph{idx}.png')) - - if with_details: - return dags, torch.cat(log_probs), torch.cat(entropies) - - return dags - - def init_hidden(self, batch_size): - zeros = torch.zeros(batch_size, self.controller_hid) - return (utils.get_variable(zeros, self.use_cuda, requires_grad=False), - utils.get_variable(zeros.clone(), self.use_cuda, requires_grad=False)) diff --git a/legacy/automl/enas_model.py b/legacy/automl/enas_model.py deleted file mode 100644 index 4f9fb449..00000000 --- a/legacy/automl/enas_model.py +++ /dev/null @@ -1,388 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch - -"""Module containing the shared RNN model.""" -import collections - -import numpy as np -import torch -import torch.nn.functional as F -from torch import nn -from torch.autograd import Variable - -import fastNLP.automl.enas_utils as utils -from fastNLP.models.base_model import BaseModel - - -def _get_dropped_weights(w_raw, dropout_p, is_training): - """Drops out weights to implement DropConnect. - - Args: - w_raw: Full, pre-dropout, weights to be dropped out. - dropout_p: Proportion of weights to drop out. - is_training: True iff _shared_ model is training. - - Returns: - The dropped weights. - - Why does torch.nn.functional.dropout() return: - 1. `torch.autograd.Variable()` on the training loop - 2. `torch.nn.Parameter()` on the controller or eval loop, when - training = False... - - Even though the call to `_setweights` in the Smerity repo's - `weight_drop.py` does not have this behaviour, and `F.dropout` always - returns `torch.autograd.Variable` there, even when `training=False`? - - The above TODO is the reason for the hacky check for `torch.nn.Parameter`. - """ - dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training) - - if isinstance(dropped_w, torch.nn.Parameter): - dropped_w = dropped_w.clone() - - return dropped_w - -class EmbeddingDropout(torch.nn.Embedding): - """Class for dropping out embeddings by zero'ing out parameters in the - embedding matrix. - - This is equivalent to dropping out particular words, e.g., in the sentence - 'the quick brown fox jumps over the lazy dog', dropping out 'the' would - lead to the sentence '### quick brown fox jumps over ### lazy dog' (in the - embedding vector space). - - See 'A Theoretically Grounded Application of Dropout in Recurrent Neural - Networks', (Gal and Ghahramani, 2016). - """ - def __init__(self, - num_embeddings, - embedding_dim, - max_norm=None, - norm_type=2, - scale_grad_by_freq=False, - sparse=False, - dropout=0.1, - scale=None): - """Embedding constructor. - - Args: - dropout: Dropout probability. - scale: Used to scale parameters of embedding weight matrix that are - not dropped out. Note that this is _in addition_ to the - `1/(1 - dropout)` scaling. - - See `torch.nn.Embedding` for remaining arguments. - """ - torch.nn.Embedding.__init__(self, - num_embeddings=num_embeddings, - embedding_dim=embedding_dim, - max_norm=max_norm, - norm_type=norm_type, - scale_grad_by_freq=scale_grad_by_freq, - sparse=sparse) - self.dropout = dropout - assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 ' - 'and < 1.0') - self.scale = scale - - def forward(self, inputs): # pylint:disable=arguments-differ - """Embeds `inputs` with the dropped out embedding weight matrix.""" - if self.training: - dropout = self.dropout - else: - dropout = 0 - - if dropout: - mask = self.weight.data.new(self.weight.size(0), 1) - mask.bernoulli_(1 - dropout) - mask = mask.expand_as(self.weight) - mask = mask / (1 - dropout) - masked_weight = self.weight * Variable(mask) - else: - masked_weight = self.weight - if self.scale and self.scale != 1: - masked_weight = masked_weight * self.scale - - return F.embedding(inputs, - masked_weight, - max_norm=self.max_norm, - norm_type=self.norm_type, - scale_grad_by_freq=self.scale_grad_by_freq, - sparse=self.sparse) - - -class LockedDropout(nn.Module): - # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py - def __init__(self): - super().__init__() - - def forward(self, x, dropout=0.5): - if not self.training or not dropout: - return x - m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) - mask = Variable(m, requires_grad=False) / (1 - dropout) - mask = mask.expand_as(x) - return mask * x - - -class ENASModel(BaseModel): - """Shared RNN model.""" - def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000): - super(ENASModel, self).__init__() - - self.use_cuda = cuda - - self.shared_hid = shared_hid - self.num_blocks = num_blocks - self.decoder = nn.Linear(self.shared_hid, num_classes) - self.encoder = EmbeddingDropout(embed_num, - shared_embed, - dropout=0.1) - self.lockdrop = LockedDropout() - self.dag = None - - # Tie weights - # self.decoder.weight = self.encoder.weight - - # Since W^{x, c} and W^{h, c} are always summed, there - # is no point duplicating their bias offset parameter. Likewise for - # W^{x, h} and W^{h, h}. - self.w_xc = nn.Linear(shared_embed, self.shared_hid) - self.w_xh = nn.Linear(shared_embed, self.shared_hid) - - # The raw weights are stored here because the hidden-to-hidden weights - # are weight dropped on the forward pass. - self.w_hc_raw = torch.nn.Parameter( - torch.Tensor(self.shared_hid, self.shared_hid)) - self.w_hh_raw = torch.nn.Parameter( - torch.Tensor(self.shared_hid, self.shared_hid)) - self.w_hc = None - self.w_hh = None - - self.w_h = collections.defaultdict(dict) - self.w_c = collections.defaultdict(dict) - - for idx in range(self.num_blocks): - for jdx in range(idx + 1, self.num_blocks): - self.w_h[idx][jdx] = nn.Linear(self.shared_hid, - self.shared_hid, - bias=False) - self.w_c[idx][jdx] = nn.Linear(self.shared_hid, - self.shared_hid, - bias=False) - - self._w_h = nn.ModuleList([self.w_h[idx][jdx] - for idx in self.w_h - for jdx in self.w_h[idx]]) - self._w_c = nn.ModuleList([self.w_c[idx][jdx] - for idx in self.w_c - for jdx in self.w_c[idx]]) - - self.batch_norm = None - # if args.mode == 'train': - # self.batch_norm = nn.BatchNorm1d(self.shared_hid) - # else: - # self.batch_norm = None - - self.reset_parameters() - self.static_init_hidden = utils.keydefaultdict(self.init_hidden) - - def setDAG(self, dag): - if self.dag is None: - self.dag = dag - - def forward(self, word_seq, hidden=None): - inputs = torch.transpose(word_seq, 0, 1) - - time_steps = inputs.size(0) - batch_size = inputs.size(1) - - - self.w_hh = _get_dropped_weights(self.w_hh_raw, - 0.5, - self.training) - self.w_hc = _get_dropped_weights(self.w_hc_raw, - 0.5, - self.training) - - # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden - hidden = self.static_init_hidden[batch_size] - - embed = self.encoder(inputs) - - embed = self.lockdrop(embed, 0.65 if self.training else 0) - - # The norm of hidden states are clipped here because - # otherwise ENAS is especially prone to exploding activations on the - # forward pass. This could probably be fixed in a more elegant way, but - # it might be exposing a weakness in the ENAS algorithm as currently - # proposed. - # - # For more details, see - # https://github.com/carpedm20/ENAS-pytorch/issues/6 - clipped_num = 0 - max_clipped_norm = 0 - h1tohT = [] - logits = [] - for step in range(time_steps): - x_t = embed[step] - logit, hidden = self.cell(x_t, hidden, self.dag) - - hidden_norms = hidden.norm(dim=-1) - max_norm = 25.0 - if hidden_norms.data.max() > max_norm: - # Just directly use the torch slice operations - # in PyTorch v0.4. - # - # This workaround for PyTorch v0.3.1 does everything in numpy, - # because the PyTorch slicing and slice assignment is too - # flaky. - hidden_norms = hidden_norms.data.cpu().numpy() - - clipped_num += 1 - if hidden_norms.max() > max_clipped_norm: - max_clipped_norm = hidden_norms.max() - - clip_select = hidden_norms > max_norm - clip_norms = hidden_norms[clip_select] - - mask = np.ones(hidden.size()) - normalizer = max_norm/clip_norms - normalizer = normalizer[:, np.newaxis] - - mask[clip_select] = normalizer - - if self.use_cuda: - hidden *= torch.autograd.Variable( - torch.FloatTensor(mask).cuda(), requires_grad=False) - else: - hidden *= torch.autograd.Variable( - torch.FloatTensor(mask), requires_grad=False) - logits.append(logit) - h1tohT.append(hidden) - - h1tohT = torch.stack(h1tohT) - output = torch.stack(logits) - raw_output = output - - output = self.lockdrop(output, 0.4 if self.training else 0) - - #Pooling - output = torch.mean(output, 0) - - decoded = self.decoder(output) - - extra_out = {'dropped': decoded, - 'hiddens': h1tohT, - 'raw': raw_output} - return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out} - - def cell(self, x, h_prev, dag): - """Computes a single pass through the discovered RNN cell.""" - c = {} - h = {} - f = {} - - f[0] = self.get_f(dag[-1][0].name) - c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None)) - h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + - (1 - c[0])*h_prev) - - leaf_node_ids = [] - q = collections.deque() - q.append(0) - - # Computes connections from the parent nodes `node_id` - # to their child nodes `next_id` recursively, skipping leaf nodes. A - # leaf node is a node whose id == `self.num_blocks`. - # - # Connections between parent i and child j should be computed as - # h_j = c_j*f_{ij}{(W^h_{ij}*h_i)} + (1 - c_j)*h_i, - # where c_j = \sigmoid{(W^c_{ij}*h_i)} - # - # See Training details from Section 3.1 of the paper. - # - # The following algorithm does a breadth-first (since `q.popleft()` is - # used) search over the nodes and computes all the hidden states. - while True: - if len(q) == 0: - break - - node_id = q.popleft() - nodes = dag[node_id] - - for next_node in nodes: - next_id = next_node.id - if next_id == self.num_blocks: - leaf_node_ids.append(node_id) - assert len(nodes) == 1, ('parent of leaf node should have ' - 'only one child') - continue - - w_h = self.w_h[node_id][next_id] - w_c = self.w_c[node_id][next_id] - - f[next_id] = self.get_f(next_node.name) - c[next_id] = torch.sigmoid(w_c(h[node_id])) - h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) + - (1 - c[next_id])*h[node_id]) - - q.append(next_id) - - # Instead of averaging loose ends, perhaps there should - # be a set of separate unshared weights for each "loose" connection - # between each node in a cell and the output. - # - # As it stands, all weights W^h_{ij} are doing double duty by - # connecting both from i to j, as well as from i to the output. - - # average all the loose ends - leaf_nodes = [h[node_id] for node_id in leaf_node_ids] - output = torch.mean(torch.stack(leaf_nodes, 2), -1) - - # stabilizing the Updates of omega - if self.batch_norm is not None: - output = self.batch_norm(output) - - return output, h[self.num_blocks - 1] - - def init_hidden(self, batch_size): - zeros = torch.zeros(batch_size, self.shared_hid) - return utils.get_variable(zeros, self.use_cuda, requires_grad=False) - - def get_f(self, name): - name = name.lower() - if name == 'relu': - f = torch.relu - elif name == 'tanh': - f = torch.tanh - elif name == 'identity': - f = lambda x: x - elif name == 'sigmoid': - f = torch.sigmoid - return f - - - @property - def num_parameters(self): - def size(p): - return np.prod(p.size()) - return sum([size(param) for param in self.parameters()]) - - - def reset_parameters(self): - init_range = 0.025 - # init_range = 0.025 if self.args.mode == 'train' else 0.04 - for param in self.parameters(): - param.data.uniform_(-init_range, init_range) - self.decoder.bias.data.fill_(0) - - def predict(self, word_seq): - """ - - :param word_seq: torch.LongTensor, [batch_size, seq_len] - :return predict: dict of torch.LongTensor, [batch_size, seq_len] - """ - output = self(word_seq) - _, predict = output['pred'].max(dim=1) - return {'pred': predict} diff --git a/legacy/automl/enas_trainer.py b/legacy/automl/enas_trainer.py deleted file mode 100644 index e3524aa9..00000000 --- a/legacy/automl/enas_trainer.py +++ /dev/null @@ -1,383 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch - -import math -import time -from datetime import datetime -from datetime import timedelta - -import numpy as np -import torch - -try: - from tqdm.auto import tqdm -except: - from fastNLP.core.utils import _pseudo_tqdm as tqdm - -from fastNLP.core.batch import Batch -from fastNLP.core.callback import CallbackException -from fastNLP.core.dataset import DataSet -from fastNLP.core.utils import _move_dict_value_to_device -import fastNLP -from . import enas_utils as utils -from fastNLP.core.utils import _build_args - -from torch.optim import Adam - - -def _get_no_grad_ctx_mgr(): - """Returns a the `torch.no_grad` context manager for PyTorch version >= - 0.4, or a no-op context manager otherwise. - """ - return torch.no_grad() - - -class ENASTrainer(fastNLP.Trainer): - """A class to wrap training code.""" - def __init__(self, train_data, model, controller, **kwargs): - """Constructor for training algorithm. - :param DataSet train_data: the training data - :param torch.nn.modules.module model: a PyTorch model - :param torch.nn.modules.module controller: a PyTorch model - """ - self.final_epochs = kwargs['final_epochs'] - kwargs.pop('final_epochs') - super(ENASTrainer, self).__init__(train_data, model, **kwargs) - self.controller_step = 0 - self.shared_step = 0 - self.max_length = 35 - - self.shared = model - self.controller = controller - - self.shared_optim = Adam( - self.shared.parameters(), - lr=20.0, - weight_decay=1e-7) - - self.controller_optim = Adam( - self.controller.parameters(), - lr=3.5e-4) - - def train(self, load_best_model=True): - """ - :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 - 最好的模型参数。 - :return results: 返回一个字典类型的数据, - 内含以下内容:: - - seconds: float, 表示训练时长 - 以下三个内容只有在提供了dev_data的情况下会有。 - best_eval: Dict of Dict, 表示evaluation的结果 - best_epoch: int,在第几个epoch取得的最佳值 - best_step: int, 在第几个step(batch)更新取得的最佳值 - - """ - results = {} - if self.n_epochs <= 0: - print(f"training epoch is {self.n_epochs}, nothing was done.") - results['seconds'] = 0. - return results - try: - if torch.cuda.is_available() and self.use_cuda: - self.model = self.model.cuda() - self._model_device = self.model.parameters().__next__().device - self._mode(self.model, is_test=False) - - self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) - start_time = time.time() - print("training epochs started " + self.start_time, flush=True) - - try: - self.callback_manager.on_train_begin() - self._train() - self.callback_manager.on_train_end(self.model) - except (CallbackException, KeyboardInterrupt) as e: - self.callback_manager.on_exception(e, self.model) - - if self.dev_data is not None: - print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + - self.tester._format_eval_results(self.best_dev_perf),) - results['best_eval'] = self.best_dev_perf - results['best_epoch'] = self.best_dev_epoch - results['best_step'] = self.best_dev_step - if load_best_model: - model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) - load_succeed = self._load_model(self.model, model_name) - if load_succeed: - print("Reloaded the best model.") - else: - print("Fail to reload best model.") - finally: - pass - results['seconds'] = round(time.time() - start_time, 2) - - return results - - def _train(self): - if not self.use_tqdm: - from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm - else: - inner_tqdm = tqdm - self.step = 0 - start = time.time() - total_steps = (len(self.train_data) // self.batch_size + int( - len(self.train_data) % self.batch_size != 0)) * self.n_epochs - with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: - avg_loss = 0 - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - for epoch in range(1, self.n_epochs+1): - pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) - last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) - if epoch == self.n_epochs + 1 - self.final_epochs: - print('Entering the final stage. (Only train the selected structure)') - # early stopping - self.callback_manager.on_epoch_begin(epoch, self.n_epochs) - - # 1. Training the shared parameters omega of the child models - self.train_shared(pbar) - - # 2. Training the controller parameters theta - if not last_stage: - self.train_controller() - - if ((self.validate_every > 0 and self.step % self.validate_every == 0) or - (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ - and self.dev_data is not None: - if not last_stage: - self.derive() - eval_res = self._do_validation(epoch=epoch, step=self.step) - eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, - total_steps) + \ - self.tester._format_eval_results(eval_res) - pbar.write(eval_str) - - # lr decay; early stopping - self.callback_manager.on_epoch_end(epoch, self.n_epochs, self.optimizer) - # =============== epochs end =================== # - pbar.close() - # ============ tqdm end ============== # - - - def get_loss(self, inputs, targets, hidden, dags): - """Computes the loss for the same batch for M models. - - This amounts to an estimate of the loss, which is turned into an - estimate for the gradients of the shared model. - """ - if not isinstance(dags, list): - dags = [dags] - - loss = 0 - for dag in dags: - self.shared.setDAG(dag) - inputs = _build_args(self.shared.forward, **inputs) - inputs['hidden'] = hidden - result = self.shared(**inputs) - output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out'] - - self.callback_manager.on_loss_begin(targets, result) - sample_loss = self._compute_loss(result, targets) - loss += sample_loss - - assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`' - return loss, hidden, extra_out - - def train_shared(self, pbar=None, max_step=None, dag=None): - """Train the language model for 400 steps of minibatches of 64 - examples. - - Args: - max_step: Used to run extra training steps as a warm-up. - dag: If not None, is used instead of calling sample(). - - BPTT is truncated at 35 timesteps. - - For each weight update, gradients are estimated by sampling M models - from the fixed controller policy, and averaging their gradients - computed on a batch of training data. - """ - model = self.shared - model.train() - self.controller.eval() - - hidden = self.shared.init_hidden(self.batch_size) - - abs_max_grad = 0 - abs_max_hidden_norm = 0 - step = 0 - raw_total_loss = 0 - total_loss = 0 - train_idx = 0 - avg_loss = 0 - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - - for batch_x, batch_y in data_iterator: - _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - indices = data_iterator.get_batch_indices() - # negative sampling; replace unknown; re-weight batch_y - self.callback_manager.on_batch_begin(batch_x, batch_y, indices) - # prediction = self._data_forward(self.model, batch_x) - - dags = self.controller.sample(1) - inputs, targets = batch_x, batch_y - # self.callback_manager.on_loss_begin(batch_y, prediction) - loss, hidden, extra_out = self.get_loss(inputs, - targets, - hidden, - dags) - hidden.detach_() - - avg_loss += loss.item() - - # Is loss NaN or inf? requires_grad = False - self.callback_manager.on_backward_begin(loss, self.model) - self._grad_backward(loss) - self.callback_manager.on_backward_end(self.model) - - self._update() - self.callback_manager.on_step_end(self.optimizer) - - if (self.step+1) % self.print_every == 0: - if self.use_tqdm: - print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) - pbar.update(self.print_every) - else: - end = time.time() - diff = timedelta(seconds=round(end - start)) - print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format( - epoch, self.step, avg_loss, diff) - pbar.set_postfix_str(print_output) - avg_loss = 0 - self.step += 1 - step += 1 - self.shared_step += 1 - self.callback_manager.on_batch_end() - # ================= mini-batch end ==================== # - - - def get_reward(self, dag, entropies, hidden, valid_idx=0): - """Computes the perplexity of a single sampled model on a minibatch of - validation data. - """ - if not isinstance(entropies, np.ndarray): - entropies = entropies.data.cpu().numpy() - - data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) - - for inputs, targets in data_iterator: - valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) - valid_loss = utils.to_item(valid_loss.data) - - valid_ppl = math.exp(valid_loss) - - R = 80 / valid_ppl - - rewards = R + 1e-4 * entropies - - return rewards, hidden - - def train_controller(self): - """Fixes the shared parameters and updates the controller parameters. - - The controller is updated with a score function gradient estimator - (i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl - is computed on a minibatch of validation data. - - A moving average baseline is used. - - The controller is trained for 2000 steps per epoch (i.e., - first (Train Shared) phase -> second (Train Controller) phase). - """ - model = self.controller - model.train() - # Why can't we call shared.eval() here? Leads to loss - # being uniformly zero for the controller. - # self.shared.eval() - - avg_reward_base = None - baseline = None - adv_history = [] - entropy_history = [] - reward_history = [] - - hidden = self.shared.init_hidden(self.batch_size) - total_loss = 0 - valid_idx = 0 - for step in range(20): - # sample models - dags, log_probs, entropies = self.controller.sample( - with_details=True) - - # calculate reward - np_entropies = entropies.data.cpu().numpy() - # No gradients should be backpropagated to the - # shared model during controller training, obviously. - with _get_no_grad_ctx_mgr(): - rewards, hidden = self.get_reward(dags, - np_entropies, - hidden, - valid_idx) - - - reward_history.extend(rewards) - entropy_history.extend(np_entropies) - - # moving average baseline - if baseline is None: - baseline = rewards - else: - decay = 0.95 - baseline = decay * baseline + (1 - decay) * rewards - - adv = rewards - baseline - adv_history.extend(adv) - - # policy loss - loss = -log_probs*utils.get_variable(adv, - self.use_cuda, - requires_grad=False) - - loss = loss.sum() # or loss.mean() - - # update - self.controller_optim.zero_grad() - loss.backward() - - self.controller_optim.step() - - total_loss += utils.to_item(loss.data) - - if ((step % 50) == 0) and (step > 0): - reward_history, adv_history, entropy_history = [], [], [] - total_loss = 0 - - self.controller_step += 1 - # prev_valid_idx = valid_idx - # valid_idx = ((valid_idx + self.max_length) % - # (self.valid_data.size(0) - 1)) - # # Whenever we wrap around to the beginning of the - # # validation data, we reset the hidden states. - # if prev_valid_idx > valid_idx: - # hidden = self.shared.init_hidden(self.batch_size) - - def derive(self, sample_num=10, valid_idx=0): - """We are always deriving based on the very first batch - of validation data? This seems wrong... - """ - hidden = self.shared.init_hidden(self.batch_size) - - dags, _, entropies = self.controller.sample(sample_num, - with_details=True) - - max_R = 0 - best_dag = None - for dag in dags: - R, _ = self.get_reward(dag, entropies, hidden, valid_idx) - if R.max() > max_R: - max_R = R.max() - best_dag = dag - - self.model.setDAG(best_dag) diff --git a/legacy/automl/enas_utils.py b/legacy/automl/enas_utils.py deleted file mode 100644 index 7a53dd12..00000000 --- a/legacy/automl/enas_utils.py +++ /dev/null @@ -1,53 +0,0 @@ -# Code Modified from https://github.com/carpedm20/ENAS-pytorch - -from __future__ import print_function - -import collections -from collections import defaultdict - -import numpy as np -import torch -from torch.autograd import Variable - - -def detach(h): - if type(h) == Variable: - return Variable(h.data) - else: - return tuple(detach(v) for v in h) - -def get_variable(inputs, cuda=False, **kwargs): - if type(inputs) in [list, np.ndarray]: - inputs = torch.Tensor(inputs) - if cuda: - out = Variable(inputs.cuda(), **kwargs) - else: - out = Variable(inputs, **kwargs) - return out - -def update_lr(optimizer, lr): - for param_group in optimizer.param_groups: - param_group['lr'] = lr - -Node = collections.namedtuple('Node', ['id', 'name']) - - -class keydefaultdict(defaultdict): - def __missing__(self, key): - if self.default_factory is None: - raise KeyError(key) - else: - ret = self[key] = self.default_factory(key) - return ret - - -def to_item(x): - """Converts x, possibly scalar and possibly tensor, to a Python scalar.""" - if isinstance(x, (float, int)): - return x - - if float(torch.__version__[0:3]) < 0.4: - assert (x.dim() == 1) and (len(x) == 1) - return x[0] - - return x.item() diff --git a/legacy/component/__init__.py b/legacy/component/__init__.py deleted file mode 100644 index c6784aef..00000000 --- a/legacy/component/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .bert_tokenizer import BertTokenizer diff --git a/legacy/component/bert_tokenizer.py b/legacy/component/bert_tokenizer.py deleted file mode 100644 index 6354076d..00000000 --- a/legacy/component/bert_tokenizer.py +++ /dev/null @@ -1,378 +0,0 @@ -""" -bert_tokenizer.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. -""" -import collections -import os -import unicodedata -from io import open - - -PRETRAINED_VOCAB_ARCHIVE_MAP = { - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", -} -PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { - 'bert-base-uncased': 512, - 'bert-large-uncased': 512, - 'bert-base-cased': 512, - 'bert-large-cased': 512, - 'bert-base-multilingual-uncased': 512, - 'bert-base-multilingual-cased': 512, - 'bert-base-chinese': 512, -} -VOCAB_NAME = 'vocab.txt' - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - index = 0 - with open(vocab_file, "r", encoding="utf-8") as reader: - while True: - token = reader.readline() - if not token: - break - token = token.strip() - vocab[token] = index - index += 1 - return vocab - - -def whitespace_tokenize(text): - """Runs basic whitespace cleaning and splitting on a piece of text.""" - text = text.strip() - if not text: - return [] - tokens = text.split() - return tokens - - -class BertTokenizer(object): - """Runs end-to-end tokenization: punctuation splitting + wordpiece""" - - def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, - never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): - """Constructs a BertTokenizer. - Args: - vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input - Only has an effect when do_wordpiece_only=False - do_basic_tokenize: Whether to do basic tokenization before wordpiece. - max_len: An artificial maximum length to truncate tokenized sequences to; - Effective maximum length is always the minimum of this - value (if specified) and the underlying BERT model's - sequence length. - never_split: List of tokens which will never be split during tokenization. - Only has an effect when do_wordpiece_only=False - """ - if not os.path.isfile(vocab_file): - raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) - self.vocab = load_vocab(vocab_file) - self.ids_to_tokens = collections.OrderedDict( - [(ids, tok) for tok, ids in self.vocab.items()]) - self.do_basic_tokenize = do_basic_tokenize - if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, - never_split=never_split) - self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) - self.max_len = max_len if max_len is not None else int(1e12) - - def tokenize(self, text): - split_tokens = [] - if self.do_basic_tokenize: - for token in self.basic_tokenizer.tokenize(text): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - else: - split_tokens = self.wordpiece_tokenizer.tokenize(text) - return split_tokens - - def convert_tokens_to_ids(self, tokens): - """Converts a sequence of tokens into ids using the vocab.""" - ids = [] - for token in tokens: - ids.append(self.vocab[token]) - if len(ids) > self.max_len: - print( - "WARNING!\n\"" - "Token indices sequence length is longer than the specified maximum " - "sequence length for this BERT model ({} > {}). Running this" - " sequence through BERT will result in indexing errors".format(len(ids), self.max_len) - ) - return ids - - def convert_ids_to_tokens(self, ids): - """Converts a sequence of ids in wordpiece tokens using the vocab.""" - tokens = [] - for i in ids: - tokens.append(self.ids_to_tokens[i]) - return tokens - - def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a directory or file.""" - index = 0 - if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_NAME) - with open(vocab_file, "w", encoding="utf-8") as writer: - for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): - if index != token_index: - print("Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file)) - index = token_index - writer.write(token + u'\n') - index += 1 - return vocab_file - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): - """ - Instantiate a PreTrainedBertModel from a pre-trained model file. - Download and cache the pre-trained model file if needed. - """ - if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: - vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] - if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True): - print("The pre-trained model you are loading is a cased model but you have not set " - "`do_lower_case` to False. We are setting `do_lower_case=False` for you but " - "you may want to check this behavior.") - kwargs['do_lower_case'] = False - elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True): - print("The pre-trained model you are loading is an uncased model but you have set " - "`do_lower_case` to False. We are setting `do_lower_case=True` for you " - "but you may want to check this behavior.") - kwargs['do_lower_case'] = True - else: - vocab_file = pretrained_model_name_or_path - if os.path.isdir(vocab_file): - vocab_file = os.path.join(vocab_file, VOCAB_NAME) - # redirect to the cache, if necessary - resolved_vocab_file = vocab_file - print("loading vocabulary file {}".format(vocab_file)) - if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: - # if we're using a pretrained model, ensure the tokenizer wont index sequences longer - # than the number of positional embeddings - max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] - kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) - # Instantiate tokenizer. - tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) - return tokenizer - - -class BasicTokenizer(object): - """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - - def __init__(self, - do_lower_case=True, - never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): - """Constructs a BasicTokenizer. - Args: - do_lower_case: Whether to lower case the input. - """ - self.do_lower_case = do_lower_case - self.never_split = never_split - - def tokenize(self, text): - """Tokenizes a piece of text.""" - text = self._clean_text(text) - # This was added on November 1st, 2018 for the multilingual and Chinese - # models. This is also applied to the English models now, but it doesn't - # matter since the English models were not trained on any Chinese data - # and generally don't have any Chinese data in them (there are Chinese - # characters in the vocabulary because Wikipedia does have some Chinese - # words in the English Wikipedia.). - text = self._tokenize_chinese_chars(text) - orig_tokens = whitespace_tokenize(text) - split_tokens = [] - for token in orig_tokens: - if self.do_lower_case and token not in self.never_split: - token = token.lower() - token = self._run_strip_accents(token) - split_tokens.extend(self._run_split_on_punc(token)) - - output_tokens = whitespace_tokenize(" ".join(split_tokens)) - return output_tokens - - def _run_strip_accents(self, text): - """Strips accents from a piece of text.""" - text = unicodedata.normalize("NFD", text) - output = [] - for char in text: - cat = unicodedata.category(char) - if cat == "Mn": - continue - output.append(char) - return "".join(output) - - def _run_split_on_punc(self, text): - """Splits punctuation on a piece of text.""" - if text in self.never_split: - return [text] - chars = list(text) - i = 0 - start_new_word = True - output = [] - while i < len(chars): - char = chars[i] - if _is_punctuation(char): - output.append([char]) - start_new_word = True - else: - if start_new_word: - output.append([]) - start_new_word = False - output[-1].append(char) - i += 1 - - return ["".join(x) for x in output] - - def _tokenize_chinese_chars(self, text): - """Adds whitespace around any CJK character.""" - output = [] - for char in text: - cp = ord(char) - if self._is_chinese_char(cp): - output.append(" ") - output.append(char) - output.append(" ") - else: - output.append(char) - return "".join(output) - - def _is_chinese_char(self, cp): - """Checks whether CP is the codepoint of a CJK character.""" - # This defines a "chinese character" as anything in the CJK Unicode block: - # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) - # - # Note that the CJK Unicode block is NOT all Japanese and Korean characters, - # despite its name. The modern Korean Hangul alphabet is a different block, - # as is Japanese Hiragana and Katakana. Those alphabets are used to write - # space-separated words, so they are not treated specially and handled - # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # - return True - - return False - - def _clean_text(self, text): - """Performs invalid character removal and whitespace cleanup on text.""" - output = [] - for char in text: - cp = ord(char) - if cp == 0 or cp == 0xfffd or _is_control(char): - continue - if _is_whitespace(char): - output.append(" ") - else: - output.append(char) - return "".join(output) - - -class WordpieceTokenizer(object): - """Runs WordPiece tokenization.""" - - def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): - self.vocab = vocab - self.unk_token = unk_token - self.max_input_chars_per_word = max_input_chars_per_word - - def tokenize(self, text): - """Tokenizes a piece of text into its word pieces. - This uses a greedy longest-match-first algorithm to perform tokenization - using the given vocabulary. - For example: - input = "unaffable" - output = ["un", "##aff", "##able"] - Args: - text: A single token or whitespace separated tokens. This should have - already been passed through `BasicTokenizer`. - Returns: - A list of wordpiece tokens. - """ - - output_tokens = [] - for token in whitespace_tokenize(text): - chars = list(token) - if len(chars) > self.max_input_chars_per_word: - output_tokens.append(self.unk_token) - continue - - is_bad = False - start = 0 - sub_tokens = [] - while start < len(chars): - end = len(chars) - cur_substr = None - while start < end: - substr = "".join(chars[start:end]) - if start > 0: - substr = "##" + substr - if substr in self.vocab: - cur_substr = substr - break - end -= 1 - if cur_substr is None: - is_bad = True - break - sub_tokens.append(cur_substr) - start = end - - if is_bad: - output_tokens.append(self.unk_token) - else: - output_tokens.extend(sub_tokens) - return output_tokens - - -def _is_whitespace(char): - """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them - # as whitespace since they are generally considered as such. - if char == " " or char == "\t" or char == "\n" or char == "\r": - return True - cat = unicodedata.category(char) - if cat == "Zs": - return True - return False - - -def _is_control(char): - """Checks whether `chars` is a control character.""" - # These are technically control characters but we count them as whitespace - # characters. - if char == "\t" or char == "\n" or char == "\r": - return False - cat = unicodedata.category(char) - if cat.startswith("C"): - return True - return False - - -def _is_punctuation(char): - """Checks whether `chars` is a punctuation character.""" - cp = ord(char) - # We treat all non-letter/number ASCII as punctuation. - # Characters such as "^", "$", and "`" are not in the Unicode - # Punctuation class but we treat them as punctuation anyways, for - # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): - return True - cat = unicodedata.category(char) - if cat.startswith("P"): - return True - return False - diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 9c05c334..059d52d2 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -182,8 +182,9 @@ class TestDataSetMethods(unittest.TestCase): def test_apply2(self): def split_sent(ins): return ins['raw_sentence'].split() - csv_loader = CSVLoader(headers=['raw_sentence', 'label'],sep='\t') - dataset = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv') + csv_loader = CSVLoader(headers=['raw_sentence', 'label'], sep='\t') + data_bundle = csv_loader.load('test/data_for_tests/tutorial_sample_dataset.csv') + dataset = data_bundle.datasets['train'] dataset.drop(lambda x: len(x['raw_sentence'].split()) == 0, inplace=True) dataset.apply(split_sent, new_field_name='words', is_input=True) # print(dataset) diff --git a/test/io/test_data_loader.py b/test/io/test_data_loader.py deleted file mode 100644 index 5b1bb749..00000000 --- a/test/io/test_data_loader.py +++ /dev/null @@ -1,15 +0,0 @@ -import unittest - -from fastNLP.core.const import Const -from fastNLP.io.data_loader import MNLILoader - - -class TestDataLoader(unittest.TestCase): - - def test_mnli_loader(self): - ds = MNLILoader().process('test/data_for_tests/sample_mnli.tsv', - to_lower=True, get_index=True, seq_len_type='mask') - self.assertTrue('train' in ds.datasets) - self.assertTrue(len(ds.datasets) == 1) - self.assertTrue(len(ds.datasets['train']) == 11) - self.assertTrue(isinstance(ds.datasets['train'][0][Const.INPUT_LENS(0)], list)) diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py deleted file mode 100644 index 6fb8e4f7..00000000 --- a/test/io/test_dataset_loader.py +++ /dev/null @@ -1,77 +0,0 @@ -import unittest -import os -from fastNLP.io import CSVLoader, JsonLoader -from fastNLP.io.data_loader import SSTLoader, SNLILoader, Conll2003Loader, PeopleDailyCorpusLoader - - -class TestDatasetLoader(unittest.TestCase): - - def test_Conll2003Loader(self): - """ - Test the the loader of Conll2003 dataset - """ - dataset_path = "test/data_for_tests/conll_2003_example.txt" - loader = Conll2003Loader() - dataset_2003 = loader.load(dataset_path) - - def test_PeopleDailyCorpusLoader(self): - data_set = PeopleDailyCorpusLoader().load("test/data_for_tests/people_daily_raw.txt") - - def test_CSVLoader(self): - ds = CSVLoader(sep='\t', headers=['words', 'label']) \ - .load('test/data_for_tests/tutorial_sample_dataset.csv') - assert len(ds) > 0 - - def test_SNLILoader(self): - ds = SNLILoader().load('test/data_for_tests/sample_snli.jsonl') - assert len(ds) == 3 - - def test_JsonLoader(self): - ds = JsonLoader().load('test/data_for_tests/sample_snli.jsonl') - assert len(ds) == 3 - - def no_test_SST(self): - train_data = """(3 (2 (2 The) (2 Rock)) (4 (3 (2 is) (4 (2 destined) (2 (2 (2 (2 (2 to) (2 (2 be) (2 (2 the) (2 (2 21st) (2 (2 (2 Century) (2 's)) (2 (3 new) (2 (2 ``) (2 Conan)))))))) (2 '')) (2 and)) (3 (2 that) (3 (2 he) (3 (2 's) (3 (2 going) (3 (2 to) (4 (3 (2 make) (3 (3 (2 a) (3 splash)) (2 (2 even) (3 greater)))) (2 (2 than) (2 (2 (2 (2 (1 (2 Arnold) (2 Schwarzenegger)) (2 ,)) (2 (2 Jean-Claud) (2 (2 Van) (2 Damme)))) (2 or)) (2 (2 Steven) (2 Segal))))))))))))) (2 .))) -(4 (4 (4 (2 The) (4 (3 gorgeously) (3 (2 elaborate) (2 continuation)))) (2 (2 (2 of) (2 ``)) (2 (2 The) (2 (2 (2 Lord) (2 (2 of) (2 (2 the) (2 Rings)))) (2 (2 '') (2 trilogy)))))) (2 (3 (2 (2 is) (2 (2 so) (2 huge))) (2 (2 that) (3 (2 (2 (2 a) (2 column)) (2 (2 of) (2 words))) (2 (2 (2 (2 can) (1 not)) (3 adequately)) (2 (2 describe) (2 (3 (2 (2 co-writer\/director) (2 (2 Peter) (3 (2 Jackson) (2 's)))) (3 (2 expanded) (2 vision))) (2 (2 of) (2 (2 (2 J.R.R.) (2 (2 Tolkien) (2 's))) (2 Middle-earth))))))))) (2 .))) -(3 (3 (2 (2 (2 (2 (2 Singer\/composer) (2 (2 Bryan) (2 Adams))) (2 (2 contributes) (2 (2 (2 a) (2 slew)) (2 (2 of) (2 songs))))) (2 (2 --) (2 (2 (2 (2 a) (2 (2 few) (3 potential))) (2 (2 (2 hits) (2 ,)) (2 (2 (2 a) (2 few)) (1 (1 (2 more) (1 (2 simply) (2 intrusive))) (2 (2 to) (2 (2 the) (2 story))))))) (2 --)))) (2 but)) (3 (4 (2 the) (3 (2 whole) (2 package))) (2 (3 certainly) (3 (2 captures) (2 (1 (2 the) (2 (2 (2 intended) (2 (2 ,) (2 (2 er) (2 ,)))) (3 spirit))) (2 (2 of) (2 (2 the) (2 piece)))))))) (2 .)) -(2 (2 (2 You) (2 (2 'd) (2 (2 think) (2 (2 by) (2 now))))) (2 (2 America) (2 (2 (2 would) (1 (2 have) (2 (2 (2 had) (1 (2 enough) (2 (2 of) (2 (2 plucky) (2 (2 British) (1 eccentrics)))))) (4 (2 with) (4 (3 hearts) (3 (2 of) (3 gold))))))) (2 .)))) -""" - test_data = """(3 (2 Yet) (3 (2 (2 the) (2 act)) (3 (4 (3 (2 is) (3 (2 still) (4 charming))) (2 here)) (2 .)))) -(4 (2 (2 Whether) (2 (2 (2 (2 or) (1 not)) (3 (2 you) (2 (2 're) (3 (3 enlightened) (2 (2 by) (2 (2 any) (2 (2 of) (2 (2 Derrida) (2 's))))))))) (2 (2 lectures) (2 (2 on) (2 (2 ``) (2 (2 (2 (2 (2 (2 the) (2 other)) (2 '')) (2 and)) (2 ``)) (2 (2 the) (2 self)))))))) (3 (2 ,) (3 (2 '') (3 (2 Derrida) (3 (3 (2 is) (4 (2 an) (4 (4 (2 undeniably) (3 (4 (3 fascinating) (2 and)) (4 playful))) (2 fellow)))) (2 .)))))) -(4 (3 (2 (2 Just) (2 (2 the) (2 labour))) (3 (2 involved) (3 (2 in) (4 (2 creating) (3 (3 (2 the) (3 (3 layered) (2 richness))) (3 (2 of) (3 (2 (2 the) (2 imagery)) (2 (2 in) (3 (2 (2 this) (2 chiaroscuro)) (2 (2 of) (2 (2 (2 madness) (2 and)) (2 light)))))))))))) (3 (3 (2 is) (4 astonishing)) (2 .))) -(3 (3 (2 Part) (3 (2 of) (4 (2 (2 the) (3 charm)) (2 (2 of) (2 (2 Satin) (2 Rouge)))))) (3 (3 (2 is) (3 (2 that) (3 (2 it) (2 (1 (2 avoids) (2 (2 the) (1 obvious))) (3 (2 with) (3 (3 (3 humour) (2 and)) (2 lightness))))))) (2 .))) -(4 (2 (2 a) (2 (2 screenplay) (2 more))) (3 (4 ingeniously) (2 (2 constructed) (2 (2 (2 (2 than) (2 ``)) (2 Memento)) (2 ''))))) -(3 (2 ``) (3 (2 (2 Extreme) (2 Ops)) (3 (2 '') (4 (4 (3 exceeds) (2 expectations)) (2 .))))) -""" - train, test = 'train--', 'test--' - with open(train, 'w', encoding='utf-8') as f: - f.write(train_data) - with open(test, 'w', encoding='utf-8') as f: - f.write(test_data) - - loader = SSTLoader() - info = loader.process( - {train: train, test: test}, - train_ds=[train], - src_vocab_op=dict(min_freq=2) - ) - assert len(list(info.vocabs.items())) == 2 - assert len(list(info.datasets.items())) == 2 - print(info.vocabs) - print(info.datasets) - os.remove(train), os.remove(test) - - # def test_import(self): - # import fastNLP - # from fastNLP.io import SNLILoader - # ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True, - # get_index=True, seq_len_type='seq_len', extra_split=['-']) - # assert 'train' in ds.datasets - # assert len(ds.datasets) == 1 - # assert len(ds.datasets['train']) == 3 - # - # ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True, - # get_index=True, seq_len_type='seq_len') - # assert 'train' in ds.datasets - # assert len(ds.datasets) == 1 - # assert len(ds.datasets['train']) == 3 From 39de27f472fab631b97b47d4934b05f10019b081 Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Thu, 29 Aug 2019 08:19:36 +0800 Subject: [PATCH 120/153] Update BertModel.from_pretrained function. Now can pass a model_dir_or_name instead of model_dir. --- fastNLP/embeddings/bert_embedding.py | 29 ++++------ fastNLP/modules/encoder/bert.py | 81 ++++++++++++++-------------- 2 files changed, 52 insertions(+), 58 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index b1b1a200..e15c15f5 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -18,7 +18,7 @@ from itertools import chain from ..core.vocabulary import Vocabulary from ..io.file_utils import _get_embedding_url, cached_path, PRETRAINED_BERT_MODEL_DIR -from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer +from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer, _get_bert_dir from .contextual_embedding import ContextualEmbedding import warnings from ..core import logger @@ -70,19 +70,16 @@ class BertEmbedding(ContextualEmbedding): pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False): super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) - - # 根据model_dir_or_name检查是否存在并下载 + if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'): + logger.warn("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" + " faster speed.") warnings.warn("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" " faster speed.") - model_url = _get_embedding_url('bert', model_dir_or_name.lower()) - model_dir = cached_path(model_url, name='embedding') - # 检查是否存在 - elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): - model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) - else: - raise ValueError(f"Cannot recognize {model_dir_or_name}.") + + # 根据model_dir_or_name检查是否存在并下载 + model_dir = _get_bert_dir(model_dir_or_name) self._word_sep_index = None if '[SEP]' in vocab: @@ -173,15 +170,9 @@ class BertWordPieceEncoder(nn.Module): def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False, word_dropout=0, dropout=0, requires_grad: bool = False): super().__init__() - - if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: - model_url = _get_embedding_url('bert', model_dir_or_name.lower()) - model_dir = cached_path(model_url, name='embedding') - # 检查是否存在 - elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): - model_dir = model_dir_or_name - else: - raise ValueError(f"Cannot recognize {model_dir_or_name}.") + + # 根据model_dir_or_name检查是否存在并下载 + model_dir = _get_bert_dir(model_dir_or_name) self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) self._sep_index = self.model._sep_index diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 5026f48a..89a1b09d 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -18,13 +18,13 @@ import torch from torch import nn from ..utils import _get_file_name_base_on_postfix +from ...io.file_utils import _get_embedding_url, cached_path, PRETRAINED_BERT_MODEL_DIR from ...core import logger CONFIG_FILE = 'bert_config.json' VOCAB_NAME = 'vocab.txt' - class BertConfig(object): """Configuration class to store the configuration of a `BertModel`. """ @@ -133,6 +133,19 @@ def swish(x): ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} +def _get_bert_dir(model_dir_or_name: str = 'en-base-uncased'): + if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: + model_url = _get_embedding_url('bert', model_dir_or_name.lower()) + model_dir = cached_path(model_url, name='embedding') + # 检查是否存在 + elif os.path.isdir(os.path.abspath(os.path.expanduser(model_dir_or_name))): + model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name)) + else: + logger.error(f"Cannot recognize BERT dir or name ``{model_dir_or_name}``.") + raise ValueError(f"Cannot recognize BERT dir or name ``{model_dir_or_name}``.") + return model_dir + + class BertLayerNorm(nn.Module): def __init__(self, hidden_size, eps=1e-12): """Construct a layernorm module in the TF style (epsilon inside the square root). @@ -339,27 +352,9 @@ class BertModel(nn.Module): BERT(Bidirectional Embedding Representations from Transformers). - 如果你想使用预训练好的权重矩阵,请在以下网址下载. - sources:: - - 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", - 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", - 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", - 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", - 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", - 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", - 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", - 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin", - 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", - 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", - 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", - 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", - 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin" - - 用预训练权重矩阵来建立BERT模型:: - model = BertModel.from_pretrained("path/to/weights/directory") + model = BertModel.from_pretrained(model_dir_or_name) 用随机初始化权重矩阵来建立BERT模型:: @@ -440,11 +435,15 @@ class BertModel(nn.Module): return encoded_layers, pooled_output @classmethod - def from_pretrained(cls, pretrained_model_dir, *inputs, **kwargs): + def from_pretrained(cls, pretrained_model_dir_or_name, *inputs, **kwargs): state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) kwargs.pop('cache_dir', None) kwargs.pop('from_tf', None) + + # get model dir from name or dir + pretrained_model_dir = _get_bert_dir(pretrained_model_dir_or_name) + # Load config config_file = _get_file_name_base_on_postfix(pretrained_model_dir, '.json') config = BertConfig.from_json_file(config_file) @@ -493,6 +492,8 @@ class BertModel(nn.Module): if len(unexpected_keys) > 0: logger.warn("Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) + + logger.info(f"Load pre-trained BERT parameters from dir {pretrained_model_dir}.") return model @@ -562,7 +563,7 @@ class WordpieceTokenizer(object): output_tokens.append(self.unk_token) else: output_tokens.extend(sub_tokens) - if len(output_tokens)==0: #防止里面全是空格或者回车符号 + if len(output_tokens) == 0: # 防止里面全是空格或者回车符号 return [self.unk_token] return output_tokens @@ -673,14 +674,14 @@ class BasicTokenizer(object): # as is Japanese Hiragana and Katakana. Those alphabets are used to write # space-separated words, so they are not treated specially and handled # like the all of the other languages. - if ((cp >= 0x4E00 and cp <= 0x9FFF) or # - (cp >= 0x3400 and cp <= 0x4DBF) or # - (cp >= 0x20000 and cp <= 0x2A6DF) or # - (cp >= 0x2A700 and cp <= 0x2B73F) or # - (cp >= 0x2B740 and cp <= 0x2B81F) or # - (cp >= 0x2B820 and cp <= 0x2CEAF) or - (cp >= 0xF900 and cp <= 0xFAFF) or # - (cp >= 0x2F800 and cp <= 0x2FA1F)): # + if (((cp >= 0x4E00) and (cp <= 0x9FFF)) or # + ((cp >= 0x3400) and (cp <= 0x4DBF)) or # + ((cp >= 0x20000) and (cp <= 0x2A6DF)) or # + ((cp >= 0x2A700) and (cp <= 0x2B73F)) or # + ((cp >= 0x2B740) and (cp <= 0x2B81F)) or # + ((cp >= 0x2B820) and (cp <= 0x2CEAF)) or + ((cp >= 0xF900) and (cp <= 0xFAFF)) or # + ((cp >= 0x2F800) and (cp <= 0x2FA1F))): # return True return False @@ -730,8 +731,8 @@ def _is_punctuation(char): # Characters such as "^", "$", and "`" are not in the Unicode # Punctuation class but we treat them as punctuation anyways, for # consistency. - if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or - (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + if (((cp >= 33) and (cp <= 47)) or ((cp >= 58) and (cp <= 64)) or + ((cp >= 91) and (cp <= 96)) or ((cp >= 123) and (cp <= 126))): return True cat = unicodedata.category(char) if cat.startswith("P"): @@ -830,11 +831,11 @@ class BertTokenizer(object): return vocab_file @classmethod - def from_pretrained(cls, model_dir, *inputs, **kwargs): + def from_pretrained(cls, model_dir_or_name, *inputs, **kwargs): """ - 给定path,直接读取vocab. - + 给定模型的名字或者路径,直接读取vocab. """ + model_dir = _get_bert_dir(model_dir_or_name) pretrained_model_name_or_path = _get_file_name_base_on_postfix(model_dir, '.txt') logger.info("loading vocabulary file {}".format(pretrained_model_name_or_path)) max_len = 512 @@ -843,17 +844,19 @@ class BertTokenizer(object): tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs) return tokenizer + class _WordPieceBertModel(nn.Module): """ 这个模块用于直接计算word_piece的结果. """ - def __init__(self, model_dir: str, layers: str = '-1', pooled_cls:bool=False): + def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=False): super().__init__() - self.tokenzier = BertTokenizer.from_pretrained(model_dir) - self.encoder = BertModel.from_pretrained(model_dir) + self.model_dir = _get_bert_dir(model_dir_or_name) + self.tokenzier = BertTokenizer.from_pretrained(self.model_dir) + self.encoder = BertModel.from_pretrained(self.model_dir) # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) @@ -914,7 +917,7 @@ class _WordPieceBertModel(nn.Module): attn_masks = word_pieces.ne(self._wordpiece_pad_index) bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks, - output_all_encoded_layers=True) + output_all_encoded_layers=True) # output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1))) for l_index, l in enumerate(self.layers): From 09d0b74595c8273b8bcb3af48a84cdcd5e6c982e Mon Sep 17 00:00:00 2001 From: yhcc Date: Thu, 29 Aug 2019 09:56:36 +0800 Subject: [PATCH 121/153] Update .travis.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TRAVIS默认已经加入了 --- .travis.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0d63417a..210d158a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,6 @@ language: python python: - "3.6" - -env: - - TRAVIS=1 # command to install dependencies install: - pip install --quiet -r requirements.txt From 146a004deee58f139ba7317e7b66740a709947ba Mon Sep 17 00:00:00 2001 From: yh_cc Date: Thu, 29 Aug 2019 10:12:30 +0800 Subject: [PATCH 122/153] =?UTF-8?q?=E4=BF=AE=E6=94=B9travis=20converage?= =?UTF-8?q?=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .coverage | 1 + .travis.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 .coverage diff --git a/.coverage b/.coverage new file mode 100644 index 00000000..a6d89bc8 --- /dev/null +++ b/.coverage @@ -0,0 +1 @@ +!coverage.py: This is a private format, don't read it directly!{"lines":{"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/__init__.py":[12,14,15,18,19,20,22,23,24,26,27,29,30,31,32,33,34,35,37,38,39,41,42,43,45,46,47,48,50,51,52,53,55,56,57,58,59,60,62,64,66,68,69,70,71,72],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/__init__.py":[6,9,10,11,12,13,14,15,16,17,18,21,22,23,24,25,26,27],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/embedding.py":[128,129,130,131,4,133,7,8,11,12,13,140,15,141,142,18,146,148,143,144,145,155,157,39,41,169,43,45,174,47,48,177,178,49,51,181,182,52,55,185,186,179,60,61,63,193,68,199,72,201,73,75,76,205,82,85,86,87,89,90,91,93,104,111,119,120,121,122,123,124,125,126,127],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/utils.py":[4,5,6,7,9,42,43,12,44,45,46,16,24,57,26,27,28,25,31],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/__init__.py":[13,15,17,19,20,21,22,24,26,27,28,30,32,33,35,36,37,38,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,56,57,58,59,60,61,63,64,65,67,68,69,70,72,73,74,75,78,79,80,83,84,85,86,87,88,89,90,91,92,93,94],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/_logger.py":[1,130,131,4,132,134,7,8,9,10,11,137,13,140,15,16,143,19,20,24,25,26,155,27,29,30,31,32,33,45,46,47,49,50,51,52,53,56,78,79,80,83,84,88,92,94,95,99,100,101,102,103,106,107,108,110,114,119,125,127],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/batch.py":[4,6,7,8,11,13,14,15,16,18,19,20,21,24,29,32,33,34,35,36,37,38,40,42,43,44,45,47,50,57,58,59,60,61,62,63,64,65,67,68,69,70,73,74,75,76,80,81,83,84,85,87,92,99,100,101,102,103,105,106,108,109,112,113,114,115,116,117,119,120,122,124,125,126,127,129,130,131,132,133,135,136,138,139,141,146,171,174,175,176,177,178,181,182,183,184,185,186,187,189,190,193,194,202,204,207,211,215,223,224,225,226,227,228,229,230,233],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/sampler.py":[3,5,6,7,8,134,135,11,140,13,137,16,149,150,151,24,153,26,155,156,158,160,34,162,163,164,166,165,40,167,42,170,43,46,52,54,55,58,186,187,188,190,191,192,193,68,70,71,72,73,75,83,84,86,87,89,90,91,92,93,94,96,97,98,100,102,103,104,105,106,107,108,109,110,112,113,114,115,117,120],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/dataset.py":[515,516,518,532,543,550,552,554,560,562,570,578,585,586,587,589,590,592,606,607,608,609,610,611,617,619,631,632,633,634,635,640,641,643,660,676,688,694,696,702,704,722,723,725,726,727,728,729,734,737,738,859,740,742,751,752,753,754,755,756,757,758,862,760,761,762,763,764,765,766,767,768,770,771,772,774,791,792,793,794,795,796,285,287,290,291,803,293,294,806,296,297,298,299,300,301,302,303,811,305,809,824,314,316,317,318,319,320,321,834,835,836,837,838,322,323,324,325,326,327,328,334,329,332,337,849,338,339,340,342,335,344,857,858,347,348,861,345,350,346,865,864,863,866,860,351,868,354,353,356,871,867,875,869,870,360,363,364,877,365,367,369,883,884,886,376,377,378,379,380,381,382,383,384,385,386,387,388,894,895,896,897,402,409,410,412,413,415,420,421,422,423,425,426,427,431,432,434,872,441,443,445,447,451,452,453,454,459,873,474,807,486,487,488,490,491,493,499,500,502,503,505,506,507,509],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/field.py":[4,7,8,9,12,13,14,15,16,18,19,21,22,535,25,26,27,28,29,30,33,34,35,36,37,38,41,43,44,557,46,559,560,47,562,48,52,53,54,563,56,57,58,59,60,564,62,63,64,568,570,67,68,572,70,71,72,65,74,578,76,580,78,590,80,591,585,83,592,85,593,87,594,89,595,596,597,598,599,95,96,97,98,99,100,613,101,614,102,615,609,616,617,618,104,106,108,622,624,113,114,115,116,629,117,118,120,119,122,130,131,132,133,134,135,136,137,138,651,139,653,140,141,142,146,147,148,149,150,663,152,659,661,157,158,159,160,162,165,677,167,169,681,682,683,685,686,175,687,177,178,688,180,181,182,183,184,690,691,187,692,693,190,694,192,697,200,201,202,205,206,207,209,211,212,214,220,221,222,226,45,236,242,244,252,254,255,256,257,259,261,278,565,566,567,298,569,571,318,573,574,575,339,576,577,579,359,581,582,379,584,586,626,398,419,695,428,429,430,431,432,433,434,435,436,437,438,439,441,443,444,445,446,447,448,450,451,452,453,454,455,456,458,459,460,465,482,484,485,487,490,491,610],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/utils.py":[3,5,517,6,7,518,10,11,12,13,14,521,16,17,18,19,20,524,22,23,527,528,26,27,535,29,536,540,541,542,543,35,544,545,547,546,40,548,551,552,553,554,46,550,49,563,564,53,565,567,568,569,59,60,574,62,63,64,67,522,592,599,609,615,530,118,119,120,121,122,124,125,126,127,641,129,130,132,131,134,647,648,649,650,651,652,135,139,140,656,142,144,147,659,145,146,662,663,664,151,666,667,152,669,670,153,672,673,674,163,676,165,678,679,168,681,682,643,685,644,645,192,709,217,218,219,220,222,736,738,227,739,740,226,229,232,233,230,148,231,745,234,235,149,236,237,238,239,240,244,245,241,242,243,246,247,248,249,250,251,252,253,254,255,256,259,260,263,154,271,273,274,156,277,157,280,158,642,288,289,159,291,292,293,294,295,296,297,298,161,301,316,333,334,335,337,338,339,340,341,342,343,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,364,388,389,390,391,392,393,396,397,405,411,413,416,417,421,430,433,436,437,438,439,440,290,445,449,451,452,454,456,457,458,460,463,465,466,469,470,471,475,476,477,478,479,480,485,496,497,498,499,500,501,502,503,505,506,507,508,509,510,511],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/instance.py":[58,5,37,39,7,11,46,47,48,52,53,55,56,24,26,59,28,30],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/const.py":[4,7,11,29,30,31,32,33,34,35,36,37,39,42,43,45,51,56,61,64,65,67,70,71,73,76,77,79],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/callback.py":[1024,513,1026,1030,1031,1036,529,531,1043,1044,1060,1071,562,51,53,1077,55,56,57,58,59,60,61,62,63,64,65,66,576,68,69,1092,583,72,73,74,76,78,80,81,84,85,87,88,89,91,92,603,606,97,612,106,108,621,109,623,110,113,111,118,120,123,125,128,130,133,135,648,138,140,143,145,660,148,150,153,155,159,161,674,164,166,681,169,171,683,685,686,175,687,177,688,179,692,693,183,696,701,189,191,703,705,706,708,197,710,199,721,722,210,212,723,726,724,728,729,730,220,733,222,741,229,231,743,745,746,748,237,749,239,750,751,752,753,756,245,754,247,758,761,759,252,765,254,766,767,768,770,771,260,773,262,774,775,776,778,779,780,781,782,783,777,785,786,275,787,788,789,791,790,273,794,283,795,796,797,287,799,289,800,801,802,805,293,295,303,818,820,821,310,311,312,313,822,315,316,823,318,830,824,321,322,826,836,828,827,831,832,833,841,329,331,332,333,334,839,336,337,842,851,339,340,341,852,855,348,349,350,863,351,353,864,357,870,871,361,875,365,369,881,373,377,889,890,381,385,389,902,393,907,397,912,401,405,410,411,922,929,420,428,945,946,437,961,964,455,968,457,459,461,462,463,468,469,471,472,473,987,479,482,504,489,491,1003,492,493,494,496,1009,497,1014,1016,506,1020],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/tester.py":[34,35,37,38,40,41,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,62,66,92,94,95,97,100,102,103,104,105,106,107,109,110,111,118,119,121,127,131,132,134,138,139,141,148,149,150,151,152,153,154,155,156,158,159,160,162,164,165,166,167,170,171,173,174,176,177,178,181,182,183,184,185,187,188,189,190,191,192,194,195,196,197,199,206,207,209,211,213,214,215,217,223,224,225,226,227,228],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/metrics.py":[4,6,7,8,9,12,13,15,16,18,19,20,21,22,23,24,25,26,29,117,119,120,121,122,124,133,137,138,141,151,156,158,165,166,179,180,181,182,183,185,186,187,188,192,193,194,195,200,208,209,213,215,230,231,235,236,239,240,241,242,246,247,249,250,253,254,255,256,257,258,259,262,263,264,265,267,270,271,272,274,277,278,279,280,281,282,284,285,286,287,288,290,292,295,305,307,309,311,313,314,316,329,330,332,336,340,341,343,345,346,347,348,350,354,355,356,357,359,360,362,369,370,371,372,373,376,386,388,389,390,391,392,393,394,395,396,398,399,400,401,402,406,437,468,477,479,480,481,482,483,484,485,486,488,489,491,492,493,496,504,505,506,507,508,509,510,511,512,514,515,516,520,561,564,566,568,570,573,574,575,576,577,578,579,580,581,582,586,587,588,589,590,592,593,595,597,598,599,601,609,612,616,620,622,623,624,625,633,634,635,636,637,638,640,641,643,644,646,647,648,649,651,652,653,655,657,658,659,660,661,662,663,664,665,666,667,668,669,670,671,672,673,674,675,676,677,678,679,681,686,687,688,689,690,691,692,694,695,696,697,699,700,702,704,712,713,714,716,719,726,727,728,729,730,732,733,734,736,738,742,743,747,750,759,760,761,762,763,766,776,777,778,779,780,781,784,799,802,804,806,808,810,811,813,814,816,818,819,820,821,823,825,827,836,837,838,839,841,842,845,846,850,851,852,853,855,856,857,858,859,862,863,864,865,867,868,870,873,875,876,878,879,880,881,883,884,885,887,888,891,893,895,897,900,901,903,905,907,909,910,911,912,914,916,918,919,920,921,923,929,932,933,935,936,937,939,940,942,944,945,946,947,949],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/vocabulary.py":[4,7,8,11,12,13,15,16,17,18,21,26,35,40,42,43,44,46,49,54,56,57,58,59,61,62,64,67,90,92,93,94,95,96,97,98,99,100,102,104,105,116,117,118,120,121,133,134,135,137,145,146,147,148,149,150,151,153,154,166,168,169,181,182,184,190,191,192,193,194,195,197,198,199,200,201,202,203,204,205,206,207,209,214,215,217,219,221,229,231,242,244,251,252,253,254,258,259,273,279,280,282,283,285,287,289,291,292,295,296,297,301,302,303,304,305,311,313,317,337,338,342,343,344,345,346,348,349,350,352,354,355,356,358,359,360,361,368,369,370,371,377,379,385,387,398,400,401,406,407,408,410,411,416,417,418,420,428,430,443,447,448,450,451,453,457,458,460,463,465,466],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/_parallel_utils.py":[1,97,3,5,7,8,9,10,11,76,104,14,105,107],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/losses.py":[4,6,8,9,11,12,13,14,17,18,20,21,23,24,25,26,27,28,29,30,31,34,37,39,40,41,43,52,55,62,63,76,77,78,79,80,82,83,84,85,89,90,91,92,102,110,112,113,114,115,119,120,122,123,125,126,127,128,129,130,131,134,135,136,137,139,141,142,143,145,148,149,150,151,152,153,155,156,157,158,160,162,163,165,168,188,190,192,193,194,195,198,201,222,224,225,226,227,228,229,230,232,233,234,235,236,239,240,241,242,243,245,246,249,259,261,262,263,264,265,267,268,271,280,282,283,284,285,286,288,289,292,303,305,306,307,308,309,310,312,313,316,323,325,326,327,329,331,332,333,334,335,336,337,338,339,340,341,343,345,347,353,356,357,358,359,360,361,366,374,377,386,387,395,410,432],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/optimizer.py":[4,6,7,8,9,135,138,12,13,14,15,18,151,24,26,27,156,29,30,32,35,41,43,47,48,51,54,61,68,70,71,72,73,75,76,78,80,83,90,92,93,95,96,98,99,101,103,106],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/trainer.py":[517,518,519,521,522,523,524,525,526,527,528,529,530,531,532,533,534,536,537,538,539,540,541,545,547,548,551,552,553,554,555,556,557,558,559,560,561,562,564,565,567,570,571,573,593,594,598,599,600,601,602,603,604,606,607,608,609,619,620,621,622,623,624,625,626,627,628,629,630,634,635,637,639,640,641,643,644,645,646,647,648,649,650,651,652,653,654,656,657,658,659,660,662,663,666,667,668,669,672,673,674,676,677,679,680,681,682,683,685,686,687,688,689,690,691,693,694,695,696,697,698,700,701,705,707,708,711,712,713,715,716,717,720,721,722,723,724,725,727,728,857,730,737,740,742,746,747,352,749,750,751,752,755,757,764,765,766,768,775,777,800,802,812,813,816,818,823,824,825,826,827,829,319,831,321,832,835,324,325,326,833,328,329,330,843,332,333,841,847,336,848,338,851,340,341,342,343,344,339,853,854,855,349,350,351,856,345,346,858,347,348,864,865,868,869,353,354,355,356,358,872,873,875,876,877,878,879,880,881,882,883,884,885,886,887,888,889,890,891,892,893,895,896,898,899,900,901,902,903,904,905,907,908,909,910,911,913,914,915,916,917,918,919,920,924,925,927,928,418,932,936,425,426,427,937,941,942,939,940,431,943,433,944,945,947,437,438,948,949,441,954,950,444,951,958,449,450,961,962,964,454,965,456,968,458,970,971,974,466,482,484,485,489,490,491,498,499,502,503,506,507,510,511],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/static_embedding.py":[4,7,9,11,12,13,14,16,17,18,19,20,21,22,25,66,69,70,71,72,75,76,77,78,79,83,84,91,92,119,121,122,123,124,127,128,130,133,134,135,136,140,141,142,143,144,146,147,148,150,151,153,154,155,156,158,164,165,166,167,168,169,171,179,181,182,186,188,202,204,205,207,209,210,226,227,229,230,231,232,233,237,238,239,240,241,242,243,244,245,246,247,248,249,250,252,254,257,258,259,260,261,262,269,270,271,272,275,277,279,283,284,285,286,287,288,290,292,299,300,301,302,303,304],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/__init__.py":[13,15,17,19,21,22,23,24,25,26,28,29,30,31,32,33,34,35,37,38,40,42,43,44,45,46,48,50,51,52,53,54,55,57,58,59,60,61,63,65,66,67,68,69,70,71,72,73,74,75,76,78,79,83,84,85,87,88],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/embed_loader.py":[4,6,7,10,11,12,14,16,17,20,22,23,24,25,34,39,41,44,45,46,63,64,66,67,68,69,70,71,72,73,75,76,77,78,80,81,82,83,84,86,88,90,91,92,93,100,101,102,103,104,105,106,107,108,109,111,112,114,116,117,118,133,134,135,136,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,157,166,168,169,170,171,173,174,175,176,177,178,180,181,182,183,184,185,187,188,190],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/data_bundle.py":[4,6,9,10,13,142,27,29,30,31,159,33,45,55,184,64,74,203,83,92,117],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/model_io.py":[32,3,5,6,9,42,12,17,19,53,22,55,62],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/__init__.py":[44,47,49,50,51,52,53,54,56,57,58,59,60,61,62,63,65,66,68,70,71,72,73,74,76,77,78,79,80,81,82,83],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/classification.py":[1,259,4,5,6,7,8,9,261,264,12,13,14,15,16,17,19,20,21,279,24,291,164,45,47,304,50,178,180,306,309,183,72,73,201,339,244,119,120],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/loader.py":[65,66,1,4,33,70,7,67,9,10,11,12,68,78,15,19,21,22,24,63],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/file_utils.py":[4,7,8,9,10,11,14,15,16,17,18,19,21,22,23,25,28,29,30,32,33,35,36,38,40,41,43,44,45,46,50,51,52,53,54,58,60,61,62,63,64,65,66,67,68,69,71,73,74,76,77,78,79,83,84,85,86,87,88,89,90,91,92,93,94,96,97,98,99,102,103,104,107,108,109,110,114,159,186,202,228,252,273,293,306,418,427,434,443],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/utils.py":[33,34,35,4,36,7,10,11,12,14,17,81],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/conll.py":[1,4,5,6,7,8,9,10,11,12,15,16,17,18,19,146,21,22,23,24,25,150,278,28,279,282,286,287,408,421,175,177,183,62,446,64,448,451,325,204,78,208,92,349,222,273,224,351,354,227,404,117,405,119,125],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/file_reader.py":[33,34,3,35,5,7,9,41,42,12,43,78,47,44,24,25,26,30],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/csv.py":[32,1,34,33,4,35,36,7,8,9,10,37,13,24,26,27,28,29,30],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/cws.py":[1,4,38,7,8,9,10,11,39,13,14,15,47,18,56],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/json.py":[1,4,38,7,8,9,10,13,25,27],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/loader/matching.py":[1,129,4,5,6,7,8,11,12,13,15,16,17,18,19,20,273,277,23,159,35,37,40,170,298,300,303,184,186,189,318,66,216,98,228,109,241,243,246,120,122],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/__init__.py":[9,11,13,15,16,17,18,19,21,22,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,42,43,44,46,47,48],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/classification.py":[1,4,5,6,7,8,134,264,11,392,13,15,16,17,18,19,20,21,22,408,24,410,28,414,32,34,37,172,52,182,315,320,449,195,197,70,201,333,335,339,89,218,247,228,104,106,119,249,382],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/pipe.py":[1,4,7,10,13,14,23],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/utils.py":[1,66,153,4,5,6,39,9,137,11,12,15,87,121,91],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/conll.py":[1,4,5,6,7,8,9,12,13,14,15,16,17,18,19,20,141,272,23,286,288,34,36,293,43,306,308,182,313,192,328,330,79,208,210,215,225,98,227,100,233,113,114],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/matching.py":[128,1,129,259,4,5,6,7,8,9,10,11,12,13,14,15,135,140,18,19,20,21,22,146,147,25,152,260,134,169,42,171,44,177,50,265,266,191,64,141,271,272,247,248,122,123,253,254],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/io/pipe/cws.py":[1,4,7,8,136,10,11,12,13,14,17,155,157,34,168,50,65,202,84,110,254],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/__init__.py":[18,22,23,25,27,29,31,33,34,35,37,38,39,40,42,44,45,46,47,49,52,53,54,55,56],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/decoder/__init__.py":[4,6,7,8,9,12,13,14,15],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/decoder/crf.py":[1,4,5,8,9,11,12,15,29,31,32,33,34,35,36,37,38,40,41,42,43,44,46,47,48,50,51,52,53,54,55,56,57,58,59,60,63,73,74,75,76,93,94,95,96,97,98,102,121,122,123,124,125,126,127,128,157,170,173,175,177,178,181,182,183,184,186,187,192,194,196,204,205,206,207,209,211,212,213,214,215,216,218,219,221,223,231,232,233,236,237,238,240,242,243,244,245,246,247,248,250,252,261,262,263,264,265,267,269,282,283,284,287,288,289,290,291,295,296,297,298,299,300,301,302,303,304,306,310,311,312,314,316,317,318,319,320,321,322,323,328,329],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/utils.py":[4,134,7,8,11,12,14,15,16,19,35,37,39,41,43,45,47,49,52,54,56,57,60,61,62,63,64,65,67,68,69,70,72,73,74,75,77,80,83,120],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/decoder/mlp.py":[1,4,7,8,10,13,44,46,47,48,49,50,51,52,53,55,57,60,61,62,64,65,71,72,73,75,76,79,86,88,93,94,95,96,98,99],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/decoder/utils.py":[1,4,6,9],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/__init__.py":[4,9,10,12,14,16,18,20,21,22,24,25,26,27,29,32,33,34,35,36,37,38,39,40],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/attention.py":[128,1,4,132,7,9,10,11,13,16,20,22,23,24,25,26,27,28,30,38,39,40,41,42,43,46,175,55,184,57,186,58,59,60,61,62,64,65,66,67,69,198,70,71,73,74,75,76,77,78,80,212,88,89,90,92,93,94,97,98,99,100,101,102,105,106,107,110,126],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/bert.py":[512,4,517,7,10,11,12,13,14,15,17,18,20,21,22,24,25,28,30,44,571,70,586,587,75,76,77,591,78,79,80,81,82,83,84,85,600,86,87,92,100,107,621,110,115,119,632,125,126,129,133,136,654,149,150,153,154,667,155,156,158,159,160,161,162,165,167,169,170,171,172,173,689,177,178,180,181,182,183,184,187,188,189,703,191,192,193,194,197,198,199,200,715,204,205,206,208,209,210,212,214,727,215,216,217,219,220,221,222,224,225,226,229,230,743,744,232,747,235,239,241,242,243,244,245,248,249,250,251,252,253,255,256,257,258,259,262,263,776,264,265,266,268,269,270,271,274,275,786,276,277,278,279,283,796,284,285,286,289,290,291,292,293,294,296,809,297,298,299,300,303,304,816,305,306,307,308,310,311,312,313,314,317,318,319,320,833,321,323,324,325,326,327,328,329,330,331,334,335,848,336,337,338,340,852,854,343,344,345,346,349,877,374,376,377,378,385,386,387,388,389,390,391,393,396,909,399,400,401,402,403,404,406,407,409,410,417,424,425,427,428,429,430,431,432,433,434,435,437,500,509,510],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/char_encoder.py":[1,4,5,7,8,10,14,25,27,28,29,30,32,34,36,41,43,45,47,48,49,50,52,54,55,57,58,61,68,70,77,78,80,81,82,83,84,85,87,92,93,94,95,96,98,99],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/conv_maxpool.py":[1,4,6,7,8,11,23,25,26,28,29,32,33,36,37,38,43,52,59,60,69,77,79,80,81,82,84,85,86],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/lstm.py":[4,7,10,11,12,15,30,33,34,35,36,37,38,40,41,42,44,45,46,47,49,51,61,62,65,66,67,68,69,72,73,74,75,76,77,82],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/pooling.py":[1,129,4,5,6,7,135,9,10,137,13,141,25,27,38,62,67,69,73,85,86,88,92,102,107,109,114],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/star_transformer.py":[3,6,9,10,11,12,15,32,34,35,36,38,40,41,42,43,44,45,46,48,49,53,63,65,67,68,69,71,72,76,77,78,79,80,81,82,83,85,87,89,91,94,95,96,99,100,101,102,104,107,109,111,112,114,116,117,118,119,120,121,122,123,124,125,126,127,129,130,132,134,137,138,140,141,142,143,144,146,149,151,153,154,156,158,159,160,161,162,163,164,165,166],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/transformer.py":[1,4,6,8,9,12,26,28,29,30,31,32,33,34,35,36,37,39,46,47,48,49,50,51,52,54,55,56,58,65,66,69,70,71,72,73],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/dropout.py":[1,4,7,10,14,16,17,18,19,20,24],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/variational_rnn.py":[3,6,7,8,11,12,13,15,16,25,28,31,33,34,35,36,37,38,40,52,53,54,55,56,58,59,60,61,62,63,64,66,67,69,70,73,74,75,76,77,79,80,81,82,83,84,85,86,87,88,89,96,97,98,99,102,120,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,146,147,148,149,150,151,152,153,155,163,164,165,166,167,168,169,170,172,173,175,176,177,178,179,181,182,183,184,185,186,187,188,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,210,212,213,215,216,218,219,221,224,239,241,242,243,245,246,249,264,266,270,274,289,291,295],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/elmo_embedding.py":[4,7,136,10,11,12,13,14,15,141,17,18,19,20,21,23,155,163,171,173,305,58,61,92,99,111,119],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/modules/encoder/_elmo.py":[514,3,515,5,7,263,9,10,11,12,264,14,528,17,409,410,309,56,65,453,327,328,85,98,493,239,240,251,510],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/contextual_embedding.py":[99,4,7,104,10,12,76,14,15,16,17,18,19,20,23,24,27],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/bert_embedding.py":[4,7,8,135,11,12,14,15,16,17,271,19,20,21,22,23,24,149,273,27,157,168,171,186,67,198,71,203,207,211,215,95,98,227,361,115,250],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/char_embedding.py":[4,7,8,11,12,13,14,16,17,18,19,20,21,22,25,57,61,62,64,65,67,68,70,71,72,85,87,88,89,91,92,93,94,95,98,99,101,104,106,108,109,110,111,113,120,121,122,123,124,125,127,128,129,130,131,132,133,134,135,136,137,138,142,143,145,161,168,169,170,172,173,174,175,177,180,211,216,217,219,221,222,224,225,226,239,241,242,243,245,246,247,248,249,252,253,255,258,260,261,263,264,265,267,274,275,276,277,278,279,281,282,283,284,285,286,289,290,291,292,297,299,301,318],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/embeddings/stack_embedding.py":[4,7,10,12,13,15,18,37,39,40,41,42,43,44,45,46,48,49,50,51,52,53,55,64,71,75,87,92,99,100,101,102,103,104],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/__init__.py":[32,33,34,9,11,13,14,16,18,19,20,21,23,24,27,28,30,31],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/base_model.py":[32,1,33,3,5,7,10,12,14,15,17,20,24,25,26,27,29,30],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/bert.py":[4,6,8,10,11,13,14,15,16,17,20,57,58,59,60,61,65,67,68,69,71,77,78,80,81,82,83,84,86,91,93,98,135,136,137,138,139,142,144,145,146,148,154,155,156,157,158,159,160,161,162,164,169,171,176,215,216,217,218,219,222,224,225,226,228,234,235,236,237,239,251,253,258,300,301,302,303,306,308,311,313,319,320,321,322,323,324,326,343,345],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/biaffine_parser.py":[3,5,517,6,520,9,10,11,12,522,14,523,16,17,18,19,20,21,22,23,24,25,530,536,28,539,542,534,544,33,34,35,36,37,38,39,40,41,545,546,547,548,46,47,48,49,50,51,52,53,45,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,73,74,75,76,524,77,78,79,80,525,81,84,82,87,526,527,92,93,94,95,528,96,97,99,101,102,103,104,105,107,108,109,110,111,112,531,114,115,116,117,118,119,120,121,122,532,124,125,126,533,128,131,136,138,139,141,142,151,152,153,154,155,156,157,158,160,161,170,171,172,173,174,175,176,177,178,179,182,188,190,191,192,193,194,195,198,200,549,207,208,209,210,211,42,214,43,222,44,224,225,226,227,229,236,237,238,241,262,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,305,306,307,308,310,311,312,313,314,315,316,317,318,322,323,324,325,326,327,328,329,330,331,333,334,335,336,337,338,339,341,342,344,362,366,368,369,371,372,373,376,377,378,379,380,381,382,383,385,386,387,391,392,393,394,397,400,402,403,405,406,416,417,418,419,420,421,422,424,437,438,439,440,441,442,443,444,445,446,447,449,450,451,452,453,454,456,469,470,471,472,473,474,477,489,493,494,495,496,497,498,499,502],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/cnn_text_classification.py":[4,7,10,11,13,14,15,16,19,32,38,39,42,43,44,45,46,47,48,50,57,58,59,60,62,63,64,65,67,74,75,76],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/sequence_labeling.py":[3,5,6,10,11,12,14,15,16,17,18,19,20,21,22,25,39,41,61,75,78,82,93,95,96,98,99,100,101,102,104,112,113,114,116,118,120,122,124,132,134,136,138,140,141,143,151,152,153,154,155,156,158,159,160,161,162,163,165,170,171,174,189,191,193,195,196,197,198,199,200,201,202,203,204,206,207,213,218,219,221,229,230,231,232,233,234,236,237,238,239,240,241,243,252,253,254,257,259,263,264,267,269,270,271,272,273,274,275,277,279,287,289,296],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/snli.py":[4,6,9,10,11,12,14,15,16,17,20,32,35,36,38,41,42,43,44,45,48,49,50,51,52,54,57,58,59,60,61,63,65,66,68,77,78,79,80,81,82,83,87,89,90,91,92,94,95,99,100,101,102,104,105,107,113,115,116,117,121,122,123,124,126,127,128,129,130,131,134,136,137,138,139,142,143,144,145,146,147,148,149,151,153,154,155,156,158,160,162,165,167,168,169,174,177,178,179,182,183,184,185,186,187,189,190,193,194,195,196,197,198,199,202,204,205,208,209,211,213,214,215,216,217,218,220],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/models/star_transformer.py":[3,5,6,7,8,11,12,14,15,16,17,20,36,38,46,47,48,49,51,52,53,54,55,56,58,67,68,69,70,73,74,75,76,77,78,79,80,83,84,85,88,89,90,91,92,93,94,95,96,99,100,101,102,105,123,133,134,135,136,137,138,139,140,141,142,143,145,152,153,154,155,156,158,165,166,167,170,188,198,199,200,201,202,203,204,205,206,207,208,210,217,218,219,220,221,223,230,231,232,235,253,263,264,265,266,267,268,269,270,271,272,273,275,284,285,287,288,289,291,292,293,294,296,305,306,307],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/dist_trainer.py":[3,4,5,6,7,9,10,11,12,13,14,15,16,18,19,20,21,22,23,24,25,26,152,29,30,157,34,169,47,304,50,179,183,312,58,320,332,343,355,229],"/hdd/fudanNLP/fastNLP/fastNLP/fastNLP/core/predictor.py":[1,4,7,9,11,12,13,14,17,25,27,28,31,32,33,35,42,44,47,48,49,50,51,53,56,58,59,60,61,62,64,67,68,69,70,80,81]}} \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 210d158a..bd7a34f5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,7 @@ install: - pip install pytest-cov # command to run tests script: - - pytest --cov=./ test/ + - pytest --cov=fastNLP test/ after_success: - bash <(curl -s https://codecov.io/bash) From 1756e3ffdf1ffa7ac4d296883fc5ebf4e3ad38c9 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Thu, 29 Aug 2019 11:16:59 +0800 Subject: [PATCH 123/153] =?UTF-8?q?1.=E4=BF=AE=E5=A4=8DMNLILoader=E4=B8=AD?= =?UTF-8?q?=E7=9A=84bug;=202.=E4=BF=AE=E5=A4=8Dfield=E4=B8=AD=E7=9A=84tens?= =?UTF-8?q?or=20warning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/field.py | 6 +++--- fastNLP/core/vocabulary.py | 4 ++-- fastNLP/io/loader/matching.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 05f987c2..859dfb1f 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -595,7 +595,7 @@ class AutoPadder(Padder): max_len = max(map(len, contents)) tensor = torch.full((len(contents), max_len), fill_value=self.pad_val, dtype=field_ele_dtype) for i, content_i in enumerate(contents): - tensor[i, :len(content_i)] = torch.tensor(content_i) + tensor[i, :len(content_i)] = content_i.clone().detach() elif dim == 2: max_len = max(map(len, contents)) max_word_len = max([max([len(content_ii) for content_ii in content_i]) for @@ -604,7 +604,7 @@ class AutoPadder(Padder): dtype=field_ele_dtype) for i, content_i in enumerate(contents): for j, content_ii in enumerate(content_i): - tensor[i, j, :len(content_ii)] = torch.tensor(content_ii) + tensor[i, j, :len(content_ii)] = content_ii.clone().detach() else: shapes = set([np.shape(content_i) for content_i in contents]) if len(shapes) > 1: @@ -615,7 +615,7 @@ class AutoPadder(Padder): tensor = torch.full([len(contents)] + list(shape), fill_value=self.pad_val, dtype=field_ele_dtype) for i, content_i in enumerate(contents): - tensor[i] = torch.tensor(content_i, dtype=field_ele_dtype) + tensor[i] = content_i.clone().detach().to(field_ele_dtype) else: raise RuntimeError( f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 52d33a5a..cd4f2c0f 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -253,7 +253,7 @@ class Vocabulary(object): if self.unknown is not None: return self.word2idx[self.unknown] else: - raise ValueError("word {} not in vocabulary".format(w)) + raise ValueError("word `{}` not in vocabulary".format(w)) @_check_build_vocab def index_dataset(self, *datasets, field_name, new_field_name=None): @@ -360,7 +360,7 @@ class Vocabulary(object): try: dataset.apply(construct_vocab) except BaseException as e: - log("When processing the `{}` dataset, the following error occurred:".format(idx)) + logger.error("When processing the `{}` dataset, the following error occurred:".format(idx)) raise e else: raise TypeError("Only DataSet type is allowed.") diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py index 7f03ca3e..a21d0845 100644 --- a/fastNLP/io/loader/matching.py +++ b/fastNLP/io/loader/matching.py @@ -41,7 +41,7 @@ class MNLILoader(Loader): ds = DataSet() with open(path, 'r', encoding='utf-8') as f: f.readline() # 跳过header - if path.endswith("test.tsv"): + if path.endswith("test_matched.tsv") or path.endswith('test_mismatched.tsv'): warnings.warn("RTE's test file has no target.") for line in f: line = line.strip() From 0908c736ebc1a2afb9c36c908391943b08a45e95 Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Thu, 29 Aug 2019 16:38:17 +0800 Subject: [PATCH 124/153] fix code in BertModel.from_pretrained and BertEmbedding --- fastNLP/embeddings/bert_embedding.py | 20 +++++++------------- fastNLP/modules/encoder/bert.py | 12 +++++++----- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index e15c15f5..d1a5514a 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -17,8 +17,8 @@ import numpy as np from itertools import chain from ..core.vocabulary import Vocabulary -from ..io.file_utils import _get_embedding_url, cached_path, PRETRAINED_BERT_MODEL_DIR -from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer, _get_bert_dir +from ..io.file_utils import PRETRAINED_BERT_MODEL_DIR +from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer from .contextual_embedding import ContextualEmbedding import warnings from ..core import logger @@ -77,15 +77,12 @@ class BertEmbedding(ContextualEmbedding): " faster speed.") warnings.warn("For Chinese bert, pooled_method should choose from 'first', 'last' in order to achieve" " faster speed.") - - # 根据model_dir_or_name检查是否存在并下载 - model_dir = _get_bert_dir(model_dir_or_name) self._word_sep_index = None if '[SEP]' in vocab: self._word_sep_index = vocab['[SEP]'] - self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers, + self.model = _WordBertModel(model_dir_or_name=model_dir_or_name, vocab=vocab, layers=layers, pool_method=pool_method, include_cls_sep=include_cls_sep, pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2) @@ -170,11 +167,8 @@ class BertWordPieceEncoder(nn.Module): def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False, word_dropout=0, dropout=0, requires_grad: bool = False): super().__init__() - - # 根据model_dir_or_name检查是否存在并下载 - model_dir = _get_bert_dir(model_dir_or_name) - self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls) + self.model = _WordPieceBertModel(model_dir_or_name=model_dir_or_name, layers=layers, pooled_cls=pooled_cls) self._sep_index = self.model._sep_index self._wordpiece_pad_index = self.model._wordpiece_pad_index self._wordpiece_unk_index = self.model._wordpiece_unknown_index @@ -269,12 +263,12 @@ class BertWordPieceEncoder(nn.Module): class _WordBertModel(nn.Module): - def __init__(self, model_dir: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', + def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): super().__init__() - self.tokenzier = BertTokenizer.from_pretrained(model_dir) - self.encoder = BertModel.from_pretrained(model_dir) + self.tokenzier = BertTokenizer.from_pretrained(model_dir_or_name) + self.encoder = BertModel.from_pretrained(model_dir_or_name) self._max_position_embeddings = self.encoder.config.max_position_embeddings # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 89a1b09d..e73a8172 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -143,7 +143,7 @@ def _get_bert_dir(model_dir_or_name: str = 'en-base-uncased'): else: logger.error(f"Cannot recognize BERT dir or name ``{model_dir_or_name}``.") raise ValueError(f"Cannot recognize BERT dir or name ``{model_dir_or_name}``.") - return model_dir + return str(model_dir) class BertLayerNorm(nn.Module): @@ -453,6 +453,9 @@ class BertModel(nn.Module): if state_dict is None: weights_path = _get_file_name_base_on_postfix(pretrained_model_dir, '.bin') state_dict = torch.load(weights_path, map_location='cpu') + else: + logger.error(f'Cannot load parameters through `state_dict` variable.') + raise RuntimeError(f'Cannot load parameters through `state_dict` variable.') old_keys = [] new_keys = [] @@ -493,7 +496,7 @@ class BertModel(nn.Module): logger.warn("Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) - logger.info(f"Load pre-trained BERT parameters from dir {pretrained_model_dir}.") + logger.info(f"Load pre-trained BERT parameters from file {weights_path}.") return model @@ -854,9 +857,8 @@ class _WordPieceBertModel(nn.Module): def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=False): super().__init__() - self.model_dir = _get_bert_dir(model_dir_or_name) - self.tokenzier = BertTokenizer.from_pretrained(self.model_dir) - self.encoder = BertModel.from_pretrained(self.model_dir) + self.tokenzier = BertTokenizer.from_pretrained(model_dir_or_name) + self.encoder = BertModel.from_pretrained(model_dir_or_name) # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) From 9e6f4ffb8bf29020e7871f06eef4f8e0d32e3774 Mon Sep 17 00:00:00 2001 From: lyhuang18 <42239874+lyhuang18@users.noreply.github.com> Date: Fri, 30 Aug 2019 01:21:59 +0800 Subject: [PATCH 125/153] =?UTF-8?q?datasetloader=E6=94=B9=E6=88=90pipe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../text_classification/train_awdlstm.py | 23 ++++++++--------- .../text_classification/train_lstm.py | 25 ++++++++----------- .../text_classification/train_lstm_att.py | 25 ++++++++----------- 3 files changed, 32 insertions(+), 41 deletions(-) diff --git a/reproduction/text_classification/train_awdlstm.py b/reproduction/text_classification/train_awdlstm.py index b2a67fdb..7537e6f7 100644 --- a/reproduction/text_classification/train_awdlstm.py +++ b/reproduction/text_classification/train_awdlstm.py @@ -1,11 +1,9 @@ # 这个模型需要在pytorch=0.4下运行,weight_drop不支持1.0 -# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 -import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' +import sys +sys.path.append('../..') -from fastNLP.io.data_loader import IMDBLoader +from fastNLP.io.pipe.classification import IMDBPipe from fastNLP.embeddings import StaticEmbedding from model.awd_lstm import AWDLSTMSentiment @@ -32,15 +30,14 @@ opt=Config() # load data -dataloader=IMDBLoader() -datainfo=dataloader.process(opt.datapath) +data_bundle=IMDBPipe.process_from_file(opt.datapath) -# print(datainfo.datasets["train"]) -# print(datainfo) +# print(data_bundle.datasets["train"]) +# print(data_bundle) # define model -vocab=datainfo.vocabs['words'] +vocab=data_bundle.vocabs['words'] embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True) model=AWDLSTMSentiment(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, nfc=opt.nfc, wdrop=opt.wdrop) @@ -52,11 +49,11 @@ optimizer= Adam([param for param in model.parameters() if param.requires_grad==T def train(datainfo, model, optimizer, loss, metrics, opt): - trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1, + trainer = Trainer(data_bundle.datasets['train'], model, optimizer=optimizer, loss=loss, + metrics=metrics, dev_data=data_bundle.datasets['test'], device=0, check_code_level=-1, n_epochs=opt.train_epoch, save_path=opt.save_model_path) trainer.train() if __name__ == "__main__": - train(datainfo, model, optimizer, loss, metrics, opt) + train(data_bundle, model, optimizer, loss, metrics, opt) diff --git a/reproduction/text_classification/train_lstm.py b/reproduction/text_classification/train_lstm.py index 40f77061..a23be0cb 100644 --- a/reproduction/text_classification/train_lstm.py +++ b/reproduction/text_classification/train_lstm.py @@ -1,9 +1,7 @@ -# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 -import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' +import sys +sys.path.append('../..') -from fastNLP.io.data_loader import IMDBLoader +from fastNLP.io.pipe.classification import IMDBPipe from fastNLP.embeddings import StaticEmbedding from model.lstm import BiLSTMSentiment @@ -29,15 +27,14 @@ opt=Config() # load data -dataloader=IMDBLoader() -datainfo=dataloader.process(opt.datapath) +data_bundle=IMDBPipe.process_from_file(opt.datapath) -# print(datainfo.datasets["train"]) -# print(datainfo) +# print(data_bundle.datasets["train"]) +# print(data_bundle) # define model -vocab=datainfo.vocabs['words'] +vocab=data_bundle.vocabs['words'] embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True) model=BiLSTMSentiment(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, nfc=opt.nfc) @@ -48,12 +45,12 @@ metrics=AccuracyMetric() optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr) -def train(datainfo, model, optimizer, loss, metrics, opt): - trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1, +def train(data_bundle, model, optimizer, loss, metrics, opt): + trainer = Trainer(data_bundle.datasets['train'], model, optimizer=optimizer, loss=loss, + metrics=metrics, dev_data=data_bundle.datasets['test'], device=0, check_code_level=-1, n_epochs=opt.train_epoch, save_path=opt.save_model_path) trainer.train() if __name__ == "__main__": - train(datainfo, model, optimizer, loss, metrics, opt) \ No newline at end of file + train(data_bundle, model, optimizer, loss, metrics, opt) \ No newline at end of file diff --git a/reproduction/text_classification/train_lstm_att.py b/reproduction/text_classification/train_lstm_att.py index 1052f606..a2b8612d 100644 --- a/reproduction/text_classification/train_lstm_att.py +++ b/reproduction/text_classification/train_lstm_att.py @@ -1,9 +1,7 @@ -# 首先需要加入以下的路径到环境变量,因为当前只对内部测试开放,所以需要手动申明一下路径 -import os -os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' +import sys +sys.path.append('../..') -from fastNLP.io.data_loader import IMDBLoader +from fastNLP.io.pipe.classification import IMDBPipe from fastNLP.embeddings import StaticEmbedding from model.lstm_self_attention import BiLSTM_SELF_ATTENTION @@ -31,15 +29,14 @@ opt=Config() # load data -dataloader=IMDBLoader() -datainfo=dataloader.process(opt.datapath) +data_bundle=IMDBPipe.process_from_file(opt.datapath) -# print(datainfo.datasets["train"]) -# print(datainfo) +# print(data_bundle.datasets["train"]) +# print(data_bundle) # define model -vocab=datainfo.vocabs['words'] +vocab=data_bundle.vocabs['words'] embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-840b-300', requires_grad=True) model=BiLSTM_SELF_ATTENTION(init_embed=embed, num_classes=opt.num_classes, hidden_dim=opt.hidden_dim, num_layers=opt.num_layers, attention_unit=opt.attention_unit, attention_hops=opt.attention_hops, nfc=opt.nfc) @@ -50,12 +47,12 @@ metrics=AccuracyMetric() optimizer= Adam([param for param in model.parameters() if param.requires_grad==True], lr=opt.lr) -def train(datainfo, model, optimizer, loss, metrics, opt): - trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, - metrics=metrics, dev_data=datainfo.datasets['test'], device=0, check_code_level=-1, +def train(data_bundle, model, optimizer, loss, metrics, opt): + trainer = Trainer(data_bundle.datasets['train'], model, optimizer=optimizer, loss=loss, + metrics=metrics, dev_data=data_bundle.datasets['test'], device=0, check_code_level=-1, n_epochs=opt.train_epoch, save_path=opt.save_model_path) trainer.train() if __name__ == "__main__": - train(datainfo, model, optimizer, loss, metrics, opt) + train(data_bundle, model, optimizer, loss, metrics, opt) From 9529f89abd41ee7ef0d9e2e32596ef9ee1aedb1e Mon Sep 17 00:00:00 2001 From: yh Date: Fri, 30 Aug 2019 19:54:28 +0800 Subject: [PATCH 126/153] =?UTF-8?q?=E5=A2=9E=E5=8A=A0DataBundle=E7=9A=84?= =?UTF-8?q?=E6=96=B9=E6=B3=95=EF=BC=9B=E5=A2=9E=E5=8A=A0BilSTMCRF=E7=9A=84?= =?UTF-8?q?=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataset.py | 14 ++-- fastNLP/io/data_bundle.py | 72 +++++++++++++++- fastNLP/models/sequence_labeling.py | 83 ++++++++----------- .../seqence_labelling/ner/train_ontonote.py | 4 +- 4 files changed, 112 insertions(+), 61 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 51bcef43..551cf1f8 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -575,18 +575,18 @@ class DataSet(object): """ return len(self) - def rename_field(self, old_name, new_name): + def rename_field(self, field_name, new_field_name): """ 将某个field重新命名. - :param str old_name: 原来的field名称。 - :param str new_name: 修改为new_name。 + :param str field_name: 原来的field名称。 + :param str new_field_name: 修改为new_name。 """ - if old_name in self.field_arrays: - self.field_arrays[new_name] = self.field_arrays.pop(old_name) - self.field_arrays[new_name].name = new_name + if field_name in self.field_arrays: + self.field_arrays[new_field_name] = self.field_arrays.pop(field_name) + self.field_arrays[new_field_name].name = new_field_name else: - raise KeyError("DataSet has no field named {}.".format(old_name)) + raise KeyError("DataSet has no field named {}.".format(field_name)) return self def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index 969730a3..f30add34 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -139,9 +139,44 @@ class DataBundle: dataset.set_target(field_name, flag=flag, use_1st_ins_infer_dim_type=use_1st_ins_infer_dim_type) return self + def set_pad_val(self, field_name, pad_val, ignore_miss_dataset=True): + """ + 将DataBundle中所有的DataSet中名为field_name的Field的padding值设置为pad_val. + + :param str field_name: + :param int pad_val: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.set_pad_val(field_name=field_name, pad_val=pad_val) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self + + def set_ignore_type(self, *field_names, flag=True, ignore_miss_dataset=True): + """ + 将DataBundle中所有的DataSet中名为*field_names的Field的ignore_type设置为flag状态 + + :param str field_names: + :param bool flag: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for name, dataset in self.datasets.items(): + for field_name in field_names: + if dataset.has_field(field_name=field_name): + dataset.set_ignore_type(field_name, flag=flag) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self + def copy_field(self, field_name, new_field_name, ignore_miss_dataset=True): """ - 将DataBundle中所有的field_name复制一份叫new_field_name. + 将DataBundle中所有的DataSet中名为field_name的Field复制一份并命名为叫new_field_name. :param str field_name: :param str new_field_name: @@ -156,9 +191,42 @@ class DataBundle: raise KeyError(f"{field_name} not found DataSet:{name}.") return self + def rename_field(self, field_name, new_field_name, ignore_miss_dataset=True): + """ + 将DataBundle中所有DataSet中名为field_name的field重命名为new_field_name. + + :param str field_name: + :param str new_field_name: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.rename_field(field_name=field_name, new_field_name=new_field_name) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self + + def delete_field(self, field_name, ignore_miss_dataset=True): + """ + 将DataBundle中所有DataSet中名为field_name的field删除掉. + + :param str field_name: + :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; + 如果为False,则报错 + :return: self + """ + for name, dataset in self.datasets.items(): + if dataset.has_field(field_name=field_name): + dataset.delete_field(field_name=field_name) + elif not ignore_miss_dataset: + raise KeyError(f"{field_name} not found DataSet:{name}.") + return self + def apply_field(self, func, field_name:str, new_field_name:str, ignore_miss_dataset=True, **kwargs): """ - 对DataBundle中所有的dataset使用apply方法 + 对DataBundle中所有的dataset使用apply_field方法 :param callable func: input是instance中名为 `field_name` 的field的内容。 :param str field_name: 传入func的是哪个field。 diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py index 0dff21f0..0c573a90 100644 --- a/fastNLP/models/sequence_labeling.py +++ b/fastNLP/models/sequence_labeling.py @@ -4,7 +4,7 @@ __all__ = [ "SeqLabeling", "AdvSeqLabel", - # "BiLSTMCRF" + "BiLSTMCRF" ] import torch @@ -14,7 +14,6 @@ import torch.nn.functional as F from .base_model import BaseModel from ..core.const import Const as C from ..core.utils import seq_len_to_mask -from ..embeddings import embedding from ..embeddings import get_embeddings from ..modules import ConditionalRandomField from ..modules import LSTM @@ -24,18 +23,15 @@ from ..modules.decoder.crf import allowed_transitions class BiLSTMCRF(BaseModel): """ - 结构为BiLSTM + FC + Dropout + CRF. + 结构为embedding + BiLSTM + FC + Dropout + CRF. - .. todo:: - 继续补充文档 - - :param embed: tuple: - :param num_classes: - :param num_layers: - :param hidden_size: - :param dropout: - :param target_vocab: - :param encoding_type: + :param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100) + :param num_classes: 一共多少个类 + :param num_layers: BiLSTM的层数 + :param hidden_size: BiLSTM的hidden_size,实际hidden size为该值的两倍(前向、后向) + :param dropout: dropout的概率,0为不dropout + :param target_vocab: Vocabulary对象,target与index的对应关系 + :param encoding_type: encoding的类型,支持'bioes', 'bmes', 'bio', 'bmeso'等 """ def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5, target_vocab=None, encoding_type=None): @@ -86,21 +82,20 @@ class SeqLabeling(BaseModel): 一个基础的Sequence labeling的模型。 用于做sequence labeling的基础类。结构包含一层Embedding,一层LSTM(单向,一层),一层FC,以及一层CRF。 - :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), - 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), + 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, embedding, ndarray等则直接使用该值初始化Embedding :param int hidden_size: LSTM隐藏层的大小 :param int num_classes: 一共有多少类 """ - def __init__(self, init_embed, hidden_size, num_classes): + def __init__(self, embed, hidden_size, num_classes): super(SeqLabeling, self).__init__() - self.Embedding = embedding.Embedding(init_embed) - self.Rnn = encoder.LSTM(self.Embedding.embedding_dim, hidden_size) - self.Linear = nn.Linear(hidden_size, num_classes) - self.Crf = decoder.ConditionalRandomField(num_classes) - self.mask = None - + self.embedding = get_embeddings(embed) + self.rnn = encoder.LSTM(self.embedding.embedding_dim, hidden_size) + self.fc = nn.Linear(hidden_size, num_classes) + self.crf = decoder.ConditionalRandomField(num_classes) + def forward(self, words, seq_len, target): """ :param torch.LongTensor words: [batch_size, max_len],序列的index @@ -109,17 +104,14 @@ class SeqLabeling(BaseModel): :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. If truth is not None, return loss, a scalar. Used in training. """ - assert words.shape[0] == seq_len.shape[0] - assert target.shape == words.shape - self.mask = self._make_mask(words, seq_len) - - x = self.Embedding(words) + mask = seq_len_to_mask(seq_len, max_len=words.size(1)) + x = self.embedding(words) # [batch_size, max_len, word_emb_dim] - x, _ = self.Rnn(x, seq_len) + x, _ = self.rnn(x, seq_len) # [batch_size, max_len, hidden_size * direction] - x = self.Linear(x) + x = self.fc(x) # [batch_size, max_len, num_classes] - return {C.LOSS: self._internal_loss(x, target)} + return {C.LOSS: self._internal_loss(x, target, mask)} def predict(self, words, seq_len): """ @@ -129,18 +121,18 @@ class SeqLabeling(BaseModel): :param torch.LongTensor seq_len: [batch_size,] :return: {'pred': xx}, [batch_size, max_len] """ - self.mask = self._make_mask(words, seq_len) + mask = seq_len_to_mask(seq_len, max_len=words.size(1)) - x = self.Embedding(words) + x = self.embedding(words) # [batch_size, max_len, word_emb_dim] - x, _ = self.Rnn(x, seq_len) + x, _ = self.rnn(x, seq_len) # [batch_size, max_len, hidden_size * direction] - x = self.Linear(x) + x = self.fc(x) # [batch_size, max_len, num_classes] - pred = self._decode(x) + pred = self._decode(x, mask) return {C.OUTPUT: pred} - def _internal_loss(self, x, y): + def _internal_loss(self, x, y, mask): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] @@ -152,22 +144,15 @@ class SeqLabeling(BaseModel): y = y.long() assert x.shape[:2] == y.shape assert y.shape == self.mask.shape - total_loss = self.Crf(x, y, self.mask) + total_loss = self.crf(x, y, mask) return torch.mean(total_loss) - def _make_mask(self, x, seq_len): - batch_size, max_len = x.size(0), x.size(1) - mask = seq_len_to_mask(seq_len) - mask = mask.view(batch_size, max_len) - mask = mask.to(x).float() - return mask - - def _decode(self, x): + def _decode(self, x, mask): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] :return prediction: [batch_size, max_len] """ - tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) + tag_seq, _ = self.crf.viterbi_decode(x, mask) return tag_seq @@ -177,7 +162,7 @@ class AdvSeqLabel(nn.Module): 更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。 - :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding :param int hidden_size: LSTM的隐层大小 :param int num_classes: 有多少个类 @@ -188,11 +173,11 @@ class AdvSeqLabel(nn.Module): :param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。 """ - def __init__(self, init_embed, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): + def __init__(self, embed, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): super().__init__() - self.Embedding = embedding.Embedding(init_embed) + self.Embedding = get_embeddings(embed) self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim) self.Rnn = encoder.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2, dropout=dropout, diff --git a/reproduction/seqence_labelling/ner/train_ontonote.py b/reproduction/seqence_labelling/ner/train_ontonote.py index ee80b6f7..9fd13100 100644 --- a/reproduction/seqence_labelling/ner/train_ontonote.py +++ b/reproduction/seqence_labelling/ner/train_ontonote.py @@ -18,11 +18,9 @@ from fastNLP.io.pipe.conll import OntoNotesNERPipe #######hyper normalize = False -lower = False lr = 0.01 dropout = 0.5 batch_size = 32 -job_embed = False data_name = 'ontonote' #######hyper @@ -41,7 +39,7 @@ def cache(): word_dropout=0.01, dropout=dropout, lower=True, - min_freq=2) + min_freq=1) return data, char_embed, word_embed data, char_embed, word_embed = cache() From 82b5726686dcbac9f9a2032537f53c3eb77f7698 Mon Sep 17 00:00:00 2001 From: yunfan Date: Sat, 24 Aug 2019 13:59:30 +0800 Subject: [PATCH 127/153] update transformer --- fastNLP/modules/encoder/transformer.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py index ce9172d5..70b82bde 100644 --- a/fastNLP/modules/encoder/transformer.py +++ b/fastNLP/modules/encoder/transformer.py @@ -32,9 +32,10 @@ class TransformerEncoder(nn.Module): self.norm1 = nn.LayerNorm(model_size) self.ffn = nn.Sequential(nn.Linear(model_size, inner_size), nn.ReLU(), - nn.Linear(inner_size, model_size), - TimestepDropout(dropout), ) + nn.Dropout(dropout), + nn.Linear(inner_size, model_size)) self.norm2 = nn.LayerNorm(model_size) + self.dropout = nn.Dropout(dropout) def forward(self, input, seq_mask=None, atte_mask_out=None): """ @@ -43,17 +44,20 @@ class TransformerEncoder(nn.Module): :param seq_mask: [batch, seq_len] :return: [batch, seq_len, model_size] """ + input = self.norm1(input) attention = self.atte(input, input, input, atte_mask_out) - norm_atte = self.norm1(attention + input) - attention *= seq_mask - output = self.ffn(norm_atte) - output = self.norm2(output + norm_atte) - output *= seq_mask + input = input + self.dropout(attention) + # attention *= seq_mask + input = self.norm2(input) + output = self.ffn(input) + input = input + self.dropout(output) + # output *= seq_mask return output def __init__(self, num_layers, **kargs): super(TransformerEncoder, self).__init__() self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)]) + self.norm = nn.LayerNorm(kargs['model_size']) def forward(self, x, seq_mask=None): """ @@ -70,4 +74,4 @@ class TransformerEncoder(nn.Module): seq_mask = seq_mask[:, :, None] for layer in self.layers: output = layer(output, seq_mask, atte_mask_out) - return output + return self.norm(output) From 44af647839fe99f69b9364457ff3636df6367204 Mon Sep 17 00:00:00 2001 From: yunfan Date: Thu, 29 Aug 2019 20:19:13 +0800 Subject: [PATCH 128/153] [update] change data-loader to pipe --- .../text_classification/train_dpcnn.py | 30 +++++-------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py index f3f4e231..c7f5751c 100644 --- a/reproduction/text_classification/train_dpcnn.py +++ b/reproduction/text_classification/train_dpcnn.py @@ -8,21 +8,18 @@ from fastNLP.core.trainer import Trainer from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP.embeddings import StaticEmbedding from reproduction.text_classification.model.dpcnn import DPCNN -from fastNLP.io.data_loader import YelpLoader from fastNLP.core.sampler import BucketSampler from fastNLP.core import LRScheduler from fastNLP.core.const import Const as C from fastNLP.core.vocabulary import VocabularyOption -from fastNLP.core.dist_trainer import DistTrainer from utils.util_init import set_rng_seeds from fastNLP import logger import os -# os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' -# os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' +from fastNLP.io import YelpFullPipe, YelpPolarityPipe + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # hyper logger.add_file('log', 'INFO') -print(logger.handlers) class Config(): seed = 12345 @@ -50,18 +47,14 @@ class Config(): ops = Config() set_rng_seeds(ops.seed) -# print('RNG SEED: {}'.format(ops.seed)) logger.info('RNG SEED %d'%ops.seed) # 1.task相关信息:利用dataloader载入dataInfo -#datainfo=SSTLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train']) - @cache_results(ops.model_dir_or_name+'-data-cache') def load_data(): - datainfo = YelpLoader(fine_grained=True, lower=True).process( - paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op) + datainfo = YelpFullPipe(lower=True, tokenizer='raw').process_from_file(ops.datapath) for ds in datainfo.datasets.values(): ds.apply_field(len, C.INPUT, C.INPUT_LEN) ds.set_input(C.INPUT, C.INPUT_LEN) @@ -79,11 +72,8 @@ print(embedding.embedding.weight.data.mean(), embedding.embedding.weight.data.st # 2.或直接复用fastNLP的模型 -# embedding = StackEmbedding([StaticEmbedding(vocab), CNNCharEmbedding(vocab, 100)]) -datainfo.datasets['train'] = datainfo.datasets['train'][:1000] -datainfo.datasets['test'] = datainfo.datasets['test'][:1000] -# print(datainfo) -# print(datainfo.datasets['train'][0]) +# datainfo.datasets['train'] = datainfo.datasets['train'][:1000] # for debug purpose +# datainfo.datasets['test'] = datainfo.datasets['test'][:1000] logger.info(datainfo) model = DPCNN(init_embed=embedding, num_cls=len(datainfo.vocabs[C.TARGET]), @@ -99,14 +89,7 @@ optimizer = SGD([param for param in model.parameters() if param.requires_grad == callbacks = [] callbacks.append(LRScheduler(CosineAnnealingLR(optimizer, 5))) -# callbacks.append( -# LRScheduler(LambdaLR(optimizer, lambda epoch: ops.lr if epoch < -# ops.train_epoch * 0.8 else ops.lr * 0.1)) -# ) -# callbacks.append( -# FitlogCallback(data=datainfo.datasets, verbose=1) -# ) device = 'cuda:0' if torch.cuda.is_available() else 'cpu' @@ -114,12 +97,15 @@ device = 'cuda:0' if torch.cuda.is_available() else 'cpu' logger.info(device) # 4.定义train方法 +# normal trainer trainer = Trainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, sampler=BucketSampler(num_buckets=50, batch_size=ops.batch_size), metrics=[metric], use_tqdm=False, save_path='save', dev_data=datainfo.datasets['test'], device=device, check_code_level=-1, batch_size=ops.batch_size, callbacks=callbacks, n_epochs=ops.train_epoch, num_workers=4) + +# distributed trainer # trainer = DistTrainer(datainfo.datasets['train'], model, optimizer=optimizer, loss=loss, # metrics=[metric], # dev_data=datainfo.datasets['test'], device='cuda', From bbda73c14f2352583f1a89bafdd1ff7471543cc4 Mon Sep 17 00:00:00 2001 From: yunfan Date: Fri, 30 Aug 2019 21:48:00 +0800 Subject: [PATCH 129/153] [update] transformer --- fastNLP/modules/encoder/attention.py | 39 +++++++++++--------------- fastNLP/modules/encoder/transformer.py | 17 ++++++----- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/fastNLP/modules/encoder/attention.py b/fastNLP/modules/encoder/attention.py index 02bd078a..6a973864 100644 --- a/fastNLP/modules/encoder/attention.py +++ b/fastNLP/modules/encoder/attention.py @@ -30,14 +30,14 @@ class DotAttention(nn.Module): def forward(self, Q, K, V, mask_out=None): """ - :param Q: [batch, seq_len_q, key_size] - :param K: [batch, seq_len_k, key_size] - :param V: [batch, seq_len_k, value_size] - :param mask_out: [batch, 1, seq_len] or [batch, seq_len_q, seq_len_k] + :param Q: [..., seq_len_q, key_size] + :param K: [..., seq_len_k, key_size] + :param V: [..., seq_len_k, value_size] + :param mask_out: [..., 1, seq_len] or [..., seq_len_q, seq_len_k] """ - output = torch.matmul(Q, K.transpose(1, 2)) / self.scale + output = torch.matmul(Q, K.transpose(-1, -2)) / self.scale if mask_out is not None: - output.masked_fill_(mask_out, -1e18) + output.masked_fill_(mask_out, -1e9) output = self.softmax(output) output = self.drop(output) return torch.matmul(output, V) @@ -65,17 +65,16 @@ class MultiHeadAttention(nn.Module): self.q_in = nn.Linear(input_size, in_size) self.k_in = nn.Linear(input_size, in_size) self.v_in = nn.Linear(input_size, in_size) - # follow the paper, do not apply dropout within dot-product self.attention = DotAttention(key_size=key_size, value_size=value_size, dropout=dropout) self.out = nn.Linear(value_size * num_head, input_size) self.reset_parameters() def reset_parameters(self): sqrt = math.sqrt - nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size))) - nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size))) - nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.value_size))) - nn.init.xavier_normal_(self.out.weight) + nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(1.0 / self.input_size)) + nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(1.0 / self.input_size)) + nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(1.0 / self.input_size)) + nn.init.normal_(self.out.weight, mean=0, std=sqrt(1.0 / self.input_size)) def forward(self, Q, K, V, atte_mask_out=None): """ @@ -89,20 +88,16 @@ class MultiHeadAttention(nn.Module): sk = K.size(1) d_k, d_v, n_head = self.key_size, self.value_size, self.num_head # input linear - q = self.q_in(Q).view(batch, sq, n_head, d_k) - k = self.k_in(K).view(batch, sk, n_head, d_k) - v = self.v_in(V).view(batch, sk, n_head, d_v) - - # transpose q, k and v to do batch attention - q = q.permute(2, 0, 1, 3).contiguous().view(-1, sq, d_k) - k = k.permute(2, 0, 1, 3).contiguous().view(-1, sk, d_k) - v = v.permute(2, 0, 1, 3).contiguous().view(-1, sk, d_v) + q = self.q_in(Q).view(batch, sq, n_head, d_k).transpose(1, 2) + k = self.k_in(K).view(batch, sk, n_head, d_k).transpose(1, 2) + v = self.v_in(V).view(batch, sk, n_head, d_v).transpose(1, 2) + if atte_mask_out is not None: - atte_mask_out = atte_mask_out.repeat(n_head, 1, 1) - atte = self.attention(q, k, v, atte_mask_out).view(n_head, batch, sq, d_v) + atte_mask_out = atte_mask_out[:,None,:,:] # [bsz,1,1,len] + atte = self.attention(q, k, v, atte_mask_out).view(batch, n_head, sq, d_v) # concat all heads, do output linear - atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1) + atte = atte.transpose(1, 2).contiguous().view(batch, sq, -1) output = self.out(atte) return output diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py index 70b82bde..d8a612a0 100644 --- a/fastNLP/modules/encoder/transformer.py +++ b/fastNLP/modules/encoder/transformer.py @@ -5,8 +5,7 @@ __all__ = [ ] from torch import nn -from fastNLP.modules.encoder.attention import MultiHeadAttention -from ..dropout import TimestepDropout +from .attention import MultiHeadAttention class TransformerEncoder(nn.Module): @@ -29,12 +28,12 @@ class TransformerEncoder(nn.Module): def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1): super(TransformerEncoder.SubLayer, self).__init__() self.atte = MultiHeadAttention(model_size, key_size, value_size, num_head, dropout) - self.norm1 = nn.LayerNorm(model_size) + self.norm1 = nn.LayerNorm(model_size, eps=1e-6) self.ffn = nn.Sequential(nn.Linear(model_size, inner_size), nn.ReLU(), nn.Dropout(dropout), nn.Linear(inner_size, model_size)) - self.norm2 = nn.LayerNorm(model_size) + self.norm2 = nn.LayerNorm(model_size, eps=1e-6) self.dropout = nn.Dropout(dropout) def forward(self, input, seq_mask=None, atte_mask_out=None): @@ -47,17 +46,17 @@ class TransformerEncoder(nn.Module): input = self.norm1(input) attention = self.atte(input, input, input, atte_mask_out) input = input + self.dropout(attention) - # attention *= seq_mask + attention *= seq_mask input = self.norm2(input) output = self.ffn(input) input = input + self.dropout(output) - # output *= seq_mask - return output + input *= seq_mask + return input def __init__(self, num_layers, **kargs): super(TransformerEncoder, self).__init__() self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)]) - self.norm = nn.LayerNorm(kargs['model_size']) + self.norm = nn.LayerNorm(kargs['model_size'], eps=1e-6) def forward(self, x, seq_mask=None): """ @@ -70,7 +69,7 @@ class TransformerEncoder(nn.Module): if seq_mask is None: atte_mask_out = None else: - atte_mask_out = (seq_mask < 1)[:, None, :] + atte_mask_out = (seq_mask == 0)[:, None, :] seq_mask = seq_mask[:, :, None] for layer in self.layers: output = layer(output, seq_mask, atte_mask_out) From 4440801dbfc9bea20a86be6ceeb1431f5d020681 Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Sun, 1 Sep 2019 01:19:10 +0800 Subject: [PATCH 130/153] 1. update bert.py and fix a bug in bert_embedding to adapt torch 1.2.0; 2. update models/bert.py and add BertForSentenceMatching model, now a BertEmbedding param should be passed to these five models; 3. create a small bert version for testing and modify test/models/test_bert.py; 4. move small glove and word2vec files to data_for_tests/embedding/small_static_embedding dir and fix relevant test codes; 5. delete some __init__.py files in test dir. --- fastNLP/embeddings/bert_embedding.py | 2 +- fastNLP/models/bert.py | 373 ++++++------------ fastNLP/modules/encoder/bert.py | 4 +- test/__init__.py | 3 - .../embedding/small_bert/config.json | 13 + .../small_bert/small_pytorch_model.bin | Bin 0 -> 37965 bytes .../embedding/small_bert/vocab.txt | 20 + .../glove.6B.50d_test.txt | 0 .../small_static_embedding}/word2vec_test.txt | 0 test/embeddings/__init__.py | 0 test/embeddings/test_bert_embedding.py | 11 +- test/embeddings/test_static_embedding.py | 6 +- test/io/test_embed_loader.py | 8 +- test/models/__init__.py | 0 test/models/test_bert.py | 86 ++-- test/modules/__init__.py | 0 test/modules/decoder/__init__.py | 0 17 files changed, 225 insertions(+), 301 deletions(-) delete mode 100644 test/__init__.py create mode 100644 test/data_for_tests/embedding/small_bert/config.json create mode 100644 test/data_for_tests/embedding/small_bert/small_pytorch_model.bin create mode 100644 test/data_for_tests/embedding/small_bert/vocab.txt rename test/data_for_tests/{ => embedding/small_static_embedding}/glove.6B.50d_test.txt (100%) rename test/data_for_tests/{ => embedding/small_static_embedding}/word2vec_test.txt (100%) delete mode 100644 test/embeddings/__init__.py delete mode 100644 test/models/__init__.py delete mode 100644 test/modules/__init__.py delete mode 100644 test/modules/decoder/__init__.py diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index d1a5514a..f6c36623 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -393,7 +393,7 @@ class _WordBertModel(nn.Module): batch_indexes = torch.arange(batch_size).to(words) word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids - sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len + sep_mask = word_pieces.eq(self._sep_index).long() # batch_size x max_len sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) token_type_ids = sep_mask_cumsum.fmod(2) if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py index 0a89b765..08f16db2 100644 --- a/fastNLP/models/bert.py +++ b/fastNLP/models/bert.py @@ -5,253 +5,145 @@ bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed __all__ = [] -import os +import warnings import torch from torch import nn from .base_model import BaseModel from ..core.const import Const -from ..core.utils import seq_len_to_mask +from ..core._logger import logger from ..modules.encoder import BertModel from ..modules.encoder.bert import BertConfig, CONFIG_FILE +from ..embeddings.bert_embedding import BertEmbedding class BertForSequenceClassification(BaseModel): """BERT model for classification. - This module is composed of the BERT model with a linear layer on top of - the pooled output. - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_labels`: the number of classes for the classifier. Default = 2. - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] - with indices selected in [0, ..., num_labels]. - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, num_labels]. - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - num_labels = 2 - model = BertForSequenceClassification(num_labels, config) - logits = model(input_ids, token_type_ids, input_mask) - ``` """ - def __init__(self, num_labels, config=None, bert_dir=None): + def __init__(self, init_embed: BertEmbedding, num_labels: int=2): super(BertForSequenceClassification, self).__init__() + self.num_labels = num_labels - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - config = BertConfig(os.path.join(bert_dir, CONFIG_FILE)) - else: - if config is None: - config = BertConfig(30522) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, num_labels) - - @classmethod - def from_pretrained(cls, num_labels, pretrained_model_dir): - config = BertConfig(pretrained_model_dir) - model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir) - return model - - def forward(self, words, seq_len=None, target=None): - if seq_len is None: - seq_len = torch.ones_like(words, dtype=words.dtype, device=words.device) - if len(seq_len.size()) + 1 == len(words.size()): - seq_len = seq_len_to_mask(seq_len, max_len=words.size(-1)) - _, pooled_output = self.bert(words, attention_mask=seq_len, output_all_encoded_layers=False) - pooled_output = self.dropout(pooled_output) - logits = self.classifier(pooled_output) + self.bert = init_embed + self.dropout = nn.Dropout(0.1) + self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) + + if not self.bert.model.include_cls_sep: + warn_msg = "Bert for sequence classification excepts BertEmbedding `include_cls_sep` True, but got False." + logger.warn(warn_msg) + warnings.warn(warn_msg) + + def forward(self, words): + hidden = self.dropout(self.bert(words)) + cls_hidden = hidden[:, 0] + logits = self.classifier(cls_hidden) + + return {Const.OUTPUT: logits} + + def predict(self, words): + logits = self.forward(words)[Const.OUTPUT] + return {Const.OUTPUT: torch.argmax(logits, dim=-1)} + + +class BertForSentenceMatching(BaseModel): + + """BERT model for matching. + """ + def __init__(self, init_embed: BertEmbedding, num_labels: int=2): + super(BertForSentenceMatching, self).__init__() + self.num_labels = num_labels + self.bert = init_embed + self.dropout = nn.Dropout(0.1) + self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) + + if not self.bert.model.include_cls_sep: + error_msg = "Bert for sentence matching excepts BertEmbedding `include_cls_sep` True, but got False." + logger.error(error_msg) + raise RuntimeError(error_msg) - if target is not None: - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(logits, target) - return {Const.OUTPUT: logits, Const.LOSS: loss} - else: - return {Const.OUTPUT: logits} + def forward(self, words): + hidden = self.dropout(self.bert(words)) + cls_hidden = hidden[:, 0] + logits = self.classifier(cls_hidden) - def predict(self, words, seq_len=None): - logits = self.forward(words, seq_len=seq_len)[Const.OUTPUT] + return {Const.OUTPUT: logits} + + def predict(self, words): + logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForMultipleChoice(BaseModel): """BERT model for multiple choice tasks. - This module is composed of the BERT model with a linear layer on top of - the pooled output. - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_choices`: the number of classes for the classifier. Default = 2. - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] - with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` - and type 1 corresponds to a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size] - with indices selected in [0, ..., num_choices]. - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, num_labels]. - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) - input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) - token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - num_choices = 2 - model = BertForMultipleChoice(num_choices, config, bert_dir) - logits = model(input_ids, token_type_ids, input_mask) - ``` """ - def __init__(self, num_choices, config=None, bert_dir=None): + def __init__(self, init_embed: BertEmbedding, num_choices=2): super(BertForMultipleChoice, self).__init__() + self.num_choices = num_choices - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - if config is None: - config = BertConfig(30522) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, 1) - - @classmethod - def from_pretrained(cls, num_choices, pretrained_model_dir): - config = BertConfig(pretrained_model_dir) - model = cls(num_choices=num_choices, config=config, bert_dir=pretrained_model_dir) - return model - - def forward(self, words, seq_len1=None, seq_len2=None, target=None): - input_ids, token_type_ids, attention_mask = words, seq_len1, seq_len2 - flat_input_ids = input_ids.view(-1, input_ids.size(-1)) - flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) - flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) - _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False) + self.bert = init_embed + self.dropout = nn.Dropout(0.1) + self.classifier = nn.Linear(self.bert.embedding_dim, 1) + self.include_cls_sep = init_embed.model.include_cls_sep + + if not self.bert.model.include_cls_sep: + error_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, but got False." + logger.error(error_msg) + raise RuntimeError(error_msg) + + def forward(self, words): + """ + :param torch.Tensor words: [batch_size, num_choices, seq_len] + :return: [batch_size, num_labels] + """ + batch_size, num_choices, seq_len = words.size() + + input_ids = words.view(batch_size * num_choices, seq_len) + hidden = self.bert(input_ids) + pooled_output = hidden[:, 0] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, self.num_choices) - if target is not None: - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct(reshaped_logits, target) - return {Const.OUTPUT: reshaped_logits, Const.LOSS: loss} - else: - return {Const.OUTPUT: reshaped_logits} + return {Const.OUTPUT: reshaped_logits} - def predict(self, words, seq_len1=None, seq_len2=None,): - logits = self.forward(words, seq_len1=seq_len1, seq_len2=seq_len2)[Const.OUTPUT] + def predict(self, words): + logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForTokenClassification(BaseModel): """BERT model for token-level classification. - This module is composed of the BERT model with a linear layer on top of - the full hidden state of the last layer. - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `num_labels`: the number of classes for the classifier. Default = 2. - `bert_dir`: a dir which contains the bert parameters within file `pytorch_model.bin` - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] - with indices selected in [0, ..., num_labels]. - Outputs: - if `labels` is not `None`: - Outputs the CrossEntropy classification loss of the output with the labels. - if `labels` is `None`: - Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - num_labels = 2 - bert_dir = 'your-bert-file-dir' - model = BertForTokenClassification(num_labels, config, bert_dir) - logits = model(input_ids, token_type_ids, input_mask) - ``` """ - def __init__(self, num_labels, config=None, bert_dir=None): + def __init__(self, init_embed: BertEmbedding, num_labels): super(BertForTokenClassification, self).__init__() + self.num_labels = num_labels - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - if config is None: - config = BertConfig(30522) - self.bert = BertModel(config) - self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.classifier = nn.Linear(config.hidden_size, num_labels) - - @classmethod - def from_pretrained(cls, num_labels, pretrained_model_dir): - config = BertConfig(pretrained_model_dir) - model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir) - return model - - def forward(self, words, seq_len1=None, seq_len2=None, target=None): - sequence_output, _ = self.bert(words, seq_len1, seq_len2, output_all_encoded_layers=False) + self.bert = init_embed + self.dropout = nn.Dropout(0.1) + self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) + self.include_cls_sep = init_embed.model.include_cls_sep + + if self.include_cls_sep: + warn_msg = "Bert for token classification excepts BertEmbedding `include_cls_sep` False, but got True." + warnings.warn(warn_msg) + logger.warn(warn_msg) + + def forward(self, words): + """ + :param torch.Tensor words: [batch_size, seq_len] + :return: [batch_size, seq_len, num_labels] + """ + sequence_output = self.bert(words) + if self.include_cls_sep: + sequence_output = sequence_output[:, 1: -1] # [batch_size, seq_len, embed_dim] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) - if target is not None: - loss_fct = nn.CrossEntropyLoss() - # Only keep active parts of the loss - if seq_len2 is not None: - active_loss = seq_len2.view(-1) == 1 - active_logits = logits.view(-1, self.num_labels)[active_loss] - active_labels = target.view(-1)[active_loss] - loss = loss_fct(active_logits, active_labels) - else: - loss = loss_fct(logits.view(-1, self.num_labels), target.view(-1)) - return {Const.OUTPUT: logits, Const.LOSS: loss} - else: - return {Const.OUTPUT: logits} - - def predict(self, words, seq_len1=None, seq_len2=None): - logits = self.forward(words, seq_len1, seq_len2)[Const.OUTPUT] + return {Const.OUTPUT: logits} + + def predict(self, words): + logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} @@ -298,53 +190,24 @@ class BertForQuestionAnswering(BaseModel): start_logits, end_logits = model(input_ids, token_type_ids, input_mask) ``` """ - def __init__(self, config=None, bert_dir=None): + def __init__(self, init_embed: BertEmbedding, num_labels=2): super(BertForQuestionAnswering, self).__init__() - if bert_dir is not None: - self.bert = BertModel.from_pretrained(bert_dir) - else: - if config is None: - config = BertConfig(30522) - self.bert = BertModel(config) - # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version - # self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.qa_outputs = nn.Linear(config.hidden_size, 2) - - @classmethod - def from_pretrained(cls, pretrained_model_dir): - config = BertConfig(pretrained_model_dir) - model = cls(config=config, bert_dir=pretrained_model_dir) - return model - - def forward(self, words, seq_len1=None, seq_len2=None, target1=None, target2=None): - sequence_output, _ = self.bert(words, seq_len1, seq_len2, output_all_encoded_layers=False) - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - - if target1 is not None and target2 is not None: - # If we are on multi-GPU, split add a dimension - if len(target1.size()) > 1: - target1 = target1.squeeze(-1) - if len(target2.size()) > 1: - target2 = target2.squeeze(-1) - # sometimes the start/end positions are outside our model inputs, we ignore these terms - ignored_index = start_logits.size(1) - target1.clamp_(0, ignored_index) - target2.clamp_(0, ignored_index) - - loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, target1) - end_loss = loss_fct(end_logits, target2) - total_loss = (start_loss + end_loss) / 2 - return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits, Const.LOSS: total_loss} - else: - return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits} - - def predict(self, words, seq_len1=None, seq_len2=None): - logits = self.forward(words, seq_len1, seq_len2) - start_logits = logits[Const.OUTPUTS(0)] - end_logits = logits[Const.OUTPUTS(1)] - return {Const.OUTPUTS(0): torch.argmax(start_logits, dim=-1), - Const.OUTPUTS(1): torch.argmax(end_logits, dim=-1)} + + self.bert = init_embed + self.num_labels = num_labels + self.qa_outputs = nn.Linear(self.bert.embedding_dim, self.num_labels) + + if not self.bert.model.include_cls_sep: + error_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, but got False." + logger.error(error_msg) + raise RuntimeError(error_msg) + + def forward(self, words): + sequence_output = self.bert(words) + logits = self.qa_outputs(sequence_output) # [batch_size, seq_len, num_labels] + + return {Const.OUTPUTS(i): logits[:, :, i] for i in range(self.num_labels)} + + def predict(self, words): + logits = self.forward(words) + return {Const.OUTPUTS(i): torch.argmax(logits[Const.OUTPUTS(i)], dim=-1) for i in range(self.num_labels)} diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index e73a8172..6f6c4291 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -435,14 +435,14 @@ class BertModel(nn.Module): return encoded_layers, pooled_output @classmethod - def from_pretrained(cls, pretrained_model_dir_or_name, *inputs, **kwargs): + def from_pretrained(cls, model_dir_or_name, *inputs, **kwargs): state_dict = kwargs.get('state_dict', None) kwargs.pop('state_dict', None) kwargs.pop('cache_dir', None) kwargs.pop('from_tf', None) # get model dir from name or dir - pretrained_model_dir = _get_bert_dir(pretrained_model_dir_or_name) + pretrained_model_dir = _get_bert_dir(model_dir_or_name) # Load config config_file = _get_file_name_base_on_postfix(pretrained_model_dir, '.json') diff --git a/test/__init__.py b/test/__init__.py deleted file mode 100644 index c7a5f082..00000000 --- a/test/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -import fastNLP - -__all__ = ["fastNLP"] diff --git a/test/data_for_tests/embedding/small_bert/config.json b/test/data_for_tests/embedding/small_bert/config.json new file mode 100644 index 00000000..3e516872 --- /dev/null +++ b/test/data_for_tests/embedding/small_bert/config.json @@ -0,0 +1,13 @@ +{ + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 16, + "initializer_range": 0.02, + "intermediate_size": 64, + "max_position_embeddings": 32, + "num_attention_heads": 4, + "num_hidden_layers": 2, + "type_vocab_size": 2, + "vocab_size": 20 +} \ No newline at end of file diff --git a/test/data_for_tests/embedding/small_bert/small_pytorch_model.bin b/test/data_for_tests/embedding/small_bert/small_pytorch_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fe968fb5d64a87b224d0ed9d793e6bf3aeb70971 GIT binary patch literal 37965 zcmd43c|29^_y3PjhL9o3kR*-f@tkXIMMWj0)1*N~R8ly{ku=?foKO)`5}_hVNebs$ zyFo>BZYpV>=TsUs{x+xk{eItlZlB-#_kDc-_<20;wbx!<`&`fS+Sk7Jy4K$3UZs$z zy71dJk5>#NEI#nItGvm%0*21N)vtq>~q_f_J85%a@DVxf|kf^R2o zFE~^zRQ6KzRc!l=u+VwJ&R&X`5i-N>KB2}S62TjC5p%G!CS&Qvd zg`N8M_wC#U$$gmryda^X>=0Ftg<<~U39_K7P{~(=Ys0g5apJkS*g4xdI5|2AmD|71 zobX6tXYZl%W5vQQxaerInv1rAQP4bHR;PEQ8cJ)yl@eX-C$AnYMy=qV8z3WP?!ga3g+92OE3 zI;;H*|Id?Q{ErkajhpG&W4M$SO)U3i@~0v9mMJPB?(;Sc~g;U|YvpYlMTm>nYt@32cpp1qUM;a8WAzq)j4>(W`)rHfd|6A1aT zE{Q}40wLpT^RF&L=Y(-1#A;#Nm$b66@)vUhwe7T9MFcIJYbA^f5=H;@y|a9o{}LjgO^7)% zfIzV@NFbam1DGcf&KC%S|Lak)*neSU&|jlsL5C2I_D(j!kiVo@_%A6I$)pICk%Wnb z;R2yhMk103BLqV6e|<`1B<*5EcF1AxXyYJU{7a0eUt%n26C+xNuv9EuCJ-){A*_%H zR|G5{0{S%L6~4B)6lcuXKX?%ThkGxE1_`RfK+-BIVZ zE{@y{^u%8hoNPZ0UPI%Br#L0JTX&6&pjIqAEfAiO5uB9>&k2O*egB=t{Wg34jiCL@ zU+A!ZCui<%dhypNxb$lj)U|#2KV@Cli-ipW;bmFZS0uu#0%2pv>i_Ece_RjOIs|ZV zaj_9L{Q|iD3*bf@z)cyzEwS*nKzK(6a91L{ClKEMkNoYQzx%%d+Q0vU4!gIvv2zeU z{MG%VU)?`$>;8$X`=?^zGlB5Atos)d;Y)$=RY%j4?f;rhG^f+Xk-QTN-wT8vWF#LY!cPL>=Z>z&4oKR?XzhT5o38D-tmexvF~0s1<6D~; z-(?6t#KNBfk(|7oTswtGp7SIk1YOJLb#0GoaJHin{6BR2EA1^-JjUtg&TZ{HgSk#SkW%7x%IhRvVq{D4xgu0w37U^+TAnMLp8KFMsNkj(i(H@-r zugBPb<|d+^+>sqn+H#8o2N#iHTZ>;}8gc%=#5Cr1WMZ0do(#1Y=ZQt8oE3KW1tRv@%MNXU*h@3eqgLUCNiHO%8<#YDGIro2qZM&{TggdfBRA+9A5kXrE znWPNo{*u&{bK4{x!fj=gLpe_@a^tK(nG(^CY5i?NJ}jc9cN< zudMB=j^~d4M^c`xoxQz{gN;bg)4YLuT_!%^BQr|H#-@8?fPSup+;<7P34`G?Vk&Alp(c(JXF9Hb`f4 zo($BV^TeV6&I&|xI4c7Udn* z@y~RQwoZ;*okJAS*5a3lV$N?r(_FG9isW|M^EJ_8&XXZVah_PTgtG!sG-qXqOF2&> zTGk$2&e@KVxZe=lam5w7Pv zv1kKl1)^lm$_P_9Pa@jb9!=$JM`_@{)cg0m)2?6|cT7io&bHj*U6kI|LdLg=bARz= zaBdslW^OCv+roKbQ6^^vqOF{j@nvzIM6|6vn$1~XgTEFD;bCD5xpLEgE}TUTA&X)~1?yAk>c4Sv=KIh4hc5t3pw3D*}(Js!)kP0|YA}VZ;?&hp- z_kTocm#2vP_YQ#^xiXJvPg@I_JjI;*OP;-)+a^y5x0SK%<2LiOK* zZMSJ**WZF|_d}t^--GSbx={1CV0&#;>$hNA+Q*f=|MoR`?S)(I--7LBWSzeS?QGi@ zC%V4{+m}sxzXjWua@~Imwl6UCe+#y+dJX;_v}<37_xLT?ULxrETd=)!Vfb6Hy|7~R zTd=+GWBm7^eS2NX2JaIQe*Gmg6&mMv%d!&+N-mDehapjj?MoT zlr12Heg8A~>y{|A_>bUUDTlCMkd>SQcOi3shN>)Wj%Yyt4eE48y$#W1TOl=Z3dv;> zFvKwvYgSg$ye@J~e&s!AKC~QDHr(LPaLvcG^OwQAfPh-07Q5`!7W{nRI*i*H2X=6j zGzB=|CgnHOa^M(dM#2)ZZ0}fV`b>d+V*3b>8(T6OPNwiIWHVMyeU8JwAHbRK@=*U| z6R4yn!>;=eNyG|Im~K=GDnrZYN%w=Ol^KF>($?TwuWeAS_k*n7^OUCS8-)+FTczfX zrF7@9qvXWeL^$_aBHgt^dPN!Y9;KKwxA$~r^1zBMjJr;XuK8kW zvkHV5AHlmFUUxKO3`Rf2?n#QwM>{k8VEYP& zD93^Kd3D@W)d};gc=%vdGG6&^3Rjo+!jRiWU^JtGJ{g$^l8AvYeMSYztr$vDCp7b? z_|AZVUuKeT*RAO)%M9Xv+LGKEU`Be*G9{<<8HjR}1J`&t*41db)F;IrN4vZvmtTD& zfo5A^z=kl=l5iKbf{){=bCYqY{TjIW?KIh;vH?fU1E@Z6U#j{un1lv|psMK>yypO* z`NRm$e_2))uel9ouC2iHGWaL&IACguCNt)eFH9V`m5!}DKqQ{_Y->|C>{XZx+x!O6N$E3* z;*5Qur|$&UVx8DnL2uHpY$3^RDuu$pKQW?u39$2{VBQcD=Fd;@`1FnrCd`S%`xkXF z#VwpJOFDoCo+rqtL`A0TNdxto?Zon)s<0;_6p;@4B6Yjd8*Z8$qd9RV*bD|Z*Fc-y z^5=aTjdh^-&5=1AyO*EuZq6njwuDmQSNbDUPTHB7j}PA+#f0&v@mBE*v{5%hkD6LY zU%L@1(q7Qe=3d~~@(1Rxn2iJKccUc61>bALqF85nA+tGt#bYZ zpDy2qiUf035StB>LJcfm+bmr*h!2~VmXirl=SYv@MEErNGTxr(3Q_yKA&^X9dWTy> zdggWf@VpWJNKS_V(^F~n?HuY@l8-<7|AE2VHp7atG=9%wb*PBhK_ZN-nR%A|Fi5Es zqe4&MChZ~Yqgi`gddw}o`=pyDxUW#e)r-#K$~nhK$cyPPV|6#|=Ujk+amsM`aX$D& z?4%0U(_G4Sq)AoZWx@1Kfta~{IgKkALY<`D;ZDjr6m4^$I%B#pIgbZJX?QTR;pSDm ztjmJ;lQ_C+jth1=p-vL}z9UH|l=(jUdeW$$8DwLm8hjYyL9clKB=19&Q7zUSbShsH zof~UOVf8cA_Bcsz=jNh@#ZgE#Ghr;>C6e5iy3FHPR~oaeKiq6u30`$$A@$~M>8Rzt z7{4YMom#fjsq5UBWcNgVvf(tGGfkCUvGxNkad(8U5jxm=V}vx`UyUh>yiX?CzsJa4 zNAb~9BaA<3k85t;fyfhEA*jNv>J8hKJ-sG|%0K@~?6%2EulMQ1-fCpw(T0!Mcin7~ zxyKZF6Sq-YZa$IsxIiy=vLHvR6xo&D`_V+)nf3g-Roa+lgZ!;3P^HSl{N^+8;z%T( zT{9Xm&k#P5By?`N1kWZav!A|nhOh=ztXA8KNy9_=i*C8$^OZj6yh{a5wpv5PfNl&t zZp1WZFJ8KB#@xS~1Id0KSnHEWFNF*Tj~kv4IyW2YCM03wOyv1(WnlT-4RG+&Zs?so z2qTwCvH13Vq-LQQes~YbI%>nujy@0ZKE9}R?-@qaoyJ8OtKmaQJ>1WDf6?;Z?8|ZPJ^U7GV~s#mC&4o=!3bLB(p<0Y^j*VTEcqA<2OE{4>DmDtb>$na%~xf6 z6up2LgPU;0BLS0c%_b3EzUUr35+C1uMY`7PBVh;D!Ttwbq4m`*D6R5>&8_2Mrt>No zc2}7_xi0|%HfCV8$ppONHG-E$OKGR8FYt1oi>SZlBYEAlliD_SMhvXu&+n8EqMn2{ zjW`2&kwb9GRXr5JKHR3Yi)beEadDX*tMOKu?aCN2Auk``vy3>Hm$wZ++8ANATm)Tn zZ3#vz$b->@tK?lb7bsPm5B-+bpr+Sz_<7$H4Rf5a>fQ<%^7&wpvZwUwLpE_&M*Qy-z&=qn;M{vt=u2&(-7oXq|^WE@`q+A5|b|%{~5X z!BsM0@iypvH60keV`$T?z}$}*#P{9zg8J02M}<52xbNu;qQ5O1f=jy6AZLV@-rX=` zS}z({UI^o?v%zj_3Zc6aXxWB8c@mTH(7HJVzh5F@Kd-&h{PtYX=bD;1+Iz&YuCKUs%VU|v3c93l$zFDNr zl-?iCpPPLRK3_NwO;f`#Y?d;kvCo=cv3C;&e$xQYL_?K`yO=BIX;8VnifNi3WG@?s^Dt7{q*i9ZRQcN1>+4CV3p&C_*4b7>}L_?paWYh zw}SCAOP1PfI!goXH_|Zb0~a*~WWZ24cKM;hWZd~elFFJu)iXtQg|r2BrNofL6UDIX zb{@6-S`PImMASHHHoM0!g9tj+6Q+0(*}3-|&g>h6GrGKo&I30=QbKQ>t5r%{*K}r0 zw;q7-^q17VF%I)4S;Ek<*NB0K7mV<@hZ6sEDFGukufGG~nH~n+eWn<$bO}vo+{fhe z4dii{G8-N9hHMyBgYWy8Q|*u)q-ExII6r(o88^Bwn`E>dJSXnPyu}UV{d0FXzCDXr zq-K-XdL>}oFOAgC90SkdQc23iHRMdc!R!W?L@fQd9<5EiX-e=ZsImKj4w1)*$@v4g zUuze%ZtV=aQa$+x-+qMi-^Rd#@ZEy^BP149)Kr6!yDn4wZ9N>(Go*t2r&6WuM`+cJljvcR3AIE1 zAibh@Pz|l=U?TsF?({KbzxXA!dgNFgB7p4n9Uy{Q;CORnkC>bQ%=bLh{pR z!blS>M)ycB2)^P&@4X)ga=+%mu5XptXtRls{iquq)F{uk6um;LN6{py8n;9 z=EtIF%-WZZk2mX4^Zr+*mrh4RL3a^qsl6bz_sy7z?xvWd;DQ!e!(i7Hdv?;n8TdW+ z3p}hSpcNl?L6t5y|FGvta*_}~x4J>x(^81)cMrpK7UQ*C9uB%{gA)Un;9=!m=rHOf zju|Sz+l!j$*!przRa3|NE5}Kh5&BrRbT~}-^qy9P%!Y(h+Qih>ib=7Eqr*~?pahj60-PAt%=aBqMQnB{gQ=ww)n2t3Y|p3> zyB{l|PJ0qvPhJ>M?y6uNrE1G%RH zj0-~z9wfkoBzwG^SV637r8vo=3^i>+@l(Jbuy5!U z{to>kRtG(soWQ^IC_IYZUpdqv0+~Eci+P&NDuY+_w!bBGUgCo5Z13Z>>U!ch`a4ZK zG#@vgNJHhf^HK4^AUqyWLrK3CZ1VscxN-3*p0zXsdFvfenAHR=pH2arRtMRguELDF z1*FA#E#7jimFkzi!PqC6c%*nA4%v`SeU{Iqxi7!ca{V)qw580Y^R>OywXy=jtXc?T zElFBe{=;7t>^<=jvm0vNfW`%b}@M!cp4N=7_zU%^oG5zdiX@!mc4X;B{c56g?sfh zaCL1FR=6CZW;>_SBcT@vyFr&S8xP|5zTQxBVI9PDIpQpT)_^>q+BD!13qPl3VNd_poJrs{^9LX*_TZBQ2hcZnwyW*4e zLXsMO1)fx%fCQCH)DJ4ei7RUG{>=vZaM4`ask1hXjy8tx>$))0zc(YxzY3#&?#4q& z4zyP3C%M>n2#Lzrj^2y);Qp{gtQ%fVbj=LWrMCiAd;N%4BDt8 z!Rj+C$>qZZ=WeXqV@2#5B2QF~b%KnDK2UaIJZW&;i?F>v^Jc9+7;h7UNnSLZG4jUg z^DVKwb}P=CUsrgyrr$|Z-Wr`0%k((kJDhjtX|aCMqb_ng7TYb#he2czj*0sEq5Ee<*==4)l8 zV5sFTIA}3~KTP@*w{+Wqv3=9fbB7$L7ofm{1LCV3>-NL#ujFdIjeWMSEuAk5huOyc8Jm_wBbSnBJC27eA@L(OZz z0nBpnhNTGW!ZeCOZ_l|^+&)~+E z3{_-HZMsu7C>sLyJs~?o5>U8sJ-8njh>A*+Y5w$0u%+P)%Gr3qdGod8j%E_|Y0yME z>piO9y@Rgj*JT(}C7aq_HZK*L&A@nEJSxo+`GY zBN8vcn4&mTd1i#o;g@jIrAl0;UQgsFG@_x$TzbdX1XII}nHsSko7M9xJ+8kFd#p?Y z;Rs!F!)pPQEB0l6ET2JFkT-a4QvurLSfbW6UG|7c88#2wjB4AOfO@u^WgNlqJ)>Dgt7|0fK`psetj65j@(nDb2ay#&tsy&F9tTX6(tCz1UK<|_i@OeH z%m*9cT_Za*NeQF5JFe5oD~j+$uN~Cq=?8S{q{m3s=acLW2SB@O9USi-M0wV3cwgR# z?EkJrLzi!c+}r2z?yTYbvqK+2>(PU-xpE6Us?Nfj107+*xN~5({s}hM+Ek6LKPFWl z^^{)p&ZMfZt;pQgXVClRC+K@_Imoxx+7a4+;Q_;+`}YYP{{ci&fJ%2R#(cyC3CUP%+)Rgm)qv1X)p*UM4=7mbKuD_`I zRP2F~!OCp9TO91#UrCA?hPj#h2x|M@gAnys==H9K>Q||-CdI?i*}9r%`o}Af?9C#l zCaqy^gJq@6f`Dbx@eG22IU8Akw;wo`B_;HESR%U+0TQex%{zF6&5L zx0R5;v@`s8^AV?fSp)ttm8d*qEP1%`7`ZiQEm<<*91bwQOeap*3DxWGNhkN~4%#_g znM1krLC-7r`S&eyV(UWx~v$j`$%a^T?ANHp8?&kw)AWz1;4F}fE-pQ54{xF z&3RkklzTU}x@Qw!wDDjabD#39_bW-Aymr6@&;YaQHKcW-78@nXMoSHOlA+lj?A%*0 zc)tV8O{(bS)(j(>-I;NY zi%C-BXv!+*K>gg)aOsu_)5SUlHYdlT96p6DyoY$c`6DC`KS!fpZ>ME?RebMv@tA0Q zhz9>DfoG}((Br2%eye^;PL&!!fJOrZEX#xXd$rO>n&Ux3?gM$8rp0(GSmV2jVRVpH zD9}8F%>BDDx2GIi7QGl=|BzAx@i>fm8jELkkH#^*^ZCon5E4h;C7T!I;EHwcaFAQP zG=G>1>(!?Sv|?ki+Rg+so~?oLLzLlipB`w|`x0GreH1IXR8H<^#liQ=;cUu96|6Y( zj+8VRpmFg==v1W-A-BeZd$ucVdTWno=k&u9=@QKJK7uu=nes4tB1*fod~0 zg5XjlcCLGlQ4_l0wcX`VXs*q;pXvwo+0wKAX!i&+?ke6Cm@?O7K;S zguBlINzNi~xXV>R3PxM7&+JCBInhlp@9IlB^!8=SY)%55_vs)~wqV|mX0h8?IksiS zX6U!TjP>t?IN`%6uKuUPM6FJwmX&l5#cF<9N@=0S%K^ zQ=MKJbY*chvbFM5Ghh+=Hy?nhgN&J7Ne^Hh*#_C(qsZZ(1|ZH!=H{~Q_|T}H-l$Cj z%RCnt^;rWQ7AZ>uxqCu+UMo%GR%I?thj3ow5!iJp6rv{Yql+U;_%ZXJ2blSck5z%wA%4_H_wk|`P)jI55n{@>F_mSyZL-s$8B?c$FNcUNCv?j$E!nU6v zGe0lG&rZ7uQ&r5(O()2psk^Y}md!N3nK4h6`QZsZ_cOJ0lF+d2C7XSq)LA> z|M{PW>_kf)n&jw5mF}!(Q`7T^TElVdH6|IsJP~<{hoRL@4a6_6N>h?PRMiy4(aB2s zOnqd3W}M+dx@FFLnCo&A@29PRODbtFCf@{yt?L5cSAE27uXp0LedqCgLnpRq?G@Tx zzmyyt?gIf^>T&ahrLgD1F=#dTM2{CeA^U3TaO&pUh=(tePm0ShCdQ9yzC1{F3Le9% zx;)C2$}yzk3*JdrU}nE~EZzK}D>Yb`1UI`z!|d1FG2u`>e#j6&F4khUzaDq5c}~J4 z(@13CL#*&hz|&!AWWODfHB%l)YwJ8ff2BGbU@L*zuHI-d+Z=jyd4kKG-eQT*S}cs* zj$=y*sec?u`eUY7V@94~uSMbQNPpP*#D~V5(Wf^b+8llv)|EAM3_}$Bq>juz z+H&(dd0ylKPhK4&?=4A>6Q9`QrIZvaL2&U;jcX zR+M2~BE$c5Z9VnfWkVyZr$SQJcI;M`L$$Ng;bWZvW4!+k8u*WdOD`4C@PZZ-61x;u zKdqq2`@Z4lwO4rRjo!Ft^<9`kwm`N26ZjULC^cyP0N;nmv#J$AG z9VR>?m@fZ3j=xzl5^HN-lN&o-q1IcUeYM*bw1#h@cf<+|6LOj_zt{;p>xyt(lL}5# z(&cLUAILe9K`Vw%f#kiN=%E|)S;3boJokJ)Y>QRFhR=mCy{j^-@W2|so?i^p=S;-6 zCwoHIqQNBT`WRG-xkvP_*kWf}4WiQ6orFX`#4|4ym`Q&ws7hHh1>(MsL-Q+Kp1N)c zeOW4p)#sj*#4p29EVu}8JrhGsGjPqseb9I%QyNmBOtrF9D;KZKfX}AR7#&#;E5vqi zRig)8;GoW)94W=7jJKGSV}K(LZNyhPMo{FW3Z>lhi5E+@(4MMm;NbqvWN3CWhQ*IS zRl^8!RXqb3`C(Y~Qwj+aEg?E}JU(UAh-bY8W4Z1uY52-w>A4X&D5nl9w|~Nu<>W+O1F*!l3-Dw_tUOu$IPJw9r2QiVx|q2iNFBYLXJ;y~=_$2u{G0J(6kHPljMz zz6MVf7tmYrmT+0aoQ~?@k7{@WuN=5S{1?t6mupr-)*snWZMIx$xXd2nI;F!WoiRAE z@*5eLI)ux6IziyK9iTgY9G(#=;Ux7x!1LyFJknfDSI>52izQCbs*yv2RA1qWk^7~l zv;v0b?7|yx4i)vT;A^)*xFXR59gY*W=Kc$)9lHivP0LA$StnLBcME)^8({mL9D08} zqGb@0AL;TAS$pz3~{ru7YSyCAg7P2&k0{c-}b@ z!v_zA$0cUW=J#A~={uLFwQL|-j@m;HR&U1o>R4D3p^9gM4QN^VRJi0Hg=cM?z~j_7 z+V7hK`#skZR7dW?f+U5iEyjVE^hN`|{^*A4lMUd_Z4LA{UJ0pr>tLR1G^}TfahvBi zl6}%2nw(!rhx#1?@L342Oh&OygU$i3Odrk(D!^)w787sdfL~wiMdH+o1K9^?nGp}+ zYb{`0#TfSJb3AAv4Zq4LnJ8MVn*cP{fZ`Bk2{W+_K$jSYm|M&0Dk<};ukL|B|>66Vj$Uf#SJ(rk|??-UYc@j5BE1ouD zdXfrd(wAZLJqL{Py9tU>NAST~9d=RQEYyuRr{nJR1lMa*QPVyE=XuS6IRRVIYgP`D z#tLF=8Ut$8<+SjNE_k}wP>T<-5F@$YjiD?pFfMM&kcYv z)BGSLQjc0b+zv+JZdg#F#Q&pDZ?=2+I`VpSPx>t50KG9Om(W#fXifj_jJ4(ya^#*q z^iUs(g_BQEuSpHOZ5Ey+XZ%~zRc}9t2A;yKA$H``ATwrhpb6IPkY{U#MUZ1Dr{Qwh z2HJagC7EJ!0VBTcpkCwMA-typIab2p%@tC*=IRAFtf|PTU=6nJ&7^V3F_>SLMl3Ox z=Dywvu%&7oB)zSKbk_`WBiaJyIT}Lg zB~|>fQVF$>oPn*0>dXqy3p8=wPdvUt0$!e{KyyqU-4?kSeXmRDi`yQ!>uouCx;G!R z_imDY&U}U?UVA`!es3rosLyyAt^p-=ZjDm<96zlurgyb3;KK9>EI6r!?;T^%kFUax zv#`dLgKKa@VJ)Pl@t8Ns+E6q{k*|zqpnRwc(^%Av8G3F%@m%3TQ@XZ-jpcC~k}(+i z9_FE$B#Yd8@|qSzWl*2isbHkD1Wed+$k55-r<&Wsi0v~`cbqYEyw4QYVm5(uH@m_? z{{%=GSw`&E-ip1=enb2HXLEh`^p@+`8(S=tRNqIg-u&J8Mqr{fr z7qA|}Btc%Y>HKt|90Vw!rkBn#1Gu9!F(;WM5Bg%z>Dp}B<_Ag zoV*gaXOgR7(Y(V%`>i74>H?fPD3;_;9Zd&Bt)dDFCQQG3;V|jY4p?Y=0oN#up%R}m z`XOIHM%QIyP&m?8M|whd#c;A=^(*dO967G`Ru3B%C_s2s3hr!PM~*L>#>*e5&t5w0 zCB6EKUSB!RhT zvl5^CyoV6m0?eIug~&fmh8Ix-S*z&|T#fcDe(L!EV%7&jrZ^XylsmEVM+)irS&Ojz zkrG?JssZGCx}wEmA$0GSfp4C>krCe3Fp+yE+~^;nW|a!C`t6$`2dN>MNVj1x3q%uGVF0*VfaugEDd9+Btkr#a4Lz6;Lnr zCeD7Sfk!fIaO9QkQq6gqU{n1Bw6=`F0of_EXO1$X$a4qp#$!;w^*lVTJ4EUpmJ{~R zayXh3D?KLm0gc23;JSM*4stmQhNVfcbnOoCof}H+M;YSeO>(HN_zgd(SCbhxE)mtF zD#S&v&~jBFOk7~hdUhH|%rCDaCrLKn++-F89ahG(c9$_weFjeG*GLr28j$^50#iy7 za6#Z<_-vAjD-~a1{GKd+_fj1!SL?w{>^cWMy;kEuxhpWWuRq4s{)ueOu`1;{55Cc? zPc++cKiXf&;cB}lX^*|*py}o{0HX#_?f#r^!aXBT>ADjS>l{FT?`vSRcs*CE7n4*A zceoL@1+Fa7VQYF@;_c^K!6f_^hGq1C^5@&=lm}b5{Qoo*U;jqkwg!-&7gFF$-5PN6 zNg)1fzvHTFW5}at<*4900!=Giak=|(Y;ks?nn9n@+rWl3^mvA~CeKkZYADLZpQ9^# zQB1H)!8dji`o=X0D|nMpGhicn`aJ#rUEU@4>v@n(-x8qHkTk`#V2a0U(3EgBkD{~S zyZa)ndU1%WVg5mk48NfAz3oKbZy|hOn;=@U5v?!zk=AE=G(FuJT)!!>7sutojYt8g zG#`Yo8y;ihkkiC`pdapx>5S=%i%^`wV;^L1L?erPbos=-utKWLTzl>WuExD!y3$$b z-A9or5Cq}(KeK51j=QA#T6e1J@r8bN_Qav{%u&uY0jO0R>YH=d%jEtTF>W2Nd2#~A zw|eoHB@RM|qN!x&`!cw<_A~@F_&`z^AJM!FXZK8_FY@J>tCC)DN;H99@Yw@ZOUAJQ z6(^us`6@gi5{&;`2Dz_NuveJ|t}4}I?@pOPn7E&Kna<~*OP)pwZYeTxFE`MMR<(5S zqxTSJz`|S2wWL@@8H0*X&=KkfK&N&fs~ahz+Go#`2g8=)t4W8UN?(quKMBL&BNb$S zSqxZrvH{g-cRI7!j^5aJ0gD!UlccEqSgv2pcL`W7-R7;pB)SIRip=4#zkW4Y!QC?! z9a}G5zV#?HobStA$yvx4%(}r>JU*TA#Zla|_3kv=y$Gk!!SvLa9~9Sf&zz0+;C}8s zk-BBRWW=^EFgrtwd2nefUMm^MSo7*3!t4i``h5@G6PyoM4xE9SGulvAWd@gC&4k)> zrN~oNW6Lsb!PWc9XuAG0e5g!B;m}x+`y+&%ccN4ApRt;Om zm&1#BsVLWIfhtqAG4;qjaDJOdHh(h#g%3{La|BEHHt;&UPw#=w-3BsaGxbPX-XJzo z*8`Sh4Z?l{VsO9wMl=qosk(GD1ZVQxVA4o+?!CqSY}(r0^p@oXW|=i>e0U zh|a69ziKY;p~Q&2@;#V->+XjpU9+IzUOBE`SI)nk$r4S03M&rN!?8p3sN1pQSh99C z$j>|}?RQik{+QmK^yvQpR(te-NiE)J)EbKAr{;j$*eu+3%%A%A_(nH3>$1Cg)}fM~ z1}(C!#4_%k>JaW7?(kC&AzJSg`Rrql0+mnHS(FT;lPsCInp4nY!90F=x3!qNN`Wah z%&9DJSVDKXR*@LbB{XMn0Zg%q1;6<#Nt2r^uK&qEu9XV&#>y1qrgp*YC$w1i%u2GW z@HuJu^AS06rz;e-Dl_k<)uW30CaBmb!RGWPl0NVf&#_AjS8IvIqF7bzY8+2yf92lw z&N+#VAro$#vro1-z>4{n5EscX z*4M-?$10SVTRu`cWjLbg{9SPNvjcpVe@TCQ@`P)nDk1h*9zSd$%l+>I3gDdH7~bY? z*DxWr29rHH<5itRG~T5ONyJ-fJ}w&uWGp9p?=*1lf-ivhtn;+HS^?40&4bfZRGAHT zAJbLS))T|21$4S(H=LjRnjRc`86$JYB0&lGs`bTrr3C%i6jV?f216f>M1|SqkefUM z!yLv#bA2w9%U`2&`+Xy4E_Y+J&Mm@Yf(KA`R}WgY6u{+Wd(il<4Ky7uhxfx&+0Wg- z@|}-eg9*ts;1X~YPl-;$g{SfSPQe>+gv7e4j8uZA$vMz_uEB&Xy2M{S>;Nf>w1BLK zx~TGTA{+9(M0!sA2ov))+3yNzn6K_gQxi1NQAY!Mcn3hv>8n^1o`Nqlv!svSOb2nw zFxcsEo9vr77YS8jx8K#OG`Rek=9Cmv(cyg|JK`*EJ*o~5CNHP^eyXyieY()7&iCQT z`m6lb@M>JftHRGAU*PiYJnR~ff`aj%Nm<=^B9{{&9p`4k_=FLn|F|pb$KM2Nk{07v z%VPMP-2kr*rZ8(PT$q5z;WWqX2%hWZLXWxsz%@_j!;y;VaQvMGZmNw#)iXB%lix_Q zr`^JN!%bmNqY|d=I*0-HS7QX>-l22*0KDTL5DI6)>xg)oe8L*{-E4v75r4uZQ8P7G z-3%ux-{7neW4JMPYgJy|HeCL60_cuC01Gk)@-!aWlCF2o;ZO}H*g4n(YBq#|cf=qV zb*c~37&?Vo=yaAEb7OBoYk%T;&`+9Ont-Z$VW9WO4!?W7gt2kG$#iZE`0Ra)^W^)X z^U)d9eBfOW1o(sNp6%4GDuo@**8m&IS$;KBi>~1X;NjaHmUvsxjZ4=;gVABUq)a`((c`wMz zadT+GH(T@^9|OsztH|&%t02?XjZVD!4EOc4WyhJVp?oE3HaU)6JTB-Mp4~^pa;%-m8P*(J1QjSqbijav#(8BqVg&0Wo%p%qQ;I zd5_Rue4q1X*l%Z(G)hi{!53nr+r@Gq7M_y&1d8dZAv<}FgQb``+l6X)24QxfgnLKo zEacWya<#S{7}9$w`D2R`GrVC3-G5;fweHlDt$nUYroO4d%boVY%Q zj5`(Z?VJw%F<}%;?@<6jRga}(N}tkM&N__h-H+f<;SJvwE5Ontsa4VsKqJ9eorxAmy2X619f+QRWc0rw^0{bw#&@Cn@)nK6X566Xm zNthnO{RZRD_4sbk8058dW|)c>BvOsR&mT79_7nx?A*zHdw-R3lK?$6SA!1UluwuyE8*FDH`uAT z2LhZ+(5&k_xY+y?6rAi---i2~6iXb0)vQ3?$QNiUXUVWkgUFV-h??DP| z*$+K7oK)aF&DZ?c`rhojg>%5{`D&c-a6go7T7wn0VzK#9v~<~z96TgHp4@et3O9Q^ z#-=HHth(GxxE8tqjhDKj-kt8`n89#Jh1q2D$z_!T$7i5tAMX7sqb=Nff~VobtyVB> zErq_`AIVhrNVvE}ld5_RB|6+Qxy-pHptWB_Y_|LG$j z=M zR^V~TfIoV(J(Jbbis20gR2%;Y=lL7Jx!ngz**?Glp3Dkn|g1>fc$$HowZoH$>|AY)d)Cvp2QTh641HuiriY4 zM24?_K(>neFc;UX$EG9JwAL$+Kh1bM&NC{dBaU$O`JRjM{qkyZJJSK{x*Q=lo_#{q zG99MnO(XU`D}s8jA^5=vd@0G_ zevT^J-XsHpRMBw4eY$_d4D=k^gAU!}P-T^Vjaa`6Be~NeV4h+yZJnJ21=A1HmJKJs zDJGF#_&k(7anJ<%N~U=&f^Q$Ff?Ngi zbgiWCr`|<_$YN^tbsDItQ*5n$OII29XWi!Cp)Q6S;O$rqtR8E^%+_y&yGc_pmbVTj ztgr|1!Fp)=(Vf*DH3bYz zDJHrTy-4$=omHn!twx9b2IR%N5b0>M4VY=>Pb_&oVfNR3&@k{EMEk^`=%F0(sM(Ga z3;WSqj(nV6wFwsXt)~U+RzdYn33`4t0lzKYB;@;RC}DdsrV3l3!gGgoh(|2FaodVn zzGfXir)~;+s*4xhJYf^idmAz2O)hOre1t{d2Ms&wh@-k1bNrhHsIPk~<+VH}GtQ~P zGUF^#dQl7asDH+c$34)~J`n;gRN}s+E|}%&${3HDjkoeL( z1-IT$V0P^5M~A(4FfqBK`2MGKj8@8ry&gfZdIHzCIUC9C6_!lV&(~J}X`S;S{~S+H za4syrWeb*W3$bt;pBfflpwBs9AnHIaypMTJ?$5hR=bTAEt?L2kJgSHtH=|%t=TahA zUqlaoPQqsVs&t5LJ$E18xrP5_!3^4B zbCKrlIE#-1rb6U{do<#;I8lk129f5Q@L6bRD@4cGDj6rd6mg??XoU50~V^ zg}cM}G5S3zdm+ji1tmaB&^pkuc!ZAcCo{J!jKOe^323>v!a&y{c=1V;(bY0y6XeEt zA7e|wFkT$a=*~y)l8+D)y^hi0-J&5o(@5+jIY{{VrxGZ7w1?||MPBWcpx5S z;`@MU4a01WH2if?53T0~g7O3%tVxtaR^lwaQu8MQS`0*n9zlyKBH-p5M4g4lQ}${K z4Z3Uy&lNa-UFj967=29r?_UM2^EaRo{=y#LwUCzn5-aAtB046MnfH>2H5X#w;aVj& zx*>?FH{RlP-%UVY6(!L1Da1PGL-2uHgMW96v(86sF@BaQNqM*e@UJaZsx)V-N;b2M z^gZZXW=x%28?0hdD|jWD#t?Km7m7`~;I!8z-u~9RaMRL&wyNZV`w6b+`85tV>Ab)k zeG~TPi~V>egv&-J8Zz!>JMjCTW^@_TvAUTuo4MsJ%<3*$j?=HIQ42*L*}riQN1LYa9Q=2(Bk z;m<=DjA!ZjyJAd2n*_H`bfe*XTR8LSCiXUavBA}W5FO#hiP`hO>b(}RczTj2o=pgbMe3r7PbEOp%`r5#*{s|tuYX}mjYtZm}Fbdk0ljNuZByNIC zozqJ=ly!hqZPN#BhDS{nj$_|!G=fl=OXnr|@ORq9z}lZD;Vyd?Oh&iEqV_3x!3)7n zF$6x$kiLlG)qBVEVsGUTgOsx}9$XpM1sH=Z-Px;C~(zHo3sP zf<2hMqz2D6)nUp%9k7^J14pWBae?C<=((x`KC$01cJ5*DKA_7Kl|IGb35j6;B?ruJ zltavtDD=7#2RRe1n52~t;DXsf80`s!RflJjqbFopgO}&QIrlf%ahl%8=qQ}aAIC5T zkN+3{<$vE3{x3gS^S_PY&o6s%_oqB$t~c@&KkA_4zIk}&UpRi$&;zT38(}0c1{40S z!NoPVQ88)~-(cZu%l_aG)H*JgA6B@Jx8K!`HMPIY+nbXOM#X1f;rJuy7>-ypbvcrP zL3sN;2JW2HrIo>PbTqXNjl^#fW^y*~4VSCVoj-v8uYdF2?GkpJ_zq*+rI~8`ouF}f zgqH36!M8s75Z0GA!_7t&JY{%^SdEe%p1gxX|P>&6<|7d zm_O|+T1k{@AbxYkncB%HlF>=t?_@yp*KW|ASC5U$v{C+JB4o9E;A<@U1`>Yp#NcWG zT%Y=bXWz}8x1=l}sAxR%CjC88m>7*?)w`kTo;UrZqQus`nTAU}pVFP%C*av>!yuPO z!K1ka&S2xG^q)-cVDb5(le^_LT*YXRU;4 z3+v(cf>^4p!)1YL9I)ocPa^zlJoB(1lQ;4#93Rw&J z7q*biI2RYGhhjqeVeFo1$&Q7{u|{q0AbFr0WZ z4=|@Sm?sUtfjP-_Nrba$czqRYUD1z<_Ssx_-jLbb!_tdVJ^ZBFE|)nzY*O|mqM3%Hbi#c!8b1X zAd)?uXHs#OXzp`^*LnUe;uTJ(R}7Ecx^bnbc?u;G2_omKh*)b|W$d5g7CPG8o56q1<6b@~`}~<$vm&^grL{ z|7?BypYQX(otXI_FZ^#G=*;P6o=$WaZ=B#i$g5ZenjQuCU2Q49{Z9a7&8eZC^Jnpo zg$Xji<&3>v{v+$>a4~BxHx8{1cu+c%4CmkG9d{5!;a(o+@4GVfmnOq+@hNb7_6K74 zb|rj#atb1|x9~K&-qIDC&1BE**Ko&aGAp*g8k~pxAp15S;`$!Jv0Zn`q|2W{)^j2# zsh`JF>m8wO;2KTSols>8y?|{s-8D#N?A8_)|DDItd zihok*3|jorbmY}oCB?|iFinb9lkS{MR@Pg5bhG@W+r@ z`ahpxQf&y1`#G7;|Mmp~N^hcz^F!F3ri4O4zrk|~!R)s?FmROt4wV2&D~TlEdn6gd zX+`*C>MOp*oNw5dya_&5uOpUrr_q`=lDlOxZ2XRM*xhsq7j()HU;XE>`1TTbBc{Re zho-ReXfzZiuA(!(-k^^C`x#YNT_P4@3l&jMv2V{hQ1kc%jZ$gwSNtVOU*SxQU*4fH z8HyOwo(2{(o#@TU1JL?A7jBC7gO)6(8$Y;0d|M_%ZEGRd;cut^g3gj^uN)dzEr1q$ z1(w}bjgRXdkb4{M;V#$7$UF%{Kj(b(a*9LSh34$1*ccq=JH*5BT9B;!8pDO__%S!S zp`fLkJf3$QCf>M0I!zm4WVlSt1qV5{%fdYz|0o_^=6&F@HJWU3 z{U&_8vkmIz66)KqlRn?BMW;&cXKG4+W4D?(tNr^arHgB5@B=s8t+sFA) znFDYtR*O|%ejdfnnK30=jqr3l*LCcCMQY;O!I0}g_OWx&(z_B9v-!{>a}@(dhiE5m zMtmZL%<{_+{@_I6U#{!qJ@6Q%Ud>?C)=y=8rK;fvUlW9q&(PNeDxkY{9;uHi#OkVa zq8iyoP6P&l`J-$iY+ebArZw>PLWpGiSManwIRcNEQGwr%`~p=0`2u&M=shcF?NrJdBzf*xczA+ znoc9|^mi1;MOZ*aS`YEWJcA+D>$Grin-bAlm<~I({(@GI3|gyak9sHLVD0K#Fe~{SI`ZUDQC@|8mVXH)xAEcj)|D`I zvlvVaNd>h}*Xb5m#7-FGzMs<_E4>tbcGa?S`bO#w92$thT(1JyQ?Q2B%aLMhLMo}Y zLoW9943e^E*CAuzXtD2!c-{bZKcI#{^EnvhjPMPh=xCQ?A1|FVZ0E=ma*%vR|MbPu5?_!YkLf zz2e0veCkpL=JHMA?LQGye`JBja0n!=Y@q35q3AgH0mgfeVdB^sbndT&g3G3yC$|kG z9=rul`zi>vxB$VNcA~_0qCE}bT+Tm_H0*ywYl@Pf@8A?1x}nJV1q*Sviz8?aZ{aj- zdz^Vvl7$v47$__iHKt%PL|G)Wwe6l=lo+i#@Jgkg?9P6Klp4PujUnz|qP zLdj1{7#w@X^X2#`Zkr8g_vu=g!QKV6QLfK^LW7Z)tHF*oL%PP?f~d(>^F|^&VDVfz zNY-fpqci?c$Z1o{F0|3yrbx8mdORg3PNCqp3y}4(6mk^xc}R!-jt~$QJlBp4t=~H3yK4Q|0x(ImL_AYeBsRCA>aO7YzIvto${Oaa>~p zR;$9G(MN{SQu&RKHH)dAg)7~0C60XA83+FM^TF3A5`0%Evls0wn8|aeFxonw>6fBG z4Co!^m`6e6)xi*OY@5iQnbrgq$CcQ<_F43%RRt+qqQKjqxgB=BJPsy@^%)o(BBxKN zgV_EY_$`SP9?I&8#22^kqnx{{(n;U6HZhv=UYeR^Y>UDW+jBmkW+t z0WyBl;Iilr-k73Fx>HZl{hP|cK|`C#_jy;G$DP3v+h4=}kY+0L&I&?AV&UrQiTFu= zGqvD9hR(ce9KXmNMinL4?#DBsWTPuh?kpl!UvEMGmED;7P9KQ7Qn9A;ef)i@7!syM zVr2OjcxKXr(?i~X)cF-?A1{C_L~arDr&`4Cpb@wCav6rBNvNcp3w1|rK{`3OxMl8D z{H>W!>azsEWZpVt2Lte#-yi;hC<*4t+i~pBD-SRUoyoq6i6dLyDx&SuJX$i?LH#01 zVgFpDNfDRf#gsVCE4+jP_5;N~OWvcg$us^zya4v6l$my`SFl)))5(vYgKJ~8*bs&& znNk7jBl4tqcNr+0D(0)zUj&c6U*UyiBz7MeAp2WH*p82{X|nAla`2EjgnXEfNzT)8 z!N3!~^U(%kXNY|NQv<*ToTqUCE70WM7KoX32ZOmRh-q#n%$*Sh&CaeQB4`^_Ww{c= z2QIj2#W19^W8BYgMf#<_6XCo>I_>I8IM}rw4>=$mTd2(H9Oh#!^NR+)ILnVv4r7Hk za9JHu26&v;>VBaFi`^Nls1wAA-!zEf`cFiAvkvTiBMzEMvq-G3BK6MJz>%B?T$t@e z{S8Xc$zBq(nYr|u**pB&APXC&p|uoxlpGngr!T3Eom*9YfD0^0>h7LUNB42)0g7+>vey)EsHSOkux~Bs4zB7gL zGqq%P-!T-tUkIMd=VA7KQ?@bUeDRK*3QXXynK-319KUDVW41vfT;;Urym&!&{7+xd zdQd@}A52At6<~7rUqC18E3kN$KlaGlkyU&-UigAOh;aJCyV)^|rm3gFrePs8YxKrs z!8Le$-x=DIqR2en>&)1tRgrC9=D?|GC!xMKlE%COe5J1o@psO{g(@|?_HZ*VebHvL zm7W7LMgbMPhe?Mf_c_QeW_HP{!mXTzw0~qTt@J6RtzXunWW-*o$N2`a6=l@eE|BO= z5hjoFy zPNIq{!?}BO9y%&LBITJn*fnPkctne^>hulSHbsDOk18e4_rxM8?}0snlh9g{%Ub;( z$_W3TdHH|qAxzFoo}=kEShVaax;?c;Jiq(>T-PAAo3wYiZA)%Gc>1NphM@SNUz9qDp$7}A|pAU4jd6gwZZUPAgD$BpzF8y zL7Gb;=Dm_6e;bUzXM+lAd}<*F6ys=#`ag8vFUi}c_L47JyaG!%)Pmk5eH`07O5Y4- zg442lFl0A?$A77_8~?72ILlPgzk|%bhqfE-b*FPkheXCYR`g(RBsrR*#O%XULxI( z1JF_KHuP>5hht9DLG{-SdN#v>K7J92T8<(lA!;){0T=0sGcA`szy}XCp4CcOSZ+9vxA$T)nNgv`M8Eh;hDGJz%<=2=T4OUL zItFnJyk%e!-CZ13dWD{nd<6ne&x5;-9oD$ngJqC8JO6nkQFlqgzN6{5N{-9EDm_4( z-dmuWm5n1eUy%R{SKjWcYf;Emj8X4XWTa-S#WU+CFhXgAeD}m0o@e_NQvdoFWVoD# zOfy@cXJ>%!r9$32<30H4ZXyf|2%=kQ7;b*M0oT0LWMi8e2}hNHRjVmVwgvI9;|Wda zT?M7QE>Iq4BD@I0Sp3H(^mzvuyU^-$#Y7;eEn5ST>2V_ADGB?Zm1@k_KLGn zvX!JEn-rT(;P`frWWhQ588xsqW;cocAQ7#puxrarTmzRNy6p<@jLIzb;m`3rwe!Oe ztbLbXX#Wik&zQv2ZqJ4VE~@xcQIWW;u0gx|bNI5me-e=$72tEc0JXl}KFb!y-F3N2FJr3gD zCc{{k3!-xms5lJcfSDhiDJRV|ANWK1^tx%xiuE*Iw20TR&I2|HorTEn4mfLp7h3x1 zF#n37Pl>=A|jU9maGj<<Ai9Dnh$4_IcHu%5PV#6ZFw zg54a@^zlWw{7(fOYbgwU4ube++TbNtiSYgc82-&gp|f%1mm{Z*mpsQbUvoBL@>(>T zAj-zuCKQiyeA|e{LC}EBeAjn^OzxH6F#9vYj-yKCx3MG2H0Bro=D0QQWdd=~?lY}E zIRWn#iLhN`rab>oGPEV)2{D%3PM)phczms&iq(HigIO?+Cp-HzR8I7xGSyt~mM;aS z9~*e_`|gq@ha!mmF$uV4ehqQ@O7It)#0Z$nGnWm{k`uQCK&FPvv|8*0vxk0Iaxw_E z#wsxjB|G7I@jWP5ugj?a)CQZ6FL_V=R)NmRczQ?T2|2R+GkiK?0uf&xqsiynAYdDY z29JGE;qN8<&Yk%!QYJAEtGNHepYLeFv;|PQtOlc1>uJ9{*V*@5LGMg0whCFN1J`~? z!>2RBa3U}cO+p{TS${cJOYi}&`P?6p^+bZ1C*i~OPQxkLW6B!a8o@U8*LdtkF-}_x zD2f3oY+6upu+ldoCif-e!Pv9+`cY(MVi znB4?b%G02B_d`0SZN*%fa{!`qG}--m&Gc&UoZ_<6jaV4C1IA4rLywE^@yN^Nbkoo0 zc;KZtk6PCQ<6PitR4)x48} zUtaVUiLTbhu8d~TUl58vb&f-^`8%Ta`3uIVNHM2m!_cW#iOrdEmpq>>4BbbiK}SM` z5&gOpI~6u!1(`}+xIUAVq%yc2i^aJW5^QBoIvG7#OEqrKrKghLK~cFqGq*#S`l-3G z-!6`l&ZsBg>~{;axw}R5bup;?hq&>|MR2vhOU>3z!FOp9co>wK;r*T5IczFB2jk%Q z!rS}}-t);OL(UKH8$*-Lk9f7`OmV%LCOE_-@achQ%-;5|SW9L;WIy?dr=+UT$Y&XT z+86|u58NpGat@JDO-DA-5W6>g72 zD2PF^*)OF58{#Ol`8lain9aDZ$;4w_ z;c)7FC13d7Bxco@0*n_A2Zfq=vT&LrI}&mUZh!s@w}ee_%N}7SCfx&Oos~w78+EiY z>?U+R@}gwLZn{xdo|x6v)7QUOkti8DFDjr@WO>EwjG zB71w6Dts4+hcMYiaB%!o^6|6_6mEHeUd`hef6HMK*zSb4zwHIBKkHFjCCH8d(4?bK+f?;ojyKX$2X;Tlf%MBItj{n*E*D-v=RN0$NP88}_k|o&mN|@ft+m-c!F71Zw2FlL zzXJn~9U6z8h2h6ia8XAV_8n1W6O=|!Xjc#r-U%>YeGiA%4D$Pxr7$8xk=OeE1S>R9 zfZ5Mv=m{@bp0ryin0%kg`ITa@JL3v=-p&Il`$7CU^%YS%@tRicp1@dz0`y4TBVNUY zSSsU)9^u*GH+UJ7k~7g~ycsUvQc0}e{=+XpQfw~%gBXsH<+$M(ir&@cuW-{xEISN#p|By9`ny}#hFFaFY!gJTN;Th^LA(dIuZ1iUt zX5i5pP?FV!R$dxOtPdkHS!Tr5*n-BGi?hcoRGDU1Q+Cp32RJC;#fzGwipy`<;R=pP zP+R>Ne0IoVNBto*Dx3(J3F~n5!3m7EeFwcOH{i#R{qWf_h+Nmt#}&NG;4$$zwwsUg z51qIS`Eq$yPn^5?O}WwNt5Ze2m+Qf@1A8Hmc?kV-VZ0IJa6H3X!s}zgsP4}F)FS=? z71>>g#^&5J8Mq1?!d+qe8&ec=Xyes8#G>X+3)ZpWGX1<{5A$iwM>61c%_>DdlIFL` z^(6`RIJah$mbBB><3j9?nM1H|a{^BP`;nIK6+;?m02ae6 zozNIYDnmn{C3PFA^v$4}XC`sE%xYLQB7#n82GCKvmHn&!kn8UY@$wvFQQayEcYF%Q z0n_IYbL%zje0iC=PjKPOT>D68+s$U5m8O%T)O#4+EX`b*tO$**;aGfBhTR~;!jwpH z*p_t;gTsE3Sq5P+5@ifay&r+Dcql3_ctXlwD>Ft>-yk>m87=J^APQW^W6UIoX7?mv z&Gl1s4$86#p*=WrFqP0E9i8sphy6){MB zvyu!b1L}McM2#bt$g4pmOkL1~S{fa6-ZoFN-n|EX)@GyR=lgt7RgT@gr5!x?`hc8f zI`!R_hD)2R@LSS)@VzI1LK8GWyH<&Wl6;uIXCk(3l3@pAOh}NxLNuKihOz!E*sdR- zP0@<1X6I4fl4docwPqN@p9I3O{-xNaD2#r(dFbMy2MJ@HWNJz`#&{1>QSV@o`*0i2 zADPO$PdCASsdoOV6$e*zEYGTk<}hO% zb14M9v)+srb`r^CdEPwx9lUlyF+48$ge)mdfgKmj*;#`L=>9H;V+2bx8B>dKN47U{ zt@>gm9V|n|YPS*1PA=Cks>BEd--A)rmn7$AHe7va&ClC4go!PE0j}gB7bwK75g+$`Gy_$E@9p zOUf0QvJHNyd|ij9qa2RDh7wG&VnrZp)+;`_;9`r|O1Jnjv@1I}^%4LO>ul|v_RtT%}(T^K1iDSCfp{Zq#p=3p#BnpxPnguyU;|2A(_s!;?Z$Xn7Kb@p7SBC>i4F z#felYLq-mdLt%UfD;_yAVv}FO>AK0qS=)56_LU4bq{fzP|I)Eo1hALS+3k% zgh_0tRvMmd=_M-yZh`lzNZOFPgK^q;9q&Xv#OB^SvhCY*;v#klWN-N~_YUXtZ0<%7 zlV{aXZdHvAdw5uJDu~~{k0sA^{(*JODBSXCAce9qeA%sKyzk>K(As?lL_YmGEt;pz znlcvT_?Ar=?i>ITx z{*&h&M5F!}IP57v`Qcc;)y7Mxw$_@J&lRAN?u&@ciwo%E=>*&QN}+7pN8BrNhe~F6 z!{6g926)wx#6SJKpsq{A_n`%Pi$sy8ltSwJ+YUC1?trGz#c)X53HIemv6c5Gu^PJ8 z=*?yB4A$<370Tkw_G`*))Y}_4(ig~c93KZ}&cd`&{2K1NQHDYZ;kd|$yEk|nTwmku0}J4Bxw-h#^GQ{b0u649G5ot^O`l%$^O zBAeMqtVnyXZCzC4sWOoz>OlTb8mBD-z-X1FpW z1_f#*)Z)-)a8NpK#ilLA4F@Xuy`DVA#xRh`-IRf8Dc5-0d-Rxyy%%{Iy5VR(yc46$ zI4z=39ka(P3kXCVL~Q8j($X=>s}E5r>i07>3!}FH4TIx&u5>_ zX@-SS0#LQq8(tF;HY(DHy!23ovcQEx z=$~{AC8TpXJ?l6$D-?shx(PR{WsLaC>%#R%%jgWo8-U9{b^a3@E*SPp$~cQ_;bL z@aAzU@i0CI*2Uj(EK2|rgyV3-d|4<{KFwdX!IlS-dAyE&qPWQF7Tl^fX50VO!-gX+ z%s$W6d|kUGFtUy7B7{x=o^=RcJS++)U@Je-T93;NUq$29eO3}1qez#VIul^6#FT8= zzz6g9D5RnQ(jo+J{|kV;5N9mIW;*&{3A?@TEE>zcgO;I(q#@H0Mpx@&Qg^XVDV(2!a-ogou$H7 zOlB=V{(_rB=XpgXk=XxTmdIc6$3<;N@z&f_?6uV)3I?2qHGLdg(piELqD3gHHXU!7 zsgQv@K0o}_S-LJtnlWCGgQ+s#Y2alA2wM;X1DXB2?vh?&zIGy=uWLzj7g@3MAz48N&lpEXTQ~@C{C;qDSa4`n-tKw6x1Wpt2LQMwR%w+xk&buZTvt%);8z zl~6ca99DQ8N1p%(d>tgq)-E??6$}w=pFf~5U5{~UeuFtKQN?2nA9Y*$No_yp1Mg_# zy08|ErAiFUzCTRv884tNfvMQg_!tDY=fizDS-i*fh5YBO1^2FE3}12vB91&I3RZ%Q z>NG{%|KuI)x304CsTz-&wi_v`&cWZ#pZM8i~7j5!{ zo?5~4|MD9rJrE^+c9QHD(TNykJU~nzO+x$gwHQ#FLbrL#(ufQdIj6riV^BHV1{0It^m= z=g?AUgnzLp5?9z3;XU#1MCe5kJ(ib6tF82za?32lTd`p9c@_q4%%^uQ6k(unHT>>q zr%KvdtmLfcWQXZxY%05I^?QLB)#Pcy(2r>BZJ)+wa$V5>^!emM%33<>rV;iFS6Q0b zBtw%T$F14N(9OO6Xg7BT{W;ZQ?YH zV_4$Z1oN)ffl}i{h)8@-j_KxL%%W|~;@@JdvQ8XNz1M&W{I8Q_ng2ygka!@xleS428Eg!}b1s5Dj^??Q6`#8Q^IcdMMo{Y%kqq(9F zYwfH-$_{VFfDuz}o>4rQxmDBlqGxd5TAuOQ%FU)at%~!sEI?Ad4JP%hMCaa6{6=4J zjMw+YzBMZBbX9j8@@l6e+C?<@#%#!X){GU|;!Nk@OdQ)lVV|!k?(Qgqp@|dl&<`Qp zcl#{99-jd|4psOt`3;x}JU}VO8oF$u3#xM++rGATVk#%V_~_pv%5S^in0p1rm!!gt z8);PHg)x-tsI!{4G>DE_EtS?@hSzNyFgwDqSS#Zp72R1%zx;Sfqf*b~rmXj5@LdW? z+qsLq_F4oI&*@S7kPme2V{xX5ca5f;l7^wc0npLj&T)e?K=yt(Z7l794*xl1F+B|l zUw#mYd~VKIf*vDxSQ!VWD>G@al_W1tm`#zZ1Do+p)IGrkGV3*9vlgdeMyHU2Lr36H z#3fi}lmfLv)5x6fad0fT2OriK;6YBWXdAtRQtOIwSWp$o%6k6uFFY`4+71U_sF85< z^C)+~4I?L%@YfFB0&A^w?r(M;-ygIFw>xKW(}g*VP0S3;7rBmB%kM!&!dx1YpUAI! zeiZDQC&TxDv)Q7G_2}6=5ob&1k+A+OrdGF*Unjp08|sTNi<$FQ!ZzM^cpR_;U%CCp8GQ=$V*5L*O4&qQ7Ip;JyG8O!?+oCm zegs&if92=h84rimjp!=Jop9@PXYsyEy72yh07i{V!o8QuNY8@XXw;H~asCv1paQg1 z>cFFG7xl`!2tErvAY`5iUcGY%rC*2gH~aGF(4Z!4G!B6sFF#VcqXU-f3cv|}DJJWO zE0LRUllP}ojTv<6#AD4jawX{HbjfDkNQjm_5ESYQy#T7E$Ls) zt@OM28wfLb&+!?7S=il$)vnHTi`Q<9KCzC}=yJUmxko&~h1n>k@|1ilk0!~1GH|SL zDx-d}6Arm^TxAn!Fy{DZ^OrcIos>3P@>?Ihx$M%1PZ3mGr~was*$-D8oWWt&CeX9_ z3{%hcQZ;b}3^9ttzL`O|DgH1v%@<`xYmY%y{c3Q{t3`Kndm7@N50YO;fZ4Jdm{3j^ zy5x&>JCC8ukv(9&Ukf|BOt2#&AKgxwu{LeXu}N4MHY^+gov(3ZweB1yVWK#QIjS%(u)E$|$=J*$mg>6Vp zEW*hG<27W{+Hq{9b{gOH*$w)pXcn8}vL33en=tnNJ6ba*h(_AoMwPdgIM!ti9l0NH zNA)-8%{omjFAL$@6T*{&(lT0D`BvO5n0 zBa1*ip5smah@j?=r!Y=-F~|=W!IdtuXp?;%kIdkcNM-}b{Ss%$;3O{Cqyk&YWm%cR zZYU~igKrZX=uz89@?~u$@4KEZma_Jgc{mZbIv&C;WujbvrWWuVibFjRl*z%dVZxU-3e25G0D+DMK5$ch0s z0XJ~Ce3cjW=r+oji_%HY2qpSv=&j!a20cCGlj=26Q-l!x#1Gk>N_eO)0y?eou;1(< zy`Q4NtNu8i70h`@9HWD9&73gk7U1$XLYj Date: Sun, 1 Sep 2019 02:00:03 +0800 Subject: [PATCH 131/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dsequence=20labeling?= =?UTF-8?q?=20=E6=B5=8B=E8=AF=95=E6=8A=A5=E9=94=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/models/sequence_labeling.py | 41 ++++++++++----------------- test/models/test_sequence_labeling.py | 17 ++++++++++- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py index 0c573a90..6e839bea 100644 --- a/fastNLP/models/sequence_labeling.py +++ b/fastNLP/models/sequence_labeling.py @@ -39,14 +39,14 @@ class BiLSTMCRF(BaseModel): self.embed = get_embeddings(embed) if num_layers>1: - self.lstm = LSTM(embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size, bidirectional=True, + self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size, bidirectional=True, batch_first=True, dropout=dropout) else: - self.lstm = LSTM(embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size, bidirectional=True, + self.lstm = LSTM(self.embed.embedding_dim, num_layers=num_layers, hidden_size=hidden_size, bidirectional=True, batch_first=True) self.dropout = nn.Dropout(dropout) - self.fc = nn.Linear(hidden_size, num_classes) + self.fc = nn.Linear(hidden_size*2, num_classes) trans = None if target_vocab is not None and encoding_type is not None: @@ -56,7 +56,7 @@ class BiLSTMCRF(BaseModel): def _forward(self, words, seq_len=None, target=None): words = self.embed(words) - feats = self.lstm(words, seq_len=seq_len) + feats, _ = self.lstm(words, seq_len=seq_len) feats = self.fc(feats) feats = self.dropout(feats) logits = F.log_softmax(feats, dim=-1) @@ -142,8 +142,6 @@ class SeqLabeling(BaseModel): """ x = x.float() y = y.long() - assert x.shape[:2] == y.shape - assert y.shape == self.mask.shape total_loss = self.crf(x, y, mask) return torch.mean(total_loss) @@ -195,36 +193,29 @@ class AdvSeqLabel(nn.Module): allowed_transitions=allowed_transitions(id2words, encoding_type=encoding_type)) - def _decode(self, x): + def _decode(self, x, mask): """ :param torch.FloatTensor x: [batch_size, max_len, tag_size] + :param torch.ByteTensor mask: [batch_size, max_len] :return torch.LongTensor, [batch_size, max_len] """ - tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) + tag_seq, _ = self.Crf.viterbi_decode(x, mask) return tag_seq - def _internal_loss(self, x, y): + def _internal_loss(self, x, y, mask): """ Negative log likelihood loss. :param x: Tensor, [batch_size, max_len, tag_size] :param y: Tensor, [batch_size, max_len] + :param mask: Tensor, [batch_size, max_len] :return loss: a scalar Tensor """ x = x.float() y = y.long() - assert x.shape[:2] == y.shape - assert y.shape == self.mask.shape - total_loss = self.Crf(x, y, self.mask) + total_loss = self.Crf(x, y, mask) return torch.mean(total_loss) - def _make_mask(self, x, seq_len): - batch_size, max_len = x.size(0), x.size(1) - mask = seq_len_to_mask(seq_len) - mask = mask.view(batch_size, max_len) - mask = mask.to(x).float() - return mask - def _forward(self, words, seq_len, target=None): """ :param torch.LongTensor words: [batch_size, mex_len] @@ -236,15 +227,13 @@ class AdvSeqLabel(nn.Module): words = words.long() seq_len = seq_len.long() - self.mask = self._make_mask(words, seq_len) - - # seq_len = seq_len.long() + mask = seq_len_to_mask(seq_len, max_len=words.size(1)) + target = target.long() if target is not None else None if next(self.parameters()).is_cuda: words = words.cuda() - self.mask = self.mask.cuda() - + x = self.Embedding(words) x = self.norm1(x) # [batch_size, max_len, word_emb_dim] @@ -257,9 +246,9 @@ class AdvSeqLabel(nn.Module): x = self.drop(x) x = self.Linear2(x) if target is not None: - return {"loss": self._internal_loss(x, target)} + return {"loss": self._internal_loss(x, target, mask)} else: - return {"pred": self._decode(x)} + return {"pred": self._decode(x, mask)} def forward(self, words, seq_len, target): """ diff --git a/test/models/test_sequence_labeling.py b/test/models/test_sequence_labeling.py index 3a70e381..815d7047 100644 --- a/test/models/test_sequence_labeling.py +++ b/test/models/test_sequence_labeling.py @@ -3,9 +3,24 @@ import unittest from .model_runner import * -from fastNLP.models.sequence_labeling import SeqLabeling, AdvSeqLabel +from fastNLP.models.sequence_labeling import SeqLabeling, AdvSeqLabel, BiLSTMCRF from fastNLP.core.losses import LossInForward +class TestBiLSTM(unittest.TestCase): + def test_case1(self): + # 测试能否正常运行CNN + init_emb = (VOCAB_SIZE, 30) + model = BiLSTMCRF(init_emb, + hidden_size=30, + num_classes=NUM_CLS) + + data = RUNNER.prepare_pos_tagging_data() + data.set_input('target') + loss = LossInForward() + metric = AccuracyMetric(pred=C.OUTPUT, target=C.TARGET, seq_len=C.INPUT_LEN) + RUNNER.run_model(model, data, loss, metric) + + class TesSeqLabel(unittest.TestCase): def test_case1(self): # 测试能否正常运行CNN From 091f24e393f434eba66937af65adcbcd8ea3d3cf Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Sun, 1 Sep 2019 10:15:11 +0800 Subject: [PATCH 132/153] fix some bugs in test code. --- test/__init__.py | 3 +++ test/core/test_utils.py | 17 +++++++++++------ test/models/__init__.py | 0 test/models/test_bert.py | 2 +- 4 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 test/__init__.py create mode 100644 test/models/__init__.py diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 00000000..c7a5f082 --- /dev/null +++ b/test/__init__.py @@ -0,0 +1,3 @@ +import fastNLP + +__all__ = ["fastNLP"] diff --git a/test/core/test_utils.py b/test/core/test_utils.py index 363d5fa1..29645fb1 100644 --- a/test/core/test_utils.py +++ b/test/core/test_utils.py @@ -119,7 +119,8 @@ class TestCache(unittest.TestCase): def test_cache_save(self): try: start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train') + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train') end_time = time.time() pre_time = end_time - start_time with open('test/demo1.pkl', 'rb') as f: @@ -128,7 +129,8 @@ class TestCache(unittest.TestCase): for i in range(embed.shape[0]): self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train') + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train') end_time = time.time() read_time = end_time - start_time print("Read using {:.3f}, while prepare using:{:.3f}".format(read_time, pre_time)) @@ -139,7 +141,7 @@ class TestCache(unittest.TestCase): def test_cache_save_overwrite_path(self): try: start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', 'test/data_for_tests/cws_train', _cache_fp='test/demo_overwrite.pkl') end_time = time.time() pre_time = end_time - start_time @@ -149,7 +151,8 @@ class TestCache(unittest.TestCase): for i in range(embed.shape[0]): self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train', _cache_fp='test/demo_overwrite.pkl') end_time = time.time() read_time = end_time - start_time @@ -161,7 +164,8 @@ class TestCache(unittest.TestCase): def test_cache_refresh(self): try: start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train', _refresh=True) end_time = time.time() pre_time = end_time - start_time @@ -171,7 +175,8 @@ class TestCache(unittest.TestCase): for i in range(embed.shape[0]): self.assertListEqual(embed[i].tolist(), _embed[i].tolist()) start_time = time.time() - embed, vocab, d = process_data_1('test/data_for_tests/word2vec_test.txt', 'test/data_for_tests/cws_train', + embed, vocab, d = process_data_1('test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt', + 'test/data_for_tests/cws_train', _refresh=True) end_time = time.time() read_time = end_time - start_time diff --git a/test/models/__init__.py b/test/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/test/models/test_bert.py b/test/models/test_bert.py index 2b310edf..969a8594 100644 --- a/test/models/test_bert.py +++ b/test/models/test_bert.py @@ -82,7 +82,7 @@ class TestBert(unittest.TestCase): def test_bert_5(self): vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) - embed = BertEmbedding(vocab, model_dir_or_name='./../data_for_tests/embedding/small_bert', + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', include_cls_sep=True) model = BertForSentenceMatching(embed) From 1c2ee50c47b0b59b81a828838bf531c54fea5181 Mon Sep 17 00:00:00 2001 From: yunfan Date: Sun, 1 Sep 2019 10:31:14 +0800 Subject: [PATCH 133/153] [fix] EchoCallback --- fastNLP/core/callback.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index dde9a31a..5167b09f 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -1031,12 +1031,11 @@ class EchoCallback(Callback): def __init__(self, name, out=sys.stdout): super(EchoCallback, self).__init__() self.name = name - self.out = out + self.out = out # deprecated def __getattribute__(self, item): if item.startswith('on_'): - logger.info('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid()), - file=self.out) + logger.info('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid())) return super(EchoCallback, self).__getattribute__(item) From b9aa05f6cf371a9ceb99463c445fa000a724fa21 Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Sun, 1 Sep 2019 11:22:42 +0800 Subject: [PATCH 134/153] add testing codes and data for loader and pipe. --- test/data_for_tests/io/cws_msra/dev.txt | 2 ++ test/data_for_tests/io/cws_msra/test.txt | 2 ++ test/data_for_tests/io/cws_msra/train.txt | 3 +++ test/data_for_tests/io/imdb/dev.txt | 2 ++ test/data_for_tests/io/imdb/test.txt | 2 ++ test/data_for_tests/io/imdb/train.txt | 2 ++ test/data_for_tests/io/rte/dev.tsv | 3 +++ test/data_for_tests/io/rte/test.tsv | 3 +++ test/data_for_tests/io/rte/train.tsv | 4 ++++ test/io/loader/test_classification_loader.py | 8 ++++++++ test/io/loader/test_conll_loader.py | 14 ++++++++++++-- test/io/loader/test_cws_loader.py | 13 ++++++++++++- test/io/loader/test_matching_loader.py | 8 ++++++++ test/io/pipe/test_classification.py | 8 ++++++++ test/io/pipe/test_conll.py | 14 ++++++++++++-- test/io/pipe/test_cws.py | 12 +++++++++++- test/io/pipe/test_matching.py | 8 ++++++++ 17 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 test/data_for_tests/io/cws_msra/dev.txt create mode 100644 test/data_for_tests/io/cws_msra/test.txt create mode 100644 test/data_for_tests/io/cws_msra/train.txt create mode 100644 test/data_for_tests/io/imdb/dev.txt create mode 100644 test/data_for_tests/io/imdb/test.txt create mode 100644 test/data_for_tests/io/imdb/train.txt create mode 100644 test/data_for_tests/io/rte/dev.tsv create mode 100644 test/data_for_tests/io/rte/test.tsv create mode 100644 test/data_for_tests/io/rte/train.tsv diff --git a/test/data_for_tests/io/cws_msra/dev.txt b/test/data_for_tests/io/cws_msra/dev.txt new file mode 100644 index 00000000..9c6b34ee --- /dev/null +++ b/test/data_for_tests/io/cws_msra/dev.txt @@ -0,0 +1,2 @@ +“ 人们 常 说 生活 是 一 部 教科书 , 而 血 与 火 的 战争 更 是 不可多得 的 教科书 , 她 确实 是 名副其实 的 ‘ 我 的 大学 ’ 。 +他 “ 严格要求 自己 , 从 一个 科举 出身 的 进士 成为 一个 伟大 的 民主主义 者 , 进而 成为 一 位 杰出 的 党外 共产主义 战士 , 献身 于 崇高 的 共产主义 事业 。 diff --git a/test/data_for_tests/io/cws_msra/test.txt b/test/data_for_tests/io/cws_msra/test.txt new file mode 100644 index 00000000..8d5c6b3c --- /dev/null +++ b/test/data_for_tests/io/cws_msra/test.txt @@ -0,0 +1,2 @@ +扬帆 远东 做 与 中国 合作 的 先行 +希腊 的 经济 结构 较 特殊 。 diff --git a/test/data_for_tests/io/cws_msra/train.txt b/test/data_for_tests/io/cws_msra/train.txt new file mode 100644 index 00000000..35c2cad0 --- /dev/null +++ b/test/data_for_tests/io/cws_msra/train.txt @@ -0,0 +1,3 @@ +“ 心 静 渐 知 春 似 海 , 花 深 每 觉 影 生 香 。 +“ 吃 屎 的 东西 , 连 一 捆 麦 也 铡 不 动 呀 ? +复旦大学 百年 校庆 。 \ No newline at end of file diff --git a/test/data_for_tests/io/imdb/dev.txt b/test/data_for_tests/io/imdb/dev.txt new file mode 100644 index 00000000..6b548a0c --- /dev/null +++ b/test/data_for_tests/io/imdb/dev.txt @@ -0,0 +1,2 @@ +neg It, at all, you have seen when harry met sally, then avoid this one. It will not only make you bang your head on the table as why can't bollywood even make a good remake; but also annoy you with the so called funny moments in it. The charm of the movie is missing. Ranee looks terrible. Saif tries to act like he is one hell of an actor. The plots that have been picked up from the original, don't look effective either. The part where both of them bring their friends along and they hit a note, it just doesn't look appealing. What can be more disastrous? you wanna waste some money, this is what you can get. Otherwise, put some more bucks, and watch the original. Its too good to miss.. +neg The monster from Enemy Mine somehow made his way into a small mountain community, where he has taken up residence. He's being hunted by a female doctor-turned-vigilante who is out to exterminate him. This female assassin, who looks like a refugee from a Motley Crue video, rides around on a motorcycle and tries to save a bunch of kids who have chosen to have a Big Chill weekend right smack dab in the middle of the monster's turf. Decapitations and lots of blood are primarily in place to draw attention away from the story which limps along like a bad version of the Island of Dr. Moreau (and yes, it's worse than the one with Val Kilmer). diff --git a/test/data_for_tests/io/imdb/test.txt b/test/data_for_tests/io/imdb/test.txt new file mode 100644 index 00000000..c9bfae74 --- /dev/null +++ b/test/data_for_tests/io/imdb/test.txt @@ -0,0 +1,2 @@ +neg Alan Rickman & Emma Thompson give good performances with southern/New Orleans accents in this detective flick. It's worth seeing for their scenes- and Rickman's scene with Hal Holbrook. These three actors mannage to entertain us no matter what the movie, it seems. The plot for the movie shows potential, but one gets the impression in watching the film that it was not pulled off as well as it could have been. The fact that it is cluttered by a rather uninteresting subplot and mostly uninteresting kidnappers really muddles things. The movie is worth a view- if for nothing more than entertaining performances by Rickman, Thompson, and Holbrook. +neg I have seen this movie and I did not care for this movie anyhow. I would not think about going to Paris because I do not like this country and its national capital. I do not like to learn french anyhow because I do not understand their language. Why would I go to France when I rather go to Germany or the United Kingdom? Germany and the United Kingdom are the nations I tolerate. Apparently the Olsen Twins do not understand the French language just like me. Therefore I will not bother the France trip no matter what. I might as well stick to the United Kingdom and meet single women and play video games if there is a video arcade. That is all. diff --git a/test/data_for_tests/io/imdb/train.txt b/test/data_for_tests/io/imdb/train.txt new file mode 100644 index 00000000..d6ac6b68 --- /dev/null +++ b/test/data_for_tests/io/imdb/train.txt @@ -0,0 +1,2 @@ +neg I'll try to use words to describe this on....

I saw the original, which was good in its own way, but back then I should have feared a sequel.

And I was 'afraid' when I picked this one up, but now that I've seen it, I have to say, it's even worse then I thought. Why these movies still get money still makes my mind spin.

Let's start with the actors;they aren't all that good, but it has to be said, some make heads turn by being just plain awful. But what can an actor do with a script like this one. It's trying to be a copy of the original only this time the places have changed, any form of story is gone and any attempt of actually coming up with something that hasn't been done before, fails miserably. In a futile attempt to get it up-to-date, they try to make it exciting by making use of the whole 'big-brother' theme , but that has been worn out ages ago and offers nothing but a filler for between the beginning and the end. An attempt was made to try to save the movie by making a ton of references to the '83 original, but it just ended up being plain funny and sometimes a bit sad. In conclusion, if you have nothing , and I mean nothing , to do... go watch it, or play Frisbee... with the DVD.... by yourself. It'll offer you the same amount of fun.. I promise +pos This movie is totally wicked! It's really great to see MJH in a different role than her Sabrina character! The plot is totally cool, and the characters are excellently written. Definitely one of the best movies!! diff --git a/test/data_for_tests/io/rte/dev.tsv b/test/data_for_tests/io/rte/dev.tsv new file mode 100644 index 00000000..725d7542 --- /dev/null +++ b/test/data_for_tests/io/rte/dev.tsv @@ -0,0 +1,3 @@ +index sentence1 sentence2 label +0 Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Christopher Reeve had an accident. not_entailment +1 Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations. Bacteria is winning the war against antibiotics. entailment diff --git a/test/data_for_tests/io/rte/test.tsv b/test/data_for_tests/io/rte/test.tsv new file mode 100644 index 00000000..aeceb467 --- /dev/null +++ b/test/data_for_tests/io/rte/test.tsv @@ -0,0 +1,3 @@ +index sentence1 sentence2 +0 Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case. Shukla is related to Mangla. +1 Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia. Authorities in Brazil hold 200 people as hostage. diff --git a/test/data_for_tests/io/rte/train.tsv b/test/data_for_tests/io/rte/train.tsv new file mode 100644 index 00000000..9f3dab6e --- /dev/null +++ b/test/data_for_tests/io/rte/train.tsv @@ -0,0 +1,4 @@ +index sentence1 sentence2 label +0 No Weapons of Mass Destruction Found in Iraq Yet. Weapons of Mass Destruction Found in Iraq. not_entailment +1 A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI. Pope Benedict XVI is the new leader of the Roman Catholic Church. entailment +2 Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients. Herceptin can be used to treat breast cancer. entailment diff --git a/test/io/loader/test_classification_loader.py b/test/io/loader/test_classification_loader.py index 28f08921..1438a014 100644 --- a/test/io/loader/test_classification_loader.py +++ b/test/io/loader/test_classification_loader.py @@ -17,3 +17,11 @@ class TestDownload(unittest.TestCase): for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader]: data_bundle = loader().load() print(data_bundle) + + +class TestLoad(unittest.TestCase): + + def test_load(self): + for loader in [IMDBLoader]: + data_bundle = loader().load('test/data_for_tests/io/imdb') + print(data_bundle) diff --git a/test/io/loader/test_conll_loader.py b/test/io/loader/test_conll_loader.py index e44b8a2a..861de5a5 100644 --- a/test/io/loader/test_conll_loader.py +++ b/test/io/loader/test_conll_loader.py @@ -1,7 +1,9 @@ import unittest import os -from fastNLP.io.loader.conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader +from fastNLP.io.loader.conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, \ + Conll2003Loader + class MSRANERTest(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") @@ -10,12 +12,20 @@ class MSRANERTest(unittest.TestCase): data_bundle = MsraNERLoader().load() print(data_bundle) + class PeopleDailyTest(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_download(self): PeopleDailyNERLoader().download() + class WeiboNERTest(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_download(self): - WeiboNERLoader().download() \ No newline at end of file + WeiboNERLoader().download() + + +class TestConll2003Loader(unittest.TestCase): + def test__load(self): + Conll2003Loader()._load('test/data_for_tests/conll_2003_example.txt') + diff --git a/test/io/loader/test_cws_loader.py b/test/io/loader/test_cws_loader.py index 6ad607c3..8b5d4081 100644 --- a/test/io/loader/test_cws_loader.py +++ b/test/io/loader/test_cws_loader.py @@ -10,4 +10,15 @@ class CWSLoaderTest(unittest.TestCase): for dataset_name in dataset_names: with self.subTest(dataset_name=dataset_name): data_bundle = CWSLoader(dataset_name=dataset_name).load() - print(data_bundle) \ No newline at end of file + print(data_bundle) + + +class RunCWSLoaderTest(unittest.TestCase): + def test_cws_loader(self): + dataset_names = ['msra'] + for dataset_name in dataset_names: + with self.subTest(dataset_name=dataset_name): + data_bundle = CWSLoader(dataset_name=dataset_name).load( + f'test/data_for_tests/io/cws_{dataset_name}' + ) + print(data_bundle) diff --git a/test/io/loader/test_matching_loader.py b/test/io/loader/test_matching_loader.py index 5c1a91f1..652cf161 100644 --- a/test/io/loader/test_matching_loader.py +++ b/test/io/loader/test_matching_loader.py @@ -20,3 +20,11 @@ class TestDownload(unittest.TestCase): data_bundle = loader().load() print(data_bundle) + +class TestLoad(unittest.TestCase): + + def test_load(self): + for loader in [RTELoader]: + data_bundle = loader().load('test/data_for_tests/io/rte') + print(data_bundle) + diff --git a/test/io/pipe/test_classification.py b/test/io/pipe/test_classification.py index 39dc71e0..c6e2005e 100644 --- a/test/io/pipe/test_classification.py +++ b/test/io/pipe/test_classification.py @@ -11,3 +11,11 @@ class TestPipe(unittest.TestCase): print(pipe) data_bundle = pipe(tokenizer='raw').process_from_file() print(data_bundle) + + +class TestRunPipe(unittest.TestCase): + + def test_load(self): + for pipe in [IMDBPipe]: + data_bundle = pipe(tokenizer='raw').process_from_file('test/data_for_tests/io/imdb') + print(data_bundle) diff --git a/test/io/pipe/test_conll.py b/test/io/pipe/test_conll.py index e8879d71..6f6c4fad 100644 --- a/test/io/pipe/test_conll.py +++ b/test/io/pipe/test_conll.py @@ -1,6 +1,7 @@ import unittest import os -from fastNLP.io import MsraNERPipe, PeopleDailyPipe, WeiboNERPipe +from fastNLP.io import MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, Conll2003Pipe, Conll2003NERPipe + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") class TestPipe(unittest.TestCase): @@ -9,4 +10,13 @@ class TestPipe(unittest.TestCase): with self.subTest(pipe=pipe): print(pipe) data_bundle = pipe().process_from_file() - print(data_bundle) \ No newline at end of file + print(data_bundle) + + +class TestRunPipe(unittest.TestCase): + def test_conll2003(self): + for pipe in [Conll2003Pipe, Conll2003NERPipe]: + with self.subTest(pipe=pipe): + print(pipe) + data_bundle = pipe().process_from_file('test/data_for_tests/conll_2003_example.txt') + print(data_bundle) diff --git a/test/io/pipe/test_cws.py b/test/io/pipe/test_cws.py index 2fc57ae2..dd901a25 100644 --- a/test/io/pipe/test_cws.py +++ b/test/io/pipe/test_cws.py @@ -3,6 +3,7 @@ import unittest import os from fastNLP.io.pipe.cws import CWSPipe + class CWSPipeTest(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_process_from_file(self): @@ -10,4 +11,13 @@ class CWSPipeTest(unittest.TestCase): for dataset_name in dataset_names: with self.subTest(dataset_name=dataset_name): data_bundle = CWSPipe(dataset_name=dataset_name).process_from_file() - print(data_bundle) \ No newline at end of file + print(data_bundle) + + +class RunCWSPipeTest(unittest.TestCase): + def test_process_from_file(self): + dataset_names = ['msra'] + for dataset_name in dataset_names: + with self.subTest(dataset_name=dataset_name): + data_bundle = CWSPipe().process_from_file(f'test/data_for_tests/io/cws_{dataset_name}') + print(data_bundle) diff --git a/test/io/pipe/test_matching.py b/test/io/pipe/test_matching.py index c057bb0c..33904e7a 100644 --- a/test/io/pipe/test_matching.py +++ b/test/io/pipe/test_matching.py @@ -24,3 +24,11 @@ class TestBertPipe(unittest.TestCase): print(pipe) data_bundle = pipe(tokenizer='raw').process_from_file() print(data_bundle) + + +class TestRunPipe(unittest.TestCase): + + def test_load(self): + for pipe in [RTEPipe, RTEBertPipe]: + data_bundle = pipe(tokenizer='raw').process_from_file('test/data_for_tests/io/rte') + print(data_bundle) From 1994029ab84fb70ee8d790732006747f5d918a02 Mon Sep 17 00:00:00 2001 From: yh Date: Mon, 2 Sep 2019 15:59:45 +0800 Subject: [PATCH 135/153] =?UTF-8?q?1.=E5=BD=93=E5=89=8D=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E7=9A=84encoding=5Ftype=E9=83=BD=E6=94=AF=E6=8C=81=E4=BB=8Etag?= =?UTF-8?q?=5Fvocab=E4=B8=AD=E8=87=AA=E5=8A=A8=E5=88=A4=E6=96=AD;=E9=81=BF?= =?UTF-8?q?=E5=85=8D=E8=A7=A6=E5=8F=91=E6=97=A0=E6=84=8F=E8=AF=86=E5=AF=BC?= =?UTF-8?q?=E8=87=B4=E7=9A=84metric=20bug;=202.=20=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E9=83=A8=E5=88=86inplace=E6=93=8D=E4=BD=9C=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E6=B1=82=E5=AF=BC=E7=9A=84=E9=97=AE=E9=A2=98;=203.Vocabulary?= =?UTF-8?q?=E5=B0=86=E4=B8=80=E4=BA=9B=E5=B1=9E=E6=80=A7=E9=80=9A=E8=BF=87?= =?UTF-8?q?property=E6=9A=B4=E9=9C=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 85 +++++++++++++++++++------ fastNLP/core/vocabulary.py | 70 +++++++++++++-------- fastNLP/io/data_bundle.py | 43 ++++++++++++- fastNLP/io/pipe/conll.py | 2 +- fastNLP/models/biaffine_parser.py | 15 +++-- fastNLP/modules/decoder/crf.py | 38 ++++++++---- test/core/test_metrics.py | 41 +++++++++++- test/modules/decoder/test_CRF.py | 100 +++++++++++++++++++++++++++++- 8 files changed, 321 insertions(+), 73 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 0dc601a3..b06e5459 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -24,7 +24,7 @@ from .utils import seq_len_to_mask from .vocabulary import Vocabulary from abc import abstractmethod import warnings - +from typing import Union class MetricBase(object): """ @@ -337,15 +337,18 @@ class AccuracyMetric(MetricBase): raise TypeError(f"`seq_lens` in {_get_func_signature(self.evaluate)} must be torch.Tensor," f"got {type(seq_len)}.") - if seq_len is not None: - masks = seq_len_to_mask(seq_len=seq_len) + if seq_len is not None and target.dim()>1: + max_len = target.size(1) + masks = seq_len_to_mask(seq_len=seq_len, max_len=max_len) else: masks = None - if pred.size() == target.size(): + if pred.dim() == target.dim(): pass - elif len(pred.size()) == len(target.size()) + 1: + elif pred.dim() == target.dim() + 1: pred = pred.argmax(dim=-1) + if seq_len is None: + warnings.warn("You are not passing `seq_len` to exclude pad when calculate accuracy.") else: raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have " f"size:{pred.size()}, target should have size: {pred.size()} or " @@ -493,20 +496,63 @@ def _bio_tag_to_spans(tags, ignore_labels=None): return [(span[0], (span[1][0], span[1][1] + 1)) for span in spans if span[0] not in ignore_labels] -def _check_tag_vocab_and_encoding_type(vocab:Vocabulary, encoding_type:str): +def _get_encoding_type_from_tag_vocab(tag_vocab:Union[Vocabulary, dict])->str: + """ + 给定Vocabulary自动判断是哪种类型的encoding, 支持判断bmes, bioes, bmeso, bio + + :param tag_vocab: 支持传入tag Vocabulary; 或者传入形如{0:"O", 1:"B-tag1"},即index在前,tag在后的dict。 + :return: + """ + tag_set = set() + unk_token = '' + pad_token = '' + if isinstance(tag_vocab, Vocabulary): + unk_token = tag_vocab.unknown + pad_token = tag_vocab.padding + tag_vocab = tag_vocab.idx2word + for idx, tag in tag_vocab.items(): + if tag in (unk_token, pad_token): + continue + tag = tag[:1].lower() + tag_set.add(tag) + + bmes_tag_set = set('bmes') + if tag_set == bmes_tag_set: + return 'bmes' + bio_tag_set = set('bio') + if tag_set == bio_tag_set: + return 'bio' + bmeso_tag_set = set('bmeso') + if tag_set == bmeso_tag_set: + return 'bmeso' + bioes_tag_set = set('bioes') + if tag_set == bioes_tag_set: + return 'bioes' + raise RuntimeError("encoding_type cannot be inferred automatically. Only support " + "'bio', 'bmes', 'bmeso', 'bioes' type.") + + +def _check_tag_vocab_and_encoding_type(tag_vocab:Union[Vocabulary, dict], encoding_type:str): """ 检查vocab中的tag是否与encoding_type是匹配的 - :param vocab: target的Vocabulary + :param tag_vocab: 支持传入tag Vocabulary; 或者传入形如{0:"O", 1:"B-tag1"},即index在前,tag在后的dict。 :param encoding_type: bio, bmes, bioes, bmeso :return: """ tag_set = set() - for tag, idx in vocab: - if idx in (vocab.unknown_idx, vocab.padding_idx): + unk_token = '' + pad_token = '' + if isinstance(tag_vocab, Vocabulary): + unk_token = tag_vocab.unknown + pad_token = tag_vocab.padding + tag_vocab = tag_vocab.idx2word + for idx, tag in tag_vocab.items(): + if tag in (unk_token, pad_token): continue tag = tag[:1].lower() tag_set.add(tag) + tags = encoding_type for tag in tag_set: assert tag in tags, f"{tag} is not a valid tag in encoding type:{encoding_type}. Please check your " \ @@ -549,7 +595,7 @@ class SpanFPreRecMetric(MetricBase): :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据 :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据 :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。 - :param str encoding_type: 目前支持bio, bmes, bmeso, bioes + :param str encoding_type: 目前支持bio, bmes, bmeso, bioes。默认为None,通过tag_vocab自动判断. :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这 个label :param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个 @@ -560,18 +606,21 @@ class SpanFPreRecMetric(MetricBase): 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 """ - def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type='bio', ignore_labels=None, + def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type=None, ignore_labels=None, only_gross=True, f_type='micro', beta=1): - - encoding_type = encoding_type.lower() - + if not isinstance(tag_vocab, Vocabulary): raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab))) if f_type not in ('micro', 'macro'): raise ValueError("f_type only supports `micro` or `macro`', got {}.".format(f_type)) - - self.encoding_type = encoding_type - _check_tag_vocab_and_encoding_type(tag_vocab, encoding_type) + + if encoding_type: + encoding_type = encoding_type.lower() + _check_tag_vocab_and_encoding_type(tag_vocab, encoding_type) + self.encoding_type = encoding_type + else: + self.encoding_type = _get_encoding_type_from_tag_vocab(tag_vocab) + if self.encoding_type == 'bmes': self.tag_to_span_func = _bmes_tag_to_spans elif self.encoding_type == 'bio': @@ -581,7 +630,7 @@ class SpanFPreRecMetric(MetricBase): elif self.encoding_type == 'bioes': self.tag_to_span_func = _bioes_tag_to_spans else: - raise ValueError("Only support 'bio', 'bmes', 'bmeso' type.") + raise ValueError("Only support 'bio', 'bmes', 'bmeso', 'bioes' type.") self.ignore_labels = ignore_labels self.f_type = f_type diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index cd4f2c0f..b0f9650a 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -39,7 +39,7 @@ def _check_build_vocab(func): @wraps(func) # to solve missing docstring def _wrapper(self, *args, **kwargs): - if self.word2idx is None or self.rebuild is True: + if self._word2idx is None or self.rebuild is True: self.build_vocab() return func(self, *args, **kwargs) @@ -95,12 +95,30 @@ class Vocabulary(object): self.word_count = Counter() self.unknown = unknown self.padding = padding - self.word2idx = None - self.idx2word = None + self._word2idx = None + self._idx2word = None self.rebuild = True # 用于承载不需要单独创建entry的词语,具体见from_dataset()方法 self._no_create_word = Counter() - + + @property + @_check_build_vocab + def word2idx(self): + return self._word2idx + + @word2idx.setter + def word2idx(self, value): + self._word2idx = value + + @property + @_check_build_vocab + def idx2word(self): + return self._idx2word + + @idx2word.setter + def idx2word(self, value): + self._word2idx = value + @_check_build_status def update(self, word_lst, no_create_entry=False): """依次增加序列中词在词典中的出现频率 @@ -187,21 +205,21 @@ class Vocabulary(object): 但已经记录在词典中的词, 不会改变对应的 `int` """ - if self.word2idx is None: - self.word2idx = {} + if self._word2idx is None: + self._word2idx = {} if self.padding is not None: - self.word2idx[self.padding] = len(self.word2idx) + self._word2idx[self.padding] = len(self._word2idx) if self.unknown is not None: - self.word2idx[self.unknown] = len(self.word2idx) + self._word2idx[self.unknown] = len(self._word2idx) max_size = min(self.max_size, len(self.word_count)) if self.max_size else None words = self.word_count.most_common(max_size) if self.min_freq is not None: words = filter(lambda kv: kv[1] >= self.min_freq, words) - if self.word2idx is not None: - words = filter(lambda kv: kv[0] not in self.word2idx, words) - start_idx = len(self.word2idx) - self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) + if self._word2idx is not None: + words = filter(lambda kv: kv[0] not in self._word2idx, words) + start_idx = len(self._word2idx) + self._word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) self.build_reverse_vocab() self.rebuild = False return self @@ -211,12 +229,12 @@ class Vocabulary(object): 基于 `word to index` dict, 构建 `index to word` dict. """ - self.idx2word = {i: w for w, i in self.word2idx.items()} + self._idx2word = {i: w for w, i in self._word2idx.items()} return self @_check_build_vocab def __len__(self): - return len(self.word2idx) + return len(self._word2idx) @_check_build_vocab def __contains__(self, item): @@ -226,7 +244,7 @@ class Vocabulary(object): :param item: the word :return: True or False """ - return item in self.word2idx + return item in self._word2idx def has_word(self, w): """ @@ -248,10 +266,10 @@ class Vocabulary(object): vocab[w] """ - if w in self.word2idx: - return self.word2idx[w] + if w in self._word2idx: + return self._word2idx[w] if self.unknown is not None: - return self.word2idx[self.unknown] + return self._word2idx[self.unknown] else: raise ValueError("word `{}` not in vocabulary".format(w)) @@ -405,7 +423,7 @@ class Vocabulary(object): """ if self.unknown is None: return None - return self.word2idx[self.unknown] + return self._word2idx[self.unknown] @property @_check_build_vocab @@ -415,7 +433,7 @@ class Vocabulary(object): """ if self.padding is None: return None - return self.word2idx[self.padding] + return self._word2idx[self.padding] @_check_build_vocab def to_word(self, idx): @@ -425,7 +443,7 @@ class Vocabulary(object): :param int idx: the index :return str word: the word """ - return self.idx2word[idx] + return self._idx2word[idx] def clear(self): """ @@ -434,8 +452,8 @@ class Vocabulary(object): :return: """ self.word_count.clear() - self.word2idx = None - self.idx2word = None + self._word2idx = None + self._idx2word = None self.rebuild = True self._no_create_word.clear() return self @@ -446,8 +464,8 @@ class Vocabulary(object): """ len(self) # make sure vocab has been built state = self.__dict__.copy() - # no need to pickle idx2word as it can be constructed from word2idx - del state['idx2word'] + # no need to pickle _idx2word as it can be constructed from _word2idx + del state['_idx2word'] return state def __setstate__(self, state): @@ -462,5 +480,5 @@ class Vocabulary(object): @_check_build_vocab def __iter__(self): - for word, index in self.word2idx.items(): + for word, index in self._word2idx.items(): yield word, index diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index f30add34..3e7f39d3 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -8,7 +8,7 @@ __all__ = [ from ..core.dataset import DataSet from ..core.vocabulary import Vocabulary - +from typing import Union class DataBundle: """ @@ -191,7 +191,7 @@ class DataBundle: raise KeyError(f"{field_name} not found DataSet:{name}.") return self - def rename_field(self, field_name, new_field_name, ignore_miss_dataset=True): + def rename_field(self, field_name, new_field_name, ignore_miss_dataset=True, rename_vocab=True): """ 将DataBundle中所有DataSet中名为field_name的field重命名为new_field_name. @@ -199,6 +199,7 @@ class DataBundle: :param str new_field_name: :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; 如果为False,则报错 + :param bool rename_vocab: 如果该field同时也存在于vocabs中,会将该field的名称对应修改 :return: self """ for name, dataset in self.datasets.items(): @@ -206,15 +207,20 @@ class DataBundle: dataset.rename_field(field_name=field_name, new_field_name=new_field_name) elif not ignore_miss_dataset: raise KeyError(f"{field_name} not found DataSet:{name}.") + if rename_vocab: + if field_name in self.vocabs: + self.vocabs[new_field_name] = self.vocabs.pop(field_name) + return self - def delete_field(self, field_name, ignore_miss_dataset=True): + def delete_field(self, field_name, ignore_miss_dataset=True, delete_vocab=True): """ 将DataBundle中所有DataSet中名为field_name的field删除掉. :param str field_name: :param bool ignore_miss_dataset: 当某个field名称在某个dataset不存在时,如果为True,则直接忽略该DataSet; 如果为False,则报错 + :param bool delete_vocab: 如果该field也在vocabs中存在,将该值也一并删除 :return: self """ for name, dataset in self.datasets.items(): @@ -222,8 +228,39 @@ class DataBundle: dataset.delete_field(field_name=field_name) elif not ignore_miss_dataset: raise KeyError(f"{field_name} not found DataSet:{name}.") + if delete_vocab: + if field_name in self.vocabs: + self.vocabs.pop(field_name) return self + def iter_datasets(self)->Union[str, DataSet]: + """ + 迭代data_bundle中的DataSet + + Example:: + + for name, dataset in data_bundle.iter_datasets(): + pass + + :return: + """ + for name, dataset in self.datasets.items(): + yield name, dataset + + def iter_vocabs(self)->Union[str, Vocabulary]: + """ + 迭代data_bundle中的DataSet + + Example: + + for field_name, vocab in data_bundle.iter_vocabs(): + pass + + :return: + """ + for field_name, vocab in self.vocabs.items(): + yield field_name, vocab + def apply_field(self, func, field_name:str, new_field_name:str, ignore_miss_dataset=True, **kwargs): """ 对DataBundle中所有的dataset使用apply_field方法 diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index eb7d4909..2edc9008 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -193,7 +193,7 @@ class OntoNotesNERPipe(_NERPipe): """ 处理OntoNotes的NER数据,处理之后DataSet中的field情况为 - .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader + .. csv-table:: :header: "raw_words", "words", "target", "seq_len" "[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py index bead09fc..6b0829bd 100644 --- a/fastNLP/models/biaffine_parser.py +++ b/fastNLP/models/biaffine_parser.py @@ -207,7 +207,7 @@ class ArcBiaffine(nn.Module): output = dep.matmul(self.U) output = output.bmm(head.transpose(-1, -2)) if self.has_bias: - output += head.matmul(self.bias).unsqueeze(1) + output = output + head.matmul(self.bias).unsqueeze(1) return output @@ -234,7 +234,7 @@ class LabelBilinear(nn.Module): :return output: [batch, seq_len, num_cls] 每个元素对应类别的概率图 """ output = self.bilinear(x1, x2) - output += self.lin(torch.cat([x1, x2], dim=2)) + output = output + self.lin(torch.cat([x1, x2], dim=2)) return output @@ -363,7 +363,7 @@ class BiaffineParser(GraphParser): # print('forward {} {}'.format(batch_size, seq_len)) # get sequence mask - mask = seq_len_to_mask(seq_len).long() + mask = seq_len_to_mask(seq_len, max_len=length).long() word = self.word_embedding(words1) # [N,L] -> [N,L,C_0] pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1] @@ -435,10 +435,10 @@ class BiaffineParser(GraphParser): """ batch_size, length, _ = pred1.shape - mask = seq_len_to_mask(seq_len) + mask = seq_len_to_mask(seq_len, max_len=length) flip_mask = (mask == 0) _arc_pred = pred1.clone() - _arc_pred.masked_fill_(flip_mask.unsqueeze(1), -float('inf')) + _arc_pred = _arc_pred.masked_fill(flip_mask.unsqueeze(1), -float('inf')) arc_logits = F.log_softmax(_arc_pred, dim=2) label_logits = F.log_softmax(pred2, dim=2) batch_index = torch.arange(batch_size, device=arc_logits.device, dtype=torch.long).unsqueeze(1) @@ -446,9 +446,8 @@ class BiaffineParser(GraphParser): arc_loss = arc_logits[batch_index, child_index, target1] label_loss = label_logits[batch_index, child_index, target2] - byte_mask = flip_mask.byte() - arc_loss.masked_fill_(byte_mask, 0) - label_loss.masked_fill_(byte_mask, 0) + arc_loss = arc_loss.masked_fill(flip_mask, 0) + label_loss = label_loss.masked_fill(flip_mask, 0) arc_nll = -arc_loss.mean() label_nll = -label_loss.mean() return arc_nll + label_nll diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py index f63d46e3..c13ea50c 100644 --- a/fastNLP/modules/decoder/crf.py +++ b/fastNLP/modules/decoder/crf.py @@ -10,33 +10,45 @@ from torch import nn from ..utils import initial_parameter from ...core.vocabulary import Vocabulary +from ...core.metrics import _get_encoding_type_from_tag_vocab, _check_tag_vocab_and_encoding_type +from typing import Union - -def allowed_transitions(id2target, encoding_type='bio', include_start_end=False): +def allowed_transitions(tag_vocab:Union[Vocabulary, dict], encoding_type=None, include_start_end=False): """ 别名::class:`fastNLP.modules.allowed_transitions` :class:`fastNLP.modules.decoder.allowed_transitions` 给定一个id到label的映射表,返回所有可以跳转的(from_tag_id, to_tag_id)列表。 - :param dict, ~fastNLP.Vocabulary id2target: key是label的indices,value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是 - "B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.idx2word得到id2label。 - :param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"。 + :param ~fastNLP.Vocabulary,dict tag_vocab: 支持类型为tag或tag-label。只有tag的,比如"B", "M"; 也可以是"B-NN", "M-NN", + tag和label之间一定要用"-"隔开。如果传入dict,格式需要形如{0:"O", 1:"B-tag1"},即index在前,tag在后。 + :param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"。默认为None,通过vocab自动推断 :param bool include_start_end: 是否包含开始与结尾的转换。比如在bio中,b/o可以在开头,但是i不能在开头; 为True,返回的结果中会包含(start_idx, b_idx), (start_idx, o_idx), 但是不包含(start_idx, i_idx); start_idx=len(id2label), end_idx=len(id2label)+1。为False, 返回的结果中不含与开始结尾相关的内容 :return: List[Tuple(int, int)]], 内部的Tuple是可以进行跳转的(from_tag_id, to_tag_id)。 """ - if isinstance(id2target, Vocabulary): - id2target = id2target.idx2word - num_tags = len(id2target) + if encoding_type is None: + encoding_type = _get_encoding_type_from_tag_vocab(tag_vocab) + else: + encoding_type = encoding_type.lower() + _check_tag_vocab_and_encoding_type(tag_vocab, encoding_type) + + pad_token = '' + unk_token = '' + + if isinstance(tag_vocab, Vocabulary): + id_label_lst = list(tag_vocab.idx2word.items()) + pad_token = tag_vocab.padding + unk_token = tag_vocab.unknown + else: + id_label_lst = list(tag_vocab.items()) + + num_tags = len(tag_vocab) start_idx = num_tags end_idx = num_tags + 1 - encoding_type = encoding_type.lower() allowed_trans = [] - id_label_lst = list(id2target.items()) if include_start_end: id_label_lst += [(start_idx, 'start'), (end_idx, 'end')] - def split_tag_label(from_label): from_label = from_label.lower() if from_label in ['start', 'end']: @@ -48,11 +60,11 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=False) return from_tag, from_label for from_id, from_label in id_label_lst: - if from_label in ['', '']: + if from_label in [pad_token, unk_token]: continue from_tag, from_label = split_tag_label(from_label) for to_id, to_label in id_label_lst: - if to_label in ['', '']: + if to_label in [pad_token, unk_token]: continue to_tag, to_label = split_tag_label(to_label) if _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label): diff --git a/test/core/test_metrics.py b/test/core/test_metrics.py index 5a7c55cf..8a472a62 100644 --- a/test/core/test_metrics.py +++ b/test/core/test_metrics.py @@ -11,6 +11,12 @@ from fastNLP.core.metrics import SpanFPreRecMetric, ExtractiveQAMetric def _generate_tags(encoding_type, number_labels=4): + """ + + :param encoding_type: 例如BIOES, BMES, BIO等 + :param number_labels: 多少个label,大于1 + :return: + """ vocab = {} for i in range(number_labels): label = str(i) @@ -184,7 +190,7 @@ class TestAccuracyMetric(unittest.TestCase): self.assertDictEqual(metric.get_metric(), {'acc': 1.}) -class SpanF1PreRecMetric(unittest.TestCase): +class SpanFPreRecMetricTest(unittest.TestCase): def test_case1(self): from fastNLP.core.metrics import _bmes_tag_to_spans from fastNLP.core.metrics import _bio_tag_to_spans @@ -338,6 +344,39 @@ class SpanF1PreRecMetric(unittest.TestCase): for key, value in expected_metric.items(): self.assertAlmostEqual(value, metric_value[key], places=5) + def test_auto_encoding_type_infer(self): + # 检查是否可以自动check encode的类型 + vocabs = {} + import random + for encoding_type in ['bio', 'bioes', 'bmeso']: + vocab = Vocabulary(unknown=None, padding=None) + for i in range(random.randint(10, 100)): + label = str(random.randint(1, 10)) + for tag in encoding_type: + if tag!='o': + vocab.add_word(f'{tag}-{label}') + else: + vocab.add_word('o') + vocabs[encoding_type] = vocab + for e in ['bio', 'bioes', 'bmeso']: + with self.subTest(e=e): + metric = SpanFPreRecMetric(tag_vocab=vocabs[e]) + assert metric.encoding_type == e + + bmes_vocab = _generate_tags('bmes') + vocab = Vocabulary() + for tag, index in bmes_vocab.items(): + vocab.add_word(tag) + metric = SpanFPreRecMetric(vocab) + assert metric.encoding_type == 'bmes' + + # 一些无法check的情况 + vocab = Vocabulary() + for i in range(10): + vocab.add_word(str(i)) + with self.assertRaises(Exception): + metric = SpanFPreRecMetric(vocab) + def test_encoding_type(self): # 检查传入的tag_vocab与encoding_type不符合时,是否会报错 vocabs = {} diff --git a/test/modules/decoder/test_CRF.py b/test/modules/decoder/test_CRF.py index 647af7d3..94b4ab7a 100644 --- a/test/modules/decoder/test_CRF.py +++ b/test/modules/decoder/test_CRF.py @@ -1,6 +1,6 @@ import unittest - +from fastNLP import Vocabulary class TestCRF(unittest.TestCase): def test_case1(self): @@ -14,7 +14,8 @@ class TestCRF(unittest.TestCase): id2label = {0: 'B', 1:'M', 2:'E', 3:'S'} expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)} - self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES', include_start_end=True))) + self.assertSetEqual(expected_res, set( + allowed_transitions(id2label, encoding_type='BMES', include_start_end=True))) id2label = {0: 'B', 1: 'I', 2:'O', 3: '', 4:""} allowed_transitions(id2label, include_start_end=True) @@ -37,7 +38,100 @@ class TestCRF(unittest.TestCase): expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4), (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0), (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)} - self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES', include_start_end=True))) + self.assertSetEqual(expected_res, set( + allowed_transitions(id2label, include_start_end=True))) + + def test_case11(self): + # 测试自动推断encoding类型 + from fastNLP.modules.decoder.crf import allowed_transitions + + id2label = {0: 'B', 1: 'I', 2: 'O'} + expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2), + (2, 4), (3, 0), (3, 2)} + self.assertSetEqual(expected_res, set(allowed_transitions(id2label, include_start_end=True))) + + id2label = {0: 'B', 1: 'M', 2: 'E', 3: 'S'} + expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)} + self.assertSetEqual(expected_res, set( + allowed_transitions(id2label, include_start_end=True))) + + id2label = {0: 'B', 1: 'I', 2: 'O', 3: '', 4: ""} + allowed_transitions(id2label, include_start_end=True) + + labels = ['O'] + for label in ['X', 'Y']: + for tag in 'BI': + labels.append('{}-{}'.format(tag, label)) + id2label = {idx: label for idx, label in enumerate(labels)} + expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1), + (2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3), + (4, 4), (4, 6), (5, 0), (5, 1), (5, 3)} + self.assertSetEqual(expected_res, set(allowed_transitions(id2label, include_start_end=True))) + + labels = [] + for label in ['X', 'Y']: + for tag in 'BMES': + labels.append('{}-{}'.format(tag, label)) + id2label = {idx: label for idx, label in enumerate(labels)} + expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4), + (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0), + (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)} + self.assertSetEqual(expected_res, set( + allowed_transitions(id2label, include_start_end=True))) + + def test_case12(self): + # 测试能否通过vocab生成转移矩阵 + from fastNLP.modules.decoder.crf import allowed_transitions + + id2label = {0: 'B', 1: 'I', 2: 'O'} + vocab = Vocabulary(unknown=None, padding=None) + for idx, tag in id2label.items(): + vocab.add_word(tag) + expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2), + (2, 4), (3, 0), (3, 2)} + self.assertSetEqual(expected_res, set(allowed_transitions(vocab, include_start_end=True))) + + id2label = {0: 'B', 1: 'M', 2: 'E', 3: 'S'} + vocab = Vocabulary(unknown=None, padding=None) + for idx, tag in id2label.items(): + vocab.add_word(tag) + expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)} + self.assertSetEqual(expected_res, set( + allowed_transitions(vocab, include_start_end=True))) + + id2label = {0: 'B', 1: 'I', 2: 'O', 3: '', 4: ""} + vocab = Vocabulary() + for idx, tag in id2label.items(): + vocab.add_word(tag) + allowed_transitions(vocab, include_start_end=True) + + labels = ['O'] + for label in ['X', 'Y']: + for tag in 'BI': + labels.append('{}-{}'.format(tag, label)) + id2label = {idx: label for idx, label in enumerate(labels)} + expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1), + (2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3), + (4, 4), (4, 6), (5, 0), (5, 1), (5, 3)} + vocab = Vocabulary(unknown=None, padding=None) + for idx, tag in id2label.items(): + vocab.add_word(tag) + self.assertSetEqual(expected_res, set(allowed_transitions(vocab, include_start_end=True))) + + labels = [] + for label in ['X', 'Y']: + for tag in 'BMES': + labels.append('{}-{}'.format(tag, label)) + id2label = {idx: label for idx, label in enumerate(labels)} + vocab = Vocabulary(unknown=None, padding=None) + for idx, tag in id2label.items(): + vocab.add_word(tag) + expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4), + (3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0), + (7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)} + self.assertSetEqual(expected_res, set( + allowed_transitions(vocab, include_start_end=True))) + def test_case2(self): # 测试CRF能否避免解码出非法跃迁, 使用allennlp做了验证。 From 53f744a87d9d48cc3beaa43a17010a9628261f72 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Mon, 2 Sep 2019 19:43:28 +0800 Subject: [PATCH 136/153] fix some bugs in docs --- docs/source/tutorials/tutorial_9_callback.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/tutorials/tutorial_9_callback.rst b/docs/source/tutorials/tutorial_9_callback.rst index 8e2742bb..dc50aca5 100644 --- a/docs/source/tutorials/tutorial_9_callback.rst +++ b/docs/source/tutorials/tutorial_9_callback.rst @@ -23,7 +23,7 @@ Callback的构建和使用 class LRDecay(fastNLP.Callback): def __init__(self): - super(MyCallback, self).__init__() + super(LRDecay, self).__init__() self.base_lrs = [] self.delta = [] From b3718b10dcda636883f0267b76b264c904e807ff Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Tue, 3 Sep 2019 23:19:18 +0800 Subject: [PATCH 137/153] 1. rename init_embed to embed in models/*; 2. update documents in models/bert.py; 3. update tutorial six. --- docs/source/fastNLP.models.bert.rst | 6 + docs/source/fastNLP.models.rst | 3 +- .../fastNLP.models.sequence_labeling.rst | 2 +- .../tutorials/tutorial_6_seq_labeling.rst | 92 +++---- fastNLP/models/__init__.py | 10 +- fastNLP/models/bert.py | 241 +++++++++++------- fastNLP/models/biaffine_parser.py | 6 +- fastNLP/models/cnn_text_classification.py | 6 +- fastNLP/models/snli.py | 10 +- fastNLP/models/star_transformer.py | 24 +- test/models/test_bert.py | 84 +++++- test/models/test_biaffine_parser.py | 4 +- 12 files changed, 312 insertions(+), 176 deletions(-) create mode 100644 docs/source/fastNLP.models.bert.rst diff --git a/docs/source/fastNLP.models.bert.rst b/docs/source/fastNLP.models.bert.rst new file mode 100644 index 00000000..b0c813f9 --- /dev/null +++ b/docs/source/fastNLP.models.bert.rst @@ -0,0 +1,6 @@ +fastNLP.models.bert +=================== + +.. automodule:: fastNLP.models.bert + :members: BertForSequenceClassification, BertForSentenceMatching, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering + diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst index fb782de1..21cf41a7 100644 --- a/docs/source/fastNLP.models.rst +++ b/docs/source/fastNLP.models.rst @@ -2,7 +2,7 @@ fastNLP.models ============== .. automodule:: fastNLP.models - :members: CNNText, SeqLabeling, AdvSeqLabel, ESIM, StarTransEnc, STSeqLabel, STNLICls, STSeqCls, BiaffineParser, GraphParser + :members: CNNText, SeqLabeling, AdvSeqLabel, ESIM, StarTransEnc, STSeqLabel, STNLICls, STSeqCls, BiaffineParser, GraphParser, BertForSequenceClassification, BertForSentenceMatching, BertForMultipleChoice, BertForTokenClassification, BertForQuestionAnswering 子模块 ------ @@ -10,6 +10,7 @@ fastNLP.models .. toctree:: :maxdepth: 1 + fastNLP.models.bert fastNLP.models.biaffine_parser fastNLP.models.cnn_text_classification fastNLP.models.sequence_labeling diff --git a/docs/source/fastNLP.models.sequence_labeling.rst b/docs/source/fastNLP.models.sequence_labeling.rst index f6551f8b..dcd1300e 100644 --- a/docs/source/fastNLP.models.sequence_labeling.rst +++ b/docs/source/fastNLP.models.sequence_labeling.rst @@ -2,5 +2,5 @@ fastNLP.models.sequence_labeling ================================ .. automodule:: fastNLP.models.sequence_labeling - :members: SeqLabeling, AdvSeqLabel + :members: SeqLabeling, AdvSeqLabel, BiLSTMCRF diff --git a/docs/source/tutorials/tutorial_6_seq_labeling.rst b/docs/source/tutorials/tutorial_6_seq_labeling.rst index 09a53cdc..7fcf97b3 100644 --- a/docs/source/tutorials/tutorial_6_seq_labeling.rst +++ b/docs/source/tutorials/tutorial_6_seq_labeling.rst @@ -3,64 +3,52 @@ ===================== 这一部分的内容主要展示如何使用fastNLP 实现序列标注任务。你可以使用fastNLP的各个组件快捷,方便地完成序列标注任务,达到出色的效果。 -在阅读这篇Tutorial前,希望你已经熟悉了fastNLP的基础使用,包括基本数据结构以及数据预处理,embedding的嵌入等,希望你对之前的教程有更进一步的掌握。 -我们将对CoNLL-03的英文数据集进行处理,展示如何完成命名实体标注任务整个训练的过程。 +在阅读这篇Tutorial前,希望你已经熟悉了fastNLP的基础使用,尤其是数据的载入以及模型的构建,通过这个小任务的能让你进一步熟悉fastNLP的使用。 +我们将对基于Weibo的中文社交数据集进行处理,展示如何完成命名实体标注任务的整个过程。 载入数据 =================================== -fastNLP可以方便地载入各种类型的数据。同时,针对常见的数据集,我们已经预先实现了载入方法,其中包含CoNLL-03数据集。 +fastNLP的数据载入主要是由Loader与Pipe两个基类衔接完成的。通过Loader可以方便地载入各种类型的数据。同时,针对常见的数据集,我们已经预先实现了载入方法,其中包含weibo数据集。 在设计dataloader时,以DataSetLoader为基类,可以改写并应用于其他数据集的载入。 .. code-block:: python - class Conll2003DataLoader(DataSetLoader): - def __init__(self, task:str='ner', encoding_type:str='bioes'): - assert task in ('ner', 'pos', 'chunk') - index = {'ner':3, 'pos':1, 'chunk':2}[task] - #ConllLoader是fastNLP内置的类 - self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index]) - self._tag_converters = None - if task in ('ner', 'chunk'): - #iob和iob2bioes会对tag进行统一,标准化 - self._tag_converters = [iob2] - if encoding_type == 'bioes': - self._tag_converters.append(iob2bioes) - - def load(self, path: str): - dataset = self._loader.load(path) - def convert_tag_schema(tags): - for converter in self._tag_converters: - tags = converter(tags) - return tags - if self._tag_converters: - #使用apply实现convert_tag_schema函数,实际上也支持匿名函数 - dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) - return dataset - -输出数据格式如: - - {'raw_words': ['on', 'Friday', ':'] type=list, - 'target': ['O', 'O', 'O'] type=list}, + from fastNLP.io import WeiboNERLoader + data_bundle = WeiboNERLoader().load() + + + +载入后的数据如 :: + + {'dev': DataSet( + {{'raw_chars': ['用', '最', '大', '努', '力', '去', '做''人', '生', '。', '哈', '哈', '哈', '哈', '哈', '哈', ' + 'target': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',, 'O', 'O', 'O', 'O', 'O', 'O'] type=list})} + + {'test': DataSet( + {{'raw_chars': ['感', '恩', '大', '回', '馈'] type=list, 'target': ['O', 'O', 'O', 'O', 'O'] type=list})} + + {'train': DataSet( + {'raw_chars': ['国', '安', '老', '球', '迷'] type=list, 'target': ['B-ORG.NAM', 'I-ORG.NAM', 'B-PER.NOM', 'I-PER.NOM', 'I-PER.NOM'] type=list})} + 数据处理 ---------------------------- -我们进一步处理数据。将数据和词表封装在 :class:`~fastNLP.DataBundle` 类中。data是DataBundle的实例。 -我们输入模型的数据包括char embedding,以及word embedding。在数据处理部分,我们尝试完成词表的构建。 -使用fastNLP中的Vocabulary类来构建词表。 +我们进一步处理数据。通过Pipe基类处理Loader载入的数据。 如果你还有印象,应该还能想起,实现自定义数据集的Pipe时,至少要编写process 函数或者process_from_file 函数。前者接受 :class:`~fastNLP.DataBundle` 类的数据,并返回该 :class:`~fastNLP.DataBundle` 。后者接收数据集所在文件夹为参数,读取并处理为 :class:`~fastNLP.DataBundle` 后,通过process 函数处理数据。 +这里我们已经实现通过Loader载入数据,并已返回 :class:`~fastNLP.DataBundle` 类的数据。我们编写process 函数以处理Loader载入后的数据。 .. code-block:: python - word_vocab = Vocabulary(min_freq=2) - word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT) - word_vocab.index_dataset(*data.datasets.values(),field_name=Const.INPUT, new_field_name=Const.INPUT) + from fastNLP.io import ChineseNERPipe + data_bundle = ChineseNERPipe(encoding_type='bioes', bigram=True).process(data_bundle) -处理后的data对象内部为: +载入后的数据如下 :: - dataset - vocabs - dataset保存了train和test中的数据,并保存为dataset类型 - vocab保存了words,raw-words以及target的词表。 + {'raw_chars': ['用', '最', '大', '努', '力', '去', '做', '值', '得', '的', '事', '人', '生', '。', '哈', '哈', '哈', '哈', '哈', '哈', '我', '在'] type=list, + 'target': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] type=list, + 'chars': [97, 71, 34, 422, 104, 72, 144, 628, 66, 3, 158, 2, 9, 647, 485, 196, 2,19] type=list, + 'bigrams': [5948, 1950, 34840, 98, 8413, 3961, 34841, 631, 34842, 407, 462, 45, 3 1959, 1619, 3, 3, 3, 3, 3, 2663, 29, 90] type=list, + 'seq_len': 30 type=int} 模型构建 -------------------------------- @@ -69,27 +57,23 @@ fastNLP可以方便地载入各种类型的数据。同时,针对常见的数 模型的训练 首先实例化模型,导入所需的char embedding以及word embedding。Embedding的载入可以参考教程。 -也可以查看 :mod:`~fastNLP.modules.encoder.embedding` 使用所需的embedding 载入方法。 -fastNLP将模型的训练过程封装在了 :class:`~fastnlp.trainer` 类中。 +也可以查看 :mod:`~fastNLP.embedding` 使用所需的embedding 载入方法。 +fastNLP将模型的训练过程封装在了 :class:`~fastnlp.Trainer` 类中。 根据不同的任务调整trainer中的参数即可。通常,一个trainer实例需要有:指定的训练数据集,模型,优化器,loss函数,评测指标,以及指定训练的epoch数,batch size等参数。 .. code-block:: python #实例化模型 - model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) - #定义优化器 - optimizer = Adam(model.parameters(), lr=0.005) + model = CNBiLSTMCRFNER(char_embed, num_classes=len(data_bundle.vocabs['target']), bigram_embed=bigram_embed) #定义评估指标 - Metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) - #实例化trainer - trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, dev_data=data.datasets['test'], batch_size=10, metrics=Metrics,callbacks=callbacks, n_epochs=100) - #开始训练 - trainer.train() + Metrics=SpanFPreRecMetric(data_bundle.vocabs['target'], encoding_type='bioes') + #实例化trainer并训练 + Trainer(data_bundle.datasets['train'], model, batch_size=20, metrics=Metrics, num_workers=2, dev_data=data_bundle. datasets['dev']).train() + 训练中会保存最优的参数配置。 -训练的结果如下: -.. code-block:: python +训练的结果如下 :: Evaluation on DataSet test: SpanFPreRecMetric: f=0.727661, pre=0.732293, rec=0.723088 diff --git a/fastNLP/models/__init__.py b/fastNLP/models/__init__.py index 14314049..a659e1d5 100644 --- a/fastNLP/models/__init__.py +++ b/fastNLP/models/__init__.py @@ -21,12 +21,18 @@ __all__ = [ "STSeqCls", "BiaffineParser", - "GraphParser" + "GraphParser", + + "BertForSequenceClassification", + "BertForSentenceMatching", + "BertForMultipleChoice", + "BertForTokenClassification", + "BertForQuestionAnswering" ] from .base_model import BaseModel from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \ - BertForTokenClassification + BertForTokenClassification, BertForSentenceMatching from .biaffine_parser import BiaffineParser, GraphParser from .cnn_text_classification import CNNText from .sequence_labeling import SeqLabeling, AdvSeqLabel diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py index 08f16db2..4a04bd6d 100644 --- a/fastNLP/models/bert.py +++ b/fastNLP/models/bert.py @@ -1,9 +1,35 @@ -"""undocumented -bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. +""" +fastNLP提供了BERT应用到五个下游任务的模型代码,可以直接调用。这五个任务分别为 + + - 文本分类任务: :class:`~fastNLP.models.BertForSequenceClassification` + - Matching任务: :class:`~fastNLP.models.BertForSentenceMatching` + - 多选任务: :class:`~fastNLP.models.BertForMultipleChoice` + - 序列标注任务: :class:`~fastNLP.models.BertForTokenClassification` + - 抽取式QA任务: :class:`~fastNLP.models.BertForQuestionAnswering` + +每一个模型必须要传入一个名字为 `embed` 的 :class:`fastNLP.embeddings.BertEmbedding` ,这个参数包含了 +:class:`fastNLP.modules.encoder.BertModel` ,是下游模型的编码器(encoder)。 + +除此以外,还需要传入一个数字,这个数字在不同下游任务模型上的意义如下:: + + 下游任务模型 参数名称 含义 + BertForSequenceClassification num_labels 文本分类类别数目,默认值为2 + BertForSentenceMatching num_labels Matching任务类别数目,默认值为2 + BertForMultipleChoice num_choices 多选任务选项数目,默认值为2 + BertForTokenClassification num_labels 序列标注标签数目,无默认值 + BertForQuestionAnswering num_labels 抽取式QA列数,默认值为2(即第一列为start_span, 第二列为end_span) + +最后还可以传入dropout的大小,默认值为0.1。 """ -__all__ = [] +__all__ = [ + "BertForSequenceClassification", + "BertForSentenceMatching", + "BertForMultipleChoice", + "BertForTokenClassification", + "BertForQuestionAnswering" +] import warnings @@ -13,28 +39,40 @@ from torch import nn from .base_model import BaseModel from ..core.const import Const from ..core._logger import logger -from ..modules.encoder import BertModel -from ..modules.encoder.bert import BertConfig, CONFIG_FILE -from ..embeddings.bert_embedding import BertEmbedding +from ..embeddings import BertEmbedding class BertForSequenceClassification(BaseModel): - """BERT model for classification. """ - def __init__(self, init_embed: BertEmbedding, num_labels: int=2): + 别名: :class:`fastNLP.models.BertForSequenceClassification` + :class:`fastNLP.models.bert.BertForSequenceClassification` + + BERT model for classification. + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_labels: 文本分类类别数目,默认值为2. + :param float dropout: dropout的大小,默认值为0.1. + """ + def __init__(self, embed: BertEmbedding, num_labels: int=2, dropout=0.1): super(BertForSequenceClassification, self).__init__() self.num_labels = num_labels - self.bert = init_embed - self.dropout = nn.Dropout(0.1) + self.bert = embed + self.dropout = nn.Dropout(p=dropout) self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) if not self.bert.model.include_cls_sep: - warn_msg = "Bert for sequence classification excepts BertEmbedding `include_cls_sep` True, but got False." + self.bert.model.include_cls_sep = True + warn_msg = "Bert for sequence classification excepts BertEmbedding `include_cls_sep` True, " \ + "but got False. FastNLP has changed it to True." logger.warn(warn_msg) warnings.warn(warn_msg) def forward(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.Tensor [batch_size, num_labels] + """ hidden = self.dropout(self.bert(words)) cls_hidden = hidden[:, 0] logits = self.classifier(cls_hidden) @@ -42,172 +80,193 @@ class BertForSequenceClassification(BaseModel): return {Const.OUTPUT: logits} def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size] + """ logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForSentenceMatching(BaseModel): + """ + 别名: :class:`fastNLP.models.BertForSentenceMatching` + :class:`fastNLP.models.bert.BertForSentenceMatching` + + BERT model for sentence matching. - """BERT model for matching. + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_labels: Matching任务类别数目,默认值为2. + :param float dropout: dropout的大小,默认值为0.1. """ - def __init__(self, init_embed: BertEmbedding, num_labels: int=2): + def __init__(self, embed: BertEmbedding, num_labels: int=2, dropout=0.1): super(BertForSentenceMatching, self).__init__() self.num_labels = num_labels - self.bert = init_embed - self.dropout = nn.Dropout(0.1) + self.bert = embed + self.dropout = nn.Dropout(p=dropout) self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) if not self.bert.model.include_cls_sep: - error_msg = "Bert for sentence matching excepts BertEmbedding `include_cls_sep` True, but got False." - logger.error(error_msg) - raise RuntimeError(error_msg) + self.bert.model.include_cls_sep = True + warn_msg = "Bert for sentence matching excepts BertEmbedding `include_cls_sep` True, " \ + "but got False. FastNLP has changed it to True." + logger.warn(warn_msg) + warnings.warn(warn_msg) def forward(self, words): - hidden = self.dropout(self.bert(words)) - cls_hidden = hidden[:, 0] + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.Tensor [batch_size, num_labels] + """ + hidden = self.bert(words) + cls_hidden = self.dropout(hidden[:, 0]) logits = self.classifier(cls_hidden) return {Const.OUTPUT: logits} def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size] + """ logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForMultipleChoice(BaseModel): - """BERT model for multiple choice tasks. """ - def __init__(self, init_embed: BertEmbedding, num_choices=2): + 别名: :class:`fastNLP.models.BertForMultipleChoice` + :class:`fastNLP.models.bert.BertForMultipleChoice` + + BERT model for multiple choice. + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_choices: 多选任务选项数目,默认值为2. + :param float dropout: dropout的大小,默认值为0.1. + """ + def __init__(self, embed: BertEmbedding, num_choices=2, dropout=0.1): super(BertForMultipleChoice, self).__init__() self.num_choices = num_choices - self.bert = init_embed - self.dropout = nn.Dropout(0.1) + self.bert = embed + self.dropout = nn.Dropout(p=dropout) self.classifier = nn.Linear(self.bert.embedding_dim, 1) - self.include_cls_sep = init_embed.model.include_cls_sep if not self.bert.model.include_cls_sep: - error_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, but got False." - logger.error(error_msg) - raise RuntimeError(error_msg) + self.bert.model.include_cls_sep = True + warn_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, " \ + "but got False. FastNLP has changed it to True." + logger.warn(warn_msg) + warnings.warn(warn_msg) def forward(self, words): """ - :param torch.Tensor words: [batch_size, num_choices, seq_len] - :return: [batch_size, num_labels] + :param torch.LongTensor words: [batch_size, num_choices, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size, num_choices] """ batch_size, num_choices, seq_len = words.size() input_ids = words.view(batch_size * num_choices, seq_len) hidden = self.bert(input_ids) - pooled_output = hidden[:, 0] - pooled_output = self.dropout(pooled_output) + pooled_output = self.dropout(hidden[:, 0]) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, self.num_choices) return {Const.OUTPUT: reshaped_logits} def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, num_choices, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size] + """ logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForTokenClassification(BaseModel): - """BERT model for token-level classification. """ - def __init__(self, init_embed: BertEmbedding, num_labels): + 别名: :class:`fastNLP.models.BertForTokenClassification` + :class:`fastNLP.models.bert.BertForTokenClassification` + + BERT model for token classification. + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_labels: 序列标注标签数目,无默认值. + :param float dropout: dropout的大小,默认值为0.1. + """ + def __init__(self, embed: BertEmbedding, num_labels, dropout=0.1): super(BertForTokenClassification, self).__init__() self.num_labels = num_labels - self.bert = init_embed - self.dropout = nn.Dropout(0.1) + self.bert = embed + self.dropout = nn.Dropout(p=dropout) self.classifier = nn.Linear(self.bert.embedding_dim, num_labels) - self.include_cls_sep = init_embed.model.include_cls_sep - if self.include_cls_sep: - warn_msg = "Bert for token classification excepts BertEmbedding `include_cls_sep` False, but got True." - warnings.warn(warn_msg) + if self.bert.model.include_cls_sep: + self.bert.model.include_cls_sep = False + warn_msg = "Bert for token classification excepts BertEmbedding `include_cls_sep` False, " \ + "but got True. FastNLP has changed it to False." logger.warn(warn_msg) + warnings.warn(warn_msg) def forward(self, words): """ - :param torch.Tensor words: [batch_size, seq_len] - :return: [batch_size, seq_len, num_labels] + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.Tensor [batch_size, seq_len, num_labels] """ - sequence_output = self.bert(words) - if self.include_cls_sep: - sequence_output = sequence_output[:, 1: -1] # [batch_size, seq_len, embed_dim] + sequence_output = self.bert(words) # [batch_size, seq_len, embed_dim] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) return {Const.OUTPUT: logits} def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: { :attr:`fastNLP.Const.OUTPUT` : logits}: torch.LongTensor [batch_size, seq_len] + """ logits = self.forward(words)[Const.OUTPUT] return {Const.OUTPUT: torch.argmax(logits, dim=-1)} class BertForQuestionAnswering(BaseModel): - """BERT model for Question Answering (span extraction). - This module is composed of the BERT model with a linear layer on top of - the sequence output that computes start_logits and end_logits - Params: - `config`: a BertConfig class instance with the configuration to build a new model. - `bert_dir`: a dir which contains the bert parameters within file `pytorch_model.bin` - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size]. - Positions are clamped to the length of the sequence and position outside of the sequence are not taken - into account for computing the loss. - `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size]. - Positions are clamped to the length of the sequence and position outside of the sequence are not taken - into account for computing the loss. - Outputs: - if `start_positions` and `end_positions` are not `None`: - Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. - if `start_positions` or `end_positions` is `None`: - Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end - position tokens of shape [batch_size, sequence_length]. - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - bert_dir = 'your-bert-file-dir' - model = BertForQuestionAnswering(config, bert_dir) - start_logits, end_logits = model(input_ids, token_type_ids, input_mask) - ``` """ - def __init__(self, init_embed: BertEmbedding, num_labels=2): + 别名: :class:`fastNLP.models.BertForQuestionAnswering` + :class:`fastNLP.models.bert.BertForQuestionAnswering` + + BERT model for classification. + + :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). + :param int num_labels: 抽取式QA列数,默认值为2(即第一列为start_span, 第二列为end_span). + """ + def __init__(self, embed: BertEmbedding, num_labels=2): super(BertForQuestionAnswering, self).__init__() - self.bert = init_embed + self.bert = embed self.num_labels = num_labels self.qa_outputs = nn.Linear(self.bert.embedding_dim, self.num_labels) if not self.bert.model.include_cls_sep: - error_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, but got False." - logger.error(error_msg) - raise RuntimeError(error_msg) + self.bert.model.include_cls_sep = True + warn_msg = "Bert for question answering excepts BertEmbedding `include_cls_sep` True, " \ + "but got False. FastNLP has changed it to True." + logger.warn(warn_msg) + warnings.warn(warn_msg) def forward(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: 一个包含num_labels个logit的dict,每一个logit的形状都是[batch_size, seq_len] + """ sequence_output = self.bert(words) logits = self.qa_outputs(sequence_output) # [batch_size, seq_len, num_labels] return {Const.OUTPUTS(i): logits[:, :, i] for i in range(self.num_labels)} def predict(self, words): + """ + :param torch.LongTensor words: [batch_size, seq_len] + :return: 一个包含num_labels个logit的dict,每一个logit的形状都是[batch_size] + """ logits = self.forward(words) return {Const.OUTPUTS(i): torch.argmax(logits[Const.OUTPUTS(i)], dim=-1) for i in range(self.num_labels)} diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py index 6b0829bd..455d27a7 100644 --- a/fastNLP/models/biaffine_parser.py +++ b/fastNLP/models/biaffine_parser.py @@ -245,7 +245,7 @@ class BiaffineParser(GraphParser): Biaffine Dependency Parser 实现. 论文参考 `Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) `_ . - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding :param pos_vocab_size: part-of-speech 词典大小 @@ -262,7 +262,7 @@ class BiaffineParser(GraphParser): """ def __init__(self, - init_embed, + embed, pos_vocab_size, pos_emb_dim, num_label, @@ -276,7 +276,7 @@ class BiaffineParser(GraphParser): super(BiaffineParser, self).__init__() rnn_out_size = 2 * rnn_hidden_size word_hid_dim = pos_hid_dim = rnn_hidden_size - self.word_embedding = get_embeddings(init_embed) + self.word_embedding = get_embeddings(embed) word_emb_dim = self.word_embedding.embedding_dim self.pos_embedding = nn.Embedding(num_embeddings=pos_vocab_size, embedding_dim=pos_emb_dim) self.word_fc = nn.Linear(word_emb_dim, word_hid_dim) diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index 37a60c35..4bf9c4d1 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -23,7 +23,7 @@ class CNNText(torch.nn.Module): 使用CNN进行文本分类的模型 'Yoon Kim. 2014. Convolution Neural Networks for Sentence Classification.' - :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), + :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding :param int num_classes: 一共有多少类 :param int,tuple(int) out_channels: 输出channel的数量。如果为list,则需要与kernel_sizes的数量保持一致 @@ -31,7 +31,7 @@ class CNNText(torch.nn.Module): :param float dropout: Dropout的大小 """ - def __init__(self, init_embed, + def __init__(self, embed, num_classes, kernel_nums=(30, 40, 50), kernel_sizes=(1, 3, 5), @@ -39,7 +39,7 @@ class CNNText(torch.nn.Module): super(CNNText, self).__init__() # no support for pre-trained embedding currently - self.embed = embedding.Embedding(init_embed) + self.embed = embedding.Embedding(embed) self.conv_pool = encoder.ConvMaxpool( in_channels=self.embed.embedding_dim, out_channels=kernel_nums, diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py index 5ca4052d..97a14e9f 100644 --- a/fastNLP/models/snli.py +++ b/fastNLP/models/snli.py @@ -24,21 +24,21 @@ class ESIM(BaseModel): ESIM model的一个PyTorch实现 论文参见: https://arxiv.org/pdf/1609.06038.pdf - :param init_embedding: 初始化的Embedding + :param embed: 初始化的Embedding :param int hidden_size: 隐藏层大小,默认值为Embedding的维度 :param int num_labels: 目标标签种类数量,默认值为3 :param float dropout_rate: dropout的比率,默认值为0.3 :param float dropout_embed: 对Embedding的dropout比率,默认值为0.1 """ - def __init__(self, init_embedding, hidden_size=None, num_labels=3, dropout_rate=0.3, + def __init__(self, embed, hidden_size=None, num_labels=3, dropout_rate=0.3, dropout_embed=0.1): super(ESIM, self).__init__() - if isinstance(init_embedding, TokenEmbedding) or isinstance(init_embedding, Embedding): - self.embedding = init_embedding + if isinstance(embed, TokenEmbedding) or isinstance(embed, Embedding): + self.embedding = embed else: - self.embedding = Embedding(init_embedding) + self.embedding = Embedding(embed) self.dropout_embed = EmbedDropout(p=dropout_embed) if hidden_size is None: hidden_size = self.embedding.embed_size diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py index b95d1c25..7fe0d343 100644 --- a/fastNLP/models/star_transformer.py +++ b/fastNLP/models/star_transformer.py @@ -23,7 +23,7 @@ class StarTransEnc(nn.Module): 带word embedding的Star-Transformer Encoder - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding :param hidden_size: 模型中特征维度. @@ -35,7 +35,7 @@ class StarTransEnc(nn.Module): :param dropout: 模型除词嵌入外的dropout概率. """ - def __init__(self, init_embed, + def __init__(self, embed, hidden_size, num_layers, num_head, @@ -44,7 +44,7 @@ class StarTransEnc(nn.Module): emb_dropout, dropout): super(StarTransEnc, self).__init__() - self.embedding = get_embeddings(init_embed) + self.embedding = get_embeddings(embed) emb_dim = self.embedding.embedding_dim self.emb_fc = nn.Linear(emb_dim, hidden_size) # self.emb_drop = nn.Dropout(emb_dropout) @@ -108,7 +108,7 @@ class STSeqLabel(nn.Module): 用于序列标注的Star-Transformer模型 - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding :param num_cls: 输出类别个数 @@ -122,7 +122,7 @@ class STSeqLabel(nn.Module): :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ - def __init__(self, init_embed, num_cls, + def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, @@ -132,7 +132,7 @@ class STSeqLabel(nn.Module): emb_dropout=0.1, dropout=0.1, ): super(STSeqLabel, self).__init__() - self.enc = StarTransEnc(init_embed=init_embed, + self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, @@ -173,7 +173,7 @@ class STSeqCls(nn.Module): 用于分类任务的Star-Transformer - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding :param num_cls: 输出类别个数 @@ -187,7 +187,7 @@ class STSeqCls(nn.Module): :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ - def __init__(self, init_embed, num_cls, + def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, @@ -197,7 +197,7 @@ class STSeqCls(nn.Module): emb_dropout=0.1, dropout=0.1, ): super(STSeqCls, self).__init__() - self.enc = StarTransEnc(init_embed=init_embed, + self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, @@ -238,7 +238,7 @@ class STNLICls(nn.Module): 用于自然语言推断(NLI)的Star-Transformer - :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 + :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象, 此时就以传入的对象作为embedding :param num_cls: 输出类别个数 @@ -252,7 +252,7 @@ class STNLICls(nn.Module): :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1 """ - def __init__(self, init_embed, num_cls, + def __init__(self, embed, num_cls, hidden_size=300, num_layers=4, num_head=8, @@ -262,7 +262,7 @@ class STNLICls(nn.Module): emb_dropout=0.1, dropout=0.1, ): super(STNLICls, self).__init__() - self.enc = StarTransEnc(init_embed=init_embed, + self.enc = StarTransEnc(embed=embed, hidden_size=hidden_size, num_layers=num_layers, num_head=num_head, diff --git a/test/models/test_bert.py b/test/models/test_bert.py index 969a8594..9cab3a88 100644 --- a/test/models/test_bert.py +++ b/test/models/test_bert.py @@ -23,10 +23,25 @@ class TestBert(unittest.TestCase): self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2)) - pred = model.predict(input_ids) + pred = model(input_ids) self.assertTrue(isinstance(pred, dict)) self.assertTrue(Const.OUTPUT in pred) - self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,)) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2)) + + def test_bert_1_w(self): + vocab = Vocabulary().add_word_lst("this is a test .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + + with self.assertWarns(Warning): + model = BertForSequenceClassification(embed, 2) + + input_ids = torch.LongTensor([[1, 2, 3], [5, 6, 0]]) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,)) def test_bert_2(self): @@ -44,6 +59,23 @@ class TestBert(unittest.TestCase): self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (1, 2)) + def test_bert_2_w(self): + + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + + with self.assertWarns(Warning): + model = BertForMultipleChoice(embed, 2) + + input_ids = torch.LongTensor([[[2, 6, 7], [1, 6, 5]]]) + print(input_ids.size()) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (1,)) + def test_bert_3(self): vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) @@ -58,6 +90,22 @@ class TestBert(unittest.TestCase): self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 3, 7)) + def test_bert_3_w(self): + + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=True) + + with self.assertWarns(Warning): + model = BertForTokenClassification(embed, 7) + + input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 3)) + def test_bert_4(self): vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) @@ -79,6 +127,22 @@ class TestBert(unittest.TestCase): self.assertTrue(isinstance(pred, dict)) self.assertEqual(len(pred), 7) + def test_bert_4_w(self): + + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + + with self.assertWarns(Warning): + model = BertForQuestionAnswering(embed) + + input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUTS(1) in pred) + self.assertEqual(tuple(pred[Const.OUTPUTS(1)].shape), (2,)) + def test_bert_5(self): vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) @@ -93,3 +157,19 @@ class TestBert(unittest.TestCase): self.assertTrue(Const.OUTPUT in pred) self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2)) + def test_bert_5_w(self): + + vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', + include_cls_sep=False) + + with self.assertWarns(Warning): + model = BertForSentenceMatching(embed) + + input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]]) + + pred = model.predict(input_ids) + self.assertTrue(isinstance(pred, dict)) + self.assertTrue(Const.OUTPUT in pred) + self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,)) + diff --git a/test/models/test_biaffine_parser.py b/test/models/test_biaffine_parser.py index 4f93b994..4b38d816 100644 --- a/test/models/test_biaffine_parser.py +++ b/test/models/test_biaffine_parser.py @@ -27,7 +27,7 @@ def prepare_parser_data(): class TestBiaffineParser(unittest.TestCase): def test_train(self): - model = BiaffineParser(init_embed=(VOCAB_SIZE, 10), + model = BiaffineParser(embed=(VOCAB_SIZE, 10), pos_vocab_size=VOCAB_SIZE, pos_emb_dim=10, rnn_hidden_size=10, arc_mlp_size=10, @@ -37,7 +37,7 @@ class TestBiaffineParser(unittest.TestCase): RUNNER.run_model(model, ds, loss=ParserLoss(), metrics=ParserMetric()) def test_train2(self): - model = BiaffineParser(init_embed=(VOCAB_SIZE, 10), + model = BiaffineParser(embed=(VOCAB_SIZE, 10), pos_vocab_size=VOCAB_SIZE, pos_emb_dim=10, rnn_hidden_size=16, arc_mlp_size=10, From d15ad75d96f3b72fe6b439ef8ce6e4829987ce0f Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Tue, 3 Sep 2019 23:33:10 +0800 Subject: [PATCH 138/153] fix a bug in test code --- test/modules/decoder/test_bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/modules/decoder/test_bert.py b/test/modules/decoder/test_bert.py index 0fcf01e4..56946f5d 100644 --- a/test/modules/decoder/test_bert.py +++ b/test/modules/decoder/test_bert.py @@ -3,7 +3,7 @@ import unittest import torch -from fastNLP.models.bert import BertModel +from fastNLP.modules.encoder.bert import BertModel class TestBert(unittest.TestCase): From e903db0e70bb4cd9e9b45907fc33db4b4fce9765 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Wed, 4 Sep 2019 12:47:52 +0800 Subject: [PATCH 139/153] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=AD=E6=96=87?= =?UTF-8?q?=E5=88=86=E7=B1=BBPipe;=E4=BD=BF=E7=94=A8=E7=9F=A9=E9=98=B5?= =?UTF-8?q?=E5=8A=A0=E9=80=9FBertEmbedding=E9=83=A8=E5=88=86pool=5Fmethod;?= =?UTF-8?q?=E8=B0=83=E6=95=B4=E9=83=A8=E5=88=86=E6=B5=8B=E8=AF=95=E7=94=A8?= =?UTF-8?q?=E4=BE=8B=E5=90=8D=E7=A7=B0;=E4=BF=AE=E5=A4=8Dmetric=E4=B8=AD?= =?UTF-8?q?=E5=AF=B9warning=E7=9A=84=E8=AF=AF=E6=8A=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 4 +- fastNLP/core/metrics.py | 2 +- fastNLP/embeddings/bert_embedding.py | 42 +++++--- fastNLP/io/__init__.py | 4 +- fastNLP/io/data_bundle.py | 15 +-- fastNLP/io/file_utils.py | 7 +- fastNLP/io/loader/__init__.py | 3 +- fastNLP/io/loader/classification.py | 57 +++++++++++ fastNLP/io/pipe/__init__.py | 3 +- fastNLP/io/pipe/classification.py | 101 ++++++++++++++++++- fastNLP/io/pipe/conll.py | 40 ++++++-- test/embeddings/test_bert_embedding.py | 6 ++ test/io/loader/test_classification_loader.py | 6 +- test/io/loader/test_conll_loader.py | 6 +- test/io/loader/test_cws_loader.py | 4 +- test/io/loader/test_matching_loader.py | 5 +- test/io/pipe/test_classification.py | 13 ++- test/io/pipe/test_conll.py | 6 +- test/io/pipe/test_cws.py | 4 +- test/io/pipe/test_matching.py | 6 +- 20 files changed, 274 insertions(+), 60 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index d5549cec..7402a568 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -238,8 +238,8 @@ class CrossEntropyLoss(LossBase): pred = pred.tranpose(-1, pred) pred = pred.reshape(-1, pred.size(-1)) target = target.reshape(-1) - if seq_len is not None: - mask = seq_len_to_mask(seq_len).reshape(-1).eq(0) + if seq_len is not None and target.dim()>1: + mask = seq_len_to_mask(seq_len, max_len=target.size(1)).reshape(-1).eq(0) target = target.masked_fill(mask, self.padding_idx) return F.cross_entropy(input=pred, target=target, diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index b06e5459..c0f14c90 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -347,7 +347,7 @@ class AccuracyMetric(MetricBase): pass elif pred.dim() == target.dim() + 1: pred = pred.argmax(dim=-1) - if seq_len is None: + if seq_len is None and target.dim()>1: warnings.warn("You are not passing `seq_len` to exclude pad when calculate accuracy.") else: raise RuntimeError(f"In {_get_func_signature(self.evaluate)}, when pred have " diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index f6c36623..08615fe0 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -68,7 +68,7 @@ class BertEmbedding(ContextualEmbedding): def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False, - pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False): + pooled_cls=True, requires_grad: bool = True, auto_truncate: bool = False): super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout) if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: @@ -165,7 +165,7 @@ class BertWordPieceEncoder(nn.Module): """ def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False, - word_dropout=0, dropout=0, requires_grad: bool = False): + word_dropout=0, dropout=0, requires_grad: bool = True): super().__init__() self.model = _WordPieceBertModel(model_dir_or_name=model_dir_or_name, layers=layers, pooled_cls=pooled_cls) @@ -288,7 +288,7 @@ class _WordBertModel(nn.Module): self.auto_truncate = auto_truncate # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] - logger.info("Start to generating word pieces for word.") + logger.info("Start to generate word pieces for word.") # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 word_piece_dict = {'[CLS]': 1, '[SEP]': 1} # 用到的word_piece以及新增的 found_count = 0 @@ -374,7 +374,8 @@ class _WordBertModel(nn.Module): else: raise RuntimeError( "After split words into word pieces, the lengths of word pieces are longer than the " - f"maximum allowed sequence length:{self._max_position_embeddings} of bert.") + f"maximum allowed sequence length:{self._max_position_embeddings} of bert. You can set " + f"`auto_truncate=True` for BertEmbedding to automatically truncate overlong input.") # +2是由于需要加入[CLS]与[SEP] word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)), @@ -407,15 +408,26 @@ class _WordBertModel(nn.Module): # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size if self.include_cls_sep: - outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, - bert_outputs[-1].size(-1)) s_shift = 1 + outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, + bert_outputs[-1].size(-1)) + else: + s_shift = 0 outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len, bert_outputs[-1].size(-1)) - s_shift = 0 batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1) batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1) # batch_size x max_len + + if self.pool_method == 'first': + batch_word_pieces_cum_length = batch_word_pieces_cum_length[:, :seq_len.max()] + batch_word_pieces_cum_length.masked_fill_(batch_word_pieces_cum_length.ge(word_piece_length), 0) + batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) + elif self.pool_method == 'last': + batch_word_pieces_cum_length = batch_word_pieces_cum_length[:, 1:seq_len.max()+1] - 1 + batch_word_pieces_cum_length.masked_fill_(batch_word_pieces_cum_length.ge(word_piece_length), 0) + batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) + for l_index, l in enumerate(self.layers): output_layer = bert_outputs[l] real_word_piece_length = output_layer.size(1) - 2 @@ -426,16 +438,15 @@ class _WordBertModel(nn.Module): output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() # 从word_piece collapse到word的表示 truncate_output_layer = output_layer[:, 1:-1] # 删除[CLS]与[SEP] batch_size x len x hidden_size - outputs_seq_len = seq_len + s_shift if self.pool_method == 'first': - for i in range(batch_size): - i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置 - outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[ - i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size + tmp = truncate_output_layer[batch_indexes, batch_word_pieces_cum_length] + tmp = tmp.masked_fill(word_mask[:, :batch_word_pieces_cum_length.size(1), None].eq(0), 0) + outputs[l_index, :, s_shift:batch_word_pieces_cum_length.size(1)+s_shift] = tmp + elif self.pool_method == 'last': - for i in range(batch_size): - i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i] + 1] - 1 # 每个word的end - outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] + tmp = truncate_output_layer[batch_indexes, batch_word_pieces_cum_length] + tmp = tmp.masked_fill(word_mask[:, :batch_word_pieces_cum_length.size(1), None].eq(0), 0) + outputs[l_index, :, s_shift:batch_word_pieces_cum_length.size(1)+s_shift] = tmp elif self.pool_method == 'max': for i in range(batch_size): for j in range(seq_len[i]): @@ -452,5 +463,6 @@ class _WordBertModel(nn.Module): else: outputs[l_index, :, 0] = output_layer[:, 0] outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift] + # 3. 最终的embedding结果 return outputs diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 251b7292..6f727f05 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -24,6 +24,7 @@ __all__ = [ 'IMDBLoader', 'SSTLoader', 'SST2Loader', + "ChnSentiCorpLoader", 'ConllLoader', 'Conll2003Loader', @@ -52,8 +53,9 @@ __all__ = [ "SSTPipe", "SST2Pipe", "IMDBPipe", - "Conll2003Pipe", + "ChnSentiCorpPipe", + "Conll2003Pipe", "Conll2003NERPipe", "OntoNotesNERPipe", "MsraNERPipe", diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py index 3e7f39d3..19b48828 100644 --- a/fastNLP/io/data_bundle.py +++ b/fastNLP/io/data_bundle.py @@ -306,12 +306,15 @@ class DataBundle: return self def __repr__(self): - _str = 'In total {} datasets:\n'.format(len(self.datasets)) - for name, dataset in self.datasets.items(): - _str += '\t{} has {} instances.\n'.format(name, len(dataset)) - _str += 'In total {} vocabs:\n'.format(len(self.vocabs)) - for name, vocab in self.vocabs.items(): - _str += '\t{} has {} entries.\n'.format(name, len(vocab)) + _str = '' + if len(self.datasets): + _str += 'In total {} datasets:\n'.format(len(self.datasets)) + for name, dataset in self.datasets.items(): + _str += '\t{} has {} instances.\n'.format(name, len(dataset)) + if len(self.vocabs): + _str += 'In total {} vocabs:\n'.format(len(self.vocabs)) + for name, vocab in self.vocabs.items(): + _str += '\t{} has {} entries.\n'.format(name, len(vocab)) return _str diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 8ecdff25..f76bcd26 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -77,6 +77,9 @@ PRETRAIN_STATIC_FILES = { 'cn-tencent': "tencent_cn.zip", 'cn-fasttext': "cc.zh.300.vec.gz", 'cn-sgns-literature-word': 'sgns.literature.word.txt.zip', + 'cn-char-fastnlp-100d': "cn_char_fastnlp_100d.zip", + 'cn-bi-fastnlp-100d': "cn_bi_fastnlp_100d.zip", + "cn-tri-fastnlp-100d": "cn_tri_fastnlp_100d.zip" } DATASET_DIR = { @@ -96,7 +99,9 @@ DATASET_DIR = { "cws-pku": 'cws_pku.zip', "cws-cityu": "cws_cityu.zip", "cws-as": 'cws_as.zip', - "cws-msra": 'cws_msra.zip' + "cws-msra": 'cws_msra.zip', + + "chn-senti-corp":"chn_senti_corp.zip" } PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py index 6c23f213..3ad1b47d 100644 --- a/fastNLP/io/loader/__init__.py +++ b/fastNLP/io/loader/__init__.py @@ -52,6 +52,7 @@ __all__ = [ 'IMDBLoader', 'SSTLoader', 'SST2Loader', + "ChnSentiCorpLoader", 'ConllLoader', 'Conll2003Loader', @@ -73,7 +74,7 @@ __all__ = [ "QNLILoader", "RTELoader" ] -from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader +from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader from .csv import CSVLoader from .cws import CWSLoader diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index ec00d2b4..4ebd58e1 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -7,6 +7,7 @@ __all__ = [ "IMDBLoader", "SSTLoader", "SST2Loader", + "ChnSentiCorpLoader" ] import glob @@ -346,3 +347,59 @@ class SST2Loader(Loader): """ output_dir = self._get_dataset_path(dataset_name='sst-2') return output_dir + + +class ChnSentiCorpLoader(Loader): + """ + 支持读取的数据的格式为,第一行为标题(具体内容会被忽略),之后一行为一个sample,第一个制表符之前被认为是label,第 + 一个制表符及之后认为是句子 + + Example:: + + label raw_chars + 1 這間酒店環境和服務態度亦算不錯,但房間空間太小~~ + 1 <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道... + 0 商品的不足暂时还没发现,京东的订单处理速度实在.......周二就打包完成,周五才发货... + + 读取后的DataSet具有以下的field + + .. csv-table:: + :header: "raw_chars", "target" + + "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1" + "<荐书> 推荐所有喜欢<红楼>...", "1" + "..." + + """ + def __init__(self): + super().__init__() + + def _load(self, path:str): + """ + 从path中读取数据 + + :param path: + :return: + """ + ds = DataSet() + with open(path, 'r', encoding='utf-8') as f: + f.readline() + for line in f: + line = line.strip() + tab_index = line.index('\t') + if tab_index!=-1: + target = line[:tab_index] + raw_chars = line[tab_index+1:] + if raw_chars: + ds.append(Instance(raw_chars=raw_chars, target=target)) + return ds + + def download(self)->str: + """ + 自动下载数据,该数据取自https://github.com/pengming617/bert_classification/tree/master/data,在 + https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用 + + :return: + """ + output_dir = self._get_dataset_path('chn-senti-corp') + return output_dir diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 048e4cfe..943709e7 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -17,6 +17,7 @@ __all__ = [ "SSTPipe", "SST2Pipe", "IMDBPipe", + "ChnSentiCorpPipe", "Conll2003NERPipe", "OntoNotesNERPipe", @@ -39,7 +40,7 @@ __all__ = [ "MNLIPipe", ] -from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe +from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index 30c591a4..d1c7aa0e 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -5,7 +5,8 @@ __all__ = [ "YelpPolarityPipe", "SSTPipe", "SST2Pipe", - 'IMDBPipe' + 'IMDBPipe', + "ChnSentiCorpPipe" ] import re @@ -13,18 +14,18 @@ import re from nltk import Tree from .pipe import Pipe -from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance +from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance, _add_chars_field from ..data_bundle import DataBundle from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader from ...core.const import Const from ...core.dataset import DataSet from ...core.instance import Instance from ...core.vocabulary import Vocabulary +from ..loader.classification import ChnSentiCorpLoader nonalpnum = re.compile('[^0-9a-zA-Z?!\']+') - class _CLSPipe(Pipe): """ 分类问题的基类,负责对classification的数据进行tokenize操作。默认是对raw_words列操作,然后生成words列 @@ -457,3 +458,97 @@ class IMDBPipe(_CLSPipe): data_bundle = self.process(data_bundle) return data_bundle + + +class ChnSentiCorpPipe(Pipe): + """ + 处理之后的DataSet有以下的结构 + + .. csv-table:: + :header: "raw_chars", "chars", "target", "seq_len" + + "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "[2, 3, 4, 5, ...]", 1, 31 + "<荐书> 推荐所有喜欢<红楼>...", "[10, 21, ....]", 1, 25 + "..." + + 其中chars, seq_len是input,target是target + + :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 + 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('bigrams')获取. + :param bool trigrams: 是否增加一列trigrams. trigrams的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + 。如果设置为True,返回的DataSet将有一列名为trigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('trigrams')获取. + """ + def __init__(self, bigrams=False, trigrams=False): + super().__init__() + + self.bigrams = bigrams + self.trigrams = trigrams + + def _tokenize(self, data_bundle): + """ + 将DataSet中的"复旦大学"拆分为["复", "旦", "大", "学"]. 未来可以通过扩展这个函数实现分词。 + + :param data_bundle: + :return: + """ + data_bundle.apply_field(list, field_name=Const.CHAR_INPUT, new_field_name=Const.CHAR_INPUT) + return data_bundle + + def process(self, data_bundle:DataBundle): + """ + 可以处理的DataSet应该具备以下的field + + .. csv-table:: + :header: "raw_chars", "target" + + "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1" + "<荐书> 推荐所有喜欢<红楼>...", "1" + "..." + + :param data_bundle: + :return: + """ + _add_chars_field(data_bundle, lower=False) + + data_bundle = self._tokenize(data_bundle) + + input_field_names = [Const.CHAR_INPUT] + if self.bigrams: + for name, dataset in data_bundle.iter_datasets(): + dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + [''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + if self.trigrams: + for name, dataset in data_bundle.iter_datasets(): + dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in + zip(chars, chars[1:] + [''], chars[2:] + [''] * 2)], + field_name=Const.CHAR_INPUT, new_field_name='trigrams') + input_field_names.append('trigrams') + + # index + _indexize(data_bundle, input_field_names, Const.TARGET) + + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names + target_fields = [Const.TARGET] + + for name, dataset in data_bundle.datasets.items(): + dataset.add_seq_len(Const.CHAR_INPUT) + + data_bundle.set_input(*input_fields) + data_bundle.set_target(*target_fields) + + return data_bundle + + def process_from_file(self, paths=None): + """ + + :param paths: 支持路径类型参见 :class:`fastNLP.io.loader.Loader` 的load函数。 + :return: DataBundle + """ + # 读取数据 + data_bundle = ChnSentiCorpLoader().load(paths) + data_bundle = self.process(data_bundle) + + return data_bundle \ No newline at end of file diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index 2edc9008..a96b259a 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -222,14 +222,23 @@ class _CNNERPipe(Pipe): target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target, seq_len。 :param: str encoding_type: target列使用什么类型的encoding方式,支持bioes, bio两种。 + :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 + 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('bigrams')获取. + :param bool trigrams: 是否增加一列trigrams. trigrams的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...] + 。如果设置为True,返回的DataSet将有一列名为trigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 + data_bundle.get_vocab('trigrams')获取. """ - def __init__(self, encoding_type: str = 'bio'): + def __init__(self, encoding_type: str = 'bio', bigrams=False, trigrams=False): if encoding_type == 'bio': self.convert_tag = iob2 else: self.convert_tag = lambda words: iob2bioes(iob2(words)) - + + self.bigrams = bigrams + self.trigrams = trigrams + def process(self, data_bundle: DataBundle) -> DataBundle: """ 支持的DataSet的field为 @@ -241,11 +250,11 @@ class _CNNERPipe(Pipe): "[青, 岛, 海, 牛, 队, 和, ...]", "[B-ORG, I-ORG, I-ORG, ...]" "[...]", "[...]" - raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 - target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int], + 是转换为index的target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 - :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field的内容均为List[str]。 - 在传入DataBundle基础上原位修改。 + :param ~fastNLP.DataBundle data_bundle: 传入的DataBundle中的DataSet必须包含raw_words和ner两个field,且两个field + 的内容均为List[str]。在传入DataBundle基础上原位修改。 :return: DataBundle """ # 转换tag @@ -253,11 +262,24 @@ class _CNNERPipe(Pipe): dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET) _add_chars_field(data_bundle, lower=False) - + + input_field_names = [Const.CHAR_INPUT] + if self.bigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + [''])], + field_name=Const.CHAR_INPUT, new_field_name='bigrams') + input_field_names.append('bigrams') + if self.trigrams: + for name, dataset in data_bundle.datasets.items(): + dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in + zip(chars, chars[1:] + [''], chars[2:] + [''] * 2)], + field_name=Const.CHAR_INPUT, new_field_name='trigrams') + input_field_names.append('trigrams') + # index - _indexize(data_bundle, input_field_names=Const.CHAR_INPUT, target_field_names=Const.TARGET) + _indexize(data_bundle, input_field_names, Const.TARGET) - input_fields = [Const.TARGET, Const.CHAR_INPUT, Const.INPUT_LEN] + input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names target_fields = [Const.TARGET, Const.INPUT_LEN] for name, dataset in data_bundle.datasets.items(): diff --git a/test/embeddings/test_bert_embedding.py b/test/embeddings/test_bert_embedding.py index 46ad74c3..6a4a0ffa 100644 --- a/test/embeddings/test_bert_embedding.py +++ b/test/embeddings/test_bert_embedding.py @@ -13,6 +13,12 @@ class TestDownload(unittest.TestCase): words = torch.LongTensor([[2, 3, 4, 0]]) print(embed(words).size()) + for pool_method in ['first', 'last', 'max', 'avg']: + for include_cls_sep in [True, False]: + embed = BertEmbedding(vocab, model_dir_or_name='en', pool_method=pool_method, + include_cls_sep=include_cls_sep) + print(embed(words).size()) + def test_word_drop(self): vocab = Vocabulary().add_word_lst("This is a test .".split()) embed = BertEmbedding(vocab, model_dir_or_name='en', dropout=0.1, word_dropout=0.2) diff --git a/test/io/loader/test_classification_loader.py b/test/io/loader/test_classification_loader.py index 1438a014..f099c1b2 100644 --- a/test/io/loader/test_classification_loader.py +++ b/test/io/loader/test_classification_loader.py @@ -5,22 +5,22 @@ from fastNLP.io.loader.classification import YelpPolarityLoader from fastNLP.io.loader.classification import IMDBLoader from fastNLP.io.loader.classification import SST2Loader from fastNLP.io.loader.classification import SSTLoader +from fastNLP.io.loader.classification import ChnSentiCorpLoader import os @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") class TestDownload(unittest.TestCase): def test_download(self): - for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader]: + for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader, ChnSentiCorpLoader]: loader().download() def test_load(self): - for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader]: + for loader in [YelpFullLoader, YelpPolarityLoader, IMDBLoader, SST2Loader, SSTLoader, ChnSentiCorpLoader]: data_bundle = loader().load() print(data_bundle) class TestLoad(unittest.TestCase): - def test_load(self): for loader in [IMDBLoader]: data_bundle = loader().load('test/data_for_tests/io/imdb') diff --git a/test/io/loader/test_conll_loader.py b/test/io/loader/test_conll_loader.py index 861de5a5..31859a6b 100644 --- a/test/io/loader/test_conll_loader.py +++ b/test/io/loader/test_conll_loader.py @@ -5,7 +5,7 @@ from fastNLP.io.loader.conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNE Conll2003Loader -class MSRANERTest(unittest.TestCase): +class TestMSRANER(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_download(self): MsraNERLoader().download(re_download=False) @@ -13,13 +13,13 @@ class MSRANERTest(unittest.TestCase): print(data_bundle) -class PeopleDailyTest(unittest.TestCase): +class TestPeopleDaily(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_download(self): PeopleDailyNERLoader().download() -class WeiboNERTest(unittest.TestCase): +class TestWeiboNER(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_download(self): WeiboNERLoader().download() diff --git a/test/io/loader/test_cws_loader.py b/test/io/loader/test_cws_loader.py index 8b5d4081..55e48910 100644 --- a/test/io/loader/test_cws_loader.py +++ b/test/io/loader/test_cws_loader.py @@ -3,7 +3,7 @@ import os from fastNLP.io.loader import CWSLoader -class CWSLoaderTest(unittest.TestCase): +class TestCWSLoader(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_download(self): dataset_names = ['pku', 'cityu', 'as', 'msra'] @@ -13,7 +13,7 @@ class CWSLoaderTest(unittest.TestCase): print(data_bundle) -class RunCWSLoaderTest(unittest.TestCase): +class TestRunCWSLoader(unittest.TestCase): def test_cws_loader(self): dataset_names = ['msra'] for dataset_name in dataset_names: diff --git a/test/io/loader/test_matching_loader.py b/test/io/loader/test_matching_loader.py index 652cf161..cb1334e0 100644 --- a/test/io/loader/test_matching_loader.py +++ b/test/io/loader/test_matching_loader.py @@ -8,7 +8,7 @@ from fastNLP.io.loader.matching import MNLILoader import os @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") -class TestDownload(unittest.TestCase): +class TestMatchingDownload(unittest.TestCase): def test_download(self): for loader in [RTELoader, QNLILoader, SNLILoader, MNLILoader]: loader().download() @@ -21,8 +21,7 @@ class TestDownload(unittest.TestCase): print(data_bundle) -class TestLoad(unittest.TestCase): - +class TestMatchingLoad(unittest.TestCase): def test_load(self): for loader in [RTELoader]: data_bundle = loader().load('test/data_for_tests/io/rte') diff --git a/test/io/pipe/test_classification.py b/test/io/pipe/test_classification.py index c6e2005e..45c276a3 100644 --- a/test/io/pipe/test_classification.py +++ b/test/io/pipe/test_classification.py @@ -2,9 +2,10 @@ import unittest import os from fastNLP.io.pipe.classification import SSTPipe, SST2Pipe, IMDBPipe, YelpFullPipe, YelpPolarityPipe +from fastNLP.io.pipe.classification import ChnSentiCorpPipe @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") -class TestPipe(unittest.TestCase): +class TestClassificationPipe(unittest.TestCase): def test_process_from_file(self): for pipe in [YelpPolarityPipe, SST2Pipe, IMDBPipe, YelpFullPipe, SSTPipe]: with self.subTest(pipe=pipe): @@ -14,8 +15,16 @@ class TestPipe(unittest.TestCase): class TestRunPipe(unittest.TestCase): - def test_load(self): for pipe in [IMDBPipe]: data_bundle = pipe(tokenizer='raw').process_from_file('test/data_for_tests/io/imdb') print(data_bundle) + + +@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") +class TestCNClassificationPipe(unittest.TestCase): + def test_process_from_file(self): + for pipe in [ChnSentiCorpPipe]: + with self.subTest(pipe=pipe): + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file() + print(data_bundle) \ No newline at end of file diff --git a/test/io/pipe/test_conll.py b/test/io/pipe/test_conll.py index 6f6c4fad..4ecd7969 100644 --- a/test/io/pipe/test_conll.py +++ b/test/io/pipe/test_conll.py @@ -4,12 +4,14 @@ from fastNLP.io import MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, Conll2003Pipe @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") -class TestPipe(unittest.TestCase): +class TestConllPipe(unittest.TestCase): def test_process_from_file(self): for pipe in [MsraNERPipe, PeopleDailyPipe, WeiboNERPipe]: with self.subTest(pipe=pipe): print(pipe) - data_bundle = pipe().process_from_file() + data_bundle = pipe(bigrams=True, trigrams=True).process_from_file() + print(data_bundle) + data_bundle = pipe(encoding_type='bioes').process_from_file() print(data_bundle) diff --git a/test/io/pipe/test_cws.py b/test/io/pipe/test_cws.py index dd901a25..063b6d9a 100644 --- a/test/io/pipe/test_cws.py +++ b/test/io/pipe/test_cws.py @@ -4,7 +4,7 @@ import os from fastNLP.io.pipe.cws import CWSPipe -class CWSPipeTest(unittest.TestCase): +class TestCWSPipe(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") def test_process_from_file(self): dataset_names = ['pku', 'cityu', 'as', 'msra'] @@ -14,7 +14,7 @@ class CWSPipeTest(unittest.TestCase): print(data_bundle) -class RunCWSPipeTest(unittest.TestCase): +class TestRunCWSPipe(unittest.TestCase): def test_process_from_file(self): dataset_names = ['msra'] for dataset_name in dataset_names: diff --git a/test/io/pipe/test_matching.py b/test/io/pipe/test_matching.py index 33904e7a..932d8289 100644 --- a/test/io/pipe/test_matching.py +++ b/test/io/pipe/test_matching.py @@ -7,7 +7,7 @@ from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, MN @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") -class TestPipe(unittest.TestCase): +class TestMatchingPipe(unittest.TestCase): def test_process_from_file(self): for pipe in [SNLIPipe, RTEPipe, QNLIPipe, MNLIPipe]: with self.subTest(pipe=pipe): @@ -17,7 +17,7 @@ class TestPipe(unittest.TestCase): @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") -class TestBertPipe(unittest.TestCase): +class TestMatchingBertPipe(unittest.TestCase): def test_process_from_file(self): for pipe in [SNLIBertPipe, RTEBertPipe, QNLIBertPipe, MNLIBertPipe]: with self.subTest(pipe=pipe): @@ -26,7 +26,7 @@ class TestBertPipe(unittest.TestCase): print(data_bundle) -class TestRunPipe(unittest.TestCase): +class TestRunMatchingPipe(unittest.TestCase): def test_load(self): for pipe in [RTEPipe, RTEBertPipe]: From 113ef8b11a34ca72fd0a1b6a1496dd42e272b94d Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 4 Sep 2019 14:31:45 +0800 Subject: [PATCH 140/153] add code to detect the defined location automatically --- fastNLP/__init__.py | 4 ++++ fastNLP/doc_utils.py | 21 +++++++++++++++++++++ fastNLP/embeddings/__init__.py | 4 ++++ fastNLP/io/__init__.py | 4 ++++ fastNLP/models/__init__.py | 4 ++++ fastNLP/modules/__init__.py | 4 ++++ 6 files changed, 41 insertions(+) create mode 100644 fastNLP/doc_utils.py diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py index 19efac31..aceaf47f 100644 --- a/fastNLP/__init__.py +++ b/fastNLP/__init__.py @@ -70,3 +70,7 @@ from . import models from . import modules from .core import * from .io import loader, pipe + +import sys +from .doc_utils import doc_process +doc_process(sys.modules[__name__]) \ No newline at end of file diff --git a/fastNLP/doc_utils.py b/fastNLP/doc_utils.py new file mode 100644 index 00000000..924b7a6a --- /dev/null +++ b/fastNLP/doc_utils.py @@ -0,0 +1,21 @@ +import inspect +import sys + + +def doc_process(m): + for name, obj in inspect.getmembers(m): + if inspect.isclass(obj) or inspect.isfunction(obj): + if obj.__module__ != m.__name__: + if obj.__doc__ is None: + print(name, obj.__doc__) + else: + module_name = obj.__module__ + while 1: + defined_m = sys.modules[module_name] + if "undocumented" not in defined_m.__doc__ and name in defined_m.__all__: + obj.__doc__ = r"定义在 :class:`" + module_name + "." + name + "`\n" + obj.__doc__ + break + module_name = ".".join(module_name.split('.')[:-1]) + if module_name == m.__name__: + print(name, ": not found defined doc.") + break diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py index 8a970e25..ea99154e 100644 --- a/fastNLP/embeddings/__init__.py +++ b/fastNLP/embeddings/__init__.py @@ -25,3 +25,7 @@ from .bert_embedding import BertEmbedding, BertWordPieceEncoder from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding from .stack_embedding import StackEmbedding from .utils import get_embeddings + +import sys +from ..doc_utils import doc_process +doc_process(sys.modules[__name__]) \ No newline at end of file diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 6f727f05..c8b3dfaa 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -88,3 +88,7 @@ from .model_io import ModelLoader, ModelSaver from .loader import * from .pipe import * + +import sys +from ..doc_utils import doc_process +doc_process(sys.modules[__name__]) \ No newline at end of file diff --git a/fastNLP/models/__init__.py b/fastNLP/models/__init__.py index a659e1d5..62adbf69 100644 --- a/fastNLP/models/__init__.py +++ b/fastNLP/models/__init__.py @@ -38,3 +38,7 @@ from .cnn_text_classification import CNNText from .sequence_labeling import SeqLabeling, AdvSeqLabel from .snli import ESIM from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel + +import sys +from ..doc_utils import doc_process +doc_process(sys.modules[__name__]) \ No newline at end of file diff --git a/fastNLP/modules/__init__.py b/fastNLP/modules/__init__.py index 7959e454..769dc42a 100644 --- a/fastNLP/modules/__init__.py +++ b/fastNLP/modules/__init__.py @@ -54,3 +54,7 @@ from . import encoder from .decoder import * from .dropout import TimestepDropout from .encoder import * + +import sys +from ..doc_utils import doc_process +doc_process(sys.modules[__name__]) From 3651d61f41c267ef4801dc53e5ac359f8b71606f Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 4 Sep 2019 14:47:45 +0800 Subject: [PATCH 141/153] delete the alias in files. --- fastNLP/embeddings/bert_embedding.py | 2 -- fastNLP/embeddings/char_embedding.py | 4 ---- fastNLP/embeddings/elmo_embedding.py | 2 -- fastNLP/embeddings/embedding.py | 2 -- fastNLP/embeddings/stack_embedding.py | 2 -- fastNLP/embeddings/static_embedding.py | 2 -- fastNLP/modules/decoder/crf.py | 2 -- fastNLP/modules/decoder/mlp.py | 2 -- fastNLP/modules/decoder/utils.py | 2 -- fastNLP/modules/encoder/attention.py | 1 - fastNLP/modules/encoder/bert.py | 2 -- fastNLP/modules/encoder/char_encoder.py | 6 ------ fastNLP/modules/encoder/conv_maxpool.py | 2 -- fastNLP/modules/encoder/lstm.py | 2 -- fastNLP/modules/encoder/pooling.py | 8 -------- fastNLP/modules/encoder/star_transformer.py | 3 --- fastNLP/modules/encoder/transformer.py | 3 --- fastNLP/modules/encoder/variational_rnn.py | 6 ------ reproduction/text_classification/data/sstloader.py | 8 ++++---- reproduction/text_classification/model/awdlstm_module.py | 2 -- 20 files changed, 4 insertions(+), 59 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 08615fe0..17f6769d 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -26,8 +26,6 @@ from ..core import logger class BertEmbedding(ContextualEmbedding): """ - 别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding` - 使用BERT对words进行编码的Embedding。建议将输入的words长度限制在430以内,而不要使用512(根据预训练模型参数,可能有变化)。这是由于 预训练的bert模型长度限制为512个token,而因为输入的word是未进行word piece分割的(word piece的分割有BertEmbedding在输入word 时切分),在分割之后长度可能会超过最大长度限制。 diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py index 379d4eee..59109206 100644 --- a/fastNLP/embeddings/char_embedding.py +++ b/fastNLP/embeddings/char_embedding.py @@ -24,8 +24,6 @@ from ..core import logger class CNNCharEmbedding(TokenEmbedding): """ - 别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding` - 使用CNN生成character embedding。CNN的结构为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool -> fc -> Dropout. 不同的kernel大小的fitler结果是concat起来然后通过一层fully connected layer, 然后输出word的表示。 @@ -179,8 +177,6 @@ class CNNCharEmbedding(TokenEmbedding): class LSTMCharEmbedding(TokenEmbedding): """ - 别名::class:`fastNLP.embeddings.LSTMCharEmbedding` :class:`fastNLP.embeddings.char_embedding.LSTMCharEmbedding` - 使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool -> Dropout Example:: diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index d82344e4..0ec0caa0 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -22,8 +22,6 @@ from ..core import logger class ElmoEmbedding(ContextualEmbedding): """ - 别名::class:`fastNLP.embeddings.ElmoEmbedding` :class:`fastNLP.embeddings.elmo_embedding.ElmoEmbedding` - 使用ELMo的embedding。初始化之后,只需要传入words就可以得到对应的embedding。当前支持的使用名称初始化的模型有以下的这些(待补充) Example:: diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py index 5e7b9803..255b0823 100644 --- a/fastNLP/embeddings/embedding.py +++ b/fastNLP/embeddings/embedding.py @@ -17,8 +17,6 @@ from .utils import get_embeddings class Embedding(nn.Module): """ - 别名::class:`fastNLP.embeddings.Embedding` :class:`fastNLP.embeddings.embedding.Embedding` - 词向量嵌入,支持输入多种方式初始化. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度. Example:: diff --git a/fastNLP/embeddings/stack_embedding.py b/fastNLP/embeddings/stack_embedding.py index 14781945..e83a275c 100644 --- a/fastNLP/embeddings/stack_embedding.py +++ b/fastNLP/embeddings/stack_embedding.py @@ -17,8 +17,6 @@ from .embedding import TokenEmbedding class StackEmbedding(TokenEmbedding): """ - 别名::class:`fastNLP.embeddings.StackEmbedding` :class:`fastNLP.embeddings.stack_embedding.StackEmbedding` - 支持将多个embedding集合成一个embedding。 Example:: diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index c768f32f..8249aa11 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -24,8 +24,6 @@ from ..core import logger class StaticEmbedding(TokenEmbedding): """ - 别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding` - StaticEmbedding组件. 给定预训练embedding的名称或路径,根据vocab从embedding中抽取相应的数据(只会将出现在vocab中的词抽取出来, 如果没有找到,则会随机初始化一个值(但如果该word是被标记为no_create_entry的话,则不会单独创建一个值,而是会被指向unk的index))。 当前支持自动下载的预训练vector有以下的几种(待补充); diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py index c13ea50c..e2a751f8 100644 --- a/fastNLP/modules/decoder/crf.py +++ b/fastNLP/modules/decoder/crf.py @@ -15,8 +15,6 @@ from typing import Union def allowed_transitions(tag_vocab:Union[Vocabulary, dict], encoding_type=None, include_start_end=False): """ - 别名::class:`fastNLP.modules.allowed_transitions` :class:`fastNLP.modules.decoder.allowed_transitions` - 给定一个id到label的映射表,返回所有可以跳转的(from_tag_id, to_tag_id)列表。 :param ~fastNLP.Vocabulary,dict tag_vocab: 支持类型为tag或tag-label。只有tag的,比如"B", "M"; 也可以是"B-NN", "M-NN", diff --git a/fastNLP/modules/decoder/mlp.py b/fastNLP/modules/decoder/mlp.py index f6e687a7..3e594de1 100644 --- a/fastNLP/modules/decoder/mlp.py +++ b/fastNLP/modules/decoder/mlp.py @@ -12,8 +12,6 @@ from ..utils import initial_parameter class MLP(nn.Module): """ - 别名::class:`fastNLP.modules.MLP` :class:`fastNLP.modules.decoder.MLP` - 多层感知器 :param List[int] size_layer: 一个int的列表,用来定义MLP的层数,列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1 diff --git a/fastNLP/modules/decoder/utils.py b/fastNLP/modules/decoder/utils.py index 118b1414..e0d2af68 100644 --- a/fastNLP/modules/decoder/utils.py +++ b/fastNLP/modules/decoder/utils.py @@ -8,8 +8,6 @@ import torch def viterbi_decode(logits, transitions, mask=None, unpad=False): r""" - 别名::class:`fastNLP.modules.viterbi_decode` :class:`fastNLP.modules.decoder.viterbi_decode` - 给定一个特征矩阵以及转移分数矩阵,计算出最佳的路径以及对应的分数 :param torch.FloatTensor logits: batch_size x max_len x num_tags,特征矩阵。 diff --git a/fastNLP/modules/encoder/attention.py b/fastNLP/modules/encoder/attention.py index 6a973864..0d832653 100644 --- a/fastNLP/modules/encoder/attention.py +++ b/fastNLP/modules/encoder/attention.py @@ -45,7 +45,6 @@ class DotAttention(nn.Module): class MultiHeadAttention(nn.Module): """ - 别名::class:`fastNLP.modules.MultiHeadAttention` :class:`fastNLP.modules.encoder.MultiHeadAttention` :param input_size: int, 输入维度的大小。同时也是输出维度的大小。 :param key_size: int, 每个head的维度大小。 diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 6f6c4291..12379718 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -348,8 +348,6 @@ class BertPooler(nn.Module): class BertModel(nn.Module): """ - 别名::class:`fastNLP.modules.BertModel` :class:`fastNLP.modules.encoder.BertModel` - BERT(Bidirectional Embedding Representations from Transformers). 用预训练权重矩阵来建立BERT模型:: diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py index e40bd0dd..dc73f447 100644 --- a/fastNLP/modules/encoder/char_encoder.py +++ b/fastNLP/modules/encoder/char_encoder.py @@ -13,8 +13,6 @@ from ..utils import initial_parameter # from torch.nn.init import xavier_uniform class ConvolutionCharEncoder(nn.Module): """ - 别名::class:`fastNLP.modules.ConvolutionCharEncoder` :class:`fastNLP.modules.encoder.ConvolutionCharEncoder` - char级别的卷积编码器. :param int char_emb_size: char级别embedding的维度. Default: 50 @@ -60,11 +58,7 @@ class ConvolutionCharEncoder(nn.Module): class LSTMCharEncoder(nn.Module): """ - 别名::class:`fastNLP.modules.LSTMCharEncoder` :class:`fastNLP.modules.encoder.LSTMCharEncoder` - char级别基于LSTM的encoder. - - """ def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None): diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py index 68415189..bf629eba 100644 --- a/fastNLP/modules/encoder/conv_maxpool.py +++ b/fastNLP/modules/encoder/conv_maxpool.py @@ -10,8 +10,6 @@ import torch.nn.functional as F class ConvMaxpool(nn.Module): """ - 别名::class:`fastNLP.modules.ConvMaxpool` :class:`fastNLP.modules.encoder.ConvMaxpool` - 集合了Convolution和Max-Pooling于一体的层。给定一个batch_size x max_len x input_size的输入,返回batch_size x sum(output_channels) 大小的matrix。在内部,是先使用CNN给输入做卷积,然后经过activation激活层,在通过在长度(max_len) 这一维进行max_pooling。最后得到每个sample的一个向量表示。 diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index 1f3eae6d..1dd1f0df 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -14,8 +14,6 @@ import torch.nn.utils.rnn as rnn class LSTM(nn.Module): """ - 别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.LSTM` - LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化 为1; 且可以应对DataParallel中LSTM的使用问题。 diff --git a/fastNLP/modules/encoder/pooling.py b/fastNLP/modules/encoder/pooling.py index b1272284..c248601d 100644 --- a/fastNLP/modules/encoder/pooling.py +++ b/fastNLP/modules/encoder/pooling.py @@ -12,8 +12,6 @@ import torch.nn as nn class MaxPool(nn.Module): """ - 别名::class:`fastNLP.modules.MaxPool` :class:`fastNLP.modules.encoder.MaxPool` - Max-pooling模块。 :param stride: 窗口移动大小,默认为kernel_size @@ -61,8 +59,6 @@ class MaxPool(nn.Module): class MaxPoolWithMask(nn.Module): """ - 别名::class:`fastNLP.modules.MaxPoolWithMask` :class:`fastNLP.modules.encoder.MaxPoolWithMask` - 带mask矩阵的max pooling。在做max-pooling的时候不会考虑mask值为0的位置。 """ @@ -101,8 +97,6 @@ class KMaxPool(nn.Module): class AvgPool(nn.Module): """ - 别名::class:`fastNLP.modules.AvgPool` :class:`fastNLP.modules.encoder.AvgPool` - 给定形如[batch_size, max_len, hidden_size]的输入,在最后一维进行avg pooling. 输出为[batch_size, hidden_size] """ @@ -128,8 +122,6 @@ class AvgPool(nn.Module): class AvgPoolWithMask(nn.Module): """ - 别名::class:`fastNLP.modules.AvgPoolWithMask` :class:`fastNLP.modules.encoder.AvgPoolWithMask` - 给定形如[batch_size, max_len, hidden_size]的输入,在最后一维进行avg pooling. 输出为[batch_size, hidden_size], pooling 的时候只会考虑mask为1的位置 """ diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py index 02d7a6a0..bb47d9b5 100644 --- a/fastNLP/modules/encoder/star_transformer.py +++ b/fastNLP/modules/encoder/star_transformer.py @@ -14,9 +14,6 @@ from torch.nn import functional as F class StarTransformer(nn.Module): """ - 别名::class:`fastNLP.modules.StarTransformer` :class:`fastNLP.modules.encoder.StarTransformer` - - Star-Transformer 的encoder部分。 输入3d的文本输入, 返回相同长度的文本编码 paper: https://arxiv.org/abs/1902.09113 diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py index d8a612a0..d29a10c3 100644 --- a/fastNLP/modules/encoder/transformer.py +++ b/fastNLP/modules/encoder/transformer.py @@ -10,9 +10,6 @@ from .attention import MultiHeadAttention class TransformerEncoder(nn.Module): """ - 别名::class:`fastNLP.modules.TransformerEncoder` :class:`fastNLP.modules.encoder.TransformerEncoder` - - transformer的encoder模块,不包含embedding层 :param int num_layers: transformer的层数 diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py index 933555c8..17e2ad23 100644 --- a/fastNLP/modules/encoder/variational_rnn.py +++ b/fastNLP/modules/encoder/variational_rnn.py @@ -223,8 +223,6 @@ class VarRNNBase(nn.Module): class VarLSTM(VarRNNBase): """ - 别名::class:`fastNLP.modules.VarLSTM` :class:`fastNLP.modules.encoder.VarLSTM` - Variational Dropout LSTM. :param input_size: 输入 `x` 的特征维度 @@ -248,8 +246,6 @@ class VarLSTM(VarRNNBase): class VarRNN(VarRNNBase): """ - 别名::class:`fastNLP.modules.VarRNN` :class:`fastNLP.modules.encoder.VarRNN` - Variational Dropout RNN. :param input_size: 输入 `x` 的特征维度 @@ -273,8 +269,6 @@ class VarRNN(VarRNNBase): class VarGRU(VarRNNBase): """ - 别名::class:`fastNLP.modules.VarGRU` :class:`fastNLP.modules.encoder.VarGRU` - Variational Dropout GRU. :param input_size: 输入 `x` 的特征维度 diff --git a/reproduction/text_classification/data/sstloader.py b/reproduction/text_classification/data/sstloader.py index b635a14a..4e860279 100644 --- a/reproduction/text_classification/data/sstloader.py +++ b/reproduction/text_classification/data/sstloader.py @@ -11,11 +11,7 @@ from reproduction.utils import check_dataloader_paths, get_tokenizer class SSTLoader(DataSetLoader): - URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' - DATA_DIR = 'sst/' - """ - 别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.dataset_loader.SSTLoader` 读取SST数据集, DataSet包含fields:: words: list(str) 需要分类的文本 target: str 文本的标签 @@ -23,6 +19,10 @@ class SSTLoader(DataSetLoader): :param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False`` :param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False`` """ + + URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' + DATA_DIR = 'sst/' + def __init__(self, subtree=False, fine_grained=False): self.subtree = subtree tag_v = {'0': 'very negative', '1': 'negative', '2': 'neutral', diff --git a/reproduction/text_classification/model/awdlstm_module.py b/reproduction/text_classification/model/awdlstm_module.py index 87bfe730..a586ed2d 100644 --- a/reproduction/text_classification/model/awdlstm_module.py +++ b/reproduction/text_classification/model/awdlstm_module.py @@ -17,8 +17,6 @@ from .weight_drop import WeightDrop class LSTM(nn.Module): """ - 别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.lstm.LSTM` - LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化 为1; 且可以应对DataParallel中LSTM的使用问题。 From 4caacadeae607ebd0699d05457213321874fb786 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 4 Sep 2019 14:51:50 +0800 Subject: [PATCH 142/153] delete the alias in files. --- fastNLP/core/batch.py | 2 -- fastNLP/core/callback.py | 23 +++-------------------- fastNLP/core/dataset.py | 2 -- fastNLP/core/field.py | 6 ------ fastNLP/core/instance.py | 2 -- fastNLP/core/losses.py | 12 ------------ fastNLP/core/metrics.py | 7 ------- fastNLP/core/optimizer.py | 5 ----- fastNLP/core/sampler.py | 9 --------- fastNLP/core/tester.py | 2 -- fastNLP/core/trainer.py | 2 -- fastNLP/core/utils.py | 2 -- fastNLP/core/vocabulary.py | 2 -- fastNLP/io/embed_loader.py | 2 -- fastNLP/io/loader/classification.py | 6 ------ fastNLP/io/loader/conll.py | 2 -- fastNLP/io/loader/csv.py | 2 -- fastNLP/io/loader/json.py | 2 -- fastNLP/io/model_io.py | 4 ---- fastNLP/io/pipe/classification.py | 2 -- fastNLP/io/pipe/pipe.py | 4 +++- fastNLP/models/bert.py | 15 --------------- fastNLP/models/biaffine_parser.py | 8 -------- fastNLP/models/cnn_text_classification.py | 2 -- fastNLP/models/sequence_labeling.py | 4 ---- fastNLP/models/snli.py | 2 -- fastNLP/models/star_transformer.py | 8 -------- fastNLP/modules/decoder/crf.py | 5 +---- 28 files changed, 7 insertions(+), 137 deletions(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index ff710b30..ad07341a 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -145,8 +145,6 @@ class BatchIter: class DataSetIter(BatchIter): """ - 别名::class:`fastNLP.DataSetIter` :class:`fastNLP.core.batch.DataSetIter` - DataSetIter 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出, 组成 `x` 和 `y`:: diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 5167b09f..3cdc0f8d 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -96,8 +96,6 @@ except: class Callback(object): """ - 别名::class:`fastNLP.Callback` :class:`fastNLP.core.callback.Callback` - Callback是fastNLP中被设计用于增强 :class:`~fastNLP.Trainer` 的类。 如果Callback被传递给了 Trainer , 则 Trainer 会在对应的阶段调用Callback的函数, 具体调用时机可以通过 :doc:`trainer 模块` 查看。 @@ -436,8 +434,6 @@ class DistCallbackManager(CallbackManager): class GradientClipCallback(Callback): """ - 别名::class:`fastNLP.GradientClipCallback` :class:`fastNLP.core.callback.GradientClipCallback` - 每次backward前,将parameter的gradient clip到某个范围。 :param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。 @@ -481,8 +477,6 @@ class GradientClipCallback(Callback): class EarlyStopCallback(Callback): """ - 别名::class:`fastNLP.EarlyStopCallback` :class:`fastNLP.core.callback.EarlyStopCallback` - 多少个epoch没有变好就停止训练,相关类 :class:`EarlyStopError` :param int patience: epoch的数量 @@ -512,12 +506,10 @@ class EarlyStopCallback(Callback): class FitlogCallback(Callback): """ - 别名: :class:`fastNLP.FitlogCallback` :class:`fastNLP.core.callback.FitlogCallback` - 该callback可将loss和progress写入到fitlog中; 如果Trainer有dev的数据,将自动把dev的结果写入到log中; 同时还支持传入 - 一个(或多个)test数据集进行测试(只有在trainer具有dev时才能使用),每次在dev上evaluate之后会在这些数据集上验证一下。 - 并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则 - fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。 + 一个(或多个)test数据集进行测试(只有在trainer具有dev时才能使用),每次在dev上evaluate之后会在这些数据集上验证一下。 + 并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则 + fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。 :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要 传入多个DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。data的结果的名称以'data'开头。 @@ -611,8 +603,6 @@ class FitlogCallback(Callback): class EvaluateCallback(Callback): """ - 别名: :class:`fastNLP.EvaluateCallback` :class:`fastNLP.core.callback.EvaluateCallback` - 该callback用于扩展Trainer训练过程中只能对dev数据进行验证的问题。 :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 @@ -673,8 +663,6 @@ class EvaluateCallback(Callback): class LRScheduler(Callback): """ - 别名::class:`fastNLP.LRScheduler` :class:`fastNLP.core.callback.LRScheduler` - 对PyTorch LR Scheduler的包装以使得其可以被Trainer所使用 :param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler @@ -695,7 +683,6 @@ class LRScheduler(Callback): class ControlC(Callback): """ - 别名::class:`fastNLP.ControlC` :class:`fastNLP.core.callback.ControlC` :param bool quit_all: 若为True,则检测到control+C 直接退出程序;否则只退出Trainer """ @@ -732,8 +719,6 @@ class SmoothValue(object): class LRFinder(Callback): """ - 别名::class:`fastNLP.LRFinder` :class:`fastNLP.core.callback.LRFinder` - 用第一个 epoch 找最佳的学习率,从第二个epoch开始应用它 :param float start_lr: 学习率下界 @@ -804,8 +789,6 @@ class LRFinder(Callback): class TensorboardCallback(Callback): """ - 别名::class:`fastNLP.TensorboardCallback` :class:`fastNLP.core.callback.TensorboardCallback` - 接受以下一个或多个字符串作为参数: - "model" - "loss" diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 551cf1f8..441f9907 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -304,8 +304,6 @@ from ._logger import logger class DataSet(object): """ - 别名::class:`fastNLP.DataSet` :class:`fastNLP.core.dataset.DataSet` - fastNLP的数据容器,详细的使用方法见文档 :doc:`fastNLP.core.dataset` :param data: 如果为dict类型,则每个key的value应该为等长的list; 如果为list, diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 859dfb1f..468c248d 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -464,8 +464,6 @@ def _get_ele_type_and_dim(cell: Any, dim=0): class Padder: """ - 别名::class:`fastNLP.Padder` :class:`fastNLP.core.field.Padder` - 所有padder都需要继承这个类,并覆盖__call__方法。 用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 @@ -534,8 +532,6 @@ class Padder: class AutoPadder(Padder): """ - 别名::class:`fastNLP.AutoPadder` :class:`fastNLP.core.field.AutoPadder` - 根据contents的数据自动判定是否需要做padding。 1 如果元素类型(元素类型是指field中最里层元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类 @@ -628,8 +624,6 @@ class AutoPadder(Padder): class EngChar2DPadder(Padder): """ - 别名::class:`fastNLP.EngChar2DPadder` :class:`fastNLP.core.field.EngChar2DPadder` - 用于为英语执行character级别的2D padding操作。对应的field内容应该类似[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']], 但这个Padder只能处理index为int的情况。 diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 9a5d9edf..2285e4a4 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -10,8 +10,6 @@ __all__ = [ class Instance(object): """ - 别名::class:`fastNLP.Instance` :class:`fastNLP.core.instance.Instance` - Instance是fastNLP中对应一个sample的类。每个sample在fastNLP中是一个Instance对象。 Instance一般与 :class:`~fastNLP.DataSet` 一起使用, Instance的初始化如下面的Example所示:: diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 7402a568..b2f5ce0a 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -167,8 +167,6 @@ class LossBase(object): class LossFunc(LossBase): """ - 别名::class:`fastNLP.LossFunc` :class:`fastNLP.core.losses.LossFunc` - 提供给用户使用自定义损失函数的类 :param func: 用户自行定义的损失函数,应当为一个函数或者callable(func)为True的ojbect @@ -200,8 +198,6 @@ class LossFunc(LossBase): class CrossEntropyLoss(LossBase): """ - 别名::class:`fastNLP.CrossEntropyLoss` :class:`fastNLP.core.losses.CrossEntropyLoss` - 交叉熵损失函数 :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` @@ -248,8 +244,6 @@ class CrossEntropyLoss(LossBase): class L1Loss(LossBase): """ - 别名::class:`fastNLP.L1Loss` :class:`fastNLP.core.losses.L1Loss` - L1损失函数 :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` @@ -270,8 +264,6 @@ class L1Loss(LossBase): class BCELoss(LossBase): """ - 别名::class:`fastNLP.BCELoss` :class:`fastNLP.core.losses.BCELoss` - 二分类交叉熵损失函数 :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` @@ -291,8 +283,6 @@ class BCELoss(LossBase): class NLLLoss(LossBase): """ - 别名::class:`fastNLP.NLLLoss` :class:`fastNLP.core.losses.NLLLoss` - 负对数似然损失函数 :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` @@ -315,8 +305,6 @@ class NLLLoss(LossBase): class LossInForward(LossBase): """ - 别名::class:`fastNLP.LossInForward` :class:`fastNLP.core.losses.LossInForward` - 从forward()函数返回结果中获取loss :param str loss_key: 在forward函数中loss的键名,默认为loss diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index c0f14c90..2dc6d9d8 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -294,9 +294,6 @@ class MetricBase(object): class AccuracyMetric(MetricBase): """ - - 别名::class:`fastNLP.AccuracyMetric` :class:`fastNLP.core.metrics.AccuracyMetric` - 准确率Metric(其它的Metric参见 :doc:`fastNLP.core.metrics` ) :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` @@ -565,8 +562,6 @@ def _check_tag_vocab_and_encoding_type(tag_vocab:Union[Vocabulary, dict], encodi class SpanFPreRecMetric(MetricBase): r""" - 别名::class:`fastNLP.SpanFPreRecMetric` :class:`fastNLP.core.metrics.SpanFPreRecMetric` - 在序列标注问题中,以span的方式计算F, pre, rec. 比如中文Part of speech中,会以character的方式进行标注,句子 `中国在亚洲` 对应的POS可能为(以BMES为例) ['B-NN', 'E-NN', 'S-DET', 'B-NN', 'E-NN']。该metric就是为类似情况下的F1计算。 @@ -832,8 +827,6 @@ def _pred_topk(y_prob, k=1): class ExtractiveQAMetric(MetricBase): r""" - 别名::class:`fastNLP.ExtractiveQAMetric` :class:`fastNLP.core.metrics.ExtractiveQAMetric` - 抽取式QA(如SQuAD)的metric. :param pred1: 参数映射表中 `pred1` 的映射关系,None表示映射关系为 `pred1` -> `pred1` diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index e95047b4..c30c7e34 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -17,7 +17,6 @@ from torch.optim.optimizer import Optimizer as TorchOptimizer class Optimizer(object): """ - 别名::class:`fastNLP.Optimizer` :class:`fastNLP.core.optimizer.Optimizer` :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. :param kwargs: additional parameters. @@ -60,7 +59,6 @@ class NullOptimizer(Optimizer): class SGD(Optimizer): """ - 别名::class:`fastNLP.SGD` :class:`fastNLP.core.optimizer.SGD` :param float lr: learning rate. Default: 0.01 :param float momentum: momentum. Default: 0 @@ -82,7 +80,6 @@ class SGD(Optimizer): class Adam(Optimizer): """ - 别名::class:`fastNLP.Adam` :class:`fastNLP.core.optimizer.Adam` :param float lr: learning rate :param float weight_decay: @@ -105,8 +102,6 @@ class Adam(Optimizer): class AdamW(TorchOptimizer): r""" - 别名::class:`fastNLP.AdamW` :class:`fastNLP.core.optimizer.AdamW` - 对AdamW的实现,该实现应该会在pytorch更高版本中出现,https://github.com/pytorch/pytorch/pull/21250。这里提前加入 .. todo:: diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py index 9ca04fa0..d0df9129 100644 --- a/fastNLP/core/sampler.py +++ b/fastNLP/core/sampler.py @@ -15,9 +15,6 @@ import numpy as np class Sampler(object): """ - 别名::class:`fastNLP.Sampler` :class:`fastNLP.core.sampler.Sampler` - - `Sampler` 类的基类. 规定以何种顺序取出data中的元素 子类必须实现 ``__call__`` 方法. 输入 `DataSet` 对象, 返回其中元素的下标序列 @@ -33,8 +30,6 @@ class Sampler(object): class SequentialSampler(Sampler): """ - 别名::class:`fastNLP.SequentialSampler` :class:`fastNLP.core.sampler.SequentialSampler` - 顺序取出元素的 `Sampler` """ @@ -45,8 +40,6 @@ class SequentialSampler(Sampler): class RandomSampler(Sampler): """ - 别名::class:`fastNLP.RandomSampler` :class:`fastNLP.core.sampler.RandomSampler` - 随机化取元素的 `Sampler` """ @@ -57,8 +50,6 @@ class RandomSampler(Sampler): class BucketSampler(Sampler): """ - 别名::class:`fastNLP.BucketSampler` :class:`fastNLP.core.sampler.BucketSampler` - 带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素 :param int num_buckets: bucket的数量 diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index e549df81..344e24a8 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -65,8 +65,6 @@ __all__ = [ class Tester(object): """ - 别名::class:`fastNLP.Tester` :class:`fastNLP.core.tester.Tester` - Tester是在提供数据,模型以及metric的情况下进行性能测试的类。需要传入模型,数据以及metric进行验证。 :param ~fastNLP.DataSet data: 需要测试的数据集 diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index a47f108b..9f262fb5 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -357,8 +357,6 @@ from ._logger import logger class Trainer(object): """ - 别名::class:`fastNLP.Trainer` :class:`fastNLP.core.trainer.Trainer` - Trainer在fastNLP中用于组织单任务的训练过程,可以避免用户在不同训练任务中重复撰写 (1) epoch循环; (2) 将数据分成不同的Batch; diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index fcb2a07b..814e0bd5 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -66,8 +66,6 @@ def _prepare_cache_filepath(filepath): def cache_results(_cache_fp, _refresh=False, _verbose=1): """ - 别名::class:`fastNLP.cache_results` :class:`fastNLP.core.uitls.cache_results` - cache_results是fastNLP中用于cache数据的装饰器。通过下面的例子看一下如何使用:: import time diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index b0f9650a..d4ff6077 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -66,8 +66,6 @@ def _check_build_status(func): class Vocabulary(object): """ - 别名::class:`fastNLP.Vocabulary` :class:`fastNLP.core.vocabulary.Vocabulary` - 用于构建, 存储和使用 `str` 到 `int` 的一一映射:: vocab = Vocabulary() diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index a157901f..73a7a1de 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -33,8 +33,6 @@ class EmbeddingOption(Option): class EmbedLoader: """ - 别名::class:`fastNLP.io.EmbedLoader` :class:`fastNLP.io.embed_loader.EmbedLoader` - 用于读取预训练的embedding, 读取结果可直接载入为模型参数。 """ diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index 4ebd58e1..9efcf5d2 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -24,8 +24,6 @@ from ...core.instance import Instance class YelpLoader(Loader): """ - 别名::class:`fastNLP.io.YelpLoader` :class:`fastNLP.io.loader.YelpLoader` - 原始数据中内容应该为, 每一行为一个sample,第一个逗号之前为target,第一个逗号之后为文本内容。 Example:: @@ -164,8 +162,6 @@ class YelpPolarityLoader(YelpLoader): class IMDBLoader(Loader): """ - 别名::class:`fastNLP.io.IMDBLoader` :class:`fastNLP.io.loader.IMDBLoader` - IMDBLoader读取后的数据将具有以下两列内容: raw_words: str, 需要分类的文本; target: str, 文本的标签 DataSet具备以下的结构: @@ -244,8 +240,6 @@ class IMDBLoader(Loader): class SSTLoader(Loader): """ - 别名::class:`fastNLP.io.SSTLoader` :class:`fastNLP.io.loader.SSTLoader` - 读取之后的DataSet具有以下的结构 .. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py index 1bd1b448..f30b031f 100644 --- a/fastNLP/io/loader/conll.py +++ b/fastNLP/io/loader/conll.py @@ -27,8 +27,6 @@ from ...core.instance import Instance class ConllLoader(Loader): """ - 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.loader.ConllLoader` - ConllLoader支持读取的数据格式: 以空行隔开两个sample,除了分割行,每一行用空格或者制表符隔开不同的元素。如下例所示: Example:: diff --git a/fastNLP/io/loader/csv.py b/fastNLP/io/loader/csv.py index 0d6e35fa..aaf38c00 100644 --- a/fastNLP/io/loader/csv.py +++ b/fastNLP/io/loader/csv.py @@ -12,8 +12,6 @@ from ...core.instance import Instance class CSVLoader(Loader): """ - 别名::class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.loader.CSVLoader` - 读取CSV格式的数据集, 返回 ``DataSet`` 。 :param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称 diff --git a/fastNLP/io/loader/json.py b/fastNLP/io/loader/json.py index 012dee5a..671769fe 100644 --- a/fastNLP/io/loader/json.py +++ b/fastNLP/io/loader/json.py @@ -12,8 +12,6 @@ from ...core.instance import Instance class JsonLoader(Loader): """ - 别名::class:`fastNLP.io.JsonLoader` :class:`fastNLP.io.loader.JsonLoader` - 读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象 :param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py index a1899f51..9da921df 100644 --- a/fastNLP/io/model_io.py +++ b/fastNLP/io/model_io.py @@ -11,8 +11,6 @@ import torch class ModelLoader: """ - 别名::class:`fastNLP.io.ModelLoader` :class:`fastNLP.io.model_io.ModelLoader` - 用于读取模型 """ @@ -41,8 +39,6 @@ class ModelLoader: class ModelSaver(object): """ - 别名::class:`fastNLP.io.ModelSaver` :class:`fastNLP.io.model_io.ModelSaver` - 用于保存模型 Example:: diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index d1c7aa0e..3834a570 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -228,8 +228,6 @@ class YelpPolarityPipe(_CLSPipe): class SSTPipe(_CLSPipe): """ - 别名::class:`fastNLP.io.SSTPipe` :class:`fastNLP.io.pipe.SSTPipe` - 经过该Pipe之后,DataSet中具备的field如下所示 .. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py index 12d9c1cb..db65ece6 100644 --- a/fastNLP/io/pipe/pipe.py +++ b/fastNLP/io/pipe/pipe.py @@ -9,7 +9,9 @@ from .. import DataBundle class Pipe: """ - 别名::class:`fastNLP.io.Pipe` :class:`fastNLP.io.pipe.Pipe` + .. todo:: + doc + """ def process(self, data_bundle: DataBundle) -> DataBundle: """ diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py index 4a04bd6d..85c3af8c 100644 --- a/fastNLP/models/bert.py +++ b/fastNLP/models/bert.py @@ -44,9 +44,6 @@ from ..embeddings import BertEmbedding class BertForSequenceClassification(BaseModel): """ - 别名: :class:`fastNLP.models.BertForSequenceClassification` - :class:`fastNLP.models.bert.BertForSequenceClassification` - BERT model for classification. :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). @@ -90,9 +87,6 @@ class BertForSequenceClassification(BaseModel): class BertForSentenceMatching(BaseModel): """ - 别名: :class:`fastNLP.models.BertForSentenceMatching` - :class:`fastNLP.models.bert.BertForSentenceMatching` - BERT model for sentence matching. :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). @@ -135,9 +129,6 @@ class BertForSentenceMatching(BaseModel): class BertForMultipleChoice(BaseModel): """ - 别名: :class:`fastNLP.models.BertForMultipleChoice` - :class:`fastNLP.models.bert.BertForMultipleChoice` - BERT model for multiple choice. :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). @@ -185,9 +176,6 @@ class BertForMultipleChoice(BaseModel): class BertForTokenClassification(BaseModel): """ - 别名: :class:`fastNLP.models.BertForTokenClassification` - :class:`fastNLP.models.bert.BertForTokenClassification` - BERT model for token classification. :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). @@ -231,9 +219,6 @@ class BertForTokenClassification(BaseModel): class BertForQuestionAnswering(BaseModel): """ - 别名: :class:`fastNLP.models.BertForQuestionAnswering` - :class:`fastNLP.models.bert.BertForQuestionAnswering` - BERT model for classification. :param fastNLP.embeddings.BertEmbedding embed: 下游模型的编码器(encoder). diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py index 455d27a7..5d094472 100644 --- a/fastNLP/models/biaffine_parser.py +++ b/fastNLP/models/biaffine_parser.py @@ -130,8 +130,6 @@ def _find_cycle(vertices, edges): class GraphParser(BaseModel): """ - 别名::class:`fastNLP.models.GraphParser` :class:`fastNLP.models.baffine_parser.GraphParser` - 基于图的parser base class, 支持贪婪解码和最大生成树解码 """ @@ -240,8 +238,6 @@ class LabelBilinear(nn.Module): class BiaffineParser(GraphParser): """ - 别名::class:`fastNLP.models.BiaffineParser` :class:`fastNLP.models.baffine_parser.BiaffineParser` - Biaffine Dependency Parser 实现. 论文参考 `Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) `_ . @@ -475,8 +471,6 @@ class BiaffineParser(GraphParser): class ParserLoss(LossFunc): """ - 别名::class:`fastNLP.models.ParserLoss` :class:`fastNLP.models.baffine_parser.ParserLoss` - 计算parser的loss :param pred1: [batch_size, seq_len, seq_len] 边预测logits @@ -500,8 +494,6 @@ class ParserLoss(LossFunc): class ParserMetric(MetricBase): """ - 别名::class:`fastNLP.models.ParserMetric` :class:`fastNLP.models.baffine_parser.ParserMetric` - 评估parser的性能 :param pred1: 边预测logits diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py index 4bf9c4d1..65c20a55 100644 --- a/fastNLP/models/cnn_text_classification.py +++ b/fastNLP/models/cnn_text_classification.py @@ -18,8 +18,6 @@ from ..modules import encoder class CNNText(torch.nn.Module): """ - 别名::class:`fastNLP.models.CNNText` :class:`fastNLP.models.cnn_text_classification.CNNText` - 使用CNN进行文本分类的模型 'Yoon Kim. 2014. Convolution Neural Networks for Sentence Classification.' diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py index 6e839bea..d5bc250b 100644 --- a/fastNLP/models/sequence_labeling.py +++ b/fastNLP/models/sequence_labeling.py @@ -77,8 +77,6 @@ class BiLSTMCRF(BaseModel): class SeqLabeling(BaseModel): """ - 别名::class:`fastNLP.models.SeqLabeling` :class:`fastNLP.models.sequence_labeling.SeqLabeling` - 一个基础的Sequence labeling的模型。 用于做sequence labeling的基础类。结构包含一层Embedding,一层LSTM(单向,一层),一层FC,以及一层CRF。 @@ -156,8 +154,6 @@ class SeqLabeling(BaseModel): class AdvSeqLabel(nn.Module): """ - 别名::class:`fastNLP.models.AdvSeqLabel` :class:`fastNLP.models.sequence_labeling.AdvSeqLabel` - 更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。 :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py index 97a14e9f..07303ddc 100644 --- a/fastNLP/models/snli.py +++ b/fastNLP/models/snli.py @@ -19,8 +19,6 @@ from ..embeddings.embedding import TokenEmbedding, Embedding class ESIM(BaseModel): """ - 别名::class:`fastNLP.models.ESIM` :class:`fastNLP.models.snli.ESIM` - ESIM model的一个PyTorch实现 论文参见: https://arxiv.org/pdf/1609.06038.pdf diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py index 7fe0d343..e4d5af84 100644 --- a/fastNLP/models/star_transformer.py +++ b/fastNLP/models/star_transformer.py @@ -19,8 +19,6 @@ from ..core.const import Const class StarTransEnc(nn.Module): """ - 别名::class:`fastNLP.models.StarTransEnc` :class:`fastNLP.models.star_transformer.StarTransEnc` - 带word embedding的Star-Transformer Encoder :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 @@ -104,8 +102,6 @@ class _NLICls(nn.Module): class STSeqLabel(nn.Module): """ - 别名::class:`fastNLP.models.STSeqLabel` :class:`fastNLP.models.star_transformer.STSeqLabel` - 用于序列标注的Star-Transformer模型 :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 @@ -169,8 +165,6 @@ class STSeqLabel(nn.Module): class STSeqCls(nn.Module): """ - 别名::class:`fastNLP.models.STSeqCls` :class:`fastNLP.models.star_transformer.STSeqCls` - 用于分类任务的Star-Transformer :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 @@ -234,8 +228,6 @@ class STSeqCls(nn.Module): class STNLICls(nn.Module): """ - 别名::class:`fastNLP.models.STNLICls` :class:`fastNLP.models.star_transformer.STNLICls` - 用于自然语言推断(NLI)的Star-Transformer :param embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即 diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py index e2a751f8..aeb73d76 100644 --- a/fastNLP/modules/decoder/crf.py +++ b/fastNLP/modules/decoder/crf.py @@ -166,10 +166,7 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label class ConditionalRandomField(nn.Module): """ - 别名::class:`fastNLP.modules.ConditionalRandomField` :class:`fastNLP.modules.decoder.ConditionalRandomField` - - 条件随机场。 - 提供forward()以及viterbi_decode()两个方法,分别用于训练与inference。 + 条件随机场。提供forward()以及viterbi_decode()两个方法,分别用于训练与inference。 :param int num_tags: 标签的数量 :param bool include_start_end_trans: 是否考虑各个tag作为开始以及结尾的分数。 From a2e31584883abb68e4d7354ca0c95fc250e35605 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 4 Sep 2019 15:50:01 +0800 Subject: [PATCH 143/153] update the auto alias tool --- fastNLP/doc_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fastNLP/doc_utils.py b/fastNLP/doc_utils.py index 924b7a6a..5801dd53 100644 --- a/fastNLP/doc_utils.py +++ b/fastNLP/doc_utils.py @@ -13,7 +13,8 @@ def doc_process(m): while 1: defined_m = sys.modules[module_name] if "undocumented" not in defined_m.__doc__ and name in defined_m.__all__: - obj.__doc__ = r"定义在 :class:`" + module_name + "." + name + "`\n" + obj.__doc__ + obj.__doc__ = r"别名 :class:`" + m.__name__ + "." + name + "`" \ + + " :class:`" + module_name + "." + name + "`\n" + obj.__doc__ break module_name = ".".join(module_name.split('.')[:-1]) if module_name == m.__name__: From b1fe5f5321a1953b41c544c92d074becde003194 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 4 Sep 2019 16:53:31 +0800 Subject: [PATCH 144/153] split the class's doc & __init__'s doc (core part) --- fastNLP/core/batch.py | 36 ++++++------ fastNLP/core/callback.py | 115 +++++++++++++++++++++---------------- fastNLP/core/dataset.py | 21 +++---- fastNLP/core/field.py | 12 ++-- fastNLP/core/instance.py | 3 +- fastNLP/core/losses.py | 23 ++++---- fastNLP/core/metrics.py | 66 ++++++++++----------- fastNLP/core/optimizer.py | 56 +++++++++++------- fastNLP/core/predictor.py | 6 +- fastNLP/core/sampler.py | 12 ++-- fastNLP/core/tester.py | 49 ++++++++-------- fastNLP/core/trainer.py | 98 +++++++++++++++---------------- fastNLP/core/vocabulary.py | 28 ++++----- 13 files changed, 286 insertions(+), 239 deletions(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index ad07341a..b14b21de 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -9,15 +9,16 @@ __all__ = [ ] import atexit +from numbers import Number import numpy as np import torch import torch.utils.data -from numbers import Number -from .sampler import SequentialSampler -from .dataset import DataSet from ._logger import logger +from .dataset import DataSet +from .sampler import SequentialSampler + _python_is_exit = False @@ -153,23 +154,26 @@ class DataSetIter(BatchIter): for batch_x, batch_y in batch: # do stuff ... - :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集 - :param int batch_size: 取出的batch大小 - :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`. - - Default: ``None`` - :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`. - - Default: ``False`` - :param int num_workers: 使用多少个进程来预处理数据 - :param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。 - :param bool drop_last: 如果最后一个batch没有batch_size这么多sample,就扔掉最后一个 - :param timeout: - :param worker_init_fn: 在每个worker启动时调用该函数,会传入一个值,该值是worker的index。 """ def __init__(self, dataset, batch_size=1, sampler=None, as_numpy=False, num_workers=0, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None): + """ + + :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集 + :param int batch_size: 取出的batch大小 + :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.SequentialSampler`. + + Default: ``None`` + :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`. + + Default: ``False`` + :param int num_workers: 使用多少个进程来预处理数据 + :param bool pin_memory: 是否将产生的tensor使用pin memory, 可能会加快速度。 + :param bool drop_last: 如果最后一个batch没有batch_size这么多sample,就扔掉最后一个 + :param timeout: + :param worker_init_fn: 在每个worker启动时调用该函数,会传入一个值,该值是worker的index。 + """ super().__init__() assert isinstance(dataset, DataSet) if not isinstance(sampler, torch.utils.data.Sampler): diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 3cdc0f8d..fe198acc 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -317,9 +317,11 @@ def _transfer(func): class CallbackManager(Callback): + """ + 内部使用的Callback管理类 + """ def __init__(self, env, callbacks=None): """ - 内部使用的Callback管理类 :param dict env: The key is the name of the Trainer attribute(str). The value is the attribute itself. :param List[Callback] callbacks: @@ -435,23 +437,23 @@ class DistCallbackManager(CallbackManager): class GradientClipCallback(Callback): """ 每次backward前,将parameter的gradient clip到某个范围。 - - :param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。 - 如果为None则默认对Trainer的model中所有参数进行clip - :param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数 - :param str clip_type: 支持'norm', 'value' - 两种:: - - 1 'norm', 将gradient的norm rescale到[-clip_value, clip_value] - - 2 'value', 将gradient限制在[-clip_value, clip_value], - 小于-clip_value的gradient被赋值为-clip_value; - 大于clip_value的gradient被赋值为clip_value. - """ def __init__(self, parameters=None, clip_value=1, clip_type='norm'): + """ + :param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。 + 如果为None则默认对Trainer的model中所有参数进行clip + :param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数 + :param str clip_type: 支持'norm', 'value' + 两种:: + + 1 'norm', 将gradient的norm rescale到[-clip_value, clip_value] + + 2 'value', 将gradient限制在[-clip_value, clip_value], + 小于-clip_value的gradient被赋值为-clip_value; + 大于clip_value的gradient被赋值为clip_value. + """ super().__init__() from torch import nn @@ -477,12 +479,14 @@ class GradientClipCallback(Callback): class EarlyStopCallback(Callback): """ - 多少个epoch没有变好就停止训练,相关类 :class:`EarlyStopError` - - :param int patience: epoch的数量 + 多少个epoch没有变好就停止训练,相关类 :class:`~fastNLP.core.callback.EarlyStopError` """ def __init__(self, patience): + """ + + :param int patience: epoch的数量 + """ super(EarlyStopCallback, self).__init__() self.patience = patience self.wait = 0 @@ -510,17 +514,19 @@ class FitlogCallback(Callback): 一个(或多个)test数据集进行测试(只有在trainer具有dev时才能使用),每次在dev上evaluate之后会在这些数据集上验证一下。 并将验证结果写入到fitlog中。这些数据集的结果是根据dev上最好的结果报道的,即如果dev在第3个epoch取得了最佳,则 fitlog中记录的关于这些数据集的结果就是来自第三个epoch的结果。 - - :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要 - 传入多个DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。data的结果的名称以'data'开头。 - :param ~fastNLP.Tester,Dict[~fastNLP.Tester] tester: Tester对象,将在on_valid_end时调用。tester的结果的名称以'tester'开头 - :param int log_loss_every: 多少个step记录一次loss(记录的是这几个batch的loss平均值),如果数据集较大建议将该值设置得 - 大一些,不然会导致log文件巨大。默认为0, 即不要记录loss。 - :param int verbose: 是否在终端打印evaluation的结果,0不打印。 - :param bool log_exception: fitlog是否记录发生的exception信息 """ def __init__(self, data=None, tester=None, log_loss_every=0, verbose=0, log_exception=False): + """ + + :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要 + 传入多个DataSet请通过dict的方式传入,dict的key将作为对应dataset的name传递给fitlog。data的结果的名称以'data'开头。 + :param ~fastNLP.Tester,Dict[~fastNLP.Tester] tester: Tester对象,将在on_valid_end时调用。tester的结果的名称以'tester'开头 + :param int log_loss_every: 多少个step记录一次loss(记录的是这几个batch的loss平均值),如果数据集较大建议将该值设置得 + 大一些,不然会导致log文件巨大。默认为0, 即不要记录loss。 + :param int verbose: 是否在终端打印evaluation的结果,0不打印。 + :param bool log_exception: fitlog是否记录发生的exception信息 + """ super().__init__() self.datasets = {} self.testers = {} @@ -604,13 +610,14 @@ class FitlogCallback(Callback): class EvaluateCallback(Callback): """ 该callback用于扩展Trainer训练过程中只能对dev数据进行验证的问题。 - - :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 - DataSet请通过dict的方式传入。 - :param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象,将在on_valid_end时调用。 """ def __init__(self, data=None, tester=None): + """ + :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 + DataSet请通过dict的方式传入。 + :param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象,将在on_valid_end时调用。 + """ super().__init__() self.datasets = {} self.testers = {} @@ -664,12 +671,12 @@ class EvaluateCallback(Callback): class LRScheduler(Callback): """ 对PyTorch LR Scheduler的包装以使得其可以被Trainer所使用 - - :param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler """ def __init__(self, lr_scheduler): - + """ + :param torch.optim.lr_scheduler._LRScheduler lr_scheduler: PyTorch的lr_scheduler + """ super(LRScheduler, self).__init__() import torch.optim if isinstance(lr_scheduler, torch.optim.lr_scheduler._LRScheduler): @@ -683,12 +690,13 @@ class LRScheduler(Callback): class ControlC(Callback): """ - - :param bool quit_all: 若为True,则检测到control+C 直接退出程序;否则只退出Trainer + 检测到 control+C 时的反馈 """ def __init__(self, quit_all): - + """ + :param bool quit_all: 若为True,则检测到control+C 直接退出程序;否则只退出Trainer + """ super(ControlC, self).__init__() if type(quit_all) != bool: raise ValueError("In KeyBoardInterrupt, quit_all arguemnt must be a bool.") @@ -720,13 +728,14 @@ class SmoothValue(object): class LRFinder(Callback): """ 用第一个 epoch 找最佳的学习率,从第二个epoch开始应用它 - - :param float start_lr: 学习率下界 - :param float end_lr: 学习率上界 """ def __init__(self, start_lr=1e-6, end_lr=10): + """ + :param float start_lr: 学习率下界 + :param float end_lr: 学习率上界 + """ super(LRFinder, self).__init__() self.start_lr, self.end_lr = start_lr, end_lr @@ -864,13 +873,15 @@ class TensorboardCallback(Callback): class WarmupCallback(Callback): """ 按一定的周期调节Learning rate的大小。 - - :param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float, - 如0.1, 则前10%的step是按照schedule策略调整learning rate。 - :param str schedule: 以哪种方式调整。linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后 - warmup的step下降到0; constant前warmup的step上升到指定learning rate,后面的step保持learning rate. """ def __init__(self, warmup=0.1, schedule='constant'): + """ + + :param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float, + 如0.1, 则前10%的step是按照schedule策略调整learning rate。 + :param str schedule: 以哪种方式调整。linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后 + warmup的step下降到0; constant前warmup的step上升到指定learning rate,后面的step保持learning rate. + """ super().__init__() self.warmup = max(warmup, 0.) @@ -920,13 +931,15 @@ class SaveModelCallback(Callback): -epoch:1_step:40_{metric_key}:{evaluate_performance}.pt -2019-07-03-15-10-00 -epoch:0_step:20_{metric_key}:{evaluate_performance}.pt # metric是给定的metric_key, evaluate_perfomance是性能 - - :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型 - :param int top: 保存dev表现top多少模型。-1为保存所有模型。 - :param bool only_param: 是否只保存模型d饿权重。 - :param save_on_exception: 发生exception时,是否保存一份发生exception的模型。模型名称为epoch:x_step:x_Exception:{exception_name}. """ def __init__(self, save_dir, top=3, only_param=False, save_on_exception=False): + """ + + :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型 + :param int top: 保存dev表现top多少模型。-1为保存所有模型。 + :param bool only_param: 是否只保存模型d饿权重。 + :param save_on_exception: 发生exception时,是否保存一份发生exception的模型。模型名称为epoch:x_step:x_Exception:{exception_name}. + """ super().__init__() if not os.path.isdir(save_dir): @@ -992,11 +1005,13 @@ class SaveModelCallback(Callback): class CallbackException(BaseException): """ 当需要通过callback跳出训练的时候可以通过抛出CallbackException并在on_exception中捕获这个值。 - - :param str msg: Exception的信息。 """ def __init__(self, msg): + """ + + :param str msg: Exception的信息。 + """ super(CallbackException, self).__init__(msg) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 441f9907..ebdc780f 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -288,30 +288,31 @@ __all__ = [ ] import _pickle as pickle -import warnings +from copy import deepcopy import numpy as np -from copy import deepcopy +from ._logger import logger +from .const import Const +from .field import AppendToTargetOrInputException from .field import AutoPadder from .field import FieldArray +from .field import SetInputOrTargetException from .instance import Instance from .utils import _get_func_signature -from .field import AppendToTargetOrInputException -from .field import SetInputOrTargetException -from .const import Const -from ._logger import logger + class DataSet(object): """ fastNLP的数据容器,详细的使用方法见文档 :doc:`fastNLP.core.dataset` - - :param data: 如果为dict类型,则每个key的value应该为等长的list; 如果为list, - 每个元素应该为具有相同field的 :class:`~fastNLP.Instance` 。 - """ def __init__(self, data=None): + """ + + :param data: 如果为dict类型,则每个key的value应该为等长的list; 如果为list, + 每个元素应该为具有相同field的 :class:`~fastNLP.Instance` 。 + """ self.field_arrays = {} if data is not None: if isinstance(data, dict): diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 468c248d..82fcc523 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -468,18 +468,18 @@ class Padder: 用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。 .. py:function:: __call__(self, contents, field_name, field_ele_dtype): + + """ + + def __init__(self, pad_val=0, **kwargs): + """ - 传入的是List内容。假设有以下的DataSet。 - :param List[Any] contents: 传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前 deepcopy一份。 :param str, field_name: field的名称。 :param np.int64,np.float64,np.str,None, field_ele_dtype: 该field的内层元素的类型。如果该field的ignore_type为True,该这个值为None。 :return: np.array([padded_element]) - - """ - - def __init__(self, pad_val=0, **kwargs): + """ self.pad_val = pad_val def set_pad_val(self, pad_val): diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 2285e4a4..9460b5e4 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -37,7 +37,8 @@ class Instance(object): def items(self): """ 返回一个迭代器,迭代器返回两个内容,第一个内容是field_name, 第二个内容是field_value - :return: + + :return: 一个迭代器 """ return self.fields.items() diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index b2f5ce0a..9b32babb 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -20,7 +20,6 @@ from collections import defaultdict import torch import torch.nn.functional as F -from ..core.const import Const from .utils import _CheckError from .utils import _CheckRes from .utils import _build_args @@ -28,7 +27,7 @@ from .utils import _check_arg_dict_list from .utils import _check_function_or_method from .utils import _get_func_signature from .utils import seq_len_to_mask -import warnings +from ..core.const import Const class LossBase(object): @@ -284,15 +283,17 @@ class BCELoss(LossBase): class NLLLoss(LossBase): """ 负对数似然损失函数 - - :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` - :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` - :param ignore_idx: ignore的index,在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替 - 传入seq_len. - :param str reduction: 支持 `mean` ,`sum` 和 `none` . """ def __init__(self, pred=None, target=None, ignore_idx=-100, reduction='mean'): + """ + + :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` + :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` + :param ignore_idx: ignore的index,在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替 + 传入seq_len. + :param str reduction: 支持 `mean` ,`sum` 和 `none` . + """ super(NLLLoss, self).__init__() self._init_param_map(pred=pred, target=target) assert reduction in ('mean', 'sum', 'none') @@ -306,11 +307,13 @@ class NLLLoss(LossBase): class LossInForward(LossBase): """ 从forward()函数返回结果中获取loss - - :param str loss_key: 在forward函数中loss的键名,默认为loss """ def __init__(self, loss_key=Const.LOSS): + """ + + :param str loss_key: 在forward函数中loss的键名,默认为loss + """ super().__init__() if not isinstance(loss_key, str): raise TypeError(f"Only str allowed for loss_key, got {type(loss_key)}.") diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 2dc6d9d8..ec1a1864 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -10,7 +10,10 @@ __all__ = [ ] import inspect +import warnings +from abc import abstractmethod from collections import defaultdict +from typing import Union import numpy as np import torch @@ -22,9 +25,7 @@ from .utils import _check_arg_dict_list from .utils import _get_func_signature from .utils import seq_len_to_mask from .vocabulary import Vocabulary -from abc import abstractmethod -import warnings -from typing import Union + class MetricBase(object): """ @@ -295,13 +296,15 @@ class MetricBase(object): class AccuracyMetric(MetricBase): """ 准确率Metric(其它的Metric参见 :doc:`fastNLP.core.metrics` ) - - :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` - :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` - :param seq_len: 参数映射表中 `seq_len` 的映射关系,None表示映射关系为 `seq_len` -> `seq_len` """ def __init__(self, pred=None, target=None, seq_len=None): + """ + + :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` + :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` + :param seq_len: 参数映射表中 `seq_len` 的映射关系,None表示映射关系为 `seq_len` -> `seq_len` + """ super().__init__() @@ -584,25 +587,23 @@ class SpanFPreRecMetric(MetricBase): 'rec-label':xxx, ... } - - :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN), - 在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'. - :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据 - :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据 - :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。 - :param str encoding_type: 目前支持bio, bmes, bmeso, bioes。默认为None,通过tag_vocab自动判断. - :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这 - 个label - :param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个 - label的f1, pre, rec - :param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; `macro` : - 分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同) - :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . - 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 """ def __init__(self, tag_vocab, pred=None, target=None, seq_len=None, encoding_type=None, ignore_labels=None, only_gross=True, f_type='micro', beta=1): + r""" + + :param tag_vocab: 标签的 :class:`~fastNLP.Vocabulary` 。支持的标签为"B"(没有label);或"B-xxx"(xxx为某种label,比如POS中的NN), + 在解码时,会将相同xxx的认为是同一个label,比如['B-NN', 'E-NN']会被合并为一个'NN'. + :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用 `pred` 取数据 + :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用 `target` 取数据 + :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用 `seq_len` 取数据。 + :param str encoding_type: 目前支持bio, bmes, bmeso, bioes。默认为None,通过tag_vocab自动判断. + :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'个label + :param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个label的f1, pre, rec + :param str f_type: `micro` 或 `macro` . `micro` :通过先计算总体的TP,FN和FP的数量,再计算f, precision, recall; `macro` : 分布计算每个类别的f, precision, recall,然后做平均(各类别f的权重相同) + :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . 常用为 `beta=0.5, 1, 2` 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 + """ if not isinstance(tag_vocab, Vocabulary): raise TypeError("tag_vocab can only be fastNLP.Vocabulary, not {}.".format(type(tag_vocab))) @@ -829,20 +830,21 @@ class ExtractiveQAMetric(MetricBase): r""" 抽取式QA(如SQuAD)的metric. - :param pred1: 参数映射表中 `pred1` 的映射关系,None表示映射关系为 `pred1` -> `pred1` - :param pred2: 参数映射表中 `pred2` 的映射关系,None表示映射关系为 `pred2` -> `pred2` - :param target1: 参数映射表中 `target1` 的映射关系,None表示映射关系为 `target1` -> `target1` - :param target2: 参数映射表中 `target2` 的映射关系,None表示映射关系为 `target2` -> `target2` - :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . - 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 - :param bool right_open: right_open为true表示start跟end指针指向一个左闭右开区间,为false表示指向一个左闭右闭区间。 - :param bool print_predict_stat: True则输出预测答案是否为空与正确答案是否为空的统计信息, False则不输出 - """ def __init__(self, pred1=None, pred2=None, target1=None, target2=None, beta=1, right_open=True, print_predict_stat=False): - + r""" + + :param pred1: 参数映射表中 `pred1` 的映射关系,None表示映射关系为 `pred1` -> `pred1` + :param pred2: 参数映射表中 `pred2` 的映射关系,None表示映射关系为 `pred2` -> `pred2` + :param target1: 参数映射表中 `target1` 的映射关系,None表示映射关系为 `target1` -> `target1` + :param target2: 参数映射表中 `target2` 的映射关系,None表示映射关系为 `target2` -> `target2` + :param float beta: f_beta分数, :math:`f_{beta} = \frac{(1 + {beta}^{2})*(pre*rec)}{({beta}^{2}*pre + rec)}` . + 常用为beta=0.5, 1, 2. 若为0.5则精确率的权重高于召回率;若为1,则两者平等;若为2,则召回率权重高于精确率。 + :param bool right_open: right_open为true表示start跟end指针指向一个左闭右开区间,为false表示指向一个左闭右闭区间。 + :param bool print_predict_stat: True则输出预测答案是否为空与正确答案是否为空的统计信息, False则不输出 + """ super(ExtractiveQAMetric, self).__init__() self._init_param_map(pred1=pred1, pred2=pred2, target1=target1, target2=target2) diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index c30c7e34..5e7c1cba 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -9,20 +9,23 @@ __all__ = [ "AdamW" ] -import torch import math + import torch from torch.optim.optimizer import Optimizer as TorchOptimizer class Optimizer(object): """ - - :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. - :param kwargs: additional parameters. + Optimizer """ def __init__(self, model_params, **kwargs): + """ + + :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + :param kwargs: additional parameters. + """ if model_params is not None and not hasattr(model_params, "__next__"): raise RuntimeError("model parameters should be a generator, rather than {}.".format(type(model_params))) self.model_params = model_params @@ -59,13 +62,15 @@ class NullOptimizer(Optimizer): class SGD(Optimizer): """ - - :param float lr: learning rate. Default: 0.01 - :param float momentum: momentum. Default: 0 - :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + SGD """ def __init__(self, lr=0.001, momentum=0, model_params=None): + """ + :param float lr: learning rate. Default: 0.01 + :param float momentum: momentum. Default: 0 + :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + """ if not isinstance(lr, float): raise TypeError("learning rate has to be float.") super(SGD, self).__init__(model_params, lr=lr, momentum=momentum) @@ -81,12 +86,17 @@ class SGD(Optimizer): class Adam(Optimizer): """ - :param float lr: learning rate - :param float weight_decay: - :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. """ def __init__(self, lr=0.001, weight_decay=0, betas=(0.9, 0.999), eps=1e-8, amsgrad=False, model_params=None): + """ + + :param float lr: learning rate + :param float weight_decay: + :param eps: + :param amsgrad: + :param model_params: a generator. E.g. ``model.parameters()`` for PyTorch models. + """ if not isinstance(lr, float): raise TypeError("learning rate has to be float.") super(Adam, self).__init__(model_params, lr=lr, betas=betas, eps=eps, amsgrad=amsgrad, @@ -110,17 +120,6 @@ class AdamW(TorchOptimizer): The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. - :param params (iterable): iterable of parameters to optimize or dicts defining - parameter groups - :param lr (float, optional): learning rate (default: 1e-3) - :param betas (Tuple[float, float], optional): coefficients used for computing - running averages of gradient and its square (default: (0.9, 0.99)) - :param eps (float, optional): term added to the denominator to improve - numerical stability (default: 1e-8) - :param weight_decay (float, optional): weight decay coefficient (default: 1e-2) - algorithm from the paper `On the Convergence of Adam and Beyond`_ - (default: False) - .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _Decoupled Weight Decay Regularization: @@ -131,6 +130,19 @@ class AdamW(TorchOptimizer): def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False): + """ + + :param params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + :param lr (float, optional): learning rate (default: 1e-3) + :param betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.99)) + :param eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + :param weight_decay (float, optional): weight decay coefficient (default: 1e-2) + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + """ if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index c6b8fc90..e4112d5f 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -20,11 +20,13 @@ class Predictor(object): 与测试器(Tester)不同的是,predictor不关心模型性能的评价指标,只做inference。 这是一个fastNLP调用的高级模型包装器。它与Trainer、Tester不共享任何操作。 - - :param torch.nn.Module network: 用来完成预测任务的模型 """ def __init__(self, network): + """ + + :param torch.nn.Module network: 用来完成预测任务的模型 + """ if not isinstance(network, torch.nn.Module): raise ValueError( "Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network))) diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py index d0df9129..6e025688 100644 --- a/fastNLP/core/sampler.py +++ b/fastNLP/core/sampler.py @@ -51,14 +51,16 @@ class RandomSampler(Sampler): class BucketSampler(Sampler): """ 带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素 - - :param int num_buckets: bucket的数量 - :param int batch_size: batch的大小. 默认为None,Trainer在调用BucketSampler时,会将该值正确设置,如果是非Trainer场景使用,需 - 要显示传递该值 - :param str seq_len_field_name: 对应序列长度的 `field` 的名字 """ def __init__(self, num_buckets=10, batch_size=None, seq_len_field_name='seq_len'): + """ + + :param int num_buckets: bucket的数量 + :param int batch_size: batch的大小. 默认为None,Trainer在调用BucketSampler时,会将该值正确设置,如果是非Trainer场景使用,需 + 要显示传递该值 + :param str seq_len_field_name: 对应序列长度的 `field` 的名字 + """ self.num_buckets = num_buckets self.batch_size = batch_size self.seq_len_field_name = seq_len_field_name diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 344e24a8..d1d5d41e 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -66,30 +66,32 @@ __all__ = [ class Tester(object): """ Tester是在提供数据,模型以及metric的情况下进行性能测试的类。需要传入模型,数据以及metric进行验证。 - - :param ~fastNLP.DataSet data: 需要测试的数据集 - :param torch.nn.module model: 使用的模型 - :param ~fastNLP.core.metrics.MetricBase,List[~fastNLP.core.metrics.MetricBase] metrics: 测试时使用的metrics - :param int batch_size: evaluation时使用的batch_size有多大。 - :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 - 的计算位置进行管理。支持以下的输入: - - 1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中,可见的第一个GPU中,可见的第二个GPU中; - - 2. torch.device:将模型装载到torch.device上。 - - 3. int: 将使用device_id为该值的gpu进行训练 - - 4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。 - - 5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。 - - 如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 - :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 - :param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 """ def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True): + """ + + :param ~fastNLP.DataSet data: 需要测试的数据集 + :param torch.nn.module model: 使用的模型 + :param ~fastNLP.core.metrics.MetricBase,List[~fastNLP.core.metrics.MetricBase] metrics: 测试时使用的metrics + :param int batch_size: evaluation时使用的batch_size有多大。 + :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 + 的计算位置进行管理。支持以下的输入: + + 1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中,可见的第一个GPU中,可见的第二个GPU中; + + 2. torch.device:将模型装载到torch.device上。 + + 3. int: 将使用device_id为该值的gpu进行训练 + + 4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。 + + 5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。 + + 如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 + :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 + :param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 + """ super(Tester, self).__init__() if not isinstance(model, nn.Module): @@ -137,10 +139,9 @@ class Tester(object): self._predict_func_wrapper = self._model.forward def test(self): - """开始进行验证,并返回验证结果。 + r"""开始进行验证,并返回验证结果。 - :return Dict[Dict] : dict的二层嵌套结构,dict的第一层是metric的名称; 第二层是这个metric的指标。 - 一个AccuracyMetric的例子为{'AccuracyMetric': {'acc': 1.0}}。 + :return Dict[Dict]: dict的二层嵌套结构,dict的第一层是metric的名称; 第二层是这个metric的指标。一个AccuracyMetric的例子为{'AccuracyMetric': {'acc': 1.0}}。 """ # turn on the testing mode; clean up the history self._model_device = _get_model_device(self._model) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 9f262fb5..a2c3b1f7 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -365,54 +365,6 @@ class Trainer(object): (5) 保存获得更好验证性能的模型等。 详细的介绍参见 :doc:`fastNLP.core.trainer` - - :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。 - :param nn.modules model: 待训练的模型 - :param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器 - :param int batch_size: 训练和验证的时候的batch大小。 - :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward` - :param sampler: Batch数据生成的顺序, :class:`~fastNLP.Sampler` 类型。如果为None,默认使用 :class:`~fastNLP.RandomSampler` - :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch - :param num_workers: int, 有多少个线程来进行数据pad处理。 - :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128 - 会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。 - :param int n_epochs: 需要优化迭代多少次。 - :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。 - :param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。 - :param metrics: 验证的评估函数。可以只使用一个 :class:`Metric` , - 也可以使用多个 :class:`Metric` ,通过列表传入。 - 如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None, - 则保存当前模型。Metric种类详见 :doc:`metrics模块 ` 。仅在传入dev_data时有效。 - :param str,None metric_key: :class:`Metric` 有时会有多个指标, - 比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需 - 要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 - 明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 - :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 - :param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存 - 最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 - :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 - :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 - 的计算位置进行管理。支持以下的输入: - - 1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中, - 可见的第二个GPU中; - - 2. torch.device:将模型装载到torch.device上。 - - 3. int: 将使用device_id为该值的gpu进行训练 - - 4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。 - - 5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。 - - 已知可能会出现的问题:Adagrad优化器可能无法正常使用这个参数,请手动管理模型位置。 - - :param list(callbacks) callbacks: 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以 - 通过callback机制实现。 可使用的callback参见 :doc:`callback模块 ` - :param int check_code_level: 模型检查等级. -1: 不进行检查; 0: 仅出现错误时停止; 1: 如果有field没有被使用, - 报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 - 这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; - (2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 """ def __init__(self, train_data, model, optimizer=None, loss=None, @@ -421,6 +373,56 @@ class Trainer(object): dev_data=None, metrics=None, metric_key=None, validate_every=-1, save_path=None, use_tqdm=True, device=None, callbacks=None, check_code_level=0, **kwargs): + """ + + :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。 + :param nn.modules model: 待训练的模型 + :param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器 + :param int batch_size: 训练和验证的时候的batch大小。 + :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward` + :param sampler: Batch数据生成的顺序, :class:`~fastNLP.Sampler` 类型。如果为None,默认使用 :class:`~fastNLP.RandomSampler` + :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch + :param num_workers: int, 有多少个线程来进行数据pad处理。 + :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128 + 会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。 + :param int n_epochs: 需要优化迭代多少次。 + :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。 + :param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。 + :param metrics: 验证的评估函数。可以只使用一个 :class:`Metric` , + 也可以使用多个 :class:`Metric` ,通过列表传入。 + 如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None, + 则保存当前模型。Metric种类详见 :doc:`metrics模块 ` 。仅在传入dev_data时有效。 + :param str,None metric_key: :class:`Metric` 有时会有多个指标, + 比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需 + 要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表 + 明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。 + :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 + :param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存 + 最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 + :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 + :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 + 的计算位置进行管理。支持以下的输入: + + 1. str: ['cpu', 'cuda', 'cuda:0', 'cuda:1', ...] 依次为'cpu'中, 可见的第一个GPU中, 可见的第一个GPU中, + 可见的第二个GPU中; + + 2. torch.device:将模型装载到torch.device上。 + + 3. int: 将使用device_id为该值的gpu进行训练 + + 4. list(int):如果多于1个device,将使用torch.nn.DataParallel包裹model, 并使用传入的device。 + + 5. None. 为None则不对模型进行任何处理,如果传入的model为torch.nn.DataParallel该值必须为None。 + + 已知可能会出现的问题:Adagrad优化器可能无法正常使用这个参数,请手动管理模型位置。 + + :param list(callbacks) callbacks: 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以 + 通过callback机制实现。 可使用的callback参见 :doc:`callback模块 ` + :param int check_code_level: 模型检查等级. -1: 不进行检查; 0: 仅出现错误时停止; 1: 如果有field没有被使用, + 报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 + 这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; + (2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 + """ super(Trainer, self).__init__() if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index d4ff6077..6d530eb6 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -73,21 +73,23 @@ class Vocabulary(object): vocab.update(word_list) vocab["word"] # str to int vocab.to_word(5) # int to str - - :param int max_size: `Vocabulary` 的最大大小, 即能存储词的最大数量 - 若为 ``None`` , 则不限制大小. Default: ``None`` - :param int min_freq: 能被记录下的词在文本中的最小出现频率, 应大于或等于 1. - 若小于该频率, 词语将被视为 `unknown`. 若为 ``None`` , 所有文本中的词都被记录. Default: ``None`` - :param str optional padding: padding的字符. 如果设置为 ``None`` , - 则vocabulary中不考虑padding, 也不计入词表大小,为 ``None`` 的情况多在为label建立Vocabulary的情况. - Default: '' - :param str optional unknown: unknown的字符,所有未被记录的词在转为 `int` 时将被视为unknown. - 如果设置为 ``None`` ,则vocabulary中不考虑unknow, 也不计入词表大小. - 为 ``None`` 的情况多在为label建立Vocabulary的情况. - Default: '' """ def __init__(self, max_size=None, min_freq=None, padding='', unknown=''): + """ + + :param int max_size: `Vocabulary` 的最大大小, 即能存储词的最大数量 + 若为 ``None`` , 则不限制大小. Default: ``None`` + :param int min_freq: 能被记录下的词在文本中的最小出现频率, 应大于或等于 1. + 若小于该频率, 词语将被视为 `unknown`. 若为 ``None`` , 所有文本中的词都被记录. Default: ``None`` + :param str optional padding: padding的字符. 如果设置为 ``None`` , + 则vocabulary中不考虑padding, 也不计入词表大小,为 ``None`` 的情况多在为label建立Vocabulary的情况. + Default: '' + :param str optional unknown: unknown的字符,所有未被记录的词在转为 `int` 时将被视为unknown. + 如果设置为 ``None`` ,则vocabulary中不考虑unknow, 也不计入词表大小. + 为 ``None`` 的情况多在为label建立Vocabulary的情况. + Default: '' + """ self.max_size = max_size self.min_freq = min_freq self.word_count = Counter() @@ -402,7 +404,7 @@ class Vocabulary(object): def to_index(self, w): """ - 将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出``ValueError``:: + 将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出 ``ValueError`` :: index = vocab.to_index('abc') # equals to From 60a535db08be4621e8b2f52bb83caad81c693075 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Wed, 4 Sep 2019 17:15:21 +0800 Subject: [PATCH 145/153] fix a little error in doc. TODO: fix the bug doc of the class which inherit the class from outer space --- fastNLP/core/metrics.py | 1 + fastNLP/core/optimizer.py | 11 +++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index ec1a1864..72380fd6 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -152,6 +152,7 @@ class MetricBase(object): def get_metric_name(self): """ 返回metric的名称 + :return: """ return self._metric_name diff --git a/fastNLP/core/optimizer.py b/fastNLP/core/optimizer.py index 5e7c1cba..b782cfa6 100644 --- a/fastNLP/core/optimizer.py +++ b/fastNLP/core/optimizer.py @@ -120,12 +120,11 @@ class AdamW(TorchOptimizer): The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. - .. _Adam\: A Method for Stochastic Optimization: - https://arxiv.org/abs/1412.6980 - .. _Decoupled Weight Decay Regularization: - https://arxiv.org/abs/1711.05101 - .. _On the Convergence of Adam and Beyond: - https://openreview.net/forum?id=ryQu7f-RZ + .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 + + .. _Decoupled Weight Decay Regularization: https://arxiv.org/abs/1711.05101 + + .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, From 14d048f3406fd05a79e9e61b8e05c410bf8882f0 Mon Sep 17 00:00:00 2001 From: yh Date: Thu, 5 Sep 2019 00:21:46 +0800 Subject: [PATCH 146/153] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dbert=20embedding?= =?UTF-8?q?=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/embeddings/bert_embedding.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 17f6769d..05351cbd 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -420,11 +420,11 @@ class _WordBertModel(nn.Module): if self.pool_method == 'first': batch_word_pieces_cum_length = batch_word_pieces_cum_length[:, :seq_len.max()] batch_word_pieces_cum_length.masked_fill_(batch_word_pieces_cum_length.ge(word_piece_length), 0) - batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) + _batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) elif self.pool_method == 'last': batch_word_pieces_cum_length = batch_word_pieces_cum_length[:, 1:seq_len.max()+1] - 1 batch_word_pieces_cum_length.masked_fill_(batch_word_pieces_cum_length.ge(word_piece_length), 0) - batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) + _batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) for l_index, l in enumerate(self.layers): output_layer = bert_outputs[l] @@ -437,12 +437,12 @@ class _WordBertModel(nn.Module): # 从word_piece collapse到word的表示 truncate_output_layer = output_layer[:, 1:-1] # 删除[CLS]与[SEP] batch_size x len x hidden_size if self.pool_method == 'first': - tmp = truncate_output_layer[batch_indexes, batch_word_pieces_cum_length] + tmp = truncate_output_layer[_batch_indexes, batch_word_pieces_cum_length] tmp = tmp.masked_fill(word_mask[:, :batch_word_pieces_cum_length.size(1), None].eq(0), 0) outputs[l_index, :, s_shift:batch_word_pieces_cum_length.size(1)+s_shift] = tmp elif self.pool_method == 'last': - tmp = truncate_output_layer[batch_indexes, batch_word_pieces_cum_length] + tmp = truncate_output_layer[_batch_indexes, batch_word_pieces_cum_length] tmp = tmp.masked_fill(word_mask[:, :batch_word_pieces_cum_length.size(1), None].eq(0), 0) outputs[l_index, :, s_shift:batch_word_pieces_cum_length.size(1)+s_shift] = tmp elif self.pool_method == 'max': From 880e3ad96953bb2ac6ed19b1a54efc06835dfc04 Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Thu, 5 Sep 2019 01:26:22 +0800 Subject: [PATCH 147/153] 1. add mini_elmo.pkl and test codes for testing ElmoEmbedding; 2. update bert testing codes --- fastNLP/embeddings/elmo_embedding.py | 5 +- fastNLP/models/bert.py | 2 +- .../embedding/small_elmo/char.dic | 229 ++++++++++++++++++ .../elmo_1x16_16_32cnn_1xhighway_options.json | 29 +++ .../small_elmo/elmo_mini_for_testing.pkl | Bin 0 -> 37695 bytes test/embeddings/test_bert_embedding.py | 7 +- test/embeddings/test_elmo_embedding.py | 15 ++ 7 files changed, 282 insertions(+), 5 deletions(-) create mode 100644 test/data_for_tests/embedding/small_elmo/char.dic create mode 100644 test/data_for_tests/embedding/small_elmo/elmo_1x16_16_32cnn_1xhighway_options.json create mode 100644 test/data_for_tests/embedding/small_elmo/elmo_mini_for_testing.pkl diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py index 0ec0caa0..d19a3577 100644 --- a/fastNLP/embeddings/elmo_embedding.py +++ b/fastNLP/embeddings/elmo_embedding.py @@ -69,6 +69,7 @@ class ElmoEmbedding(ContextualEmbedding): else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs) + num_layers = self.model.encoder.num_layers if layers == 'mix': self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1), @@ -78,9 +79,9 @@ class ElmoEmbedding(ContextualEmbedding): self._embed_size = self.model.config['lstm']['projection_dim'] * 2 else: layers = list(map(int, layers.split(','))) - assert len(layers) > 0, "Must choose one output" + assert len(layers) > 0, "Must choose at least one output, but got None." for layer in layers: - assert 0 <= layer <= 2, "Layer index should be in range [0, 2]." + assert 0 <= layer <= num_layers, f"Layer index should be in range [0, {num_layers}], but got {layer}." self.layers = layers self._get_outputs = self._get_layer_outputs self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2 diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py index 85c3af8c..30ed0cd8 100644 --- a/fastNLP/models/bert.py +++ b/fastNLP/models/bert.py @@ -241,7 +241,7 @@ class BertForQuestionAnswering(BaseModel): def forward(self, words): """ :param torch.LongTensor words: [batch_size, seq_len] - :return: 一个包含num_labels个logit的dict,每一个logit的形状都是[batch_size, seq_len] + :return: 一个包含num_labels个logit的dict,每一个logit的形状都是[batch_size, seq_len + 2] """ sequence_output = self.bert(words) logits = self.qa_outputs(sequence_output) # [batch_size, seq_len, num_labels] diff --git a/test/data_for_tests/embedding/small_elmo/char.dic b/test/data_for_tests/embedding/small_elmo/char.dic new file mode 100644 index 00000000..74285f34 --- /dev/null +++ b/test/data_for_tests/embedding/small_elmo/char.dic @@ -0,0 +1,229 @@ +! 33 +" 34 +# 35 +$ 36 +% 37 +& 38 +' 39 +( 40 +) 41 +* 42 ++ 43 +, 44 +- 45 +. 46 +/ 47 +0 48 +1 49 +2 50 +3 51 +4 52 +5 53 +6 54 +7 55 +8 56 +9 57 +: 58 +; 59 +< 60 += 61 +> 62 +? 63 +@ 64 +A 65 +B 66 +C 67 +D 68 +E 69 +F 70 +G 71 +H 72 +I 73 +J 74 +K 75 +L 76 +M 77 +N 78 +O 79 +P 80 +Q 81 +R 82 +S 83 +T 84 +U 85 +V 86 +W 87 +X 88 +Y 89 +Z 90 +[ 91 +\ 92 +] 93 +^ 94 +_ 95 +` 96 +a 97 +b 98 +c 99 +d 100 +e 101 +f 102 +g 103 +h 104 +i 105 +j 106 +k 107 +l 108 +m 109 +n 110 +o 111 +p 112 +q 113 +r 114 +s 115 +t 116 +u 117 +v 118 +w 119 +x 120 +y 121 +z 122 +{ 123 +| 124 +} 125 +~ 126 + 127 +€ 128 + 129 +‚ 130 +ƒ 131 +„ 132 +† 134 +‡ 135 +ˆ 136 +‰ 137 +Š 138 +‹ 139 +Œ 140 + 141 +Ž 142 + 143 + 144 +‘ 145 +’ 146 +“ 147 +” 148 +• 149 +– 150 +— 151 +˜ 152 +™ 153 +š 154 +› 155 +œ 156 + 157 +ž 158 +Ÿ 159 +  160 +¡ 161 +¢ 162 +£ 163 +¤ 164 +¥ 165 +¦ 166 +§ 167 +¨ 168 +© 169 +ª 170 +« 171 +¬ 172 +­ 173 +® 174 +¯ 175 +° 176 +± 177 +² 178 +³ 179 +´ 180 +µ 181 +¶ 182 +· 183 +¸ 184 +¹ 185 +º 186 +» 187 +¼ 188 +½ 189 +¾ 190 +¿ 191 +À 192 +Á 193 + 194 +à 195 +Ä 196 +Å 197 +Æ 198 +Ç 199 +È 200 +É 201 +Ê 202 +Ë 203 +Ì 204 +Í 205 +Î 206 +Ï 207 +Ð 208 +Ñ 209 +Ò 210 +Ó 211 +Ô 212 +Õ 213 +Ö 214 +× 215 +Ø 216 +Ù 217 +Ú 218 +Û 219 +Ü 220 +Ý 221 +Þ 222 +ß 223 +à 224 +á 225 +â 226 +ã 227 +ä 228 +å 229 +æ 230 +ç 231 +è 232 +é 233 +ê 234 +ë 235 +ì 236 +í 237 +î 238 +ï 239 +ð 240 +ñ 241 +ò 242 +ó 243 +ô 244 +õ 245 +ö 246 +÷ 247 +ø 248 +ù 249 +ú 250 +û 251 +ü 252 +ý 253 +þ 254 +ÿ 255 + 256 + 257 + 258 + 259 + 260 + 1 + -1 diff --git a/test/data_for_tests/embedding/small_elmo/elmo_1x16_16_32cnn_1xhighway_options.json b/test/data_for_tests/embedding/small_elmo/elmo_1x16_16_32cnn_1xhighway_options.json new file mode 100644 index 00000000..9c02ef72 --- /dev/null +++ b/test/data_for_tests/embedding/small_elmo/elmo_1x16_16_32cnn_1xhighway_options.json @@ -0,0 +1,29 @@ +{ + "lstm": { + "use_skip_connections": true, + "projection_dim": 16, + "cell_clip": 3, + "proj_clip": 3, + "dim": 16, + "n_layers": 1 + }, + "char_cnn": { + "activation": "relu", + "filters": [ + [ + 1, + 16 + ], + [ + 2, + 16 + ] + ], + "n_highway": 1, + "embedding": { + "dim": 4 + }, + "n_characters": 262, + "max_characters_per_token": 50 + } +} diff --git a/test/data_for_tests/embedding/small_elmo/elmo_mini_for_testing.pkl b/test/data_for_tests/embedding/small_elmo/elmo_mini_for_testing.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4c72f3d51b692e456b55db006d0f65ccf3a0e8a0 GIT binary patch literal 37695 zcmc$_d00P>E(4BBbek);^ii zpoCNsD)T&xgx?#_{d~U9{kfmt_5JUAu1iJ`?2qjN$O3InAh~a@7!Xh}rE&?8cy#ED-L~ul0ggk_P z@z@X&z!7y3ij4dV^EWj-Vm*i7kB1uU?-#)X4VQ2WTkRj_zj{WHUxWllK;NAssOK?> z$6(Edu#LW9t9^ofH~EM87#jtJghodA1P6uq`-TNYY%<#D9~2N6!4dL{*bwFyXyg+a z5fmIQ;S=V+Dl#Z|wNHe9Nce^@pD0raj&krsk$*);2cg78a&9mK?F)%km42oM zf5|Kz!I5$2$o`|?;Ss(O{{LHWxj>#=fg*u_$2GGt<%|jZ7oPmT@y7na8}|oKA%dgm z&QbE1_z#NuJKm}w-*ArdKcZQhm{^!{R073a?0JhD{~K&4N7aRYCuhPRAhigNx;sb1 zWAc9m;WomA0%+dO_GJYPI_AfBGUz*hUv$BbQ0CgicliWFze*>=a z^;`FU;{$sC1<=&knxp@hDh>XvO2a>ZQ~m%>jo=u$bEf?>;NNxmFQ+s9XFwBEb8C*t zUx23n1~mHvX#NM#B7$S-&av{)<*D?4t?=)Fzl~@8&u|vzR;C=Ae;LpAU&gchWjyK+ z5{lqpcMk2L_urBJxApA*8O+?+%#`Esm-VLq+j@?F);8l0;>-w+lRIZtWDIAvhd2+_ zXT5)f?`q!&Uyk$dIGLm8!K3nVTw*wLf5+rsxyRLoKior>H}to#=gf=YxCQ)hmYM{b#z?KY$zkFJYI(aF+j`9~r*NLyY%|U!ZT8k6%a#XT@(z z_~>~k{6+O&zv@q#{VRf3{-Y0A*jSoaa(sC{p!_QruHyNCs|de~P$I|AMd(*DT>Zxh z{3AGP+&KXr@_+I7+Yl1P%i#YCp1^-*Zf(U&MM3}a=CyzPi^nyIv+fu3U%v$ZVZT0t z6XMR<@OuyYv#3Ape}_uwKQp(n=A{A-kGULA@UY*^cXGmig-XO9rjZexD0j}rzsYR! zm(0=s$kx)>%-Wo@iN`jU$9D5yG7BVewz!D72>gmT+p5u9!Aob7*;*~I9#%(4H- z+``7j*qjshFPV4zOXm1rGAI0Dx-){4=*~&<7|+wQPatnM+~~V0+~|)b!;Su}@8o}E zZ)t39ZOlpe70_Zl;ZuL{w|CJ@*vZNMwSb&I%=boc_PKNR|5m!s{{-4!N$PF2EB!?~m4b zg6IDc{NNwn1reM>?wrHFHU59{{w;9ff8lJ+^Y5a6X}tJf8ZY@JaOoeWWf7cmcg_)B zdaDR@{vAd~^#Xah?AV{kl=;iy{~b`A<1w5Qe+DK08vGLtoXQx^$$uI#;hc)$RQ(zG zokIVzEa!9#=gc4an18Yc=WGn8`p@{-e;PM3=A4V+)cg|8KNkrV#iW?Uxe&)RF&w_jfTR^q=!Aeib^x|4I9u14RCl_B-kP ztw{PE&SHO?_bXh)|AY1?J|+G}`{nnN|4I8@21)&m_G>ej{!g0euZlwEZ?s=!jqHEY zes7j?|4IA3TaWQKvgaxF*YkQji$Z)Z0%y87j>?@#9`?(ULWLQ~ymAiW8&2Y8X6yjC zXM*473y^`0rc9;~my-WD4D;3YP;sVH$sL`sxYukP@#=Ym=J!UUV+LQ)E`@Mth_8b{ z*DuVJJsI$P&LHaPP{4h@eei6%4purSgng=ou>GO~c6@xX^#9X;0)O-%_Bfxq{iql2 zwl{+Koo0wVHv!pKcfpq&e-Pm>N7dt0;HAh$&@MfKR%jlDrvnkN%T)+YX?MX#VF%>0 zWD>69#!`C6kHL++5NIzbf-Q%Y@%OSeNZPEAQw<)$^!6tx_@_L`&C{gz+^vUAi?y)v z>t?1~y&mxm-D)^!Ktq=85E`{|M)YTW+$E?@6ejE9wueHvdAlcCUSNV}JNcvUpK@Vc zh#;1IW`aN0_n`Xl8=$GH1F>aKKy&P3bjs2Ix0%Xfhc70i?5iHW>N*P==uhavJ`XT@ z^98DB^AWktd_?e>2I(>z2QRi5W2*B8h`a9r0ofijWquBtv40h+EtkPge7g`;r9tGj z&Z8z>8bpQbmQtGqwu1h3GhF?w9o}WAVo5nQY^vRjY`vnP{nQvTF5??Se-VblL)pmG zuM`z}%!M=!Gu$k!N67m}$X>65>3s2->g;N!-u7-m!OfA#P;3XByq1U*Pc)!=JvWhr z^mlYd{WzMmeGkMI%z&k42IRDwFsVJRjU~O5@!PgCX6hw3=K??(04y zzV{Q*u(CQXtg@x(qX;WQU>(!HnwOrN|*H!@$L zS({|=tCL!oU-~`FSZoP(16xpa!5lDk{fTB9P9eoXmr#|+WRkW{4R6j(MT3+Yek3>v z=Qr__1s8R(BKin#$4aA?U>z)Go{h}im5_b?8j6_op@MV~Qc!aq4sO{8e4G0lRzCAZ zntI;Ms=@DwU)dPT897+e<>0#nzH!A@t2 zQQeY*z|GXfNyTD#ncNg2_j4?Dc*z}jQzn3u-}f-PKN=C;sr)$eYzd=s(}XNpkqbWx zZ=iW4GDJcy4mccR%!swYfho>l{L=t^n_UAt9z8{8R@9)aV%f;9#S}Nri05ivSjq@F zj3x8@o}&i4C&>Aj7J2IUiJBuEht@2(kH+OMfkLHIb_shVp?u>GnBgW$8ZKN#%0E+> zgf+DS+fY%-N3a+FK&|k6ij;M|k%g%QR%#K! z)}eirb9kggZ{A>i2lWCf`^9j=VG&ep{~8v5H^M&T4KylpAX9_OmC%N1}tHM0Nz~>nlX+K1_UD83;q!Yfn>0)=8LiE1+8@j0Y0E`Et5jWZp z?;kHm-ilX&X0;BqMYe!kDPc@YTTuKgo(~l2z(`s%YRp#W-uDq8YR{+P_w_#!-<%E` zfeniQAODGR>vi#z_)*Y%bqTu9?nJL=bfHGH!LI8nKPYI*Ph89xf3u2PtE1jO$5ZwjpR1wZ$twHJD5fbOs(SiV*W!_95q#&ymXs|2A@tL z&2nqdhYST$=lY7;`eZ3&6hB5jSrS-QNDp@jn-gWV3P@hl!`!u!!iBeExwC?$@tjNF z(WOa7;KCNeO9ek%ICa9jmr*F=>ubl!1H#rviBCZmu-aDc;L>6Rz{86TbOyf@4`_>Gzqw-kijugpBsfRbEQ*e0hFm$D;B99miG}0_d z&KH~E@(D>mCGe4zW2R6mI&Y)ZpA^X@KT%ZNbPOq4w<68Fdv=eTuA*&fqaZhs&Ags7 zfcUcHxQp&@LmMXX;_87Re!Em2Mey?0%9e?goP`l&MrpuowJw;(Q6l!<(RO+R-LT}) zIP#@s9q{=^HN5-w1@vN?U_h^tSu6AeU9F+fue0>8C%izHWWHL&1o8>M>^TPH7^Oor z&unD|o=qUnHtOKy4SML#V{shqc#oPr;}&E8MhCt9NHcN@bHQ|`7UBEVU*tWp*Q+3D zjxv51vJo49JB(DVHz0qz*=W~xH4^$x8uz?EhV})Bpc(BHN;tWTDLr3~dIPvf*QjVe2QbMCk|CyDRw!I-HYpeKC0}3xTH!nqZ`yMaCP#b1*)6 zzf{bfbQ@kePQ;=H7wz7T5yI;!SMu7&4g^>2fsw>4be_KjvD^3=8((cwEW3{6IGB^- z4?W5iL_^Dih{{kiRsLlQoV~4zl=AnX z2g8{lV#|jg>F{b2ksADZmkIh(Gzav;G^nlbD{+$aJF5E19A?#PL6XfJLIvmNK|qiZ z8sR$(iB^@UAa*(&6PSQhyL!3TYGR>&J8u=QE#ayjM)Ha7a3XOCE_fDlC7x-*Y^#l2 z$C6rjxJm>Ja+}bjmmis`j3@AY(gvh_RTQ5IO9N$VPg1$xiySX0pi*{CA~_?P%^*u1FItUNSWFZf2d1Tu>1D$Np!vPCFqpiF> z)O&X-ik7N;Z8QXry%c+Fd3+hnco`uNGI|->JD4`XPHsaKQVBEKG6dCM&%T!v3 zlPjS|QPIVvhRh5nk=!D1~#?cOtp;Jj78|L*7Tzn8mZ!;dTDiz}!Cp*&g!v z!5CtvxbiWiHf5ug%k0Qlvka(qL-^CD6V#mQ2X=EfA1Ph4Of>epKT|R89PnK^4Uw+# zgstC;N{i!&WI-D`@?|}mG$PL3q~u7wbN`9tj)hafAMCNs6a!)yV?_eC$0Dr?dor=p zhvD9r!B_MbfXs71-?(RxYX1OwKmH{MeEUJgo!f>M*C~@ZJ6s{ZGM?xfZ6%t4SLzSU zdCu$?RmI2LF9K6hK)qX1z^t?pB9^P?llnn7NDN;9fy`o@oTx-r_THq5?|(r`t*_x} zjv0{AA_y6*Lsl>3NX7DlRE_^W=2c2F+SaH5o0uD#CFn@nGZd^h4X565By(+K2s3pXO5JM+=}Qx!YmEt!mKvp``zHa3 zJr0Je_BYg~)^giVFDAqG*BGyW>8N#i4*p;jhgJ3Sn8!4=8YG` zDAkbgRh)!_juPaD;TX_eW4`rbfZ~U3qvEj^{ z%_S%-d_QV=-^8`rau%IUS7+W#%ZADDgbBVa2}ooGHn^BbHCTGX6lwqoO*je_E@f~j zGnrgm+kj$D4MAC`m7RTB4|>#JjczsNVy&ejWQBhbwat1zY_)HL@RW3>A#f?GWp|?; zQ?=2yyZzJ=3x1q6q<}B;Z1yua9=zr^Aou;>q4A9dx-H#+t~*V`<|kC}xah}lxpfx) zHk6G%cwC~K)+*wz+eJ*@I*bl(=!RwcZXvzsVo-1l#@2Az?&ZOGc#=UjLEH1;WJn2| zPH@F8yT@XmswY&zup}C{AsMV;JbD&pPintEU@Q(V!0Ll9VDGf6NXa9L3cCFj*{6;{ zPU|wjv2X^y(yB`y*xBQKtJHD*dv(f%x4-D7K1OHadm!)qAmv!F9N+FQhDVK-pl6ha zYS!wIp0BFp>%9=FX!!-^p+GN;GyjYd_@kgNcRe!wt_>YwHY8j`2Hq^Ggf*Tf*l_kG z(7e(L)T#5}5%3KiKH!D4mFJ-=Wiv?ATuUZm>pHx1nIo=rsKH#Wn^_Q99%LEI3v1mxkBJzs5iT|oGFaRC*O{t*TrbU~F$8pJ-T zL#tmlF=N7IxnmBifwb%%TzK;=$T!>KZ_zG{@!scPysVqMQ!pKRZDfi2xKQ%sdO4b7 z^NgD7x|0Z1$Kp57I&DK2o@TyHde3|}>V!!BJ^0nS*AR1PGq{gmg+BQeQNm{zLAA0L z*q)L`vx8nk;rIkXp528LlXsxb*M@jY@I7dH9*hs1nS*PZx>=* z0Y@Jpv@joIB{?C|7T<@|PhUshuk#V>d0aGkvH|p5F+ipF4WT7`4d|FnCre_IsHnJO zO#4QHat%}QkvdQE^{N6ss-uO*W;^m~*9^FR#}K+-OXJ9JMYPuR6qV2D;q2+V$wGY> zd@ued8WZ<|3ZJzDtTvZ2EoTByPv~6I{3@lBfLD+u= zX)L11=o1lie#Lp@AgDl;vzL(mH^S)B(INCM_aimNVHM~G>?7*+_Nax~Nj@v`k-<@U zGDAoNfBy6W*}4T`Z6uH(v4^z98KMyMQW?UZrm zij5LW?ZyNXDo1Z=-3)8_~ChaP*R{K#_;Pf{0i=9Jd{ddQaa6$GjYT*l=e9 zg?>=Cmrugk`}&!iF=gaRts5$HSO9(T46|NzBT|{$My4%YjiYK=P?aVgj(0<~YdX-I40$A*JdP}1(#lQTc7dT^oMfCj{cyjdHU8u^ zlN{B^WNcdGh}X#+=3;^_ez;&9@qFWhJor`7_dW9Xd(jj!xZ@6rPJ9FdSe%?$NyFk% zemw56E4fi8N<17FK&Adi1kG`1jQbq))_5H8=vYDC=}bfck27IW|72pKy^;8IisEUl z4wUKf9n|Qm$v7!{1jHh4qOWsv88x?TyA}~0C`;79KUYT){=rIwdU^e&v31~*d5*$9 zeAKke1!Uo}O{jTUGo{2=f}V*O6Wf`6+=%y2(cXm{(XGTZOgU(v-DoRvUQ>c>Us$2j z!LcB-xCL6Xg|P0aRw&_{V0X=Ul#xN2jDNvTP_hmoOI=stbB+d3IqNREH}C+S^3LOn z&w3C&6PC%2_J$kg320DHn9!eCsIgNfE_#nBYrZZBy>3h9cE5m|DMqBtY&nGMOkjFj z`S3315JpJMlni;+LE_44WZk|C_HNE*KJ1{#-b4Ik%aALqr&S1%v_yqs&d?U1O}@YW z#+a0eW4CSEyD#VuGvm1N34mKLJyj-umghkXhHR(Rn*?3 zWK#F|pq<>lI7W3*F>|AN2IKx?6`FNtB0kofi#rrM86k5AG~(h)W@d9i_j)AyT5uZL z*G_^IJw2r0vke>)l}WSuNoG-gBFGG%1i?02T(#;MiuJBRdx$T-7;ud#`8l7E)TNM{ zSBL~{3^7M1o@#sR4XTTT@qvp7zwk(;dPV^)OWO-0J3Z`#6JJoe3Z9Hb-z2PbZ8kMA z)`kp-sBtr%>yS)oNyLr(fr|OG@j<~L{L-Qcn#86NI^-F;XrWH2TYHgjHkNquQGW7X zycBIzZiM1cU+fzvjz2{VGkuvBkf@$cMna3A_}LheqW6V+Mm+-`yO#|?=i{+{vKDz> zyMrqEq=N75Ho$IK(quH+9W-Tq$cWBP>c%@8vPR(zrSyCuvLCaAd`!qgeCCmOs`6NT z>&0|3Q-TZWO1&U+eg z6)5baG|=^LkcV#}T)ACgH^(uJS{^P;cx`cN#vNg@@8nEUXC*{-I84R`Wo4*oqcPg6 za15qQo{br&P`uuADPl91lQ&XhLAvn+>d}^hQ;wzZ;pY`}A;lDhXq^WGy*{{TEl8Zg z&57J+YhqWR4Bl5tD4(I(j9vLOd|>-wTeaO+P}vVXazIxAe;TT%RwfJMPdDW;_lqN( zPyA+g#YrFQ?zYEXqMy;S^XVwdWGlYhq>2uIThXvrb31I$>VlQn1Pj+^Qa4`Eq^?;A zD_9DUw%ca-={H04Luv$`5_O!Htxe*tbaDUc188!q(eZ3S;xJZ-Tv;uH*U(L{qm9A= z=Hq~q8HNs@A>dTp&b%0ofQeSAFz?3~rhcL+Ru=E4HuL)LB_E!Ho$zaTKRp3La?Q!g z6I03V6B|jx9wq#uLK~{5^rJ;zA4AjmIoyR2I)q=(hy;#(&wMf*0(E~YvS{Z3`ozBv z%XzgyrJ^)G8#l9IbVwhUCG19v*BOJdbOKBc>_Fqb?uNi0&5UZ7Dz^4dMn#TFAM4r_*tG~ZmCb?Ob5)>m=WMb$;s`o?)(e+94#O-5NoL&-Es~Lc3q)-n zQgy}7_?6xus%=q1FS2)H!AuFPTRaAdd#2fqmr}$(d`eKsWeqA-M;E6y_aM`h>13I; zEgpA15Kn#CfqZf%V%Qmrr9EO$XY6CDqCyC(&DZB@7SDyzu5=_Pl7V%^%PG%!#dw8L zE>*PP4xIbA4BZYBAep0zl>WIubR_OO+;kX4t7t1qL30<>eo?B57OsuMVor&0mm`65yNb|(3#w#e-8RPeYyo3tymV3O8ZpguTdO zfeQ#;xrjD-nLywDUUEB252k3a+ia$RYb346mz!H!`j8 za)Aj{I4wfES}RbH-Z*6Jo{GdXm*Plko^G#akvB$zc79_oF=lInaV^K5h~{Xa8`L8t zy_11B&0M4vQ30n<2w+)faePdBBDR)M!(yZNm^&iNNOtrN>?J3Kr=7IMH|NjCBv*x4 z?u#cv442Dq@rK)ZW**Sn_klxc6{PrYhw^I)SZ`J*GumAP?{>d}K^HUfHbD$@Z=Zzq zvdby4C^_?2hgv z(<|RHW(yOc`pI0leYMlh_8#y1kmN7~M65*DR6R*PufNr9x0kG5vJIa*I~iTIP$h1I z0pQXvNaR&lgWv-fbcTBi-D&6bfNnfRQaP^RmM{awAJ>K<>4P{z_!>-l;zHad7vpP0 z506&YG4yg_q@!7nHd_L6UGN3=>urJ~!lHI+v@awymmxLdDde7xIF8FUz|Th~5&2i| z8M@a28`n(jdxq684{JZ|DEomgETu(UOUw-l2JAN!f>+cP1d zb}sH7J`J(K`qTzBX;Rl|h0h-mCdW3$Q)jJ2n6&#fX!_Dquxe{PQge6)^F^jI2F@Aq zKKdm!QTj5kE)XWl`G|ClJAqO+O5l*molqL@faTl7aH`ZHcv-`fDOZ;4ZW#l9Ul(%k z;%CfDNnso{AssF+X9=X5AQ630B9m7NuQZHpk9v5b<2P2JV66zo-Qxpw>Fqv9aCL&6 z*;Np06-i41#vX_*iIt2CxVORix|u8 zCFr8=PpVRT0-o3*k5A1$&h$CVCPvSuV9gahFkZqMSKZQw^O^!U&B_MK0#eYyt0|zB zyOGScy@Gc1#4#TQ`SAPnQ*ggt5rVItf;+q(=YcM3();iU!)*D2jKu|j#Myepl& zYu?W&mQ>=)Mmx~L%0A#cIDwLn?xpNw4}qz~Fgji@iUnL+kocR6C5R{p3cLPH`r$&v*l!wgon6YEi+utyF%9Ihe{w;QU|_JX`M@6%aTUk3GDM z6qk$vlg>EY9QYngWdj(mbRjhOI-e1#^&~4Y5IOm1m9>FIF@Baam1(MPM~9%$ZjstS zR4D5Pw=T59`y?&cM<-H66E$%_>sMa?J`&s(m7|t(Ik3QECCTDWvzt^YL3#T3!47UR z$vQU`XO*o+0_tDUuH%MCs9l{{Cwrky2g8}rNr#xVg?o{FjV_T@@8_NvV3?NeMNHAW zF>otz4$S$!1-^_m!egI`Vhh@U9ExRNc;FDacVsppx0+!_VIJ^X%8*z2S5Z!%|Z?AXH}P@DUFtY4B2NpgJP_Z)LewhIBfrI0$9B|y9br18ZHVN%&)OIlS@ zA%CzDR8+*#W)|Rm+8NX#xDPQO^zccUd??&hZMW2Q3z9gdfepm=qZFU*%*JooL_>W5 zY~3Wt!ydUq6syGvr*lsY@(Ra1$p?gH+^<40ig zn{3=uE{HCAN1zRAW66%Wp_qvRv|vpdrEm5W@*^&S>x1$5-kUB6V^z>Q`G+vlkj=C-U(vXL3&ilIpQmcPyy4iXFx#>@Q^@!nQ7U5JEaICkhgXVa!Obd3X6Vsu@+~6C#z<3^MBxgM zK6Z+U+P@yfPFhKBNDJV{vua`8mNhU@MvSP`S|hy)>nVYs&tS`}$>`hI*?94u188~D zDqF3fT9hI%gIOX{&dj%*1ggD-s3B#D@u7^-Zjmd{@F9$_ULR3@>JT?7Sq-Wd7F~+17U<%vcTpg!)Crrzo0%)Mr?9|^0%WmOoZ6mS0ir)HGk(vMv8?%I zJmYjQbl>l_yQ4S?{6+Toc^5xEw|4{4UtUQ$EY2o5ns?9|T8s>I{6yY9BS5C7Li;Qy zeE8NUD3N)GnkR+eG~xuxM>NQwt|vL`*9pQ)_oMH}X_CDtgS*&77H+@G=Y3;8h(2}p zf$FO`;&^`zWAoetN(N%>20r-USsVd;`F;X2el-S845UNbjjPOUI0{==UW7dvdhky0 zAu_hR4=1wsgTvZloWEF$WM5#wUH=J+j~7G26UQT`M{f~xx81Jt(GD~+;|eoTei`>J z|6~-fst3L8%ApQr@WauCEb?A^5#=0{L)>l0(e&wI%-vlI*zXdrhjm3ADb|#LxMDK( zvdIjmtk=R*a{bV^uVD9GClJTo_=3Kz@dxRhCg2?V3VHL*z&q|wAl6G4;D93sU~qQ| z(mvS^9p;0ev(1sw&dr9}(<8`M;SP$`QMbLc;1sB4(`2Ba*X}c4Db(@K=65a%ke|D@ zll5-~kxs=lV)eKS?mw3xkrs*I|Hc5De7Z{si3pHeVdvoO%??H@LJY4bC*g6J1@RH} z#lv#q#QH@DBY!XpS_&o+{uB>N*jb1~NC$#pmN2Q#{|u!uAMN~#b|N+3?I8bcGF~Qn znpwI-l&BkB;hyM-BtOPT+im1Kjq>^CfST4)vTlkx^Uj9>&%4Qxd+{oC$FkTfjYdh; zs}Rfkj*50LldrBqTgo{S?qu3C}Wu1@6sj1O=0dXIz?(vY;=W%QObp$QLOA-t*|4lP~D z)hQojXl=e6T;ljBunQLu~Z;MqX=V;(8j^y2j>_o956b=(ie$I;?hrD(Q{ z1-a{V34&6@h$|Zca#Gis<|U;hWP&~Q?R_d7e7_VceU3xybS=Dr_+!T-kGP7vTkRb5 zg^7;tS-T336%v^J3f-BK1z%EP(H&E7bm~qrrF3mCiXV%P7P=_Ls$E+!B7q7;c} zxGUou;Ei8g^dk1ZZv*i0^-ga@v4&2#u-6@hZmWTjA1n#+xJ~TupJYDD@5SZ|&f&Kw zSP+veC0E~ClLNkKkb1WaYhNkHhk8ax$AmLv{ssl~SCg@dL_Ok{th2>%A?by z4&om=`NS{I8b3=DW4~K(#}iM<(-Imd@R6Httj@{~QiaR0NAwf+c4-?)$h?MiuC%lA zYmhzP)EqXzHjM0>pU>`G+Q%wL-J}D~s?w|C^662zY`VzMnr?Hhz{=10*yStN)8|kk z+sca5=fx@Zsk-K{7Ln3gQ+h;vC4Jo6i`7!7 zqa8Xqyt}Ob`8+&Y!G`LHU`IDYimBklKvT|LXXkj%syL{ z#6C}n#n({~*%9~+JDd~7JzXc+qMc7@p+jnH`g;lc2mNblv80Q%e!y2&@Z}!--DEyJ zUTXsVY|Dc^>ilE@!FcSRIgIQ9ZhNK`UKcZ^JfGMQqLmd#X>gjie_#+Aln>z?OE` zuq~V4lAC*VXveShI7_(!kEt?m9Nea8zxuf#+kSW!edy+9HYzio{0yH-Cg_XV&)D#q zZOfR?#yPE~r_;+=l~2m-=8Y`vx@#Hx;6OiJJGu{-t9Ovwde_*RhlNn9{)}!;>B2z) zM@_Wdz?Pv4o zX4V1MPJcw*W)m5!gCFsuT^01X8(6&6ll^)N(?bVV z(udmf+34wS*@nCes(tg|XN(PoH%U zdBx^Owvq9r&20DjN;37j4trPWG<$d2LiWs)Qo6Nd44bk31RnEw37xI4$WAj%A#UA5 z_IZx!wEsaJI%<6YYks~BJFnn<&*&0sJmtNLwQf5zf6xz za}4PhYHE%1&}&>>HHA&9nb5f0dK>-R>;d_5sE{_BTuJ*nu4jkDRax(OU)eDV^H`TB z9<0BID{XK(g5I>Zmt7GPL}YWTsUHuMsO5PfgtM)eywT?;?aATnv_qA2g32N6x>nHM zeM$rV`nUkk()Ob544T<;`$b@Evkr2W2-rVsieu|Gx6q9GMOxx*KCXRzhwgEX#exTR zvv;0nktBgncq7|{>3ij5+_u%s%7cpJjIRUzbEBaBVBTHyuF;s573l?~;}_U0YZ-cI zDWMZbt?7&d%h&{d1-jLKjQxjsU9@`BF}!M#VB?K%uK47m^F*3@L?+ctq6Ad-$g z-AZaz28sEChi z;3&~PtEE_GTLC+|XeNF=u!#=$Iz{wGgBZ!6mt$?51V$^k=0B@N=dEiL%{D zJOYzgy&z9oy26a!Gi18@G zG>-M7?$H??!`Lj~5l*lAMh8aMvmMtru_rqP$bRG1Y>ozhqsPx~HrdLYjq;VTKeqKI zR%c$&x=WIYAG?FhkDSRa(`sOE+T^gCaV9RkB*40mnQWC>39F&lz;d6}upU#Tu}DWQ zM*MH+zMMC(Q7e?~5r2g|qkGx7{G)W({p4wl!%;QO(Vt4Uv!`f1{dAf+ z#JgX}9%ZR@nXHn_1H3?;kInIUNo<^@@y0-Yx_y2K+wGT3kDR_m4l0$fKNlZnt2W!P z13JsFO20T;A9|eLxokQ8EPoz-NjRLfZCk~T5xfskvEJBYzZ7lKb%DO#UY#{rrbwqOlWjbd?n-0p9kiTZ7JYd2TbwZZl)Y15 z$CmI(v$hM)(&^{C===LKXmPh`?5L`-MU$ z@W%chc#rrkw(mwW-5kAw7U(Xg-AotYRd=Kt&)%8EUij|Ler}eq-zTrf*6{r#BFFFH zmD5Y`b|ooxz=+R2Yfme!{$dI1_H>Zdojiy2>8@pf7u;P#PvY6A zWR3#a<8p=Q@4ic&7~W4?c6+i<4}4)0T8oJH`(Rubc^zju#o_VhgG?OXJ50tnV&e-( zU}~!xZSe3GX*S=^rn|`53!Gj=JKVOR$w%J4(?7nE-}Mr$rTmaJ%Z#TT?Z=SJ;8NB< zuaMk0=FB#g?O>N{3)-jNi^M|r3)pe;%J{X*QgSi5jkd0uN3^Q}SG_c0w<(sB&c{dC zn!|;-edf|f>-6ZuuYZusC+q3+uBYjZkB6}K>I7Q*h$3Bl?G$}> zTN3v4DaS<*zLChy8T4mIWBk0}96cgYLw~w^hMuq2{HyfR&J1?>!eS!wr~?(ee}PAz zn2=YWO6bc=^4RQ`3HU-~4-Rk}!LAn%&>9QO*ohM#k=?kOo|QVpuHKMBE2+HW^)jW| zk&Maqa_)8b?$9R^byrD*qg_~Nn?7=oo!sbC)<*V|o=4|A!>BtBe`-F<18jdC!-mFx9sHl3diE1yrlm*~KSNrxcw zpca4>xoG&wt+m}oW7+mu7eZm-v* zY1EosQf`1x_|}u2nNF;<-#I!uriHkFAI2ZPZzGPX33$Pna(3;3v5hC@9)}sL$J6b> zV_63YCpyF|0ejUrvgXe#p*<;to%vJAUhqI7>!1A-Un%^8ZZu2L(z}ft!)8ab-?uqq z-+LMKg#LS2Q1lr6CVD&0m&&9yzVgm=n`K#}2bJs#dM*9Bnq~Lo$+22D20&n(ex?s+4T1dAcd&9r7(4r?8qH29rynkvXy2XOLQZW^r!8;oAv3e8 z@%_`nboL5KdX~N(%_PmHJEa0>l;XyYdfTxJ>s9DW)vsx_gj?)|c74|2susKAg)#lU zJQmDLo}r$2;FAfNVarBmI~;jC>M3&EMR z!ekCBEVzvJ94%lazo)X^0o`)8R6Wj)c`MJxe7;IJ5~uJuvFCJmk}bQbdw@MO zr-$5*wIbuMEX1F-9>gWRtes59bb5x`JeDyThHCt*ugT2{h~e01{o%i&-%-80EeOXU5H}?g>uMFmpIx? z%+y|gRFq!hA4c<;Euf87D%%HedpdIICieynvbm`dd)lWs#iIrac?H+dYnW;_xPZXnkA%ADg*PqH>M`4 zSCL*jY5W(TfpfCVmc@s0&bBgA-64rDY4iFaGlbC>(NHwM>mJ%@EJ0o+NRqd@Jxs;q zGw7{~IjURq0}bFPCSw`KvHxlZ;OyCC<){6y@VhZt7*USH4pfqN4kz*MY2|oS_c~-g zYeI{J^PujTCuxlvi?=6Lk_X8NcxB^BQtuW=`qv3zb-CFjz_^IydN>gW`8YgboGh+0 z7Dw`z1MvJu&tax)KJH#)gLmyxBnK{jLbpOZ2pds`HyZK2P3J1$1AQwYWRDe2`IL;m zzRJV0BcdesFh642ipgP1Phv5Wga;nw!V?>Qvdiu&@Rb*G-S33q`1WjE)yKl6=@;P` zwHaUi`~zY|^theUCg{@na3W^Cm2B00fI>HpLZfOiI+iF!VuQ-KvYRqU{}Xc}Iwl?O z(l;j84OECr@B$*?W`U3Syaut^3(0{yN#vz*Js3&vB-Wj~(Cl4PQR84BdfWbiioR%y zG(sx~sGQ=}VHz;bx*oL(Na4Dh*2L<kN6}<8!lc*z`1l1mOFHYO75s8lRuus zkvDSjdyPnPYu{edY8j3ruf<`$C05*%zFV>FDQ6sIxe6Z|HNbN!G_m_j4g8kbhhC2C zCeo9<3GL}dOwN@4f2^JPH;4(t8{_nx)RS?m08o^{qa``P=mKkxVZwaH_{ zw-CJUDXtSTB=p5*5+xNyEet2(I!zuCj&LPHACHpwu0!NeP#OteWI?@7^+LaA4DnM? zpw-JfNbKyRH1rhL%j>z1y#EQm;2z8b8G1=wl8C95!Hz49*t>5EeHX4xj;nM1mrxV3 zyi1*&d&6Saa51>6IO4LFuWZ~>6Z+tp427jiB;(#8AhofW@?M)pztE%kDR02yA7L|N zKB2GMXP7XSfUyk|=;_}&Wcel?lD%yqlkBZe1Jm8v=)#A%)Nvx|WU5%Vy6fN>Vgo6@ zufew14rt&YRC7A|!42l*Tw^;-owEk_`iT(b*Rj}@&*@Q`@nxaRq_q9XZi5v)d;mIVft%cSwQSwVS5wu!P z;f7f{MELu4aO59{k-7~iUYCRk^ZqizvG+i?U5aE>TOu!Ol;Pbfv42viPKL!!Lc6sj z<;mOO_~H5JS*XHXZm(dx+GgN+9}%LXa}EAiLNIWWq083QGwvqGAXV0! zjvVO4>NoLlIkz08PZRh(Vg}ibEdsfUxh$_TpMT-F8A*D+gtb?0f;7oawrS)MbbObi z6J^w4FhG(?zbZ**FVv;SEah;`hH3PPyE^eZQiv066zJ%Y3gqQv3fyh=P^dti+|P-J z$vjbV)XSJKyT)NzzA74y6hjCvj3xFTfTxzvh=v};O<(otz?Bp{FRxARDX7q@=~@h& zG$wJfqab(hAmcfCB28pkaeJCE(Y!s5D2pD0z=^^jG7yAbn`}V0Lzp_A8pc^SbcwKB zC!`GR!OaI2V}Qv|c-CS_%TipRdJG|Qu@c=?cNkt|kE8k9L}`z&6gpZ*^Tm9kVIrq9 zocs6>o`5{HfBgWGZpXpxRVUE!mO16s#gv!mKVbZF`21fPq&eE+ z<${0g4yF4z+ABpZ7Q~|6-f^_*w+_p;sld+_vSecP9$Z)GhoKQ$;bHY}j9!(1Rn2?& zJ(aqcz4;wGXi~~d7`lX&_w=cpTnbES&}373~A!P}1(q|8#8DtG2pm4h>|Ab0#_ckt zat{P}(^HN-EXsg0i5ir*aVC3e!WK}wUkb&-b#P@pml4zbhn{VdSnHG5G0;#H0tJ^@ zy^`mkqw|~}{I7@cdSFH!_e*iP4J)$QRsn{D3SjWmBK*W&s;=%;}vfbFyfV%c0E>CD}^PSc~;-?6BWb*zYDziaK}09N8Sqi!h=?L3g=a z{2W}QszaPi?*p;8j&Gf%sPLu|m@@W_%b=Y_3(p~p<#Os(BhH|pe+EJ}gfrK-n9wpK zajd>`0_&pv@%g1TEVc4L`M27{A#O66e^8gqD^sEq-VLGZni5vhA`^<5l<=Ld2rcWG zgLUQ&7|fTXj_vEwOV|+~rx?>IzUkn2P?pAyC1U6&b5!?Kr&_az(B@VWW{)moeM`qt zN2y;-fZ8i~c}j~k*+s*Zn;oEf`VG$c@EC6FHzAAT%fP<)6C_w#f@)R`Q*KwziYc4Y zS@YxYbHOB)a-T>$ zKd)yZ#lExca(d*Aj|m-;HK36S3;C)Qp7^Wt9sJ;Qz=Z=*Skb3OL>`*M0m*MLsML)& zM5d730aIdj{XEvqm`F2@Nzu-?g{VDXLiB3;v3<1#Y`(V+opVEqI_mFWE=HRY_PjBc$t%(X zHig}jxf$h$4T;C6b13ZJ4Xz%?SU=ZEbgjHTnf|K)?S?O5q)(+iZ=V@sr6*5D9b1_T zVH(6!tDn(PD`4_o<)ERn4DDF!2fRx<^y_*#(sU#pR!UfqI|t>6#pn!(PRRgX;!=U( z@+$C4kfoohr7+3Q4VM^;(bn}sq->1{#KjbV#oHLBa@=`{?Kp}{Jno{V=mb*sWewD( z=i;TElgRT1LmKsUB8eOP0xzs9IFH3~Y*F-wG~0=Er>_jn`YuLzK7RHFGpA8@-AwrE zdlK_d1@`lei0wIV)X6Sq1J~(bw_ZK?y)c9ui#5ohOL(6cx;DnrUTb*vAJgy`7eN2t6k2j^#xoeQeoM7K4973HXIgHlHIEWOI;j4z(W8jpj zG`&QRnvH4zvvE6)yQxgqMYNz}_$0y3gOVhq`ydm!;xRu~c`AB&zd`%^>Xc{VYM1qRzpSI6-(S`P-I&vk&Nzq#duAo@6;H_i8CKt*Hii*DqA&-*1HEcT$|U zY7lxKa`VEy3N*X75Uy`X0^S^f-QXb}z2N&2C5^vBYGV!_YyS_c%)|r%I^{U*vIVa9 zeS|fJw(#l`_q((mfv1trAYN06BuG!An+hv%O`-*t+u(R27QuXlua}W`;FFBvLF0x2a#lvVGZTZvBYU?RHeo{qPoVw_V2Q!G2^G z@X`P1VSF6RWw2K~0H1AY)O?yYiTIYy1|}-ti(+favvCnr<=+%^hGYx0JpX}Sh8Hf5 zL+pIo#AZi67euCQXK##5B~5BmsN6eLhZ(SzS#pR`j*O*eZ6s~5Ul83zh zwGeb}4;r?Z(0s+4p!cT-qPaXmruR=6dd#i$fnfq)&d>B>sF2N>c@vg7#j)2m7?VTO zUgPk;)yT`R;-ir@Nt5nnhV33;?*v&ATd)PD#irrm_tqpyUKb<#n{bf*!G5hUA-sGS zMpCr`{^aY^o0$^iu;C|Ma#xvnF1g6=ihK?~s#U1+Pa)d6#)A5RG|h|UVUX5(^tRKc zog>yTDd0VR6dOlXB_z?!trGldUt;JXGrZ;XA8vo3LErrOiG#*Uz+5;1u06AG#-TcB zPSK@>en!N#H2_p!nSocJDMmi+WEQB)(O2;XRPsqW?C5=nzvOSDWKkg`GxCI2H=E7Y z4u{4G(}1UE!?&5HNC#_@SRIe$%&rF&(B>YGzBk9w;3w;#v)G6U-Za2Q9F>Mfg=F~c zcmUi2g{Z|bb*SoSVy;Wx0+mq>5;1p>Ev3L{DG1SqV^&n$MUQ++RfQ2B9$iv0nI@!O zN4ZLE;7#Ze>`YOn@w{sMGPjTOBYr^F(3_0m_B?pDOqP67SPvWK>Jr`l=~Q_14Ggo^ zoGmksgyl)`*`XU-TKUWEOGYaf8+oHfF=LQDWSYX_S0%(((O0Vzz1Oa#QK`Uwu za*PjQ&fX4?UAP53CvqA~ln5F1&SB%GO`{`Yg%I$d7@|g|kfc>nu=}|=oh^D7dupU{ z%eXp>{_hiha$gFWJAOcoRW$slS0M{Z>XFxDAQ;rCU>#r069~)Q#zjXh$UG@A@@(ZT z6!q68>E5lFxH1LqGoPS5!v>QTjmW<3zp*jO9D`rJVJ@GZNG*+W;PclET;XUyzBFCO ztGNOcX&Ay#9nSx})SSMYDND+WGVoV2S7VqP)9gQ1bp4hgJRN97(#2lmj17sC#F+Z>qX6EIm{obb*B!c+$_T6c}x8#`|R=3gM@bG(^5yYz_e-AN?m3Xh5Q z%Y-uferB2=YU@KYzBg<|PLTYcI!4d&} zk~c-SQ3+a^Do@*9mtbYsOK@~d=F4Wv(<3&|KwlwonsVI^d5yy&=9$EXRB@ zkS8yzCy+qdR)*It0ea6;kjHbe&!~KdZz{y-RPhP4qij4>e>)8!vqG79S2OUhZ!SDu zC4qsqJ%Vb-TF_M2B6*)o!AM1cUR@JvPaH;sges*-2- z6iN1kzgU%4$Sm&ahabL5bk+MuSY#$l4ffu{zdb$BU2zB8jg*jQ_k%w);~RK-w6V>d z!c^j>B5{nAf;R@^=*w19S_k8~+Vv^cHk;GNck7wv7%A%H*9v{hu7Jwvhgh@sHKsYK zLX?F%Imq#?Mz$BgnI+@N!C^6CXUw9nrWE}(+=|&&lUW~$PQ2NsLNiu`LC2s9eJlPI zJAY^Lv-fb<{;kFi6mVQ8c7UDItwjB=jwd@@vr+q@EDinT13AKXu+UPHyezRG=A+Yz zb+-=w*HMddZSxu4%CSne+YCkxN|^TQQY`ZOj4=tybZFl(*kAS(&vE(Olnrv!YJ)Cy zaMmL?rH-MR$syPyTL^m6zhTWob%@>_(i_<{B}1PQKfc;Dv_G)Ie(ZO#(J zwY`ktos+5JBMVw|wi`#Y^O-k;zd=~J4c`wsV_H}aEAD4bBkVu2C7&meaMcf3 zxJHz?Cai~|B@-baxEVSmML_LIDYV-dP-cu{g`H5N6IfFcDbZe8pYQ}Di&wHt)N5Sd zEJJk)ji|rWGn}=P^GYo|C(t`P0ta<|V|m_Au-z1nBkj3pwrdvSPWaxhkL5vF}LpYncM@Ng@oblxXDk*5yqed5xPR< zAu2yV4Z{8z@cDEv82s(WB}Td=WZZIeEziPFo1($}qc2!oT*@1tfj>?CCq~;U&vdxn8Wc^}Vq{4BPmjsDXu%^sw8LD@%5F0l;Gnc-K z(8%*q{NIc8VNPfk9BqAq%;IE7(zIqv^@G^+$|iJdnlWv>ew;}qy3~697pzIp2cMh9 z)JV;Ml)jOJp0Q*c(2^qy?oK5_idS&shBMG3)WPjlSPD5f zk@MoEJr#n%E3#BcQkMijTgP#p-hi)04-7w>izb|ZomV!!LM~#M)p>iLzmX}1Hp7Wz z>8r_ft*ZuosGkilr-;yNnN}qFILBQWb7q#B8Pk5wclx=Z72GQRu=P z2`Pkq5?q~IO4!xn+C<(wKDPo-aus-WP^_P! zNn+hJ(3T&|j-Rf9ms~!e`!|4wqZ(xY6f<(fONS0|xtA~DEm(Z(CU_=aXS&~AfyT-I zv5p&xaYckYkyA)uCDW~G@#iVj^HeKyljHhTJwD9n%1z&gl#~ZZu~8{ZypJ5~*zGqYgo*{A>0y6VGuPq6Mx251B7r<4Kax zPFC10ljCt%(ej{ZR>#SbsR`0Tx9|5cYnu#7ey2tcCdd*||541SnFO1+n?tg#C9#NC zp}bRj?A!{BX|28}si}E~KYxgjKi?EdLdimIED96bs0220um{&FC~X!J zqL;>9z`6`qnCfu}jy($mPrVsPX6jJA0tI^C=@xidSYWtiubyY*p1B#+$x9R+n+-x+cx8>SbHBd^->pkb*lQIof#ie}bCxw`@_R`)TFm3gRN z#W5fsaek(mkMYHsQg&s7C0U@WNlNxiCqF@fR$BBT8y?_sdn z666^O?8pC1U_87tAV>TS#Mzw1?Dug3xxfI-asC4{Jg&gqTt)Kx-81~V;vH=5&4=y} z|3SR@6dDm`17_RJXs4ALBrKK2%We)hsxggz{W+CX72HOTy2n7||3c)Kv&>U0ft)Az zk*p0v{fbk#mc57>;>%!7>L5lA5ay$b2npU+i^3QBA#HXME8~)l#mj3UqOh8=JmrTX z2h{1fKQg3ORh169tYL+s^09SuJvIzAV0Moz2IkI2y}#Aq>LgFI2ix)P(R{S^uV6Zc zM^Jf*9NpcnL|!l-+2_X;srR2Th_IF5a)@nMrMnH6eBl@v`P$@f;}q%@77EhmPhwx! z9|)0ZL6?&ka83Ua_+WS)^Na%E_1$q~XXPby4PFGDV|{FD*gbf#W+GXfRt=F$Gz3G5qT&tZ#7R;~3YgretW{Vm|PO%b{mdjGD8xeTN zO`h~O$k6DS3iReu9vOeR3xD_eK&GxHZQ7lISw$0wXTAjF#o1u+hrf*DHWLUq`wTYr zn9zR-mZZ`97JE6w9abhhMQs}{do-X!nbs5t9R1Ds>}kXEKcwk&vq_}MVi;WO`WY=Q ztLpyO87|LiW7G5&L3zP_Ca3Evn0Q`6Cg?aArbolMX}QRLm`=xY?`xWkQJhSm8SG_D z^=g2Uea57$?>H=%rkMTv5xdT*4K=DIX~s@TOew5|*>^tRkE_Nc&+HgRmds>NSBsIP zhEUcx@C@V2arfd67?H{DC0NgsAR7LAu&2)*dDasd@gOy-v1dQrb5JFfiNEl&HMB6XtD?7+Ek3#)o7 zS^LjdVD6bwoYY@{I-8={u(dZZWYmhPPL!a{z1PvQE(|TrHsOc$lc~Ms7YO}N3GJ_R zLe)`k#?f>!p$p_R)?AnEx;pYSEtZu=$1&ZWh*i~4z@(@Vx@P(>#ax{J50y^I0G_Ih{@M6_9 z%=*dIZ5t)1sz?^os3-)fR$MMT*b8hnaQm02dVy%9B^4{k1i7?_xUYISsDl|jx2+I1 zubl)RTVA4$@ex6v-yvLf=``cpSPFIX#?f&+Et1;B)sq@u!E=yBH&JzJ*Bg&KpS@70 z62s2DZAQtp0T}Qyru)Z)XynHy?B*RZu%SwsbUQ3VXJJ=(={kjoZ5%~Mnb-QXIejXL37tyj)crtlE(0A}E{5Y79)v`PFvk|` zMAyj!%z@K?kbSI*&R|I){%aVK9XZFZqgQvOn;ZQV|I0>ye0ZTew+cNPMhJNka8S zHdGdH@a{V1K+HWHYcV3jUn?+hKc8jxxq)}MGTjmJ0iLb8f-}~Kq4QJ)`Y%?QtoAV> zyDlBZZ=xEcr`HzQLzYxcI1Pqx>A{1l$;2W{iPf%v*Z^^i=x6crzbv zC7p%uT+e??q&4YNIth_4zgM=YSrVR8gWW$XDdN1|7+04V(XIveLE-9FE)Qu**Quz| zf;Llf+f9n-WNu+cj78`-B~2oIX*=#-R*1h(PJrx_NiaV4F6$Y7hn=(MCB7ChqJ{}g z@XPNPR-L-cT6;-=#kMsV5u;E0QesiTOO{sO{)0Q)Ea@_xKDM?@lUg5>1dWZeVAkp^ zJn(HgElS%9&vd7f`!c_|`}2q30=IwsCwms>4QrF3seAG922-En;1e!Rx1Oy+zNsj!lB!`3jvvCr^)F%B$Th}X^cyY>eF+xH=%U59OY%i*tsFPyvj9@J-ZYu2=5 zOfQ#NF4l~ME@Dg{`rd-b{}_IR&LHa;ZpzjTe_}bcfVxf-!KqJgLgSO|OjVp3%iFP* zzaY<$=*W!6R81wiZq_Jx95klGw?3lEr%5E%GaBw+*P>O1`ux(_u29!@iaE*gjsMP7 zqCNYTL7~DA)Ojb$aZA6$*g|#kOiGRL*3W`qS5f47|6(JfZ?L1UmO)itJwM$}nB?|r z(0b8}=)LDARM~a2wZ{zT!@deIek@NNH;U81mHXHY!pRu1KL8d~Dv%@J?|_K97LMge za{Z#|82PW=e%^p030W`29$54cy3Dx!+IlJGL7fx{Xuc16=1*aHk~OiI84sU4y)h{I zJG?IA_EzOrv2<$()2Wmxh+Lt8bG^h!M^hBXzR+d<^Z9|YlexaFw>8lk$b-$lMexP@ zi_Db#*RV5)(|SD?!s)wKR7{}&ro0YASwDm|k*9FBR2F)T+Cs%CJ>vMkTmb(+ULv`y z$^Xmq|HpUuf4Ii~$pi6%Z3Vr0au|GE3zmcou;Fdupqx07?lqqcwqsxL_52a$?4sLX zuyH@kp0QU@oMB8HLNWwa&jQH3hbvIz&^o5CsS#{hBM`NXZS zsegUQz-l3O?(bBVvDBv{_69V#*n^fW0vhW&na;V;&1_zoj)C2suvTggIdkCzs!d1* z?N4uUuWr784C^s%hwGq$M55`9C_Fjj47R0?)P8gkdAFhj>Rufus~Zw=zV&Z*QR_VX zec~wl@8AuHFe=4|lW*gaJS95tR}4b>U!mHZR;c>sOZ;qJqJxht;l?riozD>agn#hB zWHD8D$-&vfvoPzDHM~gM2{}RC?2U!fq3GczDxw{LI`8#ireh^O?M@=QGondllLJPm z-)0u;EMg-k_>;E_HiF16YpmK7gVS!Y*przE1G9F)p5nJ`)_7suJ~s(_U3p~Hl%3?K zo)hcb{fJfeoE5-9RMkd~ZgWpy?sgl} znRCN^k*usVkdeEoq|3UryIQHVU!}O^CESLzU zWPMRFPR>%I;ZE})qUJn)ynF+t>SoYHCl)rmOanWKeSFX7GIXYBB2hI`BzJGkq3OGX zsOIq?99WozGxq9I;f*RJJ|vnfxt@h=q&*dL65x@y)o}Y?0K^rHC$}?01S`l)ScbE3 z*QP!6QHu%eZgnI(=TO+r>6|CDGU3m1O*X$n29|GAp|{`Y(_@3(VB6_O)?C|)BXOVk zLT#_`ZdW&ipPfLOEpy?Zs4}yxV=stry-(g6X5#6s#pGLwIT?BM0rMBi!ML}<=o{}u zpPbo^8#6@VjDr|n4NRjut)5`#wh8o3+F3l1w;2nKG9lrTIIYbfxU;Ggo<;p(w|!ZR zQT2;SW1;|$Rfn;Lb8{i;j1?YoD`4eMMUf`aPQlqLJ?ubBK69w507AsBf_KDZ{ODK) zEoCM&eBU9`QYb@Kt*!x=ldjCMtB0v-U>(j((O?a2)o3K=S@6Gf2lA70VDiveVl?$B z$YhOz=koKoQ{c=r&NC#At3_!2!VqSek1Mpjc0$qfPAH}rM#I#!@Q<@Lo%Q=4D-kh; zD$o7QIQZw`Wy>GT=P3qY(f%DBgtNdWHW)*=n&s%eGt}s?BYc>gK+k8{q59T%yc2Il zc=J=Be!412G5yNE`T88jE~he{vt?=SuQTwBw85cWqiA2em-#cW4-%Y(h>qTUrZ2LA zj`!Why2QK%_uJCM?o$^0EPsp|dA6|li!My9{ed?7H85fNRy;EQ0#4Rw<?t)K$eBkho7I=Sl5i=IN8+Z4p(!3K*=yF4ZXmIt} z_rDTk&(TMCWzk3GlEPJzANL%plPVZd+d;@pRcGekV`$PJE(f$I3zXzOz=p=N@UUbV zzA%^#?mH?lm&>cp48M=xkH3XO2@Y^hGnPh8>4o&G7qMcZ1D=un59U1kjvD+p$d-*E zBVzi*Zu3HTxBDr>p7W)s=I`2V3p;+Al27J!@b~%@nls0S{>$ou|Gvi4a)oo`L%)FCB777y7M0?Lnr}?W zrBA4BWe!CVK>|1J@BD*G3`zGKW^US9(bz@*v8a!(!McmMc+^Ml+$ zOxBbZI937HF9C(Ld5|;@$eV!64Eu9)Wm4@LlB*L>8-`eDugy0{T4iX zImWyVzl$XnQq-JvB4?I3(TSYT@baN4cv7|)Vhqf2-4B*+_v85P701Y_6c0AxZ$D&u z5{|(-mxld|faPJQD&NmSNV{>AD7u(~$npp1CZSC7re45zLn-XfEmm-IumeKpn!lgJmGwE>2(4u)~tiIu2*2W@(osG%?mvKWfI%hR)f)w zxvcEQrvf4E)9Ca(6Ib#}n1R|H6m2cS$>wWukLz7V_~1Xdr4bH^>Sh$5UBp#E*YNl6 zVc6GkguE^?rZJ+@q;YpR{qry%4n29q?9D&Nb~$`xG+&s~riJHluS5-)v;sZZcO7q6 zXfwyJ`xANDCv34*CR}^6g7OZY!}te_NlnmZyj;5iKCZmPsE6+&{5kpHztoUfL`CVN zg$pXX0^Txn-YCJNq3>vw&oWAH))BDfI1)Px=t?V1(BfAz?}Se<{`-^Z0H?J~+Pw?t z8)0hot_@arX2B7T9h`T%9q)Jwkz4aiIIVvYV^-s1?=g2bI&{6n3&)O=E#Ad;9PJm5 zo)RUBS6-k&&n=K%eiZBb8u4~}8ffc|-R8PcZO(8CQUanZY2>TL87^|!d;%=i)qaR^!oL zgyVFqKZFjauE8m*+h{&11zi1y*-;l?x`98!7+>EAk01l<&YBRN>ZSCUiXG~5|9g+{ zY4Cbj49jnN&<{p$(JI1|nCZGv_ms~ll79-ipY~#qQ8eDXB}Mb@B*PzPfjk0^6%PTa8V?)I>wRmik`>GX}UR{!oI3lGPh<2hE-m}$6sdCN#^ruSzbHnwIwr~hw~xXONbiVd*So& z1*F5J8FRe^;8t%Czx4Lp30>LYs8%_ob0< z&6!zq7Qr=n$;u@47`pm`Ciu+Gf-$v59P4c4H)pOuA?tbwT$l*Qp0`1M_C4_Yxq&F= zT?9MgK=1H$aQl}ge0#YC|5=s5>ob!%j_`SIUR%!f0CHib?Rbbdae~oKd=KkmE|RkG zlJxewbGRfq4?W*_Gwj$hVrG2`4!0`7($^#K^|cUnQ5pt?=3?BwMV5-xR)YBkKQMkD z3CqfjX@h1ku5J{k2?bGD=7sFsqz;S}xj_F&e1oUW33MPJ9<1}9vMUd+C4mJSNcMj= zWN5!4xrH3Fso#NY9aq9UnX!>Q#y;WZWDn9^bO>bnH^Px+eYo<3CmWX=0}fJrI(17w zrlbql3o|d^eYs*bqdS1UdAy#?buDELB|q}VqE9h9#_tA+k8g2dP9KyW=)!-O4im@P zy*SR~2HZ&*fDcP#(RA^8d{eU;U!Gn^3qNq$_V^I0f9V=@L=>{uc;P4!=SUUL|Kl%p z$b>5|#i@6A6iW75LT&RF95mA-Mbq6eXI%m{Xj%*Zc6!nIE(Pc?{x#eRAH?EeZCG;T z6Mw|e8R+w5d=lk^r{1lB{O<E0_8`5*I_t_J6PM{)vbE*NDR#m8dcN)Hgw?ew~Y83t< zNgdK-QAM(VyckhIEZ)S({JOwIw4Py1=ZF*WultGk#Q+dXHUUp@FSaeclu<}NLMpZ7 z;onnvrb#{?JO*XRON9PfAp z37!^YhEEY=Bkw{lv1Vkj1W~L(0?p@TlAoLdyG}-(uCWo|l|&=BH<&^9*+)@L7e!pQ z#|O3ig@{qa88&Fb9oA4Qj`Y;lLD!WW=6atg{*K(jM6cB(Gq#A*6Ec!m;+=?7Io^E` zU4<9I=P>V%F?b|Z9dFH;hO1BVG4dLLSI_w*N8OZNdp@2_`<}znzrPqui5u|TBZB@A z+<@|xBao_mTwro1756ucpqsu272V5YRkn)}jgXaiCdQG>*xQ10rMeh{M+`b_a3Qe+ zy?_hPb8OXa$WG>wnv+owATyQLay{v{`rl!AMhx@k;%e&ZW59j38pc-qWHpMuV5j1L ztbg!0V((GI-t`nF=^-nj@!tkIaft}y@{K#sWbQxMdrJe*W*rITc=5qu<;?hEMcN-0 z2FdeE;l!UMSUyjMPBoUsmT!@``3lFpza>u^rcZ~g+YF6JTu1j+-e6nKo0%Y*SMJxPI0rA|*c5S27D%44i9a}O)j5O^LHj)cTJT(*Plc+E6s7| z{h8$ZacgXU8U`sM!gRv6<5>Oh7OZ;O0zrEO?5YC|g4FvmBxr2`3XPm2Im*JM#e@fi z?H%|zcqNSJVc7b85X!Ap`5m%>=&N%b z?zy^Rn+YOP{HYgfz!D{?YrrO>ybgx%6{Ad?r_KZ1`cv=R{2S)koL0nHZ-i=P!cnVVP z2vPDQA=;51jJx=LUrwr{UM(7zXXB2ER_r{V+wgBPVLp8>hQA#b zprWQ%5VY|uyIH`U6R?ZZpZ)ErlP?@)@NFbqI*d1l)TM zTJkkuO8-1!{ND>yo%a~`ty}^&XU&Pn#yyyGV+M(Gm`^MG&w&?SVyenC=tQ4~EZglt zpBbdVA$u9}$MhSj%&*~VoXKXJ1xLxhwVK$1-`L-ck|b*WbFhpYPaOPgKu<=y+}`yX zdtgRAl<9oK^0^zB5Enh_7EIYk$<>wTT94!Dqhn}U8VZXea==e<2AP*73%3U(m=h~< z$(oy-zI$^MW`DT_eVJ`oyeFHHR`Q2N^VjTk6=~wAIh}sm$Z>+dUSd8pZKbla0_gkT z+wi7q1Kq2pNO>v)@U&E$JoU?h1@p3St6wFndVe|XS7)g37iE5AL?7GTEkk}s<+1bD zon)1-=@6~D61ZUV9xu#`p)o3^sIk+Od|n{KZn`5v=e|8`|Dg5)Xub8tukL-|5SI=y zYtxZkxriLOUsiePm@a*??HRP~QzPr#!w^>XLcO#XrE24`E`>+-zR_bsCmIN*cNlZp zj5Bm@eh3c9-gqJXA+yqIDfxKzprHG79J5x%gE>C_GoJW0ht>Qro{9yg;)lyzz9%Gr zXg-ib>n0~UrDO?N{VJ3!T`xh+9fDXHj&prX*aR85Dt4UYSLTT!%ls9-0=2H%%-#_N zk{pdpX?-jNq+i33#%)|(mWdMDqV!cmJ+9ZCj`d?K9hkVz?7Lj9eEsp}ad;^Cl1p6%{L z&tr~6Zr=uad8$92lviM-@I9XVS`S5OoHkcqC$-G?VYgoVGsCh>PLPxp&vSC!1ufHaE<1$vj-AL zMPPGfnaOK5uOXe&fUD8LWFu9yRU!emL!rh)1V%UB#@?;ov@*V$ZR)%YV`(B}(5`^Y z&%cLnKD#rQXNqv1_{lW0^b1@6Q3C_E_d#EXB0K%*PI6#yCTWm~fV|#Rs#qgNc5cz9 zM^vssh3ZPCaZLhfKV3mSc%2~+ei-1LUPqjAS^$#5yFfhfJlcC)U}mpzr=KJw>8|YS zU@CioX|nr{CUqkV`c?auzndA6;vVfpo z2IT@jz}~G}a7+29ZPv~u=oC_i7C()M|1x9h6uAhl?9yV}RwC|DV(1+r%=O%q@#eE% zu&`f-T2<%qyFA}OQd%*#_4ut$Umdk1M`b_2j(3Plhtff;Y9e*`os8QTU%;PdSCA|>WAfmy2z~ZZk`(;a zr8RVjdD>wC<~2oZW}7rQCEbD-6jE^CG9KvE=+V$a_u-I)0!cHDCYEP(K|3Ilh>b{- z{c+-iJ>o#L%umyP*Indobsx7TM!})x@x(pg4jk8e%x-)lLT2yX$5O7Y-(PSO!h`p* zK1%WMujm`ooG{9K(;H6%3oZ!USL>0V_TnThx&oB=VXT(QS+tTjW6v2bBAWXq!(^KV z;6^{ql$weUtbXJBx@&k()BujOMuKU+E9p7352KEKXOdNxa(ka{SQxyPJZL_I#Tuu$ zoS6!3F`q;REFW`h$aie6+dE8;`hi=6!oW7mjb1-tLc3FJh@gBCZkd>kbGw zuAn(YPx1u{Za>Bx?#xH?C4RJ$ISfCIM2JzBHOxy%VUIqaJ#%056vV_<+*PB1vo)8H zeGw0srrpQD{GI__#c{pWmK|o_a&z>7Ux(oV%q4rzsl$`GrO>_F1%KJO&|e1MFmdT* z`tsOHdVg#keJi;UtFsTH$=^u)eDx~Y#0}w_7wd4)Kpy|oY{26dzd_CaIFZhcAs6G@ zG3KW_JN3dOZ2zN8Bm#Virr$7D+_fTdaRA@af|#>+w8=()DX!MGf@7cg5Ul)#ow4x< zbGCMr@l`B<-R;R(AJN9()lgWpRi19`lqPBienPl&HwjqY0F@^u(c-)_}O`vkPW%#Kgh9K$}z;)kS^SG413g8z`A2k z?2fq@LuTCpTzBa+y|+9bU72;6gsUMos0=E}44D_%fO#Ufg^ay{YuJJNn9&QHRh zK_4&HrE}gpkoaY_jN%spL;EX$_w6KGAZfrXJ?l$NMUIe7HBM+gO^GkMay5OhDF@}G zN?^S2WYB&r4Zpr@B8NAAh8nHO#ADhee3O&Sta@xi%ojSL^V8ihuyQedChtL?@3J7q z3J0O>g%b>XmE>P}{~Xd!)v${F5v+gZS8%x{U`9j#!N4{2KkIQc0Vlv-Gnwk zk=e}X&k#5)HJuio*P%^c<(ckl$1z;{3m#Z-j%i#W#BST|h1REqh~zna=J=31+-Z}g zM{S}Yb-`>w*qUs9?Vg`ZS^i7bX3I7F`{q2ztq8{aG6v#&6=CMlzu<6WBR%WGV(CRY zvfcg?qz*-qJ8Jh>t!`7&de4mD=NwE+|A@7mr*~`YT7k)`1kPjr4INwv*`Opw)kS(L zA0(ooPYZG&QkF1`2wZw53z8MIooXM0pV5!_-Oe=;wzC48Trl3 z2?Gl@#@2(Fw9J5rH#4dC@5^BMEE(pkkf-0~Y-BATF`%t@54MeMgV4%xWdGp`u;O@Y zZ{s;m^ZS4BqH7-Bt6G5T>Qy12|188zSB0J2_#QoZRN#NI733RNQ|*|Gc=>w_)fEvU zS1J)t#lM018>-pE4-;4`FID2F{RwqUir`_xORRmAgLim`*vmIxVqdN_Xd6ed((dQ5 zzo1c2;OfSXdwm1nD}R84?K_a~la0EXSJ~OtE@*uGB`nqWher%pNUj`5^Ri!|m+1l4 z&?*RJ!UHhmoDP2fy%8@R{f`;#oJ>`7AFvNX%y5!-3cGAo0{`MvF(MjKftgn{h|0@L zFtx~&s(+U!OF}I{sc19QOC{k@=w^EN)je2OXNJeC7Q(OScc}VNm+atl><obs9BF%He&l9X*c=A$)-=ZZA%U{m&A{q&1A*plYQb zc?CPI&ST*0f1D=v4sN)QCy|A^bbgd9=qHYWQo}8>{lrqB64!~|2-jqF@q)|u8<-o* zgy@}%s%-sS7Ge)L)6P31IB{+rKW)Qo@@iHw({p4dIhj_E53Wez)x-O77yFyN+dc!O zx4efxNAjTL+#%xpITJP=s=%J}`{??pC73c&jAH-IC0h?(z*e=>;G@)l1&0>GtxwPK zCGCQJ*W%ccbSWC&wFiSrZ}FcFi4&{;UN9pm@0s~(0pwfD572H?pe>_GXxKatnQd*% zx&a&7FXTWH8;!{0&wA7&VH~ZIpGh8R+{8~S<%v`KRZz6N&072}fLC12H-B>&7OF&% z^nzUU*!~|Ab3=nry;=x*8IQR&I$%#nx6Q3IS4Hd^%s5?d6NtBuP{&D3HE>Hxc6e5*S}{jb5!dnXg)tk z1aZyK^-~udKgY7`KlH;t8Kxq%b~~Idwx>`#g`8V-2#u~y0-yA7Hc#~*b{@IL*T)FP zj?=0p9ra=UW=|!xI#DFUZ6>?ofClq;E4G|Nn*~bzmnG?@) z@FAl@GIY4JCTB#^9N7^Zmuf{TCN@R$;n6-=TJ ztR}&qI|jHq>^C@kn9ojm;6coGMAPWk#IVa%PlrU41i& zhg399aWG;m?$2V{=RSn$8B$czE(RyLakX2vBD5RNBWuusH0P&Mp3wjH5`;z_2HWJN z;CJvo1m&%yPsc3ir+hgYxxf=jms^te-$t;!kHG|KZtuR%nhqQIkTgFHdcA8N4g9YH zYwmJq&3HS~bE6B1@*5s~6=6uryT-sG${zg6^k~xUS6oI_hQx;VV8{IP@Z!y3ESiXD z5ImWd8^6Jx9p-2(t_i9fr%q!5m$g3yBv?8L^70)?TFOjf(O%82+38Fj13J;-g9}9R z-?BO(IS~1AGwXNu4meBo!Y&tMvhnP0Qd-Jofvo4!_d)%@(^_1)Zdn+0T$;=BEK`{p z86cTwwvt$V6;k@QkNx}6j>t`MLWHlC@ ziNeeNr_e&`6 z`79h+1ELU&;vr_MfPfoVSBfhvvelIUp|Bz`Xpple5h>saX{aK6^LFJJqZBDLO>?mlSlR?YH+*P~h6YhZ7 zLo;G5YOMFAYTXMIv~irZh8BW$!*B7HpNXtF&CI3cO}O{;Uy#>12$%D6F?CH1j9r_9 z+WlwQ(7WE~5M6Py;gI0i{Y|Ike3)=-+=%AvG>WO%i;n7RAZ2YOJ@*QPw3mXVsU?uIS8MgWwt*JFqiZ$ zP~Wt%nejR(@83pU9Wqqr^1nX*wvqy-^QmqA8nSeUGwp)+q2<*`G}?bjKaLKw_9aEU z{+NJ1j|mnZlS`Vy1vERDL2d=Pcqu_7<#vly>dE~Su12({7D{7{QDkcAhizggPMt46 zoe&~eYPsim>k`r#QrUrDWz*D`NM2L3aO@#cqq-1=(+@FQl_$hUwn6K@6HK%1D3lMZ zq@Yti1SEfQN!M&q#a6VUHiffkb!_bTsY3PFL=SSXo-*D<{~4gcK-rq|Chy)gpA zpKQmD^rrjydSMq3W{0M2bIgWkmWZS_c^Pdwj zBsh`e#fVPtCDK2?TuYOFzAVr(2HhLvq?5Y~sh!cJ+Bgkwn3rP7Vv(+FE(O`*4h`yk(n0*Q9Gok0dm3C6Ed9YH)3~pAU5{sV17N{2a3)zt7jHgu1`c^T#YKfx40{t zJM+rtl3gl+_K9v#e8TKw$-hJ70hwMc*6~FDb~|v{+HeGqzK}o3!NTr zn`u`_Rh~%o-Ov;cR9q*hWAj=9VVW%WF`V znE_9)uP1NscvK!Z!ZO>MnS$RH=cgB9M}a4Kz2FRrs6HK8LmB z7&oNI>C8c>zZi+Vz00UQUr(2=dZ6Ve zXF+@=OwtM|%#;_y>YfEqPxDRCsLEkxg9&R@Cqeg(zm%5p5oi?4DQO@Yb+=}v9M4@; zd%TM!57|I@ArAY_z0TCG%gFMuOtQ1n5?SA&3i(yo6#aKW=J#QzY;B>S){O|>+MO8m x#U+rvc0EEpG$ZvT%}TAuswM3_&g$R!o+Z}}GVy1ih5y_2=P;80>HGg}{{}98;D!JI literal 0 HcmV?d00001 diff --git a/test/embeddings/test_bert_embedding.py b/test/embeddings/test_bert_embedding.py index 6a4a0ffa..71511458 100644 --- a/test/embeddings/test_bert_embedding.py +++ b/test/embeddings/test_bert_embedding.py @@ -29,8 +29,11 @@ class TestDownload(unittest.TestCase): class TestBertEmbedding(unittest.TestCase): def test_bert_embedding_1(self): - vocab = Vocabulary().add_word_lst("this is a test .".split()) - embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert') + vocab = Vocabulary().add_word_lst("this is a test . [SEP]".split()) + embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) + requires_grad = embed.requires_grad + embed.requires_grad = not requires_grad + embed.train() words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) diff --git a/test/embeddings/test_elmo_embedding.py b/test/embeddings/test_elmo_embedding.py index a087f0a4..bfb31659 100644 --- a/test/embeddings/test_elmo_embedding.py +++ b/test/embeddings/test_elmo_embedding.py @@ -18,4 +18,19 @@ class TestDownload(unittest.TestCase): # 首先保证所有权重可以加载;上传权重;验证可以下载 +class TestRunElmo(unittest.TestCase): + def test_elmo_embedding(self): + vocab = Vocabulary().add_word_lst("This is a test .".split()) + elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_elmo', layers='0,1') + words = torch.LongTensor([[0, 1, 2]]) + hidden = elmo_embed(words) + print(hidden.size()) + + def test_elmo_embedding_layer_assertion(self): + vocab = Vocabulary().add_word_lst("This is a test .".split()) + try: + elmo_embed = ElmoEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_elmo', + layers='0,1,2') + except AssertionError as e: + print(e) From b5a7db0b669f6956a98300799e060977f8a45a55 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Thu, 5 Sep 2019 14:31:38 +0800 Subject: [PATCH 148/153] delete the output part in dot-utils --- fastNLP/doc_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fastNLP/doc_utils.py b/fastNLP/doc_utils.py index 5801dd53..5f293d3f 100644 --- a/fastNLP/doc_utils.py +++ b/fastNLP/doc_utils.py @@ -1,3 +1,7 @@ +"""undocumented""" + +__all__ = [] + import inspect import sys @@ -7,7 +11,8 @@ def doc_process(m): if inspect.isclass(obj) or inspect.isfunction(obj): if obj.__module__ != m.__name__: if obj.__doc__ is None: - print(name, obj.__doc__) + # print(name, obj.__doc__) + pass else: module_name = obj.__module__ while 1: @@ -18,5 +23,5 @@ def doc_process(m): break module_name = ".".join(module_name.split('.')[:-1]) if module_name == m.__name__: - print(name, ": not found defined doc.") + # print(name, ": not found defined doc.") break From 5b7e9b6572ff980c9b536b3b8a8b5ea526bd2ad6 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Thu, 5 Sep 2019 14:32:37 +0800 Subject: [PATCH 149/153] update the ChnSentiCorpPipe in docs --- docs/source/fastNLP.io.loader.rst | 2 +- docs/source/fastNLP.io.pipe.rst | 2 +- docs/source/fastNLP.io.rst | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/fastNLP.io.loader.rst b/docs/source/fastNLP.io.loader.rst index 060b5450..c1af6c0c 100644 --- a/docs/source/fastNLP.io.loader.rst +++ b/docs/source/fastNLP.io.loader.rst @@ -2,6 +2,6 @@ fastNLP.io.loader ================= .. automodule:: fastNLP.io.loader - :members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader + :members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader :inherited-members: diff --git a/docs/source/fastNLP.io.pipe.rst b/docs/source/fastNLP.io.pipe.rst index d35d2ddc..3ef9b5a8 100644 --- a/docs/source/fastNLP.io.pipe.rst +++ b/docs/source/fastNLP.io.pipe.rst @@ -2,6 +2,6 @@ fastNLP.io.pipe =============== .. automodule:: fastNLP.io.pipe - :members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe + :members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe :inherited-members: diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst index 96df9d6c..7118039d 100644 --- a/docs/source/fastNLP.io.rst +++ b/docs/source/fastNLP.io.rst @@ -2,7 +2,7 @@ fastNLP.io ========== .. automodule:: fastNLP.io - :members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver + :members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ChnSentiCorpLoader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver :inherited-members: 子模块 From f004a070b4606fa509f6d55ea70a8ac9a82766af Mon Sep 17 00:00:00 2001 From: ChenXin Date: Thu, 5 Sep 2019 15:13:08 +0800 Subject: [PATCH 150/153] update the doc tool --- docs/count.py | 47 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/docs/count.py b/docs/count.py index 6a5d256b..7118216a 100644 --- a/docs/count.py +++ b/docs/count.py @@ -23,6 +23,13 @@ def _colored_string(string: str, color: str or int) -> str: return "\033[%dm%s\033[0m" % (color, string) +def gr(string, flag): + if flag: + return _colored_string(string, "green") + else: + return _colored_string(string, "red") + + def find_all_modules(): modules = {} children = {} @@ -79,20 +86,46 @@ def create_rst_file(modules, name, children): def check_file(m, name): + names = name.split('.') + test_name = "test." + ".".join(names[1:-1]) + ".test_" + names[-1] + try: + __import__(test_name) + tm = sys.modules[test_name] + except ModuleNotFoundError: + tm = None + tested = tm is not None + funcs = {} + classes = {} for item, obj in inspect.getmembers(m): - if inspect.isclass(obj) and obj.__module__ == name: - print(obj) - if inspect.isfunction(obj) and obj.__module__ == name: - print("FUNC", obj) + if inspect.isclass(obj) and obj.__module__ == name and not obj.__name__.startswith('_'): + this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm), {}) + for i in dir(obj): + func = getattr(obj, i) + if inspect.isfunction(func) and not i.startswith('_'): + this[2][i] = (func.__doc__ is not None, False) + classes[obj.__name__] = this + if inspect.isfunction(obj) and obj.__module__ == name and not obj.__name__.startswith('_'): + this = (obj.__doc__ is not None, tested and obj.__name__ in dir(tm)) # docs + funcs[obj.__name__] = this + return funcs, classes -def check_files(modules): +def check_files(modules, out=sys.stdout): for name in sorted(modules.keys()): - if name == 'fastNLP.core.utils': - check_file(modules[name], name) + print(name, file=out) + funcs, classes = check_file(modules[name], name) + for f in funcs: + print("%-30s \t %s \t %s" % (f, gr("文档", funcs[f][0]), gr("测试", funcs[f][1])), file=out) + for c in classes: + print("%-30s \t %s \t %s" % (c, gr("文档", classes[c][0]), gr("测试", classes[c][1])), file=out) + methods = classes[c][2] + for f in methods: + print(" %-28s \t %s" % (f, gr("文档", methods[f][0])), file=out) + print(file=out) def main(): + sys.path.append("..") print(_colored_string('Getting modules...', "Blue")) modules, to_doc, children = find_all_modules() print(_colored_string('Done!', "Green")) From 2fbc1d78518d6f75080da8bdab6ddaecd5d3cd87 Mon Sep 17 00:00:00 2001 From: unknown <793736331@qq.com> Date: Sat, 7 Sep 2019 15:22:43 +0800 Subject: [PATCH 151/153] change the print format for dataset and instance --- fastNLP/core/dataset.py | 101 ++++++++++++++-------------- fastNLP/core/instance.py | 20 +++--- fastNLP/core/utils.py | 138 +++++++++++++++++++++++++++------------ 3 files changed, 155 insertions(+), 104 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index ebdc780f..36852b93 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -300,13 +300,14 @@ from .field import FieldArray from .field import SetInputOrTargetException from .instance import Instance from .utils import _get_func_signature +from .utils import pretty_table_printer class DataSet(object): """ fastNLP的数据容器,详细的使用方法见文档 :doc:`fastNLP.core.dataset` """ - + def __init__(self, data=None): """ @@ -326,26 +327,26 @@ class DataSet(object): for ins in data: assert isinstance(ins, Instance), "Must be Instance type, not {}.".format(type(ins)) self.append(ins) - + else: raise ValueError("data only be dict or list type.") - + def __contains__(self, item): return item in self.field_arrays - + def __iter__(self): def iter_func(): for idx in range(len(self)): yield self[idx] - + return iter_func() - + def _inner_iter(self): class Iter_ptr: def __init__(self, dataset, idx): self.dataset = dataset self.idx = idx - + def __getitem__(self, item): assert item in self.dataset.field_arrays, "no such field:{} in Instance {}".format(item, self.dataset[ self.idx]) @@ -358,13 +359,13 @@ class DataSet(object): def __repr__(self): return self.dataset[self.idx].__repr__() - + def inner_iter_func(): for idx in range(len(self)): yield Iter_ptr(self, idx) - + return inner_iter_func() - + def __getitem__(self, idx): """给定int的index,返回一个Instance; 给定slice,返回包含这个slice内容的新的DataSet。 @@ -397,20 +398,20 @@ class DataSet(object): return dataset else: raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx))) - + def __getattr__(self, item): # Not tested. Don't use !! if item == "field_arrays": raise AttributeError if isinstance(item, str) and item in self.field_arrays: return self.field_arrays[item] - + def __setstate__(self, state): self.__dict__ = state - + def __getstate__(self): return self.__dict__ - + def __len__(self): """Fetch the length of the dataset. @@ -420,16 +421,10 @@ class DataSet(object): return 0 field = iter(self.field_arrays.values()).__next__() return len(field) - - def __inner_repr__(self): - if len(self) < 20: - return ",\n".join([ins.__repr__() for ins in self]) - else: - return self[:5].__inner_repr__() + "\n...\n" + self[-5:].__inner_repr__() - + def __repr__(self): - return "DataSet(" + self.__inner_repr__() + ")" - + return str(pretty_table_printer(self)) + def append(self, instance): """ 将一个instance对象append到DataSet后面。 @@ -454,7 +449,7 @@ class DataSet(object): except AppendToTargetOrInputException as e: logger.error(f"Cannot append to field:{name}.") raise e - + def add_fieldarray(self, field_name, fieldarray): """ 将fieldarray添加到DataSet中. @@ -469,7 +464,7 @@ class DataSet(object): raise RuntimeError(f"The field to add must have the same size as dataset. " f"Dataset size {len(self)} != field size {len(fieldarray)}") self.field_arrays[field_name] = fieldarray - + def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False): """ 新增一个field @@ -481,14 +476,14 @@ class DataSet(object): :param bool is_target: 新加入的field是否是target :param bool ignore_type: 是否忽略对新加入的field的类型检查 """ - + if len(self.field_arrays) != 0: if len(self) != len(fields): raise RuntimeError(f"The field to add must have the same size as dataset. " f"Dataset size {len(self)} != field size {len(fields)}") self.field_arrays[field_name] = FieldArray(field_name, fields, is_target=is_target, is_input=is_input, padder=padder, ignore_type=ignore_type) - + def delete_instance(self, index): """ 删除第index个instance @@ -504,7 +499,7 @@ class DataSet(object): for field in self.field_arrays.values(): field.pop(index) return self - + def delete_field(self, field_name): """ 删除名为field_name的field @@ -538,7 +533,7 @@ class DataSet(object): if isinstance(field_name, str): return field_name in self.field_arrays return False - + def get_field(self, field_name): """ 获取field_name这个field @@ -549,7 +544,7 @@ class DataSet(object): if field_name not in self.field_arrays: raise KeyError("Field name {} not found in DataSet".format(field_name)) return self.field_arrays[field_name] - + def get_all_fields(self): """ 返回一个dict,key为field_name, value为对应的 :class:`~fastNLP.FieldArray` @@ -557,7 +552,7 @@ class DataSet(object): :return dict: 返回如上所述的字典 """ return self.field_arrays - + def get_field_names(self) -> list: """ 返回一个list,包含所有 field 的名字 @@ -565,7 +560,7 @@ class DataSet(object): :return list: 返回如上所述的列表 """ return sorted(self.field_arrays.keys()) - + def get_length(self): """ 获取DataSet的元素数量 @@ -573,7 +568,7 @@ class DataSet(object): :return: int: DataSet中Instance的个数。 """ return len(self) - + def rename_field(self, field_name, new_field_name): """ 将某个field重新命名. @@ -587,7 +582,7 @@ class DataSet(object): else: raise KeyError("DataSet has no field named {}.".format(field_name)) return self - + def set_target(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): """ 将field_names的field设置为target @@ -614,7 +609,7 @@ class DataSet(object): else: raise KeyError("{} is not a valid field name.".format(name)) return self - + def set_input(self, *field_names, flag=True, use_1st_ins_infer_dim_type=True): """ 将field_names的field设置为input:: @@ -638,7 +633,7 @@ class DataSet(object): else: raise KeyError("{} is not a valid field name.".format(name)) return self - + def set_ignore_type(self, *field_names, flag=True): """ 将field设置为忽略类型状态。当某个field被设置了ignore_type, 则在被设置为target或者input时将不进行类型检查, @@ -655,7 +650,7 @@ class DataSet(object): else: raise KeyError("{} is not a valid field name.".format(name)) return self - + def set_padder(self, field_name, padder): """ 为field_name设置padder:: @@ -671,7 +666,7 @@ class DataSet(object): raise KeyError("There is no field named {}.".format(field_name)) self.field_arrays[field_name].set_padder(padder) return self - + def set_pad_val(self, field_name, pad_val): """ 为某个field设置对应的pad_val. @@ -683,7 +678,7 @@ class DataSet(object): raise KeyError("There is no field named {}.".format(field_name)) self.field_arrays[field_name].set_pad_val(pad_val) return self - + def get_input_name(self): """ 返回所有is_input被设置为True的field名称 @@ -691,7 +686,7 @@ class DataSet(object): :return list: 里面的元素为被设置为input的field名称 """ return [name for name, field in self.field_arrays.items() if field.is_input] - + def get_target_name(self): """ 返回所有is_target被设置为True的field名称 @@ -699,7 +694,7 @@ class DataSet(object): :return list: 里面的元素为被设置为target的field名称 """ return [name for name, field in self.field_arrays.items() if field.is_target] - + def apply_field(self, func, field_name, new_field_name=None, **kwargs): """ 将DataSet中的每个instance中的名为 `field_name` 的field传给func,并获取它的返回值。 @@ -728,16 +723,16 @@ class DataSet(object): results.append(func(ins[field_name])) except Exception as e: if idx != -1: - logger.error("Exception happens at the `{}`th(from 1) instance.".format(idx+1)) + logger.error("Exception happens at the `{}`th(from 1) instance.".format(idx + 1)) raise e if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(_get_func_signature(func=func))) - + if new_field_name is not None: self._add_apply_field(results, new_field_name, kwargs) - + return results - + def _add_apply_field(self, results, new_field_name, kwargs): """ 将results作为加入到新的field中,field名称为new_field_name @@ -769,7 +764,7 @@ class DataSet(object): self.add_field(field_name=new_field_name, fields=results, is_input=extra_param.get("is_input", None), is_target=extra_param.get("is_target", None), ignore_type=extra_param.get("ignore_type", False)) - + def apply(self, func, new_field_name=None, **kwargs): """ 将DataSet中每个instance传入到func中,并获取它的返回值. @@ -801,13 +796,13 @@ class DataSet(object): # results = [func(ins) for ins in self._inner_iter()] if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None raise ValueError("{} always return None.".format(_get_func_signature(func=func))) - + if new_field_name is not None: self._add_apply_field(results, new_field_name, kwargs) - + return results - def add_seq_len(self, field_name:str, new_field_name=Const.INPUT_LEN): + def add_seq_len(self, field_name: str, new_field_name=Const.INPUT_LEN): """ 将使用len()直接对field_name中每个元素作用,将其结果作为seqence length, 并放入seq_len这个field。 @@ -844,7 +839,7 @@ class DataSet(object): return dataset else: return DataSet() - + def split(self, ratio, shuffle=True): """ 将DataSet按照ratio的比例拆分,返回两个DataSet @@ -870,9 +865,9 @@ class DataSet(object): for field_name in self.field_arrays: train_set.field_arrays[field_name].to(self.field_arrays[field_name]) dev_set.field_arrays[field_name].to(self.field_arrays[field_name]) - + return train_set, dev_set - + def save(self, path): """ 保存DataSet. @@ -881,7 +876,7 @@ class DataSet(object): """ with open(path, 'wb') as f: pickle.dump(self, f) - + @staticmethod def load(path): r""" diff --git a/fastNLP/core/instance.py b/fastNLP/core/instance.py index 9460b5e4..3cf7ab45 100644 --- a/fastNLP/core/instance.py +++ b/fastNLP/core/instance.py @@ -3,10 +3,13 @@ instance 模块实现了Instance 类在fastNLP中对应sample。一个sample可 便于理解的例子可以参考文档 :doc:`fastNLP.core.dataset` 中的表格 """ + __all__ = [ "Instance" ] +from .utils import pretty_table_printer + class Instance(object): """ @@ -20,11 +23,11 @@ class Instance(object): >>>ins.add_field("field_3", [3, 3, 3]) >>>ins = Instance(**{'x1': 1, 'x2':np.zeros((3, 4))}) """ - + def __init__(self, **fields): - + self.fields = fields - + def add_field(self, field_name, field): """ 向Instance中增加一个field @@ -41,18 +44,15 @@ class Instance(object): :return: 一个迭代器 """ return self.fields.items() - + def __getitem__(self, name): if name in self.fields: return self.fields[name] else: raise KeyError("{} not found".format(name)) - + def __setitem__(self, name, field): return self.add_field(name, field) - + def __repr__(self): - s = '\'' - return "{" + ",\n".join( - "\'" + field_name + "\': " + str(self.fields[field_name]) + \ - f" type={(str(type(self.fields[field_name]))).split(s)[1]}" for field_name in self.fields) + "}" + return str(pretty_table_printer(self)) diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 814e0bd5..dd2afab7 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -1,6 +1,7 @@ """ utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户可以使用的是 :func:`cache_results` 修饰器。 """ + __all__ = [ "cache_results", "seq_len_to_mask", @@ -12,12 +13,12 @@ import inspect import os import warnings from collections import Counter, namedtuple - import numpy as np import torch import torch.nn as nn from typing import List from ._logger import logger +from prettytable import PrettyTable _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', 'varargs']) @@ -25,27 +26,27 @@ _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'require class Option(dict): """a dict can treat keys as attributes""" - + def __getattr__(self, item): try: return self.__getitem__(item) except KeyError: raise AttributeError(item) - + def __setattr__(self, key, value): if key.startswith('__') and key.endswith('__'): raise AttributeError(key) self.__setitem__(key, value) - + def __delattr__(self, item): try: self.pop(item) except KeyError: raise AttributeError(item) - + def __getstate__(self): return self - + def __setstate__(self, state): self.update(state) @@ -112,13 +113,13 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1): :param int _verbose: 是否打印cache的信息。 :return: """ - + def wrapper_(func): signature = inspect.signature(func) for key, _ in signature.parameters.items(): if key in ('_cache_fp', '_refresh', '_verbose'): raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key)) - + def wrapper(*args, **kwargs): if '_cache_fp' in kwargs: cache_filepath = kwargs.pop('_cache_fp') @@ -136,7 +137,7 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1): else: verbose = _verbose refresh_flag = True - + if cache_filepath is not None and refresh is False: # load data if os.path.exists(cache_filepath): @@ -145,7 +146,7 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1): if verbose == 1: logger.info("Read cache from {}.".format(cache_filepath)) refresh_flag = False - + if refresh_flag: results = func(*args, **kwargs) if cache_filepath is not None: @@ -155,11 +156,11 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1): with open(cache_filepath, 'wb') as f: _pickle.dump(results, f) logger.info("Save cache to {}.".format(cache_filepath)) - + return results - + return wrapper - + return wrapper_ @@ -187,6 +188,7 @@ def _save_model(model, model_name, save_dir, only_param=False): torch.save(model, model_path) model.to(_model_device) + def _move_model_to_device(model, device): """ 将model移动到device @@ -211,7 +213,7 @@ def _move_model_to_device(model, device): """ # if isinstance(model, torch.nn.parallel.DistributedDataParallel): # raise RuntimeError("model of `torch.nn.parallel.DistributedDataParallel` is not supported right now.") - + if device is None: if isinstance(model, torch.nn.DataParallel): model.cuda() @@ -220,10 +222,10 @@ def _move_model_to_device(model, device): if not torch.cuda.is_available() and ( device != 'cpu' or (isinstance(device, torch.device) and device.type != 'cpu')): raise ValueError("There is no usable gpu. set `device` as `cpu` or `None`.") - + if isinstance(model, torch.nn.DataParallel): raise RuntimeError("When model is `torch.nn.DataParallel`, the device has to be `None`.") - + if isinstance(device, int): assert device > -1, "device can only be non-negative integer" assert torch.cuda.device_count() > device, "Only has {} gpus, cannot use device {}.".format( @@ -267,7 +269,7 @@ def _get_model_device(model): """ # TODO 这个函数存在一定的风险,因为同一个模型可能存在某些parameter不在显卡中,比如BertEmbedding. 或者跨显卡 assert isinstance(model, nn.Module) - + parameters = list(model.parameters()) if len(parameters) == 0: return None @@ -427,10 +429,10 @@ def _move_dict_value_to_device(*args, device: torch.device, non_blocking=False): """ if not torch.cuda.is_available(): return - + if not isinstance(device, torch.device): raise TypeError(f"device must be `torch.device`, got `{type(device)}`") - + for arg in args: if isinstance(arg, dict): for key, value in arg.items(): @@ -445,10 +447,10 @@ class _CheckError(Exception): _CheckError. Used in losses.LossBase, metrics.MetricBase. """ - + def __init__(self, check_res: _CheckRes, func_signature: str): errs = [f'Problems occurred when calling `{func_signature}`'] - + if check_res.varargs: errs.append(f"\tvarargs: {check_res.varargs}(Does not support pass positional arguments, please delete it)") if check_res.missing: @@ -457,9 +459,9 @@ class _CheckError(Exception): errs.append(f"\tduplicated param: {check_res.duplicated}") if check_res.unused: errs.append(f"\tunused param: {check_res.unused}") - + Exception.__init__(self, '\n'.join(errs)) - + self.check_res = check_res self.func_signature = func_signature @@ -479,7 +481,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re # if check_res.varargs: # errs.append(f"\tvarargs: *{check_res.varargs}") # suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.") - + if check_res.unused: for _unused in check_res.unused: if _unused in target_dict: @@ -490,7 +492,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re unuseds.append(f"\tunused field: {_unused_field}") if _unused_param: unuseds.append(f"\tunused param: {_unused_param}") # output from predict or forward - + module_name = func_signature.split('.')[0] if check_res.missing: errs.append(f"\tmissing param: {check_res.missing}") @@ -511,7 +513,7 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re mapped_missing.append(_miss) else: unmapped_missing.append(_miss) - + for _miss in mapped_missing + unmapped_missing: if _miss in dataset: suggestions.append(f"Set `{_miss}` as target.") @@ -524,17 +526,17 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re else: _tmp = f'Provide `{_miss}` in DataSet or output of {prev_func_signature}.' suggestions.append(_tmp) - + if check_res.duplicated: errs.append(f"\tduplicated param: {check_res.duplicated}.") suggestions.append(f"Delete {check_res.duplicated} in the output of " f"{prev_func_signature} or do not set {check_res.duplicated} as targets. ") - + if len(errs) > 0: errs.extend(unuseds) elif check_level == STRICT_CHECK_LEVEL: errs.extend(unuseds) - + if len(errs) > 0: errs.insert(0, f'Problems occurred when calling {func_signature}') sugg_str = "" @@ -561,11 +563,11 @@ def _check_loss_evaluate(prev_func_signature: str, func_signature: str, check_re def _check_forward_error(forward_func, batch_x, dataset, check_level): check_res = _check_arg_dict_list(forward_func, batch_x) func_signature = _get_func_signature(forward_func) - + errs = [] suggestions = [] _unused = [] - + # if check_res.varargs: # errs.append(f"\tvarargs: {check_res.varargs}") # suggestions.append(f"Does not support pass positional arguments, please delete *{check_res.varargs}.") @@ -586,14 +588,14 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level): # _tmp += f"Or you might find it in `unused field:`, you can use DataSet.rename_field() to " \ # f"rename the field in `unused field:`." suggestions.append(_tmp) - + if check_res.unused: _unused = [f"\tunused field: {check_res.unused}"] if len(errs) > 0: errs.extend(_unused) elif check_level == STRICT_CHECK_LEVEL: errs.extend(_unused) - + if len(errs) > 0: errs.insert(0, f'Problems occurred when calling {func_signature}') sugg_str = "" @@ -641,7 +643,7 @@ def seq_len_to_mask(seq_len, max_len=None): max_len = int(max_len) if max_len else int(seq_len.max()) broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1)) mask = broad_cast_seq_len < seq_len.reshape(-1, 1) - + elif isinstance(seq_len, torch.Tensor): assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}." batch_size = seq_len.size(0) @@ -650,7 +652,7 @@ def seq_len_to_mask(seq_len, max_len=None): mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1)) else: raise TypeError("Only support 1-d numpy.ndarray or 1-d torch.Tensor.") - + return mask @@ -658,24 +660,25 @@ class _pseudo_tqdm: """ 当无法引入tqdm,或者Trainer中设置use_tqdm为false的时候,用该方法打印数据 """ + def __init__(self, **kwargs): self.logger = logger - + def write(self, info): self.logger.info(info) - + def set_postfix_str(self, info): self.logger.info(info) - + def __getattr__(self, item): def pass_func(*args, **kwargs): pass - + return pass_func - + def __enter__(self): return self - + def __exit__(self, exc_type, exc_val, exc_tb): del self @@ -749,3 +752,56 @@ def get_seq_len(words, pad_value=0): """ mask = words.ne(pad_value) return mask.sum(dim=-1) + + +def pretty_table_printer(dataset_or_ins) -> PrettyTable: + """ + :param dataset_or_ins: 传入一个dataSet或者instance + ins = Instance(field_1=[1, 1, 1], field_2=[2, 2, 2], field_3=["a", "b", "c"]) + +-----------+-----------+-----------------+ + | field_1 | field_2 | field_3 | + +-----------+-----------+-----------------+ + | [1, 1, 1] | [2, 2, 2] | ['a', 'b', 'c'] | + +-----------+-----------+-----------------+ + :return: 以 pretty table的形式返回根据terminal大小进行自动截断 + """ + x = PrettyTable() + try: + sz = os.get_terminal_size() + column = sz.columns + row = sz.lines + except OSError: + column = 144 + row = 11 + if type(dataset_or_ins).__name__ == "DataSet": + x.field_names = list(dataset_or_ins.field_arrays.keys()) + c_size = len(x.field_names) + for ins in dataset_or_ins: + x.add_row([sub_column(ins[k], column, c_size, k) for k in x.field_names]) + row -= 1 + if row < 0: + x.add_row(["..." for _ in range(c_size)]) + break + elif type(dataset_or_ins).__name__ == "Instance": + x.field_names = list(dataset_or_ins.fields.keys()) + c_size = len(x.field_names) + x.add_row([sub_column(dataset_or_ins[k], column, c_size, k) for k in x.field_names]) + + else: + raise Exception("only accept DataSet and Instance") + return x + + +def sub_column(string: str, c: int, c_size: int, title: str) -> str: + """ + :param string: 要被截断的字符串 + :param c: 命令行列数 + :param c_size: instance或dataset field数 + :param title: 列名 + :return: 对一个过长的列进行截断的结果 + """ + avg = max(int(c / c_size), len(title)) + string = str(string) + if len(string) > avg: + string = string[:(avg - 3)] + "..." + return string From 8c8e22cc9baa08a1c8ee9ba887717db41cce57b5 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sat, 7 Sep 2019 18:47:03 +0800 Subject: [PATCH 152/153] =?UTF-8?q?DataSet=E4=B8=AD=E5=A2=9E=E5=8A=A0print?= =?UTF-8?q?=5Ffield=5Fmeta=E6=96=B9=E6=B3=95=EF=BC=8C=E4=BD=BF=E5=BE=97?= =?UTF-8?q?=E5=85=B6=E5=8F=AF=E4=BB=A5=E8=8E=B7=E5=8F=96field=E7=9A=84inpu?= =?UTF-8?q?t=E5=92=8Ctarget=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataset.py | 57 +++++++++++++++++++++++++++++++++++++++ fastNLP/core/field.py | 7 +++-- requirements.txt | 1 + test/core/test_dataset.py | 15 ++++++++++- 4 files changed, 77 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 36852b93..2b548f22 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -301,6 +301,7 @@ from .field import SetInputOrTargetException from .instance import Instance from .utils import _get_func_signature from .utils import pretty_table_printer +from prettytable import PrettyTable class DataSet(object): @@ -425,6 +426,62 @@ class DataSet(object): def __repr__(self): return str(pretty_table_printer(self)) + def print_field_meta(self): + """ + 输出当前field的meta信息, 形似下列的输出 + + +-------------+-------+-------+ + | field_names | x | y | + +-------------+-------+-------+ + | is_input | True | False | + | is_target | False | False | + | ignore_type | False | | + | pad_value | 0 | | + +-------------+-------+-------+ + + field_names: DataSet中field的名称 + is_input: field是否为input + is_target: field是否为target + ignore_type: 是否忽略该field的type, 一般仅在该field至少为input或target时才有意义 + pad_value: 该field的pad的值,仅在该field为input或target时有意义 + + :return: + """ + if len(self.field_arrays)>0: + field_names = ['field_names'] + is_inputs = ['is_input'] + is_targets = ['is_target'] + pad_values = ['pad_value'] + ignore_types = ['ignore_type'] + + for name, field_array in self.field_arrays.items(): + field_names.append(name) + if field_array.is_input: + is_inputs.append(True) + else: + is_inputs.append(False) + if field_array.is_target: + is_targets.append(True) + else: + is_targets.append(False) + + if (field_array.is_input or field_array.is_target) and field_array.padder is not None: + pad_values.append(field_array.padder.get_pad_val()) + else: + pad_values.append(' ') + + if field_array._ignore_type: + ignore_types.append(True) + elif field_array.is_input or field_array.is_target: + ignore_types.append(False) + else: + ignore_types.append(' ') + table = PrettyTable(field_names=field_names) + fields = [is_inputs, is_targets, ignore_types, pad_values] + for field in fields: + table.add_row(field) + logger.info(table) + def append(self, instance): """ 将一个instance对象append到DataSet后面。 diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 82fcc523..1835bafa 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -53,7 +53,7 @@ class FieldArray: self.content = _content self._ignore_type = ignore_type # 根据input的情况设置input,target等 - self._cell_ndim = None # 多少维度 + self._cell_ndim = None # 多少维度, 如果value是1, dim为0; 如果value是[1, 2], dim=2 self.dtype = None # 最内层的element都是什么类型的 self._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type) self._is_input = False @@ -484,7 +484,10 @@ class Padder: def set_pad_val(self, pad_val): self.pad_val = pad_val - + + def get_pad_val(self): + return self.pad_val + @abstractmethod def __call__(self, contents, field_name, field_ele_dtype, dim: int): """ diff --git a/requirements.txt b/requirements.txt index f71e2223..bdd4a9e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ tqdm>=4.28.1 nltk>=3.4.1 requests spacy +prettytable>=0.7.2 \ No newline at end of file diff --git a/test/core/test_dataset.py b/test/core/test_dataset.py index 059d52d2..9820eff6 100644 --- a/test/core/test_dataset.py +++ b/test/core/test_dataset.py @@ -229,4 +229,17 @@ class TestDataSetIter(unittest.TestCase): def test__repr__(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) for iter in ds: - self.assertEqual(iter.__repr__(), "{'x': [1, 2, 3, 4] type=list,\n'y': [5, 6] type=list}") + self.assertEqual(iter.__repr__(), """+--------------+--------+ +| x | y | ++--------------+--------+ +| [1, 2, 3, 4] | [5, 6] | ++--------------+--------+""") + + +class TestDataSetFieldMeta(unittest.TestCase): + def test_print_field_meta(self): + ds = DataSet({"x": [[1, 2, 3, 4]] * 10, "y": [[5, 6]] * 10}) + ds.print_field_meta() + + ds.set_input('x') + ds.print_field_meta() From 1caa83d0cafbb5df6470627fab8dea86b56df36a Mon Sep 17 00:00:00 2001 From: ZikaiGuo <634500098@qq.com> Date: Sun, 8 Sep 2019 14:54:31 +0200 Subject: [PATCH 153/153] Update transformer.py --- fastNLP/modules/encoder/transformer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py index d29a10c3..3d97c306 100644 --- a/fastNLP/modules/encoder/transformer.py +++ b/fastNLP/modules/encoder/transformer.py @@ -40,6 +40,8 @@ class TransformerEncoder(nn.Module): :param seq_mask: [batch, seq_len] :return: [batch, seq_len, model_size] """ + if seq_mask is None: # 防止后续乘法时出错 + seq_mask = 1 input = self.norm1(input) attention = self.atte(input, input, input, atte_mask_out) input = input + self.dropout(attention)