Browse Source

1. 在vocabulary的from_dataset中增加no_create_entry_dataset选项,用于传递dev和test

2. 调整各种Embedding的实现,使得确保来自dev和test的未发现词使用unk的表示
3. 在Embedding中增加dropout_word的选项,使得可以随机drop掉词语
4. 以及其它若干小的bug
tags/v0.4.10
yh_cc 6 years ago
parent
commit
8f7ed07441
10 changed files with 340 additions and 107 deletions
  1. +5
    -5
      fastNLP/core/field.py
  2. +56
    -14
      fastNLP/core/vocabulary.py
  3. +76
    -46
      fastNLP/modules/encoder/_bert.py
  4. +10
    -1
      fastNLP/modules/encoder/_elmo.py
  5. +13
    -15
      fastNLP/modules/encoder/bert.py
  6. +129
    -16
      fastNLP/modules/encoder/embedding.py
  7. +2
    -2
      fastNLP/modules/encoder/lstm.py
  8. +5
    -4
      reproduction/seqence_labelling/ner/data/Conll2003Loader.py
  9. +26
    -4
      reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
  10. +18
    -0
      test/core/test_vocabulary.py

+ 5
- 5
fastNLP/core/field.py View File

@@ -242,7 +242,7 @@ class FieldArray:
new_contents.append(cell.split(sep))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e
return self._after_process(new_contents, inplace=inplace)

def int(self, inplace:bool=True):
@@ -282,7 +282,7 @@ class FieldArray:
new_contents.append(float(cell))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e
return self._after_process(new_contents, inplace=inplace)

def bool(self, inplace=True):
@@ -302,7 +302,7 @@ class FieldArray:
new_contents.append(bool(cell))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e

return self._after_process(new_contents, inplace=inplace)

@@ -323,7 +323,7 @@ class FieldArray:
new_contents.append(cell.lower())
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e
return self._after_process(new_contents, inplace=inplace)

def upper(self, inplace=True):
@@ -343,7 +343,7 @@ class FieldArray:
new_contents.append(cell.upper())
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e
return self._after_process(new_contents, inplace=inplace)

def value_count(self):


+ 56
- 14
fastNLP/core/vocabulary.py View File

@@ -4,10 +4,11 @@ __all__ = [
]

from functools import wraps
from collections import Counter
from collections import Counter, defaultdict
from .dataset import DataSet
from .utils import Option

from functools import partial
import numpy as np

class VocabularyOption(Option):
def __init__(self,
@@ -89,7 +90,9 @@ class Vocabulary(object):
self.word2idx = None
self.idx2word = None
self.rebuild = True
# 用于承载不需要单独创建entry的词语,具体见from_dataset()方法
self._no_create_word = defaultdict(int)

@_check_build_status
def update(self, word_lst):
"""依次增加序列中词在词典中的出现频率
@@ -240,8 +243,12 @@ class Vocabulary(object):
raise e
else:
raise RuntimeError("Only DataSet type is allowed.")
def from_dataset(self, *datasets, field_name):

@property
def _no_create_word_length(self):
return len(self._no_create_word)

def from_dataset(self, *datasets, field_name, no_create_entry_dataset=None):
"""
使用dataset的对应field中词构建词典::

@@ -253,6 +260,13 @@ class Vocabulary(object):
构建词典所使用的 field(s), 支持一个或多个field
若有多个 DataSet, 每个DataSet都必须有这些field.
目前仅支持的field结构: ``str`` , ``list(str)`` , ``list(list(str))``
:param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain
的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev
中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。
如果一个词出现在了train中,但是没在预训练模型中,embedding会为它用unk初始化,但它是单独的一个vector,如果
finetune embedding的话,这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中,那么就不能为它们单独建立vector,
而应该让它指向unk这个vector的值。所以只位于no_create_entry_dataset中的token,将首先从预训练的词表中寻找它的表示,
如果找到了,就使用该表示; 如果没有找到,则认为该词的表示应该为unk的表示。
:return self:
"""
if isinstance(field_name, str):
@@ -260,19 +274,28 @@ class Vocabulary(object):
elif not isinstance(field_name, list):
raise TypeError('invalid argument field_name: {}'.format(field_name))
def construct_vocab(ins):
def construct_vocab(ins, no_create_entry=False):
for fn in field_name:
field = ins[fn]
if isinstance(field, str):
if no_create_entry and field not in self.word_count:
self._no_create_word[field] += 1
self.add_word(field)
elif isinstance(field, list):
if not isinstance(field[0], list):
self.add_word_lst(field)
elif isinstance(field, (list, np.ndarray)):
if not isinstance(field[0], (list, np.ndarray)):
for word in field:
if no_create_entry and word not in self.word_count:
self._no_create_word[word] += 1
self.add_word(word)
else:
if isinstance(field[0][0], list):
if isinstance(field[0][0], (list, np.ndarray)):
raise RuntimeError("Only support field with 2 dimensions.")
[self.add_word_lst(w) for w in field]
for words in field:
for word in words:
if no_create_entry and word not in self.word_count:
self._no_create_word[word] += 1
self.add_word(word)

for idx, dataset in enumerate(datasets):
if isinstance(dataset, DataSet):
try:
@@ -281,9 +304,27 @@ class Vocabulary(object):
print("When processing the `{}` dataset, the following error occurred.".format(idx))
raise e
else:
raise RuntimeError("Only DataSet type is allowed.")
raise TypeError("Only DataSet type is allowed.")

if no_create_entry_dataset is not None:
partial_construct_vocab = partial(construct_vocab, no_create_entry=True)
if isinstance(no_create_entry_dataset, DataSet):
no_create_entry_dataset.apply(partial_construct_vocab)
elif isinstance(no_create_entry_dataset, list):
for dataset in no_create_entry_dataset:
if not isinstance(dataset, DataSet):
raise TypeError("Only DataSet type is allowed.")
dataset.apply(partial_construct_vocab)
return self

def _is_word_no_create_entry(self, word):
"""
判断当前的word是否是不需要创建entry的,具体参见from_dataset的说明
:param word: str
:return: bool
"""
return word in self._no_create_word

def to_index(self, w):
"""
将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出
@@ -338,6 +379,7 @@ class Vocabulary(object):
self.word2idx = None
self.idx2word = None
self.rebuild = True
self._no_create_word.clear()
def __getstate__(self):
"""Use to prepare data for pickle.


+ 76
- 46
fastNLP/modules/encoder/_bert.py View File

@@ -21,6 +21,7 @@ import os

import torch
from torch import nn
import glob

CONFIG_FILE = 'bert_config.json'
MODEL_WEIGHTS = 'pytorch_model.bin'
@@ -346,7 +347,12 @@ class BertModel(nn.Module):
# Instantiate model.
model = cls(*inputs, **config, **kwargs)
if state_dict is None:
weights_path = os.path.join(pretrained_model_dir, MODEL_WEIGHTS)
files = glob.glob(os.path.join(pretrained_model_dir, '*.bin'))
if len(files)==0:
raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}")
elif len(files)>1:
raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}")
weights_path = files[0]
state_dict = torch.load(weights_path)

old_keys = []
@@ -390,16 +396,6 @@ class BertModel(nn.Module):
return model












def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
@@ -671,6 +667,16 @@ class BertTokenizer(object):
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.max_len = max_len if max_len is not None else int(1e12)

def _reinit_on_new_vocab(self, vocab):
"""
在load bert之后,可能会对vocab进行重新排列。重新排列之后调用这个函数重新初始化与vocab相关的性质

:param vocab:
:return:
"""
self.vocab = vocab
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize:
@@ -706,6 +712,8 @@ class BertTokenizer(object):
index = 0
if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
else:
vocab_file = vocab_path
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
@@ -751,11 +759,44 @@ class _WordBertModel(nn.Module):

assert pool_method in ('avg', 'max', 'first', 'last')
self.pool_method = pool_method

self.include_cls_sep = include_cls_sep

# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
print("Start to generating word pieces for word.")
# 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的
found_count = 0
for word, index in vocab:
if index == vocab.padding_idx: # pad是个特殊的符号
word = '[PAD]'
elif index == vocab.unknown_idx:
word = '[UNK]'
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
if len(word_pieces)==1:
if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到
if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
word_piece_dict[word] = 1 # 新增一个值
continue
for word_piece in word_pieces:
word_piece_dict[word_piece] = 1
found_count += 1
original_embed = self.encoder.embeddings.word_embeddings.weight.data
# 特殊词汇要特殊处理
embed = nn.Embedding(len(word_piece_dict), original_embed.size(1)) # 新的embed
new_word_piece_vocab = collections.OrderedDict()
for index, token in enumerate(['[PAD]', '[UNK]']):
word_piece_dict.pop(token, None)
embed.weight.data[index] = original_embed[self.tokenzier.vocab[token]]
new_word_piece_vocab[token] = index
for token in word_piece_dict.keys():
if token in self.tokenzier.vocab:
embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab[token]]
else:
embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab['[UNK]']]
new_word_piece_vocab[token] = len(new_word_piece_vocab)
self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
self.encoder.embeddings.word_embeddings = embed

word_to_wordpieces = []
word_pieces_lengths = []
for word, index in vocab:
@@ -767,12 +808,11 @@ class _WordBertModel(nn.Module):
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
word_to_wordpieces.append(word_pieces)
word_pieces_lengths.append(len(word_pieces))
self._cls_index = len(vocab)
self._sep_index = len(vocab) + 1
print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab)))
self._cls_index = self.tokenzier.vocab['[CLS]']
self._sep_index = self.tokenzier.vocab['[SEP]']
self._pad_index = vocab.padding_idx
self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0] # 需要用于生成word_piece
word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece
self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
print("Successfully generate word pieces.")
@@ -850,7 +890,7 @@ class _WordPieceBertModel(nn.Module):
这个模块用于直接计算word_piece的结果.

"""
def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1'):
def __init__(self, model_dir:str, layers:str='-1'):
super().__init__()

self.tokenzier = BertTokenizer.from_pretrained(model_dir)
@@ -866,44 +906,34 @@ class _WordPieceBertModel(nn.Module):
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."

# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
print("Start to generating word pieces for word.")
self.word_to_wordpieces = []
self.word_pieces_length = []
for word, index in vocab:
if index == vocab.padding_idx: # pad是个特殊的符号
word = '[PAD]'
elif index == vocab.unknown_idx:
word = '[UNK]'
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
self.word_to_wordpieces.append(word_pieces)
self.word_pieces_length.append(len(word_pieces))
self._cls_index = len(vocab)
self._sep_index = len(vocab) + 1
self._pad_index = vocab.padding_idx
self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0] # 需要用于生成word_piece
self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
self.word_to_wordpieces = np.array(self.word_to_wordpieces, dtype=int)
print("Successfully generate word pieces.")
self._cls_index = self.tokenzier.vocab['[CLS]']
self._sep_index = self.tokenzier.vocab['[SEP]']
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece

def index_dataset(self, *datasets):
def index_dataset(self, *datasets, field_name):
"""
使用bert的tokenizer将word_pieces与word_pieces_seq_len这两列加入到datasets中,并将他们设置为input。加入的word_piece
已经包含了[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。

:param datasets: DataSet对象
:param field_name: 基于哪一列index
:return:
"""
def convert_words_to_word_pieces(words):
word_pieces = list(chain(*self.word_to_wordpieces[words].tolist()))
word_pieces = [self._cls_index] + word_pieces + [self._sep_index]
word_pieces = []
for word in words:
tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word)
word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens)
word_pieces.extend(word_piece_ids)
if word_pieces[0]!=self._cls_index:
word_pieces.insert(0, self._cls_index)
if word_pieces[-1]!=self._sep_index:
word_pieces.insert(-1, self._sep_index)
return word_pieces

for index, dataset in enumerate(datasets):
try:
dataset.apply_field(convert_words_to_word_pieces, field_name='words', new_field_name='word_pieces',
dataset.apply_field(convert_words_to_word_pieces, field_name=field_name, new_field_name='word_pieces',
is_input=True)
dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
except Exception as e:
@@ -919,7 +949,7 @@ class _WordPieceBertModel(nn.Module):
"""
batch_size, max_len = word_pieces.size()

attn_masks = word_pieces.ne(self._pad_index)
attn_masks = word_pieces.ne(self._wordpiece_pad_index)
bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
output_all_encoded_layers=True)
# output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size


+ 10
- 1
fastNLP/modules/encoder/_elmo.py View File

@@ -430,6 +430,8 @@ class LstmTokenEmbedder(nn.Module):
def forward(self, words, chars):
embs = []
if self.word_emb_layer is not None:
if hasattr(self, 'words_to_words'):
words = self.words_to_words[words]
word_emb = self.word_emb_layer(words)
embs.append(word_emb)

@@ -487,6 +489,8 @@ class ConvTokenEmbedder(nn.Module):
def forward(self, words, chars):
embs = []
if self.word_emb_layer is not None:
if hasattr(self, 'words_to_words'):
words = self.words_to_words[words]
word_emb = self.word_emb_layer(words)
embs.append(word_emb)

@@ -704,7 +708,12 @@ class _ElmoModel(nn.Module):
self.token_embedder = LstmTokenEmbedder(
config, word_emb_layer, char_emb_layer)
self.token_embedder.load_state_dict(token_embedder_states, strict=False)

if config['token_embedder']['word_dim'] > 0 and vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk
words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False)
for word, idx in vocab:
if vocab._is_word_no_create_entry(word):
words_to_words[idx] = vocab.unknown_idx
setattr(self.token_embedder, 'words_to_words', words_to_words)
self.output_dim = config['encoder']['projection_dim']

if config['encoder']['name'].lower() == 'elmo':


+ 13
- 15
fastNLP/modules/encoder/bert.py View File

@@ -2,21 +2,19 @@
import os
from torch import nn
import torch
from ...core.vocabulary import Vocabulary
from ...io.file_utils import _get_base_url, cached_path
from ._bert import _WordPieceBertModel, BertModel


class BertWordPieceEncoder(nn.Module):
"""
可以通过读取vocabulary使用的Bert的Encoder。传入vocab,然后调用index_datasets方法在vocabulary中生成word piece的表示
读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列

:param fastNLP.Vocabulary vocab: 词表
:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层
:param bool requires_grad: 是否需要gradient。
"""
def __init__(self, vocab:Vocabulary, model_dir_or_name:str='en-base', layers:str='-1',
def __init__(self, model_dir_or_name:str='en-base-uncased', layers:str='-1',
requires_grad:bool=False):
super().__init__()
PRETRAIN_URL = _get_base_url('bert')
@@ -44,7 +42,7 @@ class BertWordPieceEncoder(nn.Module):
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")

self.model = _WordPieceBertModel(model_dir=model_dir, vocab=vocab, layers=layers)
self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers)
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
self.requires_grad = requires_grad

@@ -69,27 +67,27 @@ class BertWordPieceEncoder(nn.Module):
def embed_size(self):
return self._embed_size

def index_datasets(self, *datasets):
def index_datasets(self, *datasets, field_name):
"""
根据datasets中的'words'列对datasets进行word piece的index。

Example::
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。

:param datasets:
:param datasets: DataSet对象
:param field_name: str基于哪一列index
:return:
"""
self.model.index_dataset(*datasets)
self.model.index_dataset(*datasets, field_name=field_name)


def forward(self, words, token_type_ids=None):
def forward(self, word_pieces, token_type_ids=None):
"""
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
删除这两个表示。
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。

:param words: batch_size x max_len
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
"""
outputs = self.model(words, token_type_ids)
outputs = self.model(word_pieces, token_type_ids)
outputs = torch.cat([*outputs], dim=-1)

return outputs

+ 129
- 16
fastNLP/modules/encoder/embedding.py View File

@@ -13,7 +13,7 @@ from .lstm import LSTM
from ...core.vocabulary import Vocabulary
from abc import abstractmethod
import torch
from ...io import EmbedLoader
import numpy as np
import torch.nn.functional as F
import os
from ._elmo import _ElmoModel
@@ -21,6 +21,7 @@ from ...io.file_utils import cached_path, _get_base_url
from ._bert import _WordBertModel
from typing import List

import warnings
from ...core.dataset import DataSet
from ...core.batch import DataSetIter
from ...core.sampler import SequentialSampler
@@ -33,13 +34,15 @@ class Embedding(nn.Module):

Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
def __init__(self, init_embed, dropout=0.0):
def __init__(self, init_embed, dropout=0.0, dropout_word=0, unk_index=None):
"""

:param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding;
也可以传入TokenEmbedding对象
:param float dropout: 对Embedding的输出的dropout。
:param float dropout_word: 按照一定比例随机将word设置为unk的idx,这样可以使得unk这个token得到足够的训练
:param int unk_index: drop word时替换为的index,如果init_embed为TokenEmbedding不需要传入该值。
"""
super(Embedding, self).__init__()

@@ -48,20 +51,32 @@ class Embedding(nn.Module):
self.dropout = nn.Dropout(dropout)
if not isinstance(self.embed, TokenEmbedding):
self._embed_size = self.embed.weight.size(1)
if dropout_word>0 and isinstance(unk_index, int):
raise ValueError("When drop word is set, you need to pass in the unk_index.")
else:
self._embed_size = self.embed.embed_size
unk_index = self.embed.get_word_vocab().unknown_idx
self.unk_index = unk_index
self.dropout_word = dropout_word

def forward(self, x):
"""
:param torch.LongTensor x: [batch, seq_len]
:return: torch.Tensor : [batch, seq_len, embed_dim]
"""
if self.dropout_word>0 and self.training:
mask = torch.ones_like(x).float() * self.dropout_word
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1
x = x.masked_fill(mask, self.unk_index)
x = self.embed(x)
return self.dropout(x)

@property
def num_embedding(self)->int:
return len(self)
if isinstance(self.embed, nn.Embedding):
return self.embed.weight.size(0)
else:
return self.embed.num_embedding

def __len__(self):
return len(self.embed)
@@ -95,7 +110,7 @@ class Embedding(nn.Module):
@property
def size(self):
if isinstance(self.embed, TokenEmbedding):
return torch.Size(self.embed._word_vocab, self.embed.embed_size)
return self.embed.size
else:
return self.embed.weight.size()

@@ -131,6 +146,10 @@ class TokenEmbedding(nn.Module):
def embed_size(self) -> int:
return self._embed_size

@property
def num_embedding(self) -> int:
return len(self._word_vocab)

def get_word_vocab(self):
"""
返回embedding的词典。
@@ -141,7 +160,7 @@ class TokenEmbedding(nn.Module):

@property
def size(self):
return torch.Size(self.embed._word_vocab, self._embed_size)
return torch.Size(self.num_embedding, self._embed_size)


class StaticEmbedding(TokenEmbedding):
@@ -159,11 +178,12 @@ class StaticEmbedding(TokenEmbedding):
:param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding的文件名,第二种是传入embedding
的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
`en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。
:param requires_grad: 是否需要gradient
:param requires_grad: 是否需要gradient. 默认为True
:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.xavier_uniform_
。调用该方法时传入一个tensor对象。

"""

def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=False):
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None):
super(StaticEmbedding, self).__init__(vocab)

# 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server,
@@ -190,15 +210,105 @@ class StaticEmbedding(TokenEmbedding):
raise ValueError(f"Cannot recognize {model_dir_or_name}.")

# 读取embedding
embedding = EmbedLoader.load_with_vocab(model_path, vocab=vocab)
embedding = torch.tensor(embedding)
embedding, hit_flags = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method)
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
padding_idx=vocab.padding_idx,
max_norm=None, norm_type=2, scale_grad_by_freq=False,
sparse=False, _weight=embedding)
if vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk
words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False)
for word, idx in vocab:
if vocab._is_word_no_create_entry(word) and not hit_flags[idx]:
words_to_words[idx] = vocab.unknown_idx
self.words_to_words = words_to_words
self._embed_size = self.embedding.weight.size(1)
self.requires_grad = requires_grad

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'words_to_words' not in name])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_words' in name:
continue
param.requires_grad = value

def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
error='ignore', init_method=None):
"""
从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
word2vec(第一行只有两个元素)还是glove格式的数据。

:param str embed_filepath: 预训练的embedding的路径。
:param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型,读取出现在vocab中的词的embedding。
没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来,以使得整个Embedding是同分布的。
:param dtype: 读出的embedding的类型
:param str padding: 词表中padding的token
:param str unknown: 词表中unknown的token
:param bool normalize: 是否将每个vector归一化到norm为1
:param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。
这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_
:return torch.tensor: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
"""
assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
if not os.path.exists(embed_filepath):
raise FileNotFoundError("`{}` does not exist.".format(embed_filepath))
if init_method is None:
init_method = nn.init.xavier_uniform_
with open(embed_filepath, 'r', encoding='utf-8') as f:
found_count = 0
line = f.readline().strip()
parts = line.split()
start_idx = 0
if len(parts) == 2:
dim = int(parts[1])
start_idx += 1
else:
dim = len(parts) - 1
f.seek(0)
matrix = torch.zeros(len(vocab), dim)
init_method(matrix)
hit_flags = np.zeros(len(vocab), dtype=bool)
for idx, line in enumerate(f, start_idx):
try:
parts = line.strip().split()
word = ''.join(parts[:-dim])
nums = parts[-dim:]
# 对齐unk与pad
if word == padding and vocab.padding is not None:
word = vocab.padding
elif word == unknown and vocab.unknown is not None:
word = vocab.unknown
if word in vocab:
index = vocab.to_index(word)
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
found_count += 1
hit_flags[index] = True
except Exception as e:
if error == 'ignore':
warnings.warn("Error occurred at the {} line.".format(idx))
else:
print("Error occurred at the {} line.".format(idx))
raise e
print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))

if normalize:
matrix /= (torch.norm(matrix, dim=1, keepdim=True) + 1e-12)

return matrix, hit_flags

def forward(self, words):
"""
传入words的index
@@ -206,6 +316,8 @@ class StaticEmbedding(TokenEmbedding):
:param words: torch.LongTensor, [batch_size, max_len]
:return: torch.FloatTensor, [batch_size, max_len, embed_size]
"""
if hasattr(self, 'words_to_words'):
words = self.words_to_words[words]
return self.embedding(words)


@@ -382,7 +494,7 @@ class ElmoEmbedding(ContextualEmbedding):
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'words_to_chars_embedding' not in name])
if 'words_to_chars_embedding' not in name and 'words_to_words' not in name])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
@@ -391,7 +503,7 @@ class ElmoEmbedding(ContextualEmbedding):
@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name: # 这个不能加入到requires_grad中
if 'words_to_chars_embedding' in name or 'words_to_words' in name: # 这个不能加入到requires_grad中
continue
param.requires_grad = value

@@ -501,7 +613,8 @@ def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
"""
char_vocab = Vocabulary(min_freq=min_freq)
for word, index in vocab:
char_vocab.add_word_lst(list(word))
if not vocab._is_word_no_create_entry(word):
char_vocab.add_word_lst(list(word))
return char_vocab


@@ -566,7 +679,7 @@ class CNNCharEmbedding(TokenEmbedding):
requires_grad=False)
self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
for word, index in vocab:
# if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。 修改为不区分pad, 这样所有的<pad>也是同一个embed
# if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。修改为不区分pad, 这样所有的<pad>也是同一个embed
self.words_to_chars_embedding[index, :len(word)] = \
torch.LongTensor([self.char_vocab.to_index(c) for c in word])
self.word_lengths[index] = len(word)
@@ -638,7 +751,7 @@ class CNNCharEmbedding(TokenEmbedding):
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset
continue
if param.data.dim()>1:
nn.init.xavier_normal_(param, 1)
nn.init.xavier_uniform_(param, 1)
else:
nn.init.uniform_(param, -1, 1)



+ 2
- 2
fastNLP/modules/encoder/lstm.py View File

@@ -55,8 +55,8 @@ class LSTM(nn.Module):

:param x: [batch, seq_len, input_size] 输入序列
:param seq_len: [batch, ] 序列长度, 若为 ``None``, 所有输入看做一样长. Default: ``None``
:param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None``
:param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全1向量. Default: ``None``
:param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全0向量. Default: ``None``
:param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全0向量. Default: ``None``
:return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列
和 [batch, hidden_size*num_direction] 最后时刻隐状态.
"""


+ 5
- 4
reproduction/seqence_labelling/ner/data/Conll2003Loader.py View File

@@ -58,19 +58,20 @@ class Conll2003DataLoader(DataSetLoader):
dataset = self.load(path)
dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
if lower:
dataset.apply_field(lambda words:[word.lower() for word in words], field_name=Const.INPUT,
new_field_name=Const.INPUT)
dataset.words.lower()
data.datasets[name] = dataset

# 对construct vocab
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT)
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
data.vocabs[Const.INPUT] = word_vocab

# cap words
cap_word_vocab = Vocabulary()
cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words')
cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words',
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
input_fields.append('cap_words')
data.vocabs['cap_words'] = cap_word_vocab


+ 26
- 4
reproduction/seqence_labelling/ner/data/OntoNoteLoader.py View File

@@ -49,6 +49,28 @@ class OntoNoteNERDataLoader(DataSetLoader):
bio_tags.append(bio_label)
return self.encoding_method(bio_tags)

def convert_word(words):
converted_words = []
for word in words:
word = word.replace('/.', '.') # 有些结尾的.是/.形式的
if not word.startswith('-'):
converted_words.append(word)
continue
# 以下是由于这些符号被转义了,再转回来
tfrs = {'-LRB-':'(',
'-RRB-': ')',
'-LSB-': '[',
'-RSB-': ']',
'-LCB-': '{',
'-RCB-': '}'
}
if word in tfrs:
converted_words.append(tfrs[word])
else:
converted_words.append(word)
return converted_words

dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words')
dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target')

return dataset
@@ -81,14 +103,14 @@ class OntoNoteNERDataLoader(DataSetLoader):
dataset = self.load(path)
dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
if lower:
dataset.apply_field(lambda words:[word.lower() for word in words], field_name=Const.INPUT,
new_field_name=Const.INPUT)
dataset.words.lower()
data.datasets[name] = dataset

# 对construct vocab
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT)
word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name=Const.INPUT)
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
data.vocabs[Const.INPUT] = word_vocab

# cap words


+ 18
- 0
test/core/test_vocabulary.py View File

@@ -70,6 +70,24 @@ class TestAdd(unittest.TestCase):
self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
vocab.index_dataset(dataset, field_name='char')

def test_from_dataset_no_entry(self):
# 测试能否正确将no_create_entry正确设置
dataset = DataSet()
start_char = 65
num_samples = 10
test_dataset = DataSet()
for i in range(num_samples):
char = [chr(start_char + i)] * 6
ins = Instance(char=char)
dataset.append(ins)
ins = Instance(char=[c+c for c in char])
test_dataset.append(ins)
vocab = Vocabulary()
vocab.from_dataset(dataset, field_name='char', no_create_entry_dataset=test_dataset)
vocab.index_dataset(dataset, field_name='char')
for i in range(num_samples):
self.assertEqual(True, vocab._is_word_no_create_entry(chr(start_char + i)+chr(start_char + i)))


class TestIndexing(unittest.TestCase):
def test_len(self):


Loading…
Cancel
Save