@@ -0,0 +1 @@ | |||
from .bert_tokenizer import BertTokenizer |
@@ -0,0 +1,378 @@ | |||
""" | |||
bert_tokenizer.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. | |||
""" | |||
import collections | |||
import os | |||
import unicodedata | |||
from io import open | |||
PRETRAINED_VOCAB_ARCHIVE_MAP = { | |||
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", | |||
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", | |||
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", | |||
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", | |||
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", | |||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", | |||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", | |||
} | |||
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = { | |||
'bert-base-uncased': 512, | |||
'bert-large-uncased': 512, | |||
'bert-base-cased': 512, | |||
'bert-large-cased': 512, | |||
'bert-base-multilingual-uncased': 512, | |||
'bert-base-multilingual-cased': 512, | |||
'bert-base-chinese': 512, | |||
} | |||
VOCAB_NAME = 'vocab.txt' | |||
def load_vocab(vocab_file): | |||
"""Loads a vocabulary file into a dictionary.""" | |||
vocab = collections.OrderedDict() | |||
index = 0 | |||
with open(vocab_file, "r", encoding="utf-8") as reader: | |||
while True: | |||
token = reader.readline() | |||
if not token: | |||
break | |||
token = token.strip() | |||
vocab[token] = index | |||
index += 1 | |||
return vocab | |||
def whitespace_tokenize(text): | |||
"""Runs basic whitespace cleaning and splitting on a piece of text.""" | |||
text = text.strip() | |||
if not text: | |||
return [] | |||
tokens = text.split() | |||
return tokens | |||
class BertTokenizer(object): | |||
"""Runs end-to-end tokenization: punctuation splitting + wordpiece""" | |||
def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, | |||
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): | |||
"""Constructs a BertTokenizer. | |||
Args: | |||
vocab_file: Path to a one-wordpiece-per-line vocabulary file | |||
do_lower_case: Whether to lower case the input | |||
Only has an effect when do_wordpiece_only=False | |||
do_basic_tokenize: Whether to do basic tokenization before wordpiece. | |||
max_len: An artificial maximum length to truncate tokenized sequences to; | |||
Effective maximum length is always the minimum of this | |||
value (if specified) and the underlying BERT model's | |||
sequence length. | |||
never_split: List of tokens which will never be split during tokenization. | |||
Only has an effect when do_wordpiece_only=False | |||
""" | |||
if not os.path.isfile(vocab_file): | |||
raise ValueError( | |||
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " | |||
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) | |||
self.vocab = load_vocab(vocab_file) | |||
self.ids_to_tokens = collections.OrderedDict( | |||
[(ids, tok) for tok, ids in self.vocab.items()]) | |||
self.do_basic_tokenize = do_basic_tokenize | |||
if do_basic_tokenize: | |||
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, | |||
never_split=never_split) | |||
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) | |||
self.max_len = max_len if max_len is not None else int(1e12) | |||
def tokenize(self, text): | |||
split_tokens = [] | |||
if self.do_basic_tokenize: | |||
for token in self.basic_tokenizer.tokenize(text): | |||
for sub_token in self.wordpiece_tokenizer.tokenize(token): | |||
split_tokens.append(sub_token) | |||
else: | |||
split_tokens = self.wordpiece_tokenizer.tokenize(text) | |||
return split_tokens | |||
def convert_tokens_to_ids(self, tokens): | |||
"""Converts a sequence of tokens into ids using the vocab.""" | |||
ids = [] | |||
for token in tokens: | |||
ids.append(self.vocab[token]) | |||
if len(ids) > self.max_len: | |||
print( | |||
"WARNING!\n\"" | |||
"Token indices sequence length is longer than the specified maximum " | |||
"sequence length for this BERT model ({} > {}). Running this" | |||
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len) | |||
) | |||
return ids | |||
def convert_ids_to_tokens(self, ids): | |||
"""Converts a sequence of ids in wordpiece tokens using the vocab.""" | |||
tokens = [] | |||
for i in ids: | |||
tokens.append(self.ids_to_tokens[i]) | |||
return tokens | |||
def save_vocabulary(self, vocab_path): | |||
"""Save the tokenizer vocabulary to a directory or file.""" | |||
index = 0 | |||
if os.path.isdir(vocab_path): | |||
vocab_file = os.path.join(vocab_path, VOCAB_NAME) | |||
with open(vocab_file, "w", encoding="utf-8") as writer: | |||
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): | |||
if index != token_index: | |||
print("Saving vocabulary to {}: vocabulary indices are not consecutive." | |||
" Please check that the vocabulary is not corrupted!".format(vocab_file)) | |||
index = token_index | |||
writer.write(token + u'\n') | |||
index += 1 | |||
return vocab_file | |||
@classmethod | |||
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): | |||
""" | |||
Instantiate a PreTrainedBertModel from a pre-trained model file. | |||
Download and cache the pre-trained model file if needed. | |||
""" | |||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: | |||
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] | |||
if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True): | |||
print("The pre-trained model you are loading is a cased model but you have not set " | |||
"`do_lower_case` to False. We are setting `do_lower_case=False` for you but " | |||
"you may want to check this behavior.") | |||
kwargs['do_lower_case'] = False | |||
elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True): | |||
print("The pre-trained model you are loading is an uncased model but you have set " | |||
"`do_lower_case` to False. We are setting `do_lower_case=True` for you " | |||
"but you may want to check this behavior.") | |||
kwargs['do_lower_case'] = True | |||
else: | |||
vocab_file = pretrained_model_name_or_path | |||
if os.path.isdir(vocab_file): | |||
vocab_file = os.path.join(vocab_file, VOCAB_NAME) | |||
# redirect to the cache, if necessary | |||
resolved_vocab_file = vocab_file | |||
print("loading vocabulary file {}".format(vocab_file)) | |||
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: | |||
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer | |||
# than the number of positional embeddings | |||
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path] | |||
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) | |||
# Instantiate tokenizer. | |||
tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) | |||
return tokenizer | |||
class BasicTokenizer(object): | |||
"""Runs basic tokenization (punctuation splitting, lower casing, etc.).""" | |||
def __init__(self, | |||
do_lower_case=True, | |||
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): | |||
"""Constructs a BasicTokenizer. | |||
Args: | |||
do_lower_case: Whether to lower case the input. | |||
""" | |||
self.do_lower_case = do_lower_case | |||
self.never_split = never_split | |||
def tokenize(self, text): | |||
"""Tokenizes a piece of text.""" | |||
text = self._clean_text(text) | |||
# This was added on November 1st, 2018 for the multilingual and Chinese | |||
# models. This is also applied to the English models now, but it doesn't | |||
# matter since the English models were not trained on any Chinese data | |||
# and generally don't have any Chinese data in them (there are Chinese | |||
# characters in the vocabulary because Wikipedia does have some Chinese | |||
# words in the English Wikipedia.). | |||
text = self._tokenize_chinese_chars(text) | |||
orig_tokens = whitespace_tokenize(text) | |||
split_tokens = [] | |||
for token in orig_tokens: | |||
if self.do_lower_case and token not in self.never_split: | |||
token = token.lower() | |||
token = self._run_strip_accents(token) | |||
split_tokens.extend(self._run_split_on_punc(token)) | |||
output_tokens = whitespace_tokenize(" ".join(split_tokens)) | |||
return output_tokens | |||
def _run_strip_accents(self, text): | |||
"""Strips accents from a piece of text.""" | |||
text = unicodedata.normalize("NFD", text) | |||
output = [] | |||
for char in text: | |||
cat = unicodedata.category(char) | |||
if cat == "Mn": | |||
continue | |||
output.append(char) | |||
return "".join(output) | |||
def _run_split_on_punc(self, text): | |||
"""Splits punctuation on a piece of text.""" | |||
if text in self.never_split: | |||
return [text] | |||
chars = list(text) | |||
i = 0 | |||
start_new_word = True | |||
output = [] | |||
while i < len(chars): | |||
char = chars[i] | |||
if _is_punctuation(char): | |||
output.append([char]) | |||
start_new_word = True | |||
else: | |||
if start_new_word: | |||
output.append([]) | |||
start_new_word = False | |||
output[-1].append(char) | |||
i += 1 | |||
return ["".join(x) for x in output] | |||
def _tokenize_chinese_chars(self, text): | |||
"""Adds whitespace around any CJK character.""" | |||
output = [] | |||
for char in text: | |||
cp = ord(char) | |||
if self._is_chinese_char(cp): | |||
output.append(" ") | |||
output.append(char) | |||
output.append(" ") | |||
else: | |||
output.append(char) | |||
return "".join(output) | |||
def _is_chinese_char(self, cp): | |||
"""Checks whether CP is the codepoint of a CJK character.""" | |||
# This defines a "chinese character" as anything in the CJK Unicode block: | |||
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) | |||
# | |||
# Note that the CJK Unicode block is NOT all Japanese and Korean characters, | |||
# despite its name. The modern Korean Hangul alphabet is a different block, | |||
# as is Japanese Hiragana and Katakana. Those alphabets are used to write | |||
# space-separated words, so they are not treated specially and handled | |||
# like the all of the other languages. | |||
if ((cp >= 0x4E00 and cp <= 0x9FFF) or # | |||
(cp >= 0x3400 and cp <= 0x4DBF) or # | |||
(cp >= 0x20000 and cp <= 0x2A6DF) or # | |||
(cp >= 0x2A700 and cp <= 0x2B73F) or # | |||
(cp >= 0x2B740 and cp <= 0x2B81F) or # | |||
(cp >= 0x2B820 and cp <= 0x2CEAF) or | |||
(cp >= 0xF900 and cp <= 0xFAFF) or # | |||
(cp >= 0x2F800 and cp <= 0x2FA1F)): # | |||
return True | |||
return False | |||
def _clean_text(self, text): | |||
"""Performs invalid character removal and whitespace cleanup on text.""" | |||
output = [] | |||
for char in text: | |||
cp = ord(char) | |||
if cp == 0 or cp == 0xfffd or _is_control(char): | |||
continue | |||
if _is_whitespace(char): | |||
output.append(" ") | |||
else: | |||
output.append(char) | |||
return "".join(output) | |||
class WordpieceTokenizer(object): | |||
"""Runs WordPiece tokenization.""" | |||
def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): | |||
self.vocab = vocab | |||
self.unk_token = unk_token | |||
self.max_input_chars_per_word = max_input_chars_per_word | |||
def tokenize(self, text): | |||
"""Tokenizes a piece of text into its word pieces. | |||
This uses a greedy longest-match-first algorithm to perform tokenization | |||
using the given vocabulary. | |||
For example: | |||
input = "unaffable" | |||
output = ["un", "##aff", "##able"] | |||
Args: | |||
text: A single token or whitespace separated tokens. This should have | |||
already been passed through `BasicTokenizer`. | |||
Returns: | |||
A list of wordpiece tokens. | |||
""" | |||
output_tokens = [] | |||
for token in whitespace_tokenize(text): | |||
chars = list(token) | |||
if len(chars) > self.max_input_chars_per_word: | |||
output_tokens.append(self.unk_token) | |||
continue | |||
is_bad = False | |||
start = 0 | |||
sub_tokens = [] | |||
while start < len(chars): | |||
end = len(chars) | |||
cur_substr = None | |||
while start < end: | |||
substr = "".join(chars[start:end]) | |||
if start > 0: | |||
substr = "##" + substr | |||
if substr in self.vocab: | |||
cur_substr = substr | |||
break | |||
end -= 1 | |||
if cur_substr is None: | |||
is_bad = True | |||
break | |||
sub_tokens.append(cur_substr) | |||
start = end | |||
if is_bad: | |||
output_tokens.append(self.unk_token) | |||
else: | |||
output_tokens.extend(sub_tokens) | |||
return output_tokens | |||
def _is_whitespace(char): | |||
"""Checks whether `chars` is a whitespace character.""" | |||
# \t, \n, and \r are technically contorl characters but we treat them | |||
# as whitespace since they are generally considered as such. | |||
if char == " " or char == "\t" or char == "\n" or char == "\r": | |||
return True | |||
cat = unicodedata.category(char) | |||
if cat == "Zs": | |||
return True | |||
return False | |||
def _is_control(char): | |||
"""Checks whether `chars` is a control character.""" | |||
# These are technically control characters but we count them as whitespace | |||
# characters. | |||
if char == "\t" or char == "\n" or char == "\r": | |||
return False | |||
cat = unicodedata.category(char) | |||
if cat.startswith("C"): | |||
return True | |||
return False | |||
def _is_punctuation(char): | |||
"""Checks whether `chars` is a punctuation character.""" | |||
cp = ord(char) | |||
# We treat all non-letter/number ASCII as punctuation. | |||
# Characters such as "^", "$", and "`" are not in the Unicode | |||
# Punctuation class but we treat them as punctuation anyways, for | |||
# consistency. | |||
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or | |||
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): | |||
return True | |||
cat = unicodedata.category(char) | |||
if cat.startswith("P"): | |||
return True | |||
return False | |||
@@ -13,7 +13,7 @@ from fastNLP.core.utils import get_func_signature | |||
class LossBase(object): | |||
"""Base class for all losses. | |||
"""所有loss的基类. | |||
""" | |||
def __init__(self): | |||
@@ -24,10 +24,10 @@ class LossBase(object): | |||
raise NotImplementedError | |||
def _init_param_map(self, key_map=None, **kwargs): | |||
"""Check the validity of key_map and other param map. Add these into self.param_map | |||
"""检查key_map和其他参数map,并将这些映射关系添加到self.param_map | |||
:param key_map: dict | |||
:param kwargs: | |||
:param dict key_map: 表示key的映射关系 | |||
:param kwargs: key word args里面的每一个的键-值对都会被构造成映射关系 | |||
:return: None | |||
""" | |||
value_counter = defaultdict(set) | |||
@@ -87,9 +87,9 @@ class LossBase(object): | |||
def __call__(self, pred_dict, target_dict, check=False): | |||
""" | |||
:param pred_dict: A dict from forward function of the network. | |||
:param target_dict: A dict from DataSet.batch_y. | |||
:param check: Boolean. Force to check the mapping functions when it is running. | |||
:param dict pred_dict: 模型的forward函数返回的dict | |||
:param dict target_dict: DataSet.batch_y里的键-值对所组成的dict | |||
:param Boolean check: 每一次执行映射函数的时候是否检查映射表,默认为不检查 | |||
:return: | |||
""" | |||
fast_param = self._fast_param_map(pred_dict, target_dict) | |||
@@ -162,15 +162,25 @@ class LossBase(object): | |||
class LossFunc(LossBase): | |||
"""A wrapper of user-provided loss function. | |||
"""提供给用户使用自定义损失函数的类 | |||
""" | |||
def __init__(self, func, key_map=None, **kwargs): | |||
""" | |||
:param func: a callable object, such as a function. | |||
:param dict key_map: | |||
:param kwargs: | |||
:param func: 用户自行定义的损失函数,应当为一个函数或者callable(func)为True的ojbect | |||
:param dict key_map: 参数映射表。键为Model/DataSet参数名,值为损失函数参数名。 | |||
fastNLP的trainer将在训练时从模型返回值或者训练数据DataSet的target=True的field中 | |||
找到相对应的参数名为value的参数,并传入func中作为参数名为key的参数 | |||
:param kwargs: 除了参数映射表以外可以用key word args的方式设置参数映射关系 | |||
Example:: | |||
>>> func = torch.nn.CrossEntropyLoss() | |||
>>> loss_func = LossFunc(func, input="pred", target="label") | |||
>>> # 这表示构建了一个损失函数类,由func计算损失函数,其中将从模型返回值或者DataSet的target=True的field | |||
>>> # 当中找到一个参数名为`pred`的参数传入func一个参数名为`input`的参数;找到一个参数名为`label`的参数 | |||
>>> # 传入func作为一个名为`target`的参数 | |||
""" | |||
super(LossFunc, self).__init__() | |||
_check_function_or_method(func) | |||
@@ -186,7 +196,17 @@ class LossFunc(LossBase): | |||
class CrossEntropyLoss(LossBase): | |||
"""交叉熵损失函数""" | |||
def __init__(self, pred=None, target=None, padding_idx=-100): | |||
""" | |||
:param pred: 参数映射表中`pred`的映射关系,None表示映射关系为`pred`->`pred` | |||
:param target: 参数映射表中`target`的映射关系,None表示映射关系为`target`->`target` | |||
:param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容 | |||
Example:: | |||
>>> loss = CrossEntropyLoss(pred='pred', target='label', padding_idx=0) | |||
""" | |||
# TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际却需要 | |||
# TODO (16, 4) | |||
super(CrossEntropyLoss, self).__init__() | |||
@@ -199,7 +219,12 @@ class CrossEntropyLoss(LossBase): | |||
class L1Loss(LossBase): | |||
"""L1损失函数""" | |||
def __init__(self, pred=None, target=None): | |||
""" | |||
:param pred: 参数映射表中`pred`的映射关系,None表示映射关系为`pred`->`pred` | |||
:param target: 参数映射表中`target`的映射关系,None表示映射关系为`target`->`target` | |||
""" | |||
super(L1Loss, self).__init__() | |||
self._init_param_map(pred=pred, target=target) | |||
@@ -208,7 +233,12 @@ class L1Loss(LossBase): | |||
class BCELoss(LossBase): | |||
"""二分类交叉熵损失函数""" | |||
def __init__(self, pred=None, target=None): | |||
""" | |||
:param pred: 参数映射表中`pred`的映射关系,None表示映射关系为`pred`->`pred` | |||
:param target: 参数映射表中`target`的映射关系,None表示映射关系为`target`->`target` | |||
""" | |||
super(BCELoss, self).__init__() | |||
self._init_param_map(pred=pred, target=target) | |||
@@ -217,7 +247,12 @@ class BCELoss(LossBase): | |||
class NLLLoss(LossBase): | |||
"""负对数似然损失函数""" | |||
def __init__(self, pred=None, target=None): | |||
""" | |||
:param pred: 参数映射表中`pred`的映射关系,None表示映射关系为`pred`->`pred` | |||
:param target: 参数映射表中`target`的映射关系,None表示映射关系为`target`->`target` | |||
""" | |||
super(NLLLoss, self).__init__() | |||
self._init_param_map(pred=pred, target=target) | |||
@@ -226,7 +261,11 @@ class NLLLoss(LossBase): | |||
class LossInForward(LossBase): | |||
"""Forward函数中计算得到的损失函数结果""" | |||
def __init__(self, loss_key='loss'): | |||
""" | |||
:param str loss_key: 在forward函数中取得loss的键名,默认为loss | |||
""" | |||
super().__init__() | |||
if not isinstance(loss_key, str): | |||
raise TypeError(f"Only str allowed for loss_key, got {type(loss_key)}.") | |||
@@ -234,13 +273,14 @@ class LossInForward(LossBase): | |||
def get_loss(self, **kwargs): | |||
if self.loss_key not in kwargs: | |||
check_res = CheckRes(missing=[self.loss_key + f"(assign to `{self.loss_key}` " \ | |||
f"in `{self.__class__.__name__}`"], | |||
unused=[], | |||
duplicated=[], | |||
required=[], | |||
all_needed=[], | |||
varargs=[]) | |||
check_res = CheckRes( | |||
missing=[self.loss_key + f"(assign to `{self.loss_key}` in `{self.__class__.__name__}`"], | |||
unused=[], | |||
duplicated=[], | |||
required=[], | |||
all_needed=[], | |||
varargs=[] | |||
) | |||
raise CheckError(check_res=check_res, func_signature=get_func_signature(self.get_loss)) | |||
return kwargs[self.loss_key] | |||
@@ -2,361 +2,290 @@ | |||
bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. | |||
""" | |||
import copy | |||
import json | |||
import math | |||
import os | |||
import torch | |||
from torch import nn | |||
CONFIG_FILE = 'bert_config.json' | |||
MODEL_WEIGHTS = 'pytorch_model.bin' | |||
def gelu(x): | |||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) | |||
def swish(x): | |||
return x * torch.sigmoid(x) | |||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} | |||
class BertLayerNorm(nn.Module): | |||
def __init__(self, hidden_size, eps=1e-12): | |||
super(BertLayerNorm, self).__init__() | |||
self.weight = nn.Parameter(torch.ones(hidden_size)) | |||
self.bias = nn.Parameter(torch.zeros(hidden_size)) | |||
self.variance_epsilon = eps | |||
def forward(self, x): | |||
u = x.mean(-1, keepdim=True) | |||
s = (x - u).pow(2).mean(-1, keepdim=True) | |||
x = (x - u) / torch.sqrt(s + self.variance_epsilon) | |||
return self.weight * x + self.bias | |||
class BertEmbeddings(nn.Module): | |||
def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, hidden_dropout_prob): | |||
super(BertEmbeddings, self).__init__() | |||
self.word_embeddings = nn.Embedding(vocab_size, hidden_size) | |||
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) | |||
self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) | |||
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load | |||
# any TensorFlow checkpoint file | |||
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12) | |||
self.dropout = nn.Dropout(hidden_dropout_prob) | |||
def forward(self, input_ids, token_type_ids=None): | |||
seq_length = input_ids.size(1) | |||
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) | |||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) | |||
if token_type_ids is None: | |||
token_type_ids = torch.zeros_like(input_ids) | |||
words_embeddings = self.word_embeddings(input_ids) | |||
position_embeddings = self.position_embeddings(position_ids) | |||
token_type_embeddings = self.token_type_embeddings(token_type_ids) | |||
embeddings = words_embeddings + position_embeddings + token_type_embeddings | |||
embeddings = self.LayerNorm(embeddings) | |||
embeddings = self.dropout(embeddings) | |||
return embeddings | |||
class BertSelfAttention(nn.Module): | |||
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob): | |||
super(BertSelfAttention, self).__init__() | |||
if hidden_size % num_attention_heads != 0: | |||
raise ValueError( | |||
"The hidden size (%d) is not a multiple of the number of attention " | |||
"heads (%d)" % (hidden_size, num_attention_heads)) | |||
self.num_attention_heads = num_attention_heads | |||
self.attention_head_size = int(hidden_size / num_attention_heads) | |||
self.all_head_size = self.num_attention_heads * self.attention_head_size | |||
self.query = nn.Linear(hidden_size, self.all_head_size) | |||
self.key = nn.Linear(hidden_size, self.all_head_size) | |||
self.value = nn.Linear(hidden_size, self.all_head_size) | |||
self.dropout = nn.Dropout(attention_probs_dropout_prob) | |||
def transpose_for_scores(self, x): | |||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) | |||
x = x.view(*new_x_shape) | |||
return x.permute(0, 2, 1, 3) | |||
def forward(self, hidden_states, attention_mask): | |||
mixed_query_layer = self.query(hidden_states) | |||
mixed_key_layer = self.key(hidden_states) | |||
mixed_value_layer = self.value(hidden_states) | |||
query_layer = self.transpose_for_scores(mixed_query_layer) | |||
key_layer = self.transpose_for_scores(mixed_key_layer) | |||
value_layer = self.transpose_for_scores(mixed_value_layer) | |||
# Take the dot product between "query" and "key" to get the raw attention scores. | |||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) | |||
attention_scores = attention_scores / math.sqrt(self.attention_head_size) | |||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function) | |||
attention_scores = attention_scores + attention_mask | |||
# Normalize the attention scores to probabilities. | |||
attention_probs = nn.Softmax(dim=-1)(attention_scores) | |||
# This is actually dropping out entire tokens to attend to, which might | |||
# seem a bit unusual, but is taken from the original Transformer paper. | |||
attention_probs = self.dropout(attention_probs) | |||
context_layer = torch.matmul(attention_probs, value_layer) | |||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() | |||
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) | |||
context_layer = context_layer.view(*new_context_layer_shape) | |||
return context_layer | |||
class BertSelfOutput(nn.Module): | |||
def __init__(self, hidden_size, hidden_dropout_prob): | |||
super(BertSelfOutput, self).__init__() | |||
self.dense = nn.Linear(hidden_size, hidden_size) | |||
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12) | |||
self.dropout = nn.Dropout(hidden_dropout_prob) | |||
def forward(self, hidden_states, input_tensor): | |||
hidden_states = self.dense(hidden_states) | |||
hidden_states = self.dropout(hidden_states) | |||
hidden_states = self.LayerNorm(hidden_states + input_tensor) | |||
return hidden_states | |||
class BertAttention(nn.Module): | |||
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob): | |||
super(BertAttention, self).__init__() | |||
self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob) | |||
self.output = BertSelfOutput(hidden_size, hidden_dropout_prob) | |||
def forward(self, input_tensor, attention_mask): | |||
self_output = self.self(input_tensor, attention_mask) | |||
attention_output = self.output(self_output, input_tensor) | |||
return attention_output | |||
class BertIntermediate(nn.Module): | |||
def __init__(self, hidden_size, intermediate_size, hidden_act): | |||
super(BertIntermediate, self).__init__() | |||
self.dense = nn.Linear(hidden_size, intermediate_size) | |||
self.intermediate_act_fn = ACT2FN[hidden_act] \ | |||
if isinstance(hidden_act, str) else hidden_act | |||
def forward(self, hidden_states): | |||
hidden_states = self.dense(hidden_states) | |||
hidden_states = self.intermediate_act_fn(hidden_states) | |||
return hidden_states | |||
class BertOutput(nn.Module): | |||
def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob): | |||
super(BertOutput, self).__init__() | |||
self.dense = nn.Linear(intermediate_size, hidden_size) | |||
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12) | |||
self.dropout = nn.Dropout(hidden_dropout_prob) | |||
def forward(self, hidden_states, input_tensor): | |||
hidden_states = self.dense(hidden_states) | |||
hidden_states = self.dropout(hidden_states) | |||
hidden_states = self.LayerNorm(hidden_states + input_tensor) | |||
return hidden_states | |||
class BertLayer(nn.Module): | |||
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob, | |||
intermediate_size, hidden_act): | |||
super(BertLayer, self).__init__() | |||
self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob, | |||
hidden_dropout_prob) | |||
self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act) | |||
self.output = BertOutput(hidden_size, intermediate_size, hidden_dropout_prob) | |||
def forward(self, hidden_states, attention_mask): | |||
attention_output = self.attention(hidden_states, attention_mask) | |||
intermediate_output = self.intermediate(attention_output) | |||
layer_output = self.output(intermediate_output, attention_output) | |||
return layer_output | |||
class BertEncoder(nn.Module): | |||
def __init__(self, num_hidden_layers, hidden_size, num_attention_heads, attention_probs_dropout_prob, | |||
hidden_dropout_prob, | |||
intermediate_size, hidden_act): | |||
super(BertEncoder, self).__init__() | |||
layer = BertLayer(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob, | |||
intermediate_size, hidden_act) | |||
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)]) | |||
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): | |||
all_encoder_layers = [] | |||
for layer_module in self.layer: | |||
hidden_states = layer_module(hidden_states, attention_mask) | |||
if output_all_encoded_layers: | |||
all_encoder_layers.append(hidden_states) | |||
if not output_all_encoded_layers: | |||
all_encoder_layers.append(hidden_states) | |||
return all_encoder_layers | |||
class BertPooler(nn.Module): | |||
def __init__(self, hidden_size): | |||
super(BertPooler, self).__init__() | |||
self.dense = nn.Linear(hidden_size, hidden_size) | |||
self.activation = nn.Tanh() | |||
def forward(self, hidden_states): | |||
# We "pool" the model by simply taking the hidden state corresponding | |||
# to the first token. | |||
first_token_tensor = hidden_states[:, 0] | |||
pooled_output = self.dense(first_token_tensor) | |||
pooled_output = self.activation(pooled_output) | |||
return pooled_output | |||
class BertModel(nn.Module): | |||
"""Bidirectional Embedding Representations from Transformers. | |||
If you want to use pre-trained weights, please download from the following sources provided by pytorch-pretrained-BERT. | |||
sources:: | |||
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", | |||
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", | |||
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", | |||
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", | |||
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", | |||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", | |||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", | |||
Construct a BERT model with pre-trained weights:: | |||
model = BertModel.from_pretrained("path/to/weights/directory") | |||
from .base_model import BaseModel | |||
from fastNLP.modules.encoder import BertModel | |||
class BertForSequenceClassification(BaseModel): | |||
"""BERT model for classification. | |||
This module is composed of the BERT model with a linear layer on top of | |||
the pooled output. | |||
Params: | |||
`config`: a BertConfig class instance with the configuration to build a new model. | |||
`num_labels`: the number of classes for the classifier. Default = 2. | |||
Inputs: | |||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] | |||
with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts | |||
`extract_features.py`, `run_classifier.py` and `run_squad.py`) | |||
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token | |||
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to | |||
a `sentence B` token (see BERT paper for more details). | |||
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices | |||
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max | |||
input sequence length in the current batch. It's the mask that we typically use for attention when | |||
a batch has varying length sentences. | |||
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size] | |||
with indices selected in [0, ..., num_labels]. | |||
Outputs: | |||
if `labels` is not `None`: | |||
Outputs the CrossEntropy classification loss of the output with the labels. | |||
if `labels` is `None`: | |||
Outputs the classification logits of shape [batch_size, num_labels]. | |||
Example usage: | |||
```python | |||
# Already been converted into WordPiece token ids | |||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) | |||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) | |||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) | |||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, | |||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) | |||
num_labels = 2 | |||
model = BertForSequenceClassification(config, num_labels) | |||
logits = model(input_ids, token_type_ids, input_mask) | |||
``` | |||
""" | |||
def __init__(self, vocab_size, | |||
hidden_size=768, | |||
num_hidden_layers=12, | |||
num_attention_heads=12, | |||
intermediate_size=3072, | |||
hidden_act="gelu", | |||
hidden_dropout_prob=0.1, | |||
attention_probs_dropout_prob=0.1, | |||
max_position_embeddings=512, | |||
type_vocab_size=2, | |||
initializer_range=0.02, **kwargs): | |||
super(BertModel, self).__init__() | |||
self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings, | |||
type_vocab_size, hidden_dropout_prob) | |||
self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads, | |||
attention_probs_dropout_prob, hidden_dropout_prob, intermediate_size, | |||
hidden_act) | |||
self.pooler = BertPooler(hidden_size) | |||
self.initializer_range = initializer_range | |||
self.apply(self.init_bert_weights) | |||
def init_bert_weights(self, module): | |||
if isinstance(module, (nn.Linear, nn.Embedding)): | |||
# Slightly different from the TF version which uses truncated_normal for initialization | |||
# cf https://github.com/pytorch/pytorch/pull/5617 | |||
module.weight.data.normal_(mean=0.0, std=self.initializer_range) | |||
elif isinstance(module, BertLayerNorm): | |||
module.bias.data.zero_() | |||
module.weight.data.fill_(1.0) | |||
if isinstance(module, nn.Linear) and module.bias is not None: | |||
module.bias.data.zero_() | |||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True): | |||
if attention_mask is None: | |||
attention_mask = torch.ones_like(input_ids) | |||
if token_type_ids is None: | |||
token_type_ids = torch.zeros_like(input_ids) | |||
# We create a 3D attention mask from a 2D tensor mask. | |||
# Sizes are [batch_size, 1, 1, to_seq_length] | |||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] | |||
# this attention mask is more simple than the triangular masking of causal attention | |||
# used in OpenAI GPT, we just need to prepare the broadcast dimension here. | |||
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) | |||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for | |||
# masked positions, this operation will create a tensor which is 0.0 for | |||
# positions we want to attend and -10000.0 for masked positions. | |||
# Since we are adding it to the raw scores before the softmax, this is | |||
# effectively the same as removing these entirely. | |||
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility | |||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 | |||
embedding_output = self.embeddings(input_ids, token_type_ids) | |||
encoded_layers = self.encoder(embedding_output, | |||
extended_attention_mask, | |||
output_all_encoded_layers=output_all_encoded_layers) | |||
sequence_output = encoded_layers[-1] | |||
pooled_output = self.pooler(sequence_output) | |||
if not output_all_encoded_layers: | |||
encoded_layers = encoded_layers[-1] | |||
return encoded_layers, pooled_output | |||
@classmethod | |||
def from_pretrained(cls, pretrained_model_dir, state_dict=None, *inputs, **kwargs): | |||
# Load config | |||
config_file = os.path.join(pretrained_model_dir, CONFIG_FILE) | |||
config = json.load(open(config_file, "r")) | |||
# config = BertConfig.from_json_file(config_file) | |||
# logger.info("Model config {}".format(config)) | |||
# Instantiate model. | |||
model = cls(*inputs, **config, **kwargs) | |||
if state_dict is None: | |||
weights_path = os.path.join(pretrained_model_dir, MODEL_WEIGHTS) | |||
state_dict = torch.load(weights_path) | |||
old_keys = [] | |||
new_keys = [] | |||
for key in state_dict.keys(): | |||
new_key = None | |||
if 'gamma' in key: | |||
new_key = key.replace('gamma', 'weight') | |||
if 'beta' in key: | |||
new_key = key.replace('beta', 'bias') | |||
if new_key: | |||
old_keys.append(key) | |||
new_keys.append(new_key) | |||
for old_key, new_key in zip(old_keys, new_keys): | |||
state_dict[new_key] = state_dict.pop(old_key) | |||
missing_keys = [] | |||
unexpected_keys = [] | |||
error_msgs = [] | |||
# copy state_dict so _load_from_state_dict can modify it | |||
metadata = getattr(state_dict, '_metadata', None) | |||
state_dict = state_dict.copy() | |||
if metadata is not None: | |||
state_dict._metadata = metadata | |||
def load(module, prefix=''): | |||
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) | |||
module._load_from_state_dict( | |||
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) | |||
for name, child in module._modules.items(): | |||
if child is not None: | |||
load(child, prefix + name + '.') | |||
load(model, prefix='' if hasattr(model, 'bert') else 'bert.') | |||
if len(missing_keys) > 0: | |||
print("Weights of {} not initialized from pretrained model: {}".format( | |||
model.__class__.__name__, missing_keys)) | |||
if len(unexpected_keys) > 0: | |||
print("Weights from pretrained model not used in {}: {}".format( | |||
model.__class__.__name__, unexpected_keys)) | |||
return model | |||
def __init__(self, config, num_labels, bert_dir): | |||
super(BertForSequenceClassification, self).__init__() | |||
self.num_labels = num_labels | |||
self.bert = BertModel.from_pretrained(bert_dir) | |||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
self.classifier = nn.Linear(config.hidden_size, num_labels) | |||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): | |||
_, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) | |||
pooled_output = self.dropout(pooled_output) | |||
logits = self.classifier(pooled_output) | |||
if labels is not None: | |||
loss_fct = nn.CrossEntropyLoss() | |||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |||
return {"pred": logits, "loss": loss} | |||
else: | |||
return {"pred": logits} | |||
def predict(self, input_ids, token_type_ids=None, attention_mask=None): | |||
logits = self.forward(input_ids, token_type_ids, attention_mask) | |||
return {"pred": torch.argmax(logits, dim=-1)} | |||
class BertForMultipleChoice(BaseModel): | |||
"""BERT model for multiple choice tasks. | |||
This module is composed of the BERT model with a linear layer on top of | |||
the pooled output. | |||
Params: | |||
`config`: a BertConfig class instance with the configuration to build a new model. | |||
`num_choices`: the number of classes for the classifier. Default = 2. | |||
Inputs: | |||
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] | |||
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts | |||
`extract_features.py`, `run_classifier.py` and `run_squad.py`) | |||
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] | |||
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` | |||
and type 1 corresponds to a `sentence B` token (see BERT paper for more details). | |||
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices | |||
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max | |||
input sequence length in the current batch. It's the mask that we typically use for attention when | |||
a batch has varying length sentences. | |||
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size] | |||
with indices selected in [0, ..., num_choices]. | |||
Outputs: | |||
if `labels` is not `None`: | |||
Outputs the CrossEntropy classification loss of the output with the labels. | |||
if `labels` is `None`: | |||
Outputs the classification logits of shape [batch_size, num_labels]. | |||
Example usage: | |||
```python | |||
# Already been converted into WordPiece token ids | |||
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) | |||
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) | |||
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]]) | |||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, | |||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) | |||
num_choices = 2 | |||
model = BertForMultipleChoice(config, num_choices, bert_dir) | |||
logits = model(input_ids, token_type_ids, input_mask) | |||
``` | |||
""" | |||
def __init__(self, config, num_choices, bert_dir): | |||
super(BertForMultipleChoice, self).__init__() | |||
self.num_choices = num_choices | |||
self.bert = BertModel.from_pretrained(bert_dir) | |||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
self.classifier = nn.Linear(config.hidden_size, 1) | |||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): | |||
flat_input_ids = input_ids.view(-1, input_ids.size(-1)) | |||
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) | |||
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) | |||
_, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False) | |||
pooled_output = self.dropout(pooled_output) | |||
logits = self.classifier(pooled_output) | |||
reshaped_logits = logits.view(-1, self.num_choices) | |||
if labels is not None: | |||
loss_fct = nn.CrossEntropyLoss() | |||
loss = loss_fct(reshaped_logits, labels) | |||
return {"pred": reshaped_logits, "loss": loss} | |||
else: | |||
return {"pred": reshaped_logits} | |||
def predict(self, input_ids, token_type_ids=None, attention_mask=None): | |||
logits = self.forward(input_ids, token_type_ids, attention_mask)["pred"] | |||
return {"pred": torch.argmax(logits, dim=-1)} | |||
class BertForTokenClassification(BaseModel): | |||
"""BERT model for token-level classification. | |||
This module is composed of the BERT model with a linear layer on top of | |||
the full hidden state of the last layer. | |||
Params: | |||
`config`: a BertConfig class instance with the configuration to build a new model. | |||
`num_labels`: the number of classes for the classifier. Default = 2. | |||
`bert_dir`: a dir which contains the bert parameters within file `pytorch_model.bin` | |||
Inputs: | |||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] | |||
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts | |||
`extract_features.py`, `run_classifier.py` and `run_squad.py`) | |||
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token | |||
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to | |||
a `sentence B` token (see BERT paper for more details). | |||
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices | |||
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max | |||
input sequence length in the current batch. It's the mask that we typically use for attention when | |||
a batch has varying length sentences. | |||
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length] | |||
with indices selected in [0, ..., num_labels]. | |||
Outputs: | |||
if `labels` is not `None`: | |||
Outputs the CrossEntropy classification loss of the output with the labels. | |||
if `labels` is `None`: | |||
Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. | |||
Example usage: | |||
```python | |||
# Already been converted into WordPiece token ids | |||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) | |||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) | |||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) | |||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, | |||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) | |||
num_labels = 2 | |||
bert_dir = 'your-bert-file-dir' | |||
model = BertForTokenClassification(config, num_labels, bert_dir) | |||
logits = model(input_ids, token_type_ids, input_mask) | |||
``` | |||
""" | |||
def __init__(self, config, num_labels, bert_dir): | |||
super(BertForTokenClassification, self).__init__() | |||
self.num_labels = num_labels | |||
self.bert = BertModel.from_pretrained(bert_dir) | |||
self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
self.classifier = nn.Linear(config.hidden_size, num_labels) | |||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): | |||
sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) | |||
sequence_output = self.dropout(sequence_output) | |||
logits = self.classifier(sequence_output) | |||
if labels is not None: | |||
loss_fct = nn.CrossEntropyLoss() | |||
# Only keep active parts of the loss | |||
if attention_mask is not None: | |||
active_loss = attention_mask.view(-1) == 1 | |||
active_logits = logits.view(-1, self.num_labels)[active_loss] | |||
active_labels = labels.view(-1)[active_loss] | |||
loss = loss_fct(active_logits, active_labels) | |||
else: | |||
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) | |||
return {"pred": logits, "loss": loss} | |||
else: | |||
return {"pred": logits} | |||
def predict(self, input_ids, token_type_ids=None, attention_mask=None): | |||
logits = self.forward(input_ids, token_type_ids, attention_mask)["pred"] | |||
return {"pred": torch.argmax(logits, dim=-1)} | |||
class BertForQuestionAnswering(BaseModel): | |||
"""BERT model for Question Answering (span extraction). | |||
This module is composed of the BERT model with a linear layer on top of | |||
the sequence output that computes start_logits and end_logits | |||
Params: | |||
`config`: a BertConfig class instance with the configuration to build a new model. | |||
`bert_dir`: a dir which contains the bert parameters within file `pytorch_model.bin` | |||
Inputs: | |||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] | |||
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts | |||
`extract_features.py`, `run_classifier.py` and `run_squad.py`) | |||
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token | |||
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to | |||
a `sentence B` token (see BERT paper for more details). | |||
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices | |||
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max | |||
input sequence length in the current batch. It's the mask that we typically use for attention when | |||
a batch has varying length sentences. | |||
`start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size]. | |||
Positions are clamped to the length of the sequence and position outside of the sequence are not taken | |||
into account for computing the loss. | |||
`end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size]. | |||
Positions are clamped to the length of the sequence and position outside of the sequence are not taken | |||
into account for computing the loss. | |||
Outputs: | |||
if `start_positions` and `end_positions` are not `None`: | |||
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. | |||
if `start_positions` or `end_positions` is `None`: | |||
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end | |||
position tokens of shape [batch_size, sequence_length]. | |||
Example usage: | |||
```python | |||
# Already been converted into WordPiece token ids | |||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) | |||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) | |||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) | |||
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, | |||
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) | |||
bert_dir = 'your-bert-file-dir' | |||
model = BertForQuestionAnswering(config, bert_dir) | |||
start_logits, end_logits = model(input_ids, token_type_ids, input_mask) | |||
``` | |||
""" | |||
def __init__(self, config, bert_dir): | |||
super(BertForQuestionAnswering, self).__init__() | |||
self.bert = BertModel.from_pretrained(bert_dir) | |||
# TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version | |||
# self.dropout = nn.Dropout(config.hidden_dropout_prob) | |||
self.qa_outputs = nn.Linear(config.hidden_size, 2) | |||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None): | |||
sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) | |||
logits = self.qa_outputs(sequence_output) | |||
start_logits, end_logits = logits.split(1, dim=-1) | |||
start_logits = start_logits.squeeze(-1) | |||
end_logits = end_logits.squeeze(-1) | |||
if start_positions is not None and end_positions is not None: | |||
# If we are on multi-GPU, split add a dimension | |||
if len(start_positions.size()) > 1: | |||
start_positions = start_positions.squeeze(-1) | |||
if len(end_positions.size()) > 1: | |||
end_positions = end_positions.squeeze(-1) | |||
# sometimes the start/end positions are outside our model inputs, we ignore these terms | |||
ignored_index = start_logits.size(1) | |||
start_positions.clamp_(0, ignored_index) | |||
end_positions.clamp_(0, ignored_index) | |||
loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) | |||
start_loss = loss_fct(start_logits, start_positions) | |||
end_loss = loss_fct(end_logits, end_positions) | |||
total_loss = (start_loss + end_loss) / 2 | |||
return {"loss": total_loss} | |||
else: | |||
return {"pred1": start_logits, "pred2": end_logits} | |||
def predict(self, input_ids, token_type_ids=None, attention_mask=None, **kwargs): | |||
logits = self.forward(input_ids, token_type_ids, attention_mask) | |||
start_logits = logits["pred1"] | |||
end_logits = logits["pred2"] | |||
return {"pred1": torch.argmax(start_logits, dim=-1), "pred2": torch.argmax(end_logits, dim=-1)} |
@@ -5,38 +5,45 @@ from fastNLP.models.base_model import BaseModel | |||
from fastNLP.modules import decoder as Decoder | |||
from fastNLP.modules import encoder as Encoder | |||
from fastNLP.modules import aggregator as Aggregator | |||
from fastNLP.modules.utils import seq_mask | |||
my_inf = 10e12 | |||
class ESIM(BaseModel): | |||
""" | |||
PyTorch Network for SNLI task using ESIM model. | |||
"""ESIM模型的一个PyTorch实现。 | |||
ESIM模型的论文: Enhanced LSTM for Natural Language Inference (arXiv: 1609.06038) | |||
""" | |||
def __init__(self, **kwargs): | |||
def __init__(self, vocab_size, embed_dim, hidden_size, dropout=0.0, num_classes=3, init_embedding=None): | |||
""" | |||
:param int vocab_size: 词表大小 | |||
:param int embed_dim: 词嵌入维度 | |||
:param int hidden_size: LSTM隐层大小 | |||
:param float dropout: dropout大小,默认为0 | |||
:param int num_classes: 标签数目,默认为3 | |||
:param numpy.array init_embedding: 初始词嵌入矩阵,形状为(vocab_size, embed_dim),默认为None,即随机初始化词嵌入矩阵 | |||
""" | |||
super(ESIM, self).__init__() | |||
self.vocab_size = kwargs["vocab_size"] | |||
self.embed_dim = kwargs["embed_dim"] | |||
self.hidden_size = kwargs["hidden_size"] | |||
self.batch_first = kwargs["batch_first"] | |||
self.dropout = kwargs["dropout"] | |||
self.n_labels = kwargs["num_classes"] | |||
self.gpu = kwargs["gpu"] and torch.cuda.is_available() | |||
self.vocab_size = vocab_size | |||
self.embed_dim = embed_dim | |||
self.hidden_size = hidden_size | |||
self.dropout = dropout | |||
self.n_labels = num_classes | |||
self.drop = nn.Dropout(self.dropout) | |||
self.embedding = Encoder.Embedding( | |||
self.vocab_size, self.embed_dim, dropout=self.dropout, | |||
init_emb=kwargs["init_embedding"] if "inin_embedding" in kwargs.keys() else None, | |||
init_emb=init_embedding, | |||
) | |||
self.embedding_layer = Encoder.Linear(self.embed_dim, self.hidden_size) | |||
self.encoder = Encoder.LSTM( | |||
input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True, | |||
batch_first=self.batch_first, bidirectional=True | |||
batch_first=True, bidirectional=True | |||
) | |||
self.bi_attention = Aggregator.BiAttention() | |||
@@ -47,24 +54,34 @@ class ESIM(BaseModel): | |||
self.decoder = Encoder.LSTM( | |||
input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True, | |||
batch_first=self.batch_first, bidirectional=True | |||
batch_first=True, bidirectional=True | |||
) | |||
self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh', dropout=self.dropout) | |||
def forward(self, words1, words2, seq_len1, seq_len2): | |||
def forward(self, words1, words2, seq_len1=None, seq_len2=None): | |||
""" Forward function | |||
:param words1: A Tensor represents premise: [batch size(B), premise seq len(PL)]. | |||
:param words2: A Tensor represents hypothesis: [B, hypothesis seq len(HL)]. | |||
:param seq_len1: A Tensor record which is a real word and which is a padding word in premise: [B]. | |||
:param seq_len2: A Tensor record which is a real word and which is a padding word in hypothesis: [B]. | |||
:return: prediction: A Dict with Tensor of classification result: [B, n_labels(N)]. | |||
:param torch.Tensor words1: [batch size(B), premise seq len(PL)] premise的token表示 | |||
:param torch.Tensor words2: [B, hypothesis seq len(HL)] hypothesis的token表示 | |||
:param torch.LongTensor seq_len1: [B] premise的长度 | |||
:param torch.LongTensor seq_len2: [B] hypothesis的长度 | |||
:return: dict prediction: [B, n_labels(N)] 预测结果 | |||
""" | |||
premise0 = self.embedding_layer(self.embedding(words1)) | |||
hypothesis0 = self.embedding_layer(self.embedding(words2)) | |||
if seq_len1 is not None: | |||
seq_len1 = seq_mask(seq_len1, premise0.size(1)) | |||
else: | |||
seq_len1 = torch.ones(premise0.size(0), premise0.size(1)) | |||
seq_len1 = (seq_len1.long()).to(device=premise0.device) | |||
if seq_len2 is not None: | |||
seq_len2 = seq_mask(seq_len2, hypothesis0.size(1)) | |||
else: | |||
seq_len2 = torch.ones(hypothesis0.size(0), hypothesis0.size(1)) | |||
seq_len2 = (seq_len2.long()).to(device=hypothesis0.device) | |||
_BP, _PSL, _HP = premise0.size() | |||
_BH, _HSL, _HH = hypothesis0.size() | |||
_BPL, _PLL = seq_len1.size() | |||
@@ -109,6 +126,14 @@ class ESIM(BaseModel): | |||
return {'pred': prediction} | |||
def predict(self, words1, words2, seq_len1, seq_len2): | |||
""" Predict function | |||
:param torch.Tensor words1: [batch size(B), premise seq len(PL)] premise的token表示 | |||
:param torch.Tensor words2: [B, hypothesis seq len(HL)] hypothesis的token表示 | |||
:param torch.LongTensor seq_len1: [B] premise的长度 | |||
:param torch.LongTensor seq_len2: [B] hypothesis的长度 | |||
:return: dict prediction: [B, n_labels(N)] 预测结果 | |||
""" | |||
prediction = self.forward(words1, words2, seq_len1, seq_len2)['pred'] | |||
return {'pred': torch.argmax(prediction, dim=-1)} | |||
@@ -1,10 +1,10 @@ | |||
from .max_pool import MaxPool | |||
from .max_pool import MaxPoolWithMask | |||
from .avg_pool import AvgPool | |||
from .avg_pool import MeanPoolWithMask | |||
from .kmax_pool import KMaxPool | |||
from .pooling import MaxPool | |||
from .pooling import MaxPoolWithMask | |||
from .pooling import AvgPool | |||
from .pooling import MeanPoolWithMask | |||
from .pooling import KMaxPool | |||
from .attention import Attention | |||
from .attention import BiAttention | |||
from .self_attention import SelfAttention | |||
from .attention import SelfAttention | |||
@@ -7,6 +7,8 @@ from torch import nn | |||
from fastNLP.modules.dropout import TimestepDropout | |||
from fastNLP.modules.utils import mask_softmax | |||
from fastNLP.modules.utils import initial_parameter | |||
class Attention(torch.nn.Module): | |||
def __init__(self, normalize=False): | |||
@@ -168,3 +170,60 @@ class BiAttention(nn.Module): | |||
out_x2 = torch.bmm(attention_b_t, in_x1) # [batch_size, x2_seq_len, hidden_size] | |||
return out_x1, out_x2 | |||
class SelfAttention(nn.Module): | |||
"""Self Attention Module. | |||
:param int input_size: 输入tensor的hidden维度 | |||
:param int attention_unit: 输出tensor的hidden维度 | |||
:param int attention_hops: | |||
:param float drop: dropout概率,默认值为0.5 | |||
:param str initial_method: 初始化参数方法 | |||
""" | |||
def __init__(self, input_size, attention_unit=300, attention_hops=10, drop=0.5, initial_method=None,): | |||
super(SelfAttention, self).__init__() | |||
self.attention_hops = attention_hops | |||
self.ws1 = nn.Linear(input_size, attention_unit, bias=False) | |||
self.ws2 = nn.Linear(attention_unit, attention_hops, bias=False) | |||
self.I = torch.eye(attention_hops, requires_grad=False) | |||
self.I_origin = self.I | |||
self.drop = nn.Dropout(drop) | |||
self.tanh = nn.Tanh() | |||
initial_parameter(self, initial_method) | |||
def _penalization(self, attention): | |||
""" | |||
compute the penalization term for attention module | |||
""" | |||
baz = attention.size(0) | |||
size = self.I.size() | |||
if len(size) != 3 or size[0] != baz: | |||
self.I = self.I_origin.expand(baz, -1, -1) | |||
self.I = self.I.to(device=attention.device) | |||
attention_t = torch.transpose(attention, 1, 2).contiguous() | |||
mat = torch.bmm(attention, attention_t) - self.I[:attention.size(0)] | |||
ret = (torch.sum(torch.sum((mat ** 2), 2), 1).squeeze() + 1e-10) ** 0.5 | |||
return torch.sum(ret) / size[0] | |||
def forward(self, input, input_origin): | |||
""" | |||
:param torch.Tensor input: [baz, senLen, h_dim] 要做attention的矩阵 | |||
:param torch.Tensor input_origin: [baz , senLen] 原始token的index组成的矩阵,含有pad部分内容 | |||
:return torch.Tensor output1: [baz, multi-head , h_dim] 经过attention操作后输入矩阵的结果 | |||
:return torch.Tensor output2: [1] attention惩罚项,是一个标量 | |||
""" | |||
input = input.contiguous() | |||
size = input.size() # [bsz, len, nhid] | |||
input_origin = input_origin.expand(self.attention_hops, -1, -1) # [hops,baz, len] | |||
input_origin = input_origin.transpose(0, 1).contiguous() # [baz, hops,len] | |||
y1 = self.tanh(self.ws1(self.drop(input))) # [baz,len,dim] -->[bsz,len, attention-unit] | |||
attention = self.ws2(y1).transpose(1, 2).contiguous() | |||
# [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len] | |||
attention = attention + (-999999 * (input_origin == 0).float()) # remove the weight on padding token. | |||
attention = F.softmax(attention, 2) # [baz ,hop, len] | |||
return torch.bmm(attention, input), self._penalization(attention) # output1 --> [baz ,hop ,nhid] | |||
@@ -1,36 +0,0 @@ | |||
# python: 3.6 | |||
# encoding: utf-8 | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
class AvgPool(nn.Module): | |||
"""1-d average pooling module.""" | |||
def __init__(self, stride=None, padding=0): | |||
super(AvgPool, self).__init__() | |||
self.stride = stride | |||
self.padding = padding | |||
def forward(self, x): | |||
# [N,C,L] -> [N,C] | |||
kernel_size = x.size(2) | |||
x = F.max_pool1d( | |||
input=x, | |||
kernel_size=kernel_size, | |||
stride=self.stride, | |||
padding=self.padding) | |||
return x.squeeze(dim=-1) | |||
class MeanPoolWithMask(nn.Module): | |||
def __init__(self): | |||
super(MeanPoolWithMask, self).__init__() | |||
self.inf = 10e12 | |||
def forward(self, tensor, mask, dim=0): | |||
masks = mask.view(mask.size(0), mask.size(1), -1).float() | |||
return torch.sum(tensor * masks, dim=dim) / torch.sum(masks, dim=1) | |||
@@ -1,20 +0,0 @@ | |||
# python: 3.6 | |||
# encoding: utf-8 | |||
import torch | |||
import torch.nn as nn | |||
# import torch.nn.functional as F | |||
class KMaxPool(nn.Module): | |||
"""K max-pooling module.""" | |||
def __init__(self, k=1): | |||
super(KMaxPool, self).__init__() | |||
self.k = k | |||
def forward(self, x): | |||
# [N,C,L] -> [N,C*k] | |||
x, index = torch.topk(x, self.k, dim=-1, sorted=False) | |||
x = torch.reshape(x, (x.size(0), -1)) | |||
return x |
@@ -1,38 +0,0 @@ | |||
# python: 3.6 | |||
# encoding: utf-8 | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
class MaxPool(nn.Module): | |||
"""1-d max-pooling module.""" | |||
def __init__(self, stride=None, padding=0, dilation=1): | |||
super(MaxPool, self).__init__() | |||
self.stride = stride | |||
self.padding = padding | |||
self.dilation = dilation | |||
def forward(self, x): | |||
x = torch.transpose(x, 1, 2) # [N,L,C] -> [N,C,L] | |||
kernel_size = x.size(2) | |||
x = F.max_pool1d( # [N,L,C] -> [N,C,1] | |||
input=x, | |||
kernel_size=kernel_size, | |||
stride=self.stride, | |||
padding=self.padding, | |||
dilation=self.dilation) | |||
return x.squeeze(dim=-1) # [N,C,1] -> [N,C] | |||
class MaxPoolWithMask(nn.Module): | |||
def __init__(self): | |||
super(MaxPoolWithMask, self).__init__() | |||
self.inf = 10e12 | |||
def forward(self, tensor, mask, dim=0): | |||
masks = mask.view(mask.size(0), mask.size(1), -1) | |||
masks = masks.expand(-1, -1, tensor.size(2)).float() | |||
return torch.max(tensor + masks.le(0.5).float() * -self.inf, dim=dim) |
@@ -0,0 +1,133 @@ | |||
# python: 3.6 | |||
# encoding: utf-8 | |||
import torch | |||
import torch.nn as nn | |||
class MaxPool(nn.Module): | |||
"""Max-pooling模块。""" | |||
def __init__( | |||
self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None, | |||
return_indices=False, ceil_mode=False | |||
): | |||
""" | |||
:param stride: 窗口移动大小,默认为kernel_size | |||
:param padding: padding的内容,默认为0 | |||
:param dilation: 控制窗口内元素移动距离的大小 | |||
:param dimension: MaxPool的维度,支持1,2,3维。 | |||
:param kernel_size: max pooling的窗口大小,默认为tensor最后k维,其中k为dimension | |||
:param return_indices: | |||
:param ceil_mode: | |||
""" | |||
super(MaxPool, self).__init__() | |||
assert (1 <= dimension) and (dimension <= 3) | |||
self.dimension = dimension | |||
self.stride = stride | |||
self.padding = padding | |||
self.dilation = dilation | |||
self.kernel_size = kernel_size | |||
self.return_indices = return_indices | |||
self.ceil_mode = ceil_mode | |||
def forward(self, x): | |||
if self.dimension == 1: | |||
pooling = nn.MaxPool1d( | |||
stride=self.stride, padding=self.padding, dilation=self.dilation, | |||
kernel_size=self.kernel_size if self.kernel_size is not None else x.size(-1), | |||
return_indices=self.return_indices, ceil_mode=self.ceil_mode | |||
) | |||
x = torch.transpose(x, 1, 2) # [N,L,C] -> [N,C,L] | |||
elif self.dimension == 2: | |||
pooling = nn.MaxPool2d( | |||
stride=self.stride, padding=self.padding, dilation=self.dilation, | |||
kernel_size=self.kernel_size if self.kernel_size is not None else (x.size(-2), x.size(-1)), | |||
return_indices=self.return_indices, ceil_mode=self.ceil_mode | |||
) | |||
else: | |||
pooling = nn.MaxPool2d( | |||
stride=self.stride, padding=self.padding, dilation=self.dilation, | |||
kernel_size=self.kernel_size if self.kernel_size is not None else (x.size(-3), x.size(-2), x.size(-1)), | |||
return_indices=self.return_indices, ceil_mode=self.ceil_mode | |||
) | |||
x = pooling(x) | |||
return x.squeeze(dim=-1) # [N,C,1] -> [N,C] | |||
class MaxPoolWithMask(nn.Module): | |||
"""带mask矩阵的1维max pooling""" | |||
def __init__(self): | |||
super(MaxPoolWithMask, self).__init__() | |||
self.inf = 10e12 | |||
def forward(self, tensor, mask, dim=1): | |||
""" | |||
:param torch.FloatTensor tensor: [batch_size, seq_len, channels] 初始tensor | |||
:param torch.LongTensor mask: [batch_size, seq_len] 0/1的mask矩阵 | |||
:param int dim: 需要进行max pooling的维度 | |||
:return: | |||
""" | |||
masks = mask.view(mask.size(0), mask.size(1), -1) | |||
masks = masks.expand(-1, -1, tensor.size(2)).float() | |||
return torch.max(tensor + masks.le(0.5).float() * -self.inf, dim=dim)[0] | |||
class KMaxPool(nn.Module): | |||
"""K max-pooling module.""" | |||
def __init__(self, k=1): | |||
super(KMaxPool, self).__init__() | |||
self.k = k | |||
def forward(self, x): | |||
""" | |||
:param torch.Tensor x: [N, C, L] 初始tensor | |||
:return: torch.Tensor x: [N, C*k] k-max pool后的结果 | |||
""" | |||
x, index = torch.topk(x, self.k, dim=-1, sorted=False) | |||
x = torch.reshape(x, (x.size(0), -1)) | |||
return x | |||
class AvgPool(nn.Module): | |||
"""1-d average pooling module.""" | |||
def __init__(self, stride=None, padding=0): | |||
super(AvgPool, self).__init__() | |||
self.stride = stride | |||
self.padding = padding | |||
def forward(self, x): | |||
""" | |||
:param torch.Tensor x: [N, C, L] 初始tensor | |||
:return: torch.Tensor x: [N, C] avg pool后的结果 | |||
""" | |||
# [N,C,L] -> [N,C] | |||
kernel_size = x.size(2) | |||
pooling = nn.AvgPool1d( | |||
kernel_size=kernel_size, | |||
stride=self.stride, | |||
padding=self.padding) | |||
x = pooling(x) | |||
return x.squeeze(dim=-1) | |||
class MeanPoolWithMask(nn.Module): | |||
def __init__(self): | |||
super(MeanPoolWithMask, self).__init__() | |||
self.inf = 10e12 | |||
def forward(self, tensor, mask, dim=1): | |||
""" | |||
:param torch.FloatTensor tensor: [batch_size, seq_len, channels] 初始tensor | |||
:param torch.LongTensor mask: [batch_size, seq_len] 0/1的mask矩阵 | |||
:param int dim: 需要进行max pooling的维度 | |||
:return: | |||
""" | |||
masks = mask.view(mask.size(0), mask.size(1), -1).float() | |||
return torch.sum(tensor * masks.float(), dim=dim) / torch.sum(masks.float(), dim=1) | |||
@@ -1,68 +0,0 @@ | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from torch.autograd import Variable | |||
from fastNLP.modules.utils import initial_parameter | |||
class SelfAttention(nn.Module): | |||
"""Self Attention Module. | |||
:param int input_size: | |||
:param int attention_unit: | |||
:param int attention_hops: | |||
:param float drop: | |||
:param str initial_method: | |||
:param bool use_cuda: | |||
""" | |||
def __init__(self, input_size, attention_unit=350, attention_hops=10, drop=0.5, initial_method=None, | |||
use_cuda=False): | |||
super(SelfAttention, self).__init__() | |||
self.attention_hops = attention_hops | |||
self.ws1 = nn.Linear(input_size, attention_unit, bias=False) | |||
self.ws2 = nn.Linear(attention_unit, attention_hops, bias=False) | |||
if use_cuda: | |||
self.I = Variable(torch.eye(attention_hops).cuda(), requires_grad=False) | |||
else: | |||
self.I = Variable(torch.eye(attention_hops), requires_grad=False) | |||
self.I_origin = self.I | |||
self.drop = nn.Dropout(drop) | |||
self.tanh = nn.Tanh() | |||
initial_parameter(self, initial_method) | |||
def penalization(self, attention): | |||
""" | |||
compute the penalization term for attention module | |||
""" | |||
baz = attention.size(0) | |||
size = self.I.size() | |||
if len(size) != 3 or size[0] != baz: | |||
self.I = self.I_origin.expand(baz, -1, -1) | |||
attentionT = torch.transpose(attention, 1, 2).contiguous() | |||
mat = torch.bmm(attention, attentionT) - self.I[:attention.size(0)] | |||
ret = (torch.sum(torch.sum((mat ** 2), 2), 1).squeeze() + 1e-10) ** 0.5 | |||
return torch.sum(ret) / size[0] | |||
def forward(self, input, input_origin): | |||
""" | |||
:param input: the matrix to do attention. [baz, senLen, h_dim] | |||
:param inp: then token index include pad token( 0 ) [baz , senLen] | |||
:return output1: the input matrix after attention operation [baz, multi-head , h_dim] | |||
:return output2: the attention penalty term, a scalar [1] | |||
""" | |||
input = input.contiguous() | |||
size = input.size() # [bsz, len, nhid] | |||
input_origin = input_origin.expand(self.attention_hops, -1, -1) # [hops,baz, len] | |||
input_origin = input_origin.transpose(0, 1).contiguous() # [baz, hops,len] | |||
y1 = self.tanh(self.ws1(self.drop(input))) # [baz,len,dim] -->[bsz,len, attention-unit] | |||
attention = self.ws2(y1).transpose(1, 2).contiguous() | |||
# [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len] | |||
attention = attention + (-999999 * (input_origin == 0).float()) # remove the weight on padding token. | |||
attention = F.softmax(attention, 2) # [baz ,hop, len] | |||
return torch.bmm(attention, input), self.penalization(attention) # output1 --> [baz ,hop ,nhid] |
@@ -7,17 +7,33 @@ from fastNLP.modules.utils import initial_parameter | |||
class MLP(nn.Module): | |||
"""Multilayer Perceptrons as a decoder | |||
:param list size_layer: list of int, define the size of MLP layers. layer的层数为 len(size_layer) - 1 | |||
:param str or list activation: str or function or a list, the activation function for hidden layers. | |||
:param str or function output_activation : str or function, the activation function for output layer | |||
:param str initial_method: the name of initialization method. | |||
:param float dropout: the probability of dropout. | |||
:param list size_layer: 一个int的列表,用来定义MLP的层数,列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1 | |||
:param str or list activation: | |||
一个字符串或者函数或者字符串跟函数的列表,用来定义每一个隐层的激活函数,字符串包括relu,tanh和sigmoid,默认值为relu | |||
:param str or function output_activation : 字符串或者函数,用来定义输出层的激活函数,默认值为None,表示输出层没有激活函数 | |||
:param str initial_method: 参数初始化方式 | |||
:param float dropout: dropout概率,默认值为0 | |||
.. note:: | |||
隐藏层的激活函数通过activation定义。一个str/function或者一个str/function的list可以被传入activation。 | |||
如果只传入了一个str/function,那么所有隐藏层的激活函数都由这个str/function定义; | |||
如果传入了一个str/function的list,那么每一个隐藏层的激活函数由这个list中对应的元素定义,其中list的长度为隐藏层数。 | |||
输出层的激活函数由output_activation定义,默认值为None,此时输出层没有激活函数。 | |||
Examples:: | |||
>>> net1 = MLP([5, 10, 5]) | |||
>>> net2 = MLP([5, 10, 5], 'tanh') | |||
>>> net3 = MLP([5, 6, 7, 8, 5], 'tanh') | |||
>>> net4 = MLP([5, 6, 7, 8, 5], 'relu', output_activation='tanh') | |||
>>> net5 = MLP([5, 6, 7, 8, 5], ['tanh', 'relu', 'tanh'], 'tanh') | |||
>>> for net in [net1, net2, net3, net4, net5]: | |||
>>> x = torch.randn(5, 5) | |||
>>> y = net(x) | |||
>>> print(x) | |||
>>> print(y) | |||
>>> | |||
""" | |||
def __init__(self, size_layer, activation='relu', output_activation=None, initial_method=None, dropout=0.0): | |||
@@ -63,6 +79,10 @@ class MLP(nn.Module): | |||
initial_parameter(self, initial_method) | |||
def forward(self, x): | |||
""" | |||
:param torch.Tensor x: MLP接受的输入 | |||
:return: torch.Tensor : MLP的输出结果 | |||
""" | |||
for layer, func in zip(self.hiddens, self.hidden_active): | |||
x = self.dropout(func(layer(x))) | |||
x = self.output(x) | |||
@@ -3,9 +3,11 @@ from .conv_maxpool import ConvMaxpool | |||
from .embedding import Embedding | |||
from .linear import Linear | |||
from .lstm import LSTM | |||
from .bert import BertModel | |||
__all__ = ["LSTM", | |||
"Embedding", | |||
"Linear", | |||
"Conv", | |||
"ConvMaxpool"] | |||
"ConvMaxpool", | |||
"BertModel"] |
@@ -0,0 +1,362 @@ | |||
""" | |||
bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0. | |||
""" | |||
import copy | |||
import json | |||
import math | |||
import os | |||
import torch | |||
from torch import nn | |||
CONFIG_FILE = 'bert_config.json' | |||
MODEL_WEIGHTS = 'pytorch_model.bin' | |||
def gelu(x): | |||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) | |||
def swish(x): | |||
return x * torch.sigmoid(x) | |||
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} | |||
class BertLayerNorm(nn.Module): | |||
def __init__(self, hidden_size, eps=1e-12): | |||
super(BertLayerNorm, self).__init__() | |||
self.weight = nn.Parameter(torch.ones(hidden_size)) | |||
self.bias = nn.Parameter(torch.zeros(hidden_size)) | |||
self.variance_epsilon = eps | |||
def forward(self, x): | |||
u = x.mean(-1, keepdim=True) | |||
s = (x - u).pow(2).mean(-1, keepdim=True) | |||
x = (x - u) / torch.sqrt(s + self.variance_epsilon) | |||
return self.weight * x + self.bias | |||
class BertEmbeddings(nn.Module): | |||
def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, hidden_dropout_prob): | |||
super(BertEmbeddings, self).__init__() | |||
self.word_embeddings = nn.Embedding(vocab_size, hidden_size) | |||
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size) | |||
self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size) | |||
# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load | |||
# any TensorFlow checkpoint file | |||
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12) | |||
self.dropout = nn.Dropout(hidden_dropout_prob) | |||
def forward(self, input_ids, token_type_ids=None): | |||
seq_length = input_ids.size(1) | |||
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) | |||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) | |||
if token_type_ids is None: | |||
token_type_ids = torch.zeros_like(input_ids) | |||
words_embeddings = self.word_embeddings(input_ids) | |||
position_embeddings = self.position_embeddings(position_ids) | |||
token_type_embeddings = self.token_type_embeddings(token_type_ids) | |||
embeddings = words_embeddings + position_embeddings + token_type_embeddings | |||
embeddings = self.LayerNorm(embeddings) | |||
embeddings = self.dropout(embeddings) | |||
return embeddings | |||
class BertSelfAttention(nn.Module): | |||
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob): | |||
super(BertSelfAttention, self).__init__() | |||
if hidden_size % num_attention_heads != 0: | |||
raise ValueError( | |||
"The hidden size (%d) is not a multiple of the number of attention " | |||
"heads (%d)" % (hidden_size, num_attention_heads)) | |||
self.num_attention_heads = num_attention_heads | |||
self.attention_head_size = int(hidden_size / num_attention_heads) | |||
self.all_head_size = self.num_attention_heads * self.attention_head_size | |||
self.query = nn.Linear(hidden_size, self.all_head_size) | |||
self.key = nn.Linear(hidden_size, self.all_head_size) | |||
self.value = nn.Linear(hidden_size, self.all_head_size) | |||
self.dropout = nn.Dropout(attention_probs_dropout_prob) | |||
def transpose_for_scores(self, x): | |||
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) | |||
x = x.view(*new_x_shape) | |||
return x.permute(0, 2, 1, 3) | |||
def forward(self, hidden_states, attention_mask): | |||
mixed_query_layer = self.query(hidden_states) | |||
mixed_key_layer = self.key(hidden_states) | |||
mixed_value_layer = self.value(hidden_states) | |||
query_layer = self.transpose_for_scores(mixed_query_layer) | |||
key_layer = self.transpose_for_scores(mixed_key_layer) | |||
value_layer = self.transpose_for_scores(mixed_value_layer) | |||
# Take the dot product between "query" and "key" to get the raw attention scores. | |||
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) | |||
attention_scores = attention_scores / math.sqrt(self.attention_head_size) | |||
# Apply the attention mask is (precomputed for all layers in BertModel forward() function) | |||
attention_scores = attention_scores + attention_mask | |||
# Normalize the attention scores to probabilities. | |||
attention_probs = nn.Softmax(dim=-1)(attention_scores) | |||
# This is actually dropping out entire tokens to attend to, which might | |||
# seem a bit unusual, but is taken from the original Transformer paper. | |||
attention_probs = self.dropout(attention_probs) | |||
context_layer = torch.matmul(attention_probs, value_layer) | |||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous() | |||
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) | |||
context_layer = context_layer.view(*new_context_layer_shape) | |||
return context_layer | |||
class BertSelfOutput(nn.Module): | |||
def __init__(self, hidden_size, hidden_dropout_prob): | |||
super(BertSelfOutput, self).__init__() | |||
self.dense = nn.Linear(hidden_size, hidden_size) | |||
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12) | |||
self.dropout = nn.Dropout(hidden_dropout_prob) | |||
def forward(self, hidden_states, input_tensor): | |||
hidden_states = self.dense(hidden_states) | |||
hidden_states = self.dropout(hidden_states) | |||
hidden_states = self.LayerNorm(hidden_states + input_tensor) | |||
return hidden_states | |||
class BertAttention(nn.Module): | |||
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob): | |||
super(BertAttention, self).__init__() | |||
self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob) | |||
self.output = BertSelfOutput(hidden_size, hidden_dropout_prob) | |||
def forward(self, input_tensor, attention_mask): | |||
self_output = self.self(input_tensor, attention_mask) | |||
attention_output = self.output(self_output, input_tensor) | |||
return attention_output | |||
class BertIntermediate(nn.Module): | |||
def __init__(self, hidden_size, intermediate_size, hidden_act): | |||
super(BertIntermediate, self).__init__() | |||
self.dense = nn.Linear(hidden_size, intermediate_size) | |||
self.intermediate_act_fn = ACT2FN[hidden_act] \ | |||
if isinstance(hidden_act, str) else hidden_act | |||
def forward(self, hidden_states): | |||
hidden_states = self.dense(hidden_states) | |||
hidden_states = self.intermediate_act_fn(hidden_states) | |||
return hidden_states | |||
class BertOutput(nn.Module): | |||
def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob): | |||
super(BertOutput, self).__init__() | |||
self.dense = nn.Linear(intermediate_size, hidden_size) | |||
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12) | |||
self.dropout = nn.Dropout(hidden_dropout_prob) | |||
def forward(self, hidden_states, input_tensor): | |||
hidden_states = self.dense(hidden_states) | |||
hidden_states = self.dropout(hidden_states) | |||
hidden_states = self.LayerNorm(hidden_states + input_tensor) | |||
return hidden_states | |||
class BertLayer(nn.Module): | |||
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob, | |||
intermediate_size, hidden_act): | |||
super(BertLayer, self).__init__() | |||
self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob, | |||
hidden_dropout_prob) | |||
self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act) | |||
self.output = BertOutput(hidden_size, intermediate_size, hidden_dropout_prob) | |||
def forward(self, hidden_states, attention_mask): | |||
attention_output = self.attention(hidden_states, attention_mask) | |||
intermediate_output = self.intermediate(attention_output) | |||
layer_output = self.output(intermediate_output, attention_output) | |||
return layer_output | |||
class BertEncoder(nn.Module): | |||
def __init__(self, num_hidden_layers, hidden_size, num_attention_heads, attention_probs_dropout_prob, | |||
hidden_dropout_prob, | |||
intermediate_size, hidden_act): | |||
super(BertEncoder, self).__init__() | |||
layer = BertLayer(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob, | |||
intermediate_size, hidden_act) | |||
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)]) | |||
def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): | |||
all_encoder_layers = [] | |||
for layer_module in self.layer: | |||
hidden_states = layer_module(hidden_states, attention_mask) | |||
if output_all_encoded_layers: | |||
all_encoder_layers.append(hidden_states) | |||
if not output_all_encoded_layers: | |||
all_encoder_layers.append(hidden_states) | |||
return all_encoder_layers | |||
class BertPooler(nn.Module): | |||
def __init__(self, hidden_size): | |||
super(BertPooler, self).__init__() | |||
self.dense = nn.Linear(hidden_size, hidden_size) | |||
self.activation = nn.Tanh() | |||
def forward(self, hidden_states): | |||
# We "pool" the model by simply taking the hidden state corresponding | |||
# to the first token. | |||
first_token_tensor = hidden_states[:, 0] | |||
pooled_output = self.dense(first_token_tensor) | |||
pooled_output = self.activation(pooled_output) | |||
return pooled_output | |||
class BertModel(nn.Module): | |||
"""Bidirectional Embedding Representations from Transformers. | |||
If you want to use pre-trained weights, please download from the following sources provided by pytorch-pretrained-BERT. | |||
sources:: | |||
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", | |||
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", | |||
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", | |||
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", | |||
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", | |||
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", | |||
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", | |||
Construct a BERT model with pre-trained weights:: | |||
model = BertModel.from_pretrained("path/to/weights/directory") | |||
""" | |||
def __init__(self, vocab_size, | |||
hidden_size=768, | |||
num_hidden_layers=12, | |||
num_attention_heads=12, | |||
intermediate_size=3072, | |||
hidden_act="gelu", | |||
hidden_dropout_prob=0.1, | |||
attention_probs_dropout_prob=0.1, | |||
max_position_embeddings=512, | |||
type_vocab_size=2, | |||
initializer_range=0.02, **kwargs): | |||
super(BertModel, self).__init__() | |||
self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings, | |||
type_vocab_size, hidden_dropout_prob) | |||
self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads, | |||
attention_probs_dropout_prob, hidden_dropout_prob, intermediate_size, | |||
hidden_act) | |||
self.pooler = BertPooler(hidden_size) | |||
self.initializer_range = initializer_range | |||
self.apply(self.init_bert_weights) | |||
def init_bert_weights(self, module): | |||
if isinstance(module, (nn.Linear, nn.Embedding)): | |||
# Slightly different from the TF version which uses truncated_normal for initialization | |||
# cf https://github.com/pytorch/pytorch/pull/5617 | |||
module.weight.data.normal_(mean=0.0, std=self.initializer_range) | |||
elif isinstance(module, BertLayerNorm): | |||
module.bias.data.zero_() | |||
module.weight.data.fill_(1.0) | |||
if isinstance(module, nn.Linear) and module.bias is not None: | |||
module.bias.data.zero_() | |||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True): | |||
if attention_mask is None: | |||
attention_mask = torch.ones_like(input_ids) | |||
if token_type_ids is None: | |||
token_type_ids = torch.zeros_like(input_ids) | |||
# We create a 3D attention mask from a 2D tensor mask. | |||
# Sizes are [batch_size, 1, 1, to_seq_length] | |||
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] | |||
# this attention mask is more simple than the triangular masking of causal attention | |||
# used in OpenAI GPT, we just need to prepare the broadcast dimension here. | |||
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) | |||
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for | |||
# masked positions, this operation will create a tensor which is 0.0 for | |||
# positions we want to attend and -10000.0 for masked positions. | |||
# Since we are adding it to the raw scores before the softmax, this is | |||
# effectively the same as removing these entirely. | |||
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility | |||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 | |||
embedding_output = self.embeddings(input_ids, token_type_ids) | |||
encoded_layers = self.encoder(embedding_output, | |||
extended_attention_mask, | |||
output_all_encoded_layers=output_all_encoded_layers) | |||
sequence_output = encoded_layers[-1] | |||
pooled_output = self.pooler(sequence_output) | |||
if not output_all_encoded_layers: | |||
encoded_layers = encoded_layers[-1] | |||
return encoded_layers, pooled_output | |||
@classmethod | |||
def from_pretrained(cls, pretrained_model_dir, state_dict=None, *inputs, **kwargs): | |||
# Load config | |||
config_file = os.path.join(pretrained_model_dir, CONFIG_FILE) | |||
config = json.load(open(config_file, "r")) | |||
# config = BertConfig.from_json_file(config_file) | |||
# logger.info("Model config {}".format(config)) | |||
# Instantiate model. | |||
model = cls(*inputs, **config, **kwargs) | |||
if state_dict is None: | |||
weights_path = os.path.join(pretrained_model_dir, MODEL_WEIGHTS) | |||
state_dict = torch.load(weights_path) | |||
old_keys = [] | |||
new_keys = [] | |||
for key in state_dict.keys(): | |||
new_key = None | |||
if 'gamma' in key: | |||
new_key = key.replace('gamma', 'weight') | |||
if 'beta' in key: | |||
new_key = key.replace('beta', 'bias') | |||
if new_key: | |||
old_keys.append(key) | |||
new_keys.append(new_key) | |||
for old_key, new_key in zip(old_keys, new_keys): | |||
state_dict[new_key] = state_dict.pop(old_key) | |||
missing_keys = [] | |||
unexpected_keys = [] | |||
error_msgs = [] | |||
# copy state_dict so _load_from_state_dict can modify it | |||
metadata = getattr(state_dict, '_metadata', None) | |||
state_dict = state_dict.copy() | |||
if metadata is not None: | |||
state_dict._metadata = metadata | |||
def load(module, prefix=''): | |||
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) | |||
module._load_from_state_dict( | |||
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) | |||
for name, child in module._modules.items(): | |||
if child is not None: | |||
load(child, prefix + name + '.') | |||
load(model, prefix='' if hasattr(model, 'bert') else 'bert.') | |||
if len(missing_keys) > 0: | |||
print("Weights of {} not initialized from pretrained model: {}".format( | |||
model.__class__.__name__, missing_keys)) | |||
if len(unexpected_keys) > 0: | |||
print("Weights from pretrained model not used in {}: {}".format( | |||
model.__class__.__name__, unexpected_keys)) | |||
return model |