|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749 |
- import hetu as ht
- import numpy as np
-
- '''
- Bert Module Architecture & Input/Output Tensor Size
-
- BertModel Inputs:
- input_ids: [batch_size, seq_len], word token indices in the vocabulary
-
- BertModel Outputs:
- sequence_output: [batch_size, seq_len, hidden_size] (from BertEncoder)
- pooled_output: [batch_size, hidden_size] (from BertPooler)
-
- BertModel:
- --[batch_size, seq_len]--
- BertEmbeddings:
- Embedding(word/position/token_type)
- LayerNorm
- Dropout
- --[batch_size, seq_len, hidden_size]--
-
- --[batch_size, seq_len, hidden_size]--
- BertEncoder:
- BertLayer(num_hidden_layers):
- BertAttention:
- BertSelfAttention
- --[batch_size, seq_len, hidden_size]--
- BertSelfOutput:
- Linear
- Dropout
- Add & LayerNorm
-
- --[batch_size, seq_len, hidden_size]--
- BertIntermediate:
- Linear + Act(gule)
- --[batch_size, seq_len, intermediate_size]--
- BertOutput:
- Linear
- Dropout
- Add & LayerNorm
- --[batch_size, seq_len, hidden_size]--
-
- --[batch_size, seq_len, hidden_size]--
- BertPooler:
- (Slice, select [cls])
- --[batch_size, hidden_size]--
- Linear + Act(Tanh)
- --[batch_size, hidden_size]--
-
- Bert
- '''
-
-
- '''
- BertEmbeddings:
- --------------------------------------------------------------------------------------------------'''
- class BertEmbeddings(object):
- """Construct the embeddings from word, position and token_type embeddings.
- """
- def __init__(self, config):
- self.seq_len = config.max_position_embeddings
- self.batch_size = config.batch_size
-
- self.word_embeddings = Embedding(config.vocab_size, config.hidden_size, "word_embeddings")
- self.position_embeddings = Embedding(config.max_position_embeddings, config.hidden_size, 'position_embeddings')
- self.token_type_embeddings = Embedding(config.type_vocab_size, config.hidden_size, 'token_type_embeddings')
-
- self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
- self.dropout = Dropout(config.hidden_dropout_prob)
-
- def __call__(self, input_ids, token_type_ids):
- '''
- inputs:
- input_ids: [batch_size, seq_len]
- token_type_ids: [batch_size, seq_len]
-
- outputs:
- embeddings: [batch_size, seq_len, hidden_size]
- '''
- seq_length= self.seq_len
- batch_size = self.batch_size
- position_ids = ht.Variable('position_ids', value=np.arange(seq_length).reshape((1,-1)).repeat(batch_size,axis=0), dtype=np.long, trainable=False, ctx=input_ids.ctx)
-
-
- '''Embedding Size
- inputs_id:[batch_size, seq_len], embedding_table:[vocab_size, hidden_size]
- position_ids:[batch_size, seq_len], embedding_table:[seq_len, hidden_size]
- token_type_ids:[batch_size, seq_len], embedding_tabel:[type_vocab_size, hidden_size]
- --> embeddings: [batch_size, seq_len, hidden_size]
- '''
- words_embeddings = self.word_embeddings(input_ids)
- position_embeddings = self.position_embeddings(position_ids)
- token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
- embeddings = words_embeddings + position_embeddings + token_type_embeddings
- embeddings = self.LayerNorm(embeddings)
- embeddings = self.dropout(embeddings)
- return embeddings
- '''-----------------------------------------------------------------------------------------------'''
-
-
- '''
- BertEncoder & BertLayer:
- --------------------------------------------------------------------------------------------------'''
- class BertEncoder(object):
- def __init__(self, config):
- self.output_hidden_states = config.output_hidden_states
- self.layer = [BertLayer(config) for _ in range(config.num_hidden_layers)]
-
- def __call__(self, hidden_states, attention_mask=None):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- attention_mask: [batch_size, num_heads, seq_len, seq_len]
- outputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- all_hidden_states: optional, num_hidden_layers * [batch_size, seq_len, hidden_size]
- '''
-
- for i, layer_module in enumerate(self.layer):
- hidden_states = layer_module(hidden_states, attention_mask)
- return hidden_states # last-layer hidden state
-
- class BertLayer(object):
- def __init__(self, config):
- self.attention = BertAttention(config)
- self.intermediate = BertIntermediate(config)
- self.output = BertOutput(config)
-
- def __call__(self, hidden_states, attention_mask):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- attention_mask: [batch_size, num_heads, seq_len, seq_len]
- outputs:
- layer_output: [batch_size, seq_len, hidden_size]
- '''
- attention_output = self.attention(hidden_states, attention_mask)
- intermediate_output = self.intermediate(attention_output)
- layer_output = self.output(intermediate_output, attention_output)
- return layer_output
- '''-----------------------------------------------------------------------------------------------'''
-
-
- '''
- BertAttention & BertSelfAttention & BertSelfOutput
- --------------------------------------------------------------------------------------------------'''
- class BertAttention(object):
- def __init__(self, config):
- self.self = BertSelfAttention(config)
- self.output = BertSelfOutput(config)
-
- def __call__(self, input_tensor, attention_mask):
- '''
- inputs:
- input_tensor: [batch_size, seq_len, hidden_size]
- attention_mask: [batch_size, num_heads, seq_len, seq_len]
- outputs:
- attention_output: [batch_size, seq_len, hidden_size]
- '''
- self_output = self.self(input_tensor, attention_mask)
- attention_output = self.output(self_output, input_tensor)
- return attention_output
-
- class BertSelfAttention(object):
- def __init__(self, config):
- if config.hidden_size % config.num_attention_heads != 0:
- raise ValueError(
- "The hidden size (%d) is not a multiple of the number of attention "
- "heads (%d)" % (config.hidden_size, config.num_attention_heads))
- self.num_attention_heads = config.num_attention_heads
- self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
- self.all_head_size = self.num_attention_heads * self.attention_head_size #all_head_size == hidden_size
- self.hidden_size = config.hidden_size
- self.seq_len = config.max_position_embeddings
- self.batch_size = config.batch_size
-
- linear_input_shape = [self.batch_size, self.seq_len, self.hidden_size]
- self.query = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
- self.key = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
- self.value = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
-
- self.dropout = Dropout(config.attention_probs_dropout_prob)
-
- def transpose_for_scores(self, input_tensor):
- output_tensor = ht.array_reshape_op(
- input_tensor, [self.batch_size, self.seq_len, self.num_attention_heads, self.attention_head_size])
- output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3])
- return output_tensor
-
- def __call__(self, hidden_states, attention_mask):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- attention_mask: [batch_size, 1, 1, seq_len]
- outputs:
- context_layer: [batch_size, seq_len, hidden_size]
- '''
-
- # linear transformation
- mixed_query_layer = self.query(hidden_states) # [batch_size, seq_len, hidden_size]
- mixed_key_layer = self.key(hidden_states) # [batch_size, seq_len, hidden_size]
- mixed_value_layer = self.value(hidden_states) # [batch_size, seq_len, hidden_size]
-
- # transpose
- query_layer = self.transpose_for_scores(mixed_query_layer) # [batch_size, num_heads, seq_len, head_size]
- key_layer = self.transpose_for_scores(mixed_key_layer) # [batch_size, num_heads, seq_len, head_size]
- value_layer = self.transpose_for_scores(mixed_value_layer) # [batch_size, num_heads, seq_len, head_size]
-
- # score
- key_layer_scaled = key_layer * (1.0 / np.sqrt(float(self.attention_head_size)))
- attention_scores = ht.batch_matmul_op(query_layer, key_layer_scaled, trans_B=True) # [batch_size, num_heads, seq_len, seq_len]
-
- # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
- attention_scores = attention_scores + ht.broadcastto_op(attention_mask, attention_scores) # [batch_size, num_heads, seq_len, seq_len]
-
- # Normalize the attention scores to probabilities.
- attention_probs = ht.softmax_op(attention_scores)
-
- # This is actually dropping out entire tokens to attend to, which might
- # seem a bit unusual, but is taken from the original Transformer paper.
- attention_probs = self.dropout(attention_probs)
-
- context_layer = ht.batch_matmul_op(attention_probs, value_layer) # [batch_size, num_heads, seq_len, head_size]
- context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3]) # [batch_size, seq_len, num_heads, head_size]
- context_layer = ht.array_reshape_op(context_layer, [-1, self.seq_len, self.all_head_size]) # [batch_size, seq_len, hidden_size]
- return context_layer
-
- class BertSelfOutput(object):
- def __init__(self, config):
- linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
- self.dense = Linear(config.hidden_size, config.hidden_size, input_shape=linear_input_shape)
- self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
- self.dropout = Dropout(config.hidden_dropout_prob)
-
- def __call__(self, hidden_states, input_tensor):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- input_tensor: [batch_size, seq_len, hidden_size]
- outputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- '''
- hidden_states = self.dense(hidden_states)
- hidden_states = self.dropout(hidden_states)
- hidden_states = self.LayerNorm(hidden_states + input_tensor)
- return hidden_states
- '''-----------------------------------------------------------------------------------------------'''
-
-
- '''
- BertIntermediate & BertOutput (2-layer FeedForward)
- --------------------------------------------------------------------------------------------------'''
- class BertIntermediate(object):
- def __init__(self, config):
- if config.hidden_act == "relu":
- self.intermediate_act_fn = ht.relu_op
- elif config.hidden_act == "gelu":
- self.intermediate_act_fn = ht.gelu_op
- print("Gelu activation is not implemented yet.")
- assert(False)
- linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
- self.dense = Linear(config.hidden_size, config.intermediate_size, activation = self.intermediate_act_fn, input_shape=linear_input_shape)
-
- def __call__(self, hidden_states):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- outputs:
- hidden_states: [batch_size, seq_len, intermediate_size]
- '''
- hidden_states = self.dense(hidden_states)
- return hidden_states
-
- class BertOutput(object):
- def __init__(self, config):
- linear_input_shape = [config.batch_size, config.max_position_embeddings, config.intermediate_size]
- self.dense = Linear(config.intermediate_size, config.hidden_size, input_shape=linear_input_shape)
- self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
- self.dropout = Dropout(config.hidden_dropout_prob)
-
- def __call__(self, hidden_states, input_tensor):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, intermediate_size]
- outputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- '''
- hidden_states = self.dense(hidden_states)
- hidden_states = self.dropout(hidden_states)
- hidden_states = self.LayerNorm(hidden_states + input_tensor)
- return hidden_states
- '''-----------------------------------------------------------------------------------------------'''
-
-
- '''
- BertPooler
- --------------------------------------------------------------------------------------------------'''
- class BertPooler(object):
- def __init__(self, config):
- self.dense = Linear(config.hidden_size, config.hidden_size, activation = ht.tanh_op)
- self.batch_size = config.batch_size
- self.hidden_size = config.hidden_size
- def __call__(self, hidden_states):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- outputs:
- pooled_output: [batch_size, hidden_size]
- '''
- first_token_tensor = ht.slice_op(hidden_states,(0,0,0),(self.batch_size,1,self.hidden_size))
- first_token_tensor = ht.array_reshape_op(first_token_tensor, [self.batch_size, self.hidden_size])
- pooled_output = self.dense(first_token_tensor)
- return pooled_output
- '''-----------------------------------------------------------------------------------------------'''
-
- '''
- Bert Downstream Heads
- --------------------------------------------------------------------------------------------------'''
- class BertPredictionHeadTransform(object):
- def __init__(self, config):
- if config.hidden_act == "relu":
- self.hidden_act = ht.relu_op
- elif config.hidden_act == "gelu":
- self.hidden_act = ht.gelu_op
- print("Gelu activation is not implemented yet.")
- assert(False)
- linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
- self.dense_act = Linear(config.hidden_size, config.hidden_size, activation=self.hidden_act, input_shape=linear_input_shape)
- self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-
- def __call__(self, hidden_states):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- outputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- '''
- hidden_states = self.dense_act(hidden_states)
- hidden_states = self.LayerNorm(hidden_states)
- return hidden_states
-
- class BertLMPredictionHead(object):
- def __init__(self, config, bert_model_embedding_weights):
- '''
- bert_model_embedding_weights: [vocab_size, hidden_size]
- '''
- self.transform = BertPredictionHeadTransform(config)
-
- linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
- self.decoder = Linear(config.hidden_size, config.vocab_size, bias_initializer=ht.init.zeros,input_shape=linear_input_shape)
- self.decoder.weights = ht.transpose_op(bert_model_embedding_weights)
-
- def __call__(self, hidden_states):
- '''
- inputs:
- hidden_states: [batch_size, seq_len, hidden_size]
- outputs:
- hidden_states: [batch_size, seq_len, vocab_size]
- '''
- hidden_states = self.transform(hidden_states)
- hidden_states = self.decoder(hidden_states)
- return hidden_states
-
-
- class BertOnlyMLMHead(object):
- def __init__(self, config, bert_model_embedding_weights):
- self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-
- def __call__(self, sequence_output):
- '''
- inputs:
- sequence_output: [batch_size, seq_len, hidden_size]
- outputs:
- prediction_scores: [batch_size, seq_len, vocab_size]
- '''
- prediction_scores = self.predictions(sequence_output)
- return prediction_scores
-
-
- class BertOnlyNSPHead(object):
- def __init__(self, config):
- self.seq_relationship = Linear(config.hidden_size, 2)
-
- def __call__(self, pooled_output):
- '''
- inputs:
- pooled_output: [batch_size, hidden_size]
- outputs:
- seq_relationship_score: [batch_size, 2]
- '''
- seq_relationship_score = self.seq_relationship(pooled_output)
- return seq_relationship_score
-
-
- class BertPreTrainingHeads(object):
- def __init__(self, config, bert_model_embedding_weights):
- self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
- self.seq_relationship = Linear(config.hidden_size, 2)
-
- def __call__(self, sequence_output, pooled_output):
- '''
- inputs:
- sequence_output: [batch_size, seq_len, hidden_size]
- pooled_output: [batch_size, hidden_size]
- outputs:
- prediction_scores: [batch_size, seq_len, vocab_size]
- seq_relationship_score: [batch_size, 2]
- '''
- prediction_scores = self.predictions(sequence_output)
- seq_relationship_score = self.seq_relationship(pooled_output)
- return prediction_scores, seq_relationship_score
-
- '''-----------------------------------------------------------------------------------------------'''
-
-
- '''
- BertModel:
- --------------------------------------------------------------------------------------------------'''
- class BertModel(object):
- """BERT model ("Bidirectional Embedding Representations from a Transformer").
-
- Params:
- config: a BertConfig class instance with the configuration to build a new model
-
- Inputs:
- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
- with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
- `extract_features.py`, `run_classifier.py` and `run_squad.py`)
- `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
- types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
- a `sentence B` token (see BERT paper for more details).
- `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
- selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
- input sequence length in the current batch. It's the mask that we typically use for attention when
- a batch has varying length sentences.
-
- Outputs: Tuple of (encoded_layers, pooled_output)
- `encoded_layers`: controled by `output_all_encoded_layers` argument:
- - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
- of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
- encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
- to the last attention block of shape [batch_size, sequence_length, hidden_size],
- `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
- classifier pretrained on top of the hidden state associated to the first character of the
- input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
-
- Example usage:
- ```python
- # Already been converted into WordPiece token ids
- input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
- input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
- token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
- config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
- num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
- model = modeling.BertModel(config=config)
- all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
- ```
- """
- def __init__(self, config):
- self.embeddings = BertEmbeddings(config)
- self.encoder = BertEncoder(config)
- self.pooler = BertPooler(config)
- self.batch_size=config.batch_size
- self.seq_len=config.max_position_embeddings
-
- def __call__(self, input_ids, token_type_ids, attention_mask):
- extended_attention_mask = ht.array_reshape_op(attention_mask, [self.batch_size, 1, 1, self.seq_len])
- extended_attention_mask = (extended_attention_mask+(-1.0)) * 10000.0
-
- embedding_output = self.embeddings(input_ids, token_type_ids)
- sequence_output = self.encoder(embedding_output, extended_attention_mask)
- pooled_output = self.pooler(sequence_output)
-
- return sequence_output, pooled_output
-
- '''-----------------------------------------------------------------------------------------------'''
-
-
- '''
- BertForPreTraining:
- --------------------------------------------------------------------------------------------------'''
- class BertForPreTraining(object):
- """BERT model with pre-training heads.
- This module comprises the BERT model followed by the two pre-training heads:
- - the masked language modeling head, and
- - the next sentence classification head.
-
- Params:
- config: a BertConfig class instance with the configuration to build a new model.
-
- Inputs:
- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
- with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
- `extract_features.py`, `run_classifier.py` and `run_squad.py`)
- `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
- types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
- a `sentence B` token (see BERT paper for more details).
- `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
- selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
- input sequence length in the current batch. It's the mask that we typically use for attention when
- a batch has varying length sentences.
- `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
- with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
- is only computed for the labels set in [0, ..., vocab_size]
- `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
- with indices selected in [0, 1].
- 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
- Outputs:
- if `masked_lm_labels` and `next_sentence_label` are not `None`:
- Outputs the total_loss which is the sum of the masked language modeling loss and the next
- sentence classification loss.
- if `masked_lm_labels` or `next_sentence_label` is `None`:
- Outputs a tuple comprising
- - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- - the next sentence classification logits of shape [batch_size, 2].
-
- Example usage:
- ```python
- # Already been converted into WordPiece token ids
- input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
- input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
- token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
- config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
- num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
- model = BertForPreTraining(config)
- masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
- ```
- """
-
- def __init__(self, config):
- self.bert = BertModel(config)
- self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
-
- self.vocab_size=config.vocab_size
-
- def __call__(self, input_ids, token_type_ids, attention_mask, masked_lm_labels=None, next_sentence_label=None):
- sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
- prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
- return_op = [prediction_scores, seq_relationship_score]
- if masked_lm_labels is not None and next_sentence_label is not None:
- '''
- masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
- prediction_scores: [batch_size, seq_len, vocab_size]
- next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
- seq_relationship_score: [batch_size, 2]
-
- masked_lm_loss: [batch_size*seq_len]
- next_sentence_loss: [batch_size]
- '''
-
- masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
- next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)
-
- return_op += [masked_lm_loss, next_sentence_loss]
- return return_op
-
-
- class BertForMaskedLM(object):
- """BERT model with the masked language modeling head.
- This module comprises the BERT model followed by the masked language modeling head.
-
- Params:
- config: a BertConfig class instance with the configuration to build a new model.
-
- Inputs:
- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
- with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
- `extract_features.py`, `run_classifier.py` and `run_squad.py`)
- `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
- types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
- a `sentence B` token (see BERT paper for more details).
- `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
- selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
- input sequence length in the current batch. It's the mask that we typically use for attention when
- a batch has varying length sentences.
- `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
- with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
- is only computed for the labels set in [0, ..., vocab_size]
-
- Outputs:
- if `masked_lm_labels` is not `None`:
- Outputs the masked language modeling loss.
- if `masked_lm_labels` is `None`:
- Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
-
- Example usage:
- ```python
- # Already been converted into WordPiece token ids
- input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
- input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
- token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
- config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
- num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
- model = BertForMaskedLM(config)
- masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
- ```
- """
- def __init__(self, config):
- self.bert = BertModel(config)
- self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
- self.vocab_size=config.vocab_size
-
- def __call__(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
- sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask)
- prediction_scores = self.cls(sequence_output)
-
- return_op = [prediction_scores]
- if masked_lm_labels is not None:
- '''
- masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
- prediction_scores: [batch_size, seq_len, vocab_size]
-
- masked_lm_loss: [batch_size*seq_len]
- '''
- masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
- return_op += [masked_lm_loss]
-
- return return_op
-
-
- class BertForNextSentencePrediction(object):
- """BERT model with next sentence prediction head.
- This module comprises the BERT model followed by the next sentence classification head.
-
- Params:
- config: a BertConfig class instance with the configuration to build a new model.
-
- Inputs:
- `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
- with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
- `extract_features.py`, `run_classifier.py` and `run_squad.py`)
- `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
- types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
- a `sentence B` token (see BERT paper for more details).
- `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
- selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
- input sequence length in the current batch. It's the mask that we typically use for attention when
- a batch has varying length sentences.
- `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
- with indices selected in [0, 1].
- 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
- Outputs:
- if `next_sentence_label` is not `None`:
- Outputs the total_loss which is the sum of the masked language modeling loss and the next
- sentence classification loss.
- if `next_sentence_label` is `None`:
- Outputs the next sentence classification logits of shape [batch_size, 2].
-
- Example usage:
- ```python
- # Already been converted into WordPiece token ids
- input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
- input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
- token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
- config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
- num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
- model = BertForNextSentencePrediction(config)
- seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
- ```
- """
- def __init__(self, config):
- self.bert = BertModel(config)
- self.cls = BertOnlyNSPHead(config)
-
- def __call__(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
- _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
- seq_relationship_score = self.cls(pooled_output)
-
- return_op = [seq_relationship_score]
- if next_sentence_label is not None:
- '''
- next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
- seq_relationship_score: [batch_size, 2]
-
- next_sentence_loss: [batch_size]
- '''
- next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)
- return_op += [next_sentence_loss]
-
- return return_op
-
- '''-----------------------------------------------------------------------------------------------'''
-
-
-
- '''
- Bert Layer utils (Embedding & BerLayerNorm & Dropout & Linear)
- --------------------------------------------------------------------------------------------------'''
- class Embedding(object):
- def __init__(self, num_embeddings, embedding_dim, embedding_name=None, initializer=ht.init.xavier_normal):
- self.weight = initializer(name=embedding_name, shape=(num_embeddings, embedding_dim))
- def __call__(self, input_tensor):
- return ht.embedding_lookup_op(self.weight, input_tensor)
-
- class BertLayerNorm(object):
- def __init__(self, hidden_size, eps=1e-12):
- self.eps=eps
- self.scale = ht.init.ones(name='layer_norm_scale', shape=(hidden_size, ))
- self.bias = ht.init.zeros(name='layer_norm_bias', shape=(hidden_size, ))
- def __call__(self, input_tensor):
- return ht.layer_normalization_op(input_tensor, self.scale, self.bias, eps=self.eps)
-
- class Dropout(object):
- def __init__(self, dropout_prob=None):
- self.dropout_prob = dropout_prob
- def __call__(self, input_tensor):
- if self.dropout_prob is None or self.dropout_prob == 0.0:
- return input_tensor
- output = ht.dropout_op(input_tensor, 1.0 - self.dropout_prob)
- return output
-
- class Linear(object):
- def __init__(self, in_features, out_features, bias=True, activation=None, kernel_initializer=ht.init.xavier_normal, bias_initializer=ht.init.zeros, input_shape=None):
- self.bias_flag = bias
- self.activation = activation
- self.weights = kernel_initializer(name='dense_weights', shape=(in_features, out_features))
- if self.bias_flag:
- self.bias = bias_initializer(name='dense_bias', shape=(out_features,))
- self.input_shape=input_shape
- self.in_features = in_features
- self.out_features = out_features
- if self.input_shape is not None and self.input_shape[-1]!=in_features:
- print("Specified in_features is not equal to input_shape[-1].")
- assert(False)
- def __call__(self, input_tensor):
- if self.input_shape is not None and len(self.input_shape)!=2:
- input_tensor = ht.array_reshape_op(input_tensor, [-1, self.in_features])
- outputs = ht.matmul_op(input_tensor, self.weights)
- if self.bias_flag:
- outputs = outputs + ht.broadcastto_op(self.bias, outputs)
- if self.activation is not None:
- outputs = self.activation(outputs)
- if self.input_shape is not None and len(self.input_shape)!=2:
- outputs = ht.array_reshape_op(outputs, self.input_shape[:-1]+[self.out_features])
- return outputs
- '''-----------------------------------------------------------------------------------------------'''
|