import hetu as ht
import numpy as np

'''
Bert Module Architecture & Input/Output Tensor Size

BertModel Inputs: 
    input_ids: [batch_size, seq_len], word token indices in the vocabulary

BertModel Outputs:
    sequence_output: [batch_size, seq_len, hidden_size] (from BertEncoder)
    pooled_output: [batch_size, hidden_size] (from BertPooler)

BertModel:
    --[batch_size, seq_len]--
    BertEmbeddings:
        Embedding(word/position/token_type)
        LayerNorm
        Dropout
    --[batch_size, seq_len, hidden_size]--

    --[batch_size, seq_len, hidden_size]--
    BertEncoder:
        BertLayer(num_hidden_layers):
            BertAttention:
                BertSelfAttention
                --[batch_size, seq_len, hidden_size]--
                BertSelfOutput:
                    Linear
                    Dropout
                    Add & LayerNorm

            --[batch_size, seq_len, hidden_size]--
            BertIntermediate:
                Linear + Act(gule)
            --[batch_size, seq_len, intermediate_size]--
            BertOutput:
                Linear
                Dropout
                Add & LayerNorm
    --[batch_size, seq_len, hidden_size]--

    --[batch_size, seq_len, hidden_size]--
    BertPooler:
        (Slice, select [cls])
        --[batch_size, hidden_size]--
        Linear + Act(Tanh)
    --[batch_size, hidden_size]--

Bert
'''


'''
BertEmbeddings:
--------------------------------------------------------------------------------------------------'''
class BertEmbeddings(object):
    """Construct the embeddings from word, position and token_type embeddings.
    """
    def __init__(self, config):
        self.seq_len = config.max_position_embeddings
        self.batch_size = config.batch_size

        self.word_embeddings = Embedding(config.vocab_size, config.hidden_size, "word_embeddings")
        self.position_embeddings = Embedding(config.max_position_embeddings, config.hidden_size, 'position_embeddings')
        self.token_type_embeddings = Embedding(config.type_vocab_size, config.hidden_size, 'token_type_embeddings')

        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = Dropout(config.hidden_dropout_prob)

    def __call__(self, input_ids, token_type_ids):
        '''
        inputs:
            input_ids: [batch_size, seq_len]
            token_type_ids: [batch_size, seq_len]

        outputs:
            embeddings: [batch_size, seq_len, hidden_size]
        '''
        seq_length= self.seq_len
        batch_size = self.batch_size
        position_ids = ht.Variable('position_ids', value=np.arange(seq_length).reshape((1,-1)).repeat(batch_size,axis=0), dtype=np.long, trainable=False, ctx=input_ids.ctx)


        '''Embedding Size
        inputs_id:[batch_size, seq_len], embedding_table:[vocab_size, hidden_size] 
        position_ids:[batch_size, seq_len], embedding_table:[seq_len, hidden_size]
        token_type_ids:[batch_size, seq_len], embedding_tabel:[type_vocab_size, hidden_size]
            --> embeddings: [batch_size, seq_len, hidden_size]
        '''
        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
'''-----------------------------------------------------------------------------------------------'''


'''
BertEncoder & BertLayer:
--------------------------------------------------------------------------------------------------'''
class BertEncoder(object):
    def __init__(self, config):
        self.output_hidden_states = config.output_hidden_states
        self.layer = [BertLayer(config) for _ in range(config.num_hidden_layers)]

    def __call__(self, hidden_states, attention_mask=None):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            attention_mask: [batch_size, num_heads, seq_len, seq_len]
        outputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            all_hidden_states: optional, num_hidden_layers * [batch_size, seq_len, hidden_size]
        '''

        for i, layer_module in enumerate(self.layer):
            hidden_states = layer_module(hidden_states, attention_mask)
        return hidden_states  # last-layer hidden state

class BertLayer(object):
    def __init__(self, config):
        self.attention = BertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def __call__(self, hidden_states, attention_mask):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            attention_mask: [batch_size, num_heads, seq_len, seq_len]
        outputs:
            layer_output: [batch_size, seq_len, hidden_size]
        '''
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
'''-----------------------------------------------------------------------------------------------'''


'''
BertAttention & BertSelfAttention & BertSelfOutput
--------------------------------------------------------------------------------------------------'''
class BertAttention(object):
    def __init__(self, config):
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def __call__(self, input_tensor, attention_mask):
        '''
        inputs:
            input_tensor: [batch_size, seq_len, hidden_size]
            attention_mask: [batch_size, num_heads, seq_len, seq_len]
        outputs:
            attention_output: [batch_size, seq_len, hidden_size]
        '''
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output

class BertSelfAttention(object):
    def __init__(self, config):
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size #all_head_size == hidden_size
        self.hidden_size = config.hidden_size
        self.seq_len = config.max_position_embeddings
        self.batch_size = config.batch_size

        linear_input_shape = [self.batch_size, self.seq_len, self.hidden_size]
        self.query = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
        self.key = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
        self.value = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)

        self.dropout = Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, input_tensor):
        output_tensor = ht.array_reshape_op(
            input_tensor, [self.batch_size, self.seq_len, self.num_attention_heads, self.attention_head_size])
        output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3])
        return output_tensor

    def __call__(self, hidden_states, attention_mask):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            attention_mask: [batch_size, 1, 1, seq_len]
        outputs:
            context_layer: [batch_size, seq_len, hidden_size]
        '''

        # linear transformation
        mixed_query_layer = self.query(hidden_states) # [batch_size, seq_len, hidden_size]
        mixed_key_layer = self.key(hidden_states) # [batch_size, seq_len, hidden_size]
        mixed_value_layer = self.value(hidden_states) # [batch_size, seq_len, hidden_size]

        # transpose
        query_layer = self.transpose_for_scores(mixed_query_layer) # [batch_size, num_heads, seq_len, head_size]
        key_layer = self.transpose_for_scores(mixed_key_layer) # [batch_size, num_heads, seq_len, head_size]
        value_layer = self.transpose_for_scores(mixed_value_layer) # [batch_size, num_heads, seq_len, head_size]

        # score
        key_layer_scaled = key_layer * (1.0 / np.sqrt(float(self.attention_head_size)))
        attention_scores = ht.batch_matmul_op(query_layer, key_layer_scaled, trans_B=True) # [batch_size, num_heads, seq_len, seq_len]

        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + ht.broadcastto_op(attention_mask, attention_scores)  # [batch_size, num_heads, seq_len, seq_len]

        # Normalize the attention scores to probabilities.
        attention_probs = ht.softmax_op(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = ht.batch_matmul_op(attention_probs, value_layer) # [batch_size, num_heads, seq_len, head_size]
        context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3]) # [batch_size, seq_len, num_heads, head_size]
        context_layer = ht.array_reshape_op(context_layer, [-1, self.seq_len, self.all_head_size]) # [batch_size, seq_len, hidden_size]
        return context_layer

class BertSelfOutput(object):
    def __init__(self, config):
        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
        self.dense = Linear(config.hidden_size, config.hidden_size, input_shape=linear_input_shape)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = Dropout(config.hidden_dropout_prob)

    def __call__(self, hidden_states, input_tensor):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            input_tensor: [batch_size, seq_len, hidden_size]
        outputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        '''
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
'''-----------------------------------------------------------------------------------------------'''


'''
BertIntermediate & BertOutput （2-layer FeedForward)
--------------------------------------------------------------------------------------------------'''
class BertIntermediate(object):
    def __init__(self, config):
        if config.hidden_act == "relu":
            self.intermediate_act_fn = ht.relu_op
        elif config.hidden_act == "gelu":
            self.intermediate_act_fn = ht.gelu_op
            print("Gelu activation is not implemented yet.")
            assert(False)
        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
        self.dense = Linear(config.hidden_size, config.intermediate_size, activation = self.intermediate_act_fn, input_shape=linear_input_shape)

    def __call__(self, hidden_states):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        outputs:
            hidden_states: [batch_size, seq_len, intermediate_size]
        '''
        hidden_states = self.dense(hidden_states)
        return hidden_states

class BertOutput(object):
    def __init__(self, config):
        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.intermediate_size]
        self.dense = Linear(config.intermediate_size, config.hidden_size, input_shape=linear_input_shape)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = Dropout(config.hidden_dropout_prob)

    def __call__(self, hidden_states, input_tensor):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, intermediate_size]
        outputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        '''
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
'''-----------------------------------------------------------------------------------------------'''


'''
BertPooler
--------------------------------------------------------------------------------------------------'''
class BertPooler(object):
    def __init__(self, config):
        self.dense = Linear(config.hidden_size, config.hidden_size, activation = ht.tanh_op)
        self.batch_size = config.batch_size
        self.hidden_size = config.hidden_size
    def __call__(self, hidden_states):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        outputs:
            pooled_output: [batch_size, hidden_size]
        '''
        first_token_tensor = ht.slice_op(hidden_states,(0,0,0),(self.batch_size,1,self.hidden_size))
        first_token_tensor = ht.array_reshape_op(first_token_tensor, [self.batch_size, self.hidden_size])
        pooled_output = self.dense(first_token_tensor)
        return pooled_output
'''-----------------------------------------------------------------------------------------------'''

'''
Bert Downstream Heads
--------------------------------------------------------------------------------------------------'''
class BertPredictionHeadTransform(object):
    def __init__(self, config):
        if config.hidden_act == "relu":
            self.hidden_act = ht.relu_op
        elif config.hidden_act == "gelu":
            self.hidden_act = ht.gelu_op
            print("Gelu activation is not implemented yet.")
            assert(False)
        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
        self.dense_act = Linear(config.hidden_size, config.hidden_size, activation=self.hidden_act, input_shape=linear_input_shape)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

    def __call__(self, hidden_states):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        outputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        '''
        hidden_states = self.dense_act(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states

class BertLMPredictionHead(object):
    def __init__(self, config, bert_model_embedding_weights):
        '''
        bert_model_embedding_weights: [vocab_size, hidden_size]
        '''
        self.transform = BertPredictionHeadTransform(config)

        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
        self.decoder = Linear(config.hidden_size, config.vocab_size, bias_initializer=ht.init.zeros,input_shape=linear_input_shape)
        self.decoder.weights = ht.transpose_op(bert_model_embedding_weights)

    def __call__(self, hidden_states):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        outputs:
            hidden_states: [batch_size, seq_len, vocab_size]
        '''
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states


class BertOnlyMLMHead(object):
    def __init__(self, config, bert_model_embedding_weights):
        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)

    def __call__(self, sequence_output):
        '''
        inputs:
            sequence_output: [batch_size, seq_len, hidden_size]
        outputs:
            prediction_scores: [batch_size, seq_len, vocab_size]
        '''
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


class BertOnlyNSPHead(object):
    def __init__(self, config):
        self.seq_relationship = Linear(config.hidden_size, 2)

    def __call__(self, pooled_output):
        '''
        inputs:
            pooled_output: [batch_size, hidden_size]
        outputs:
            seq_relationship_score: [batch_size, 2]
        '''
        seq_relationship_score = self.seq_relationship(pooled_output)
        return seq_relationship_score


class BertPreTrainingHeads(object):
    def __init__(self, config, bert_model_embedding_weights):
        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
        self.seq_relationship = Linear(config.hidden_size, 2)

    def __call__(self, sequence_output, pooled_output):
        '''
        inputs:
            sequence_output: [batch_size, seq_len, hidden_size]
            pooled_output: [batch_size, hidden_size]
        outputs:
            prediction_scores: [batch_size, seq_len, vocab_size]
            seq_relationship_score: [batch_size, 2]
        '''
        prediction_scores = self.predictions(sequence_output)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score

'''-----------------------------------------------------------------------------------------------'''


'''
BertModel:
--------------------------------------------------------------------------------------------------'''
class BertModel(object):
    """BERT model ("Bidirectional Embedding Representations from a Transformer").

    Params:
        config: a BertConfig class instance with the configuration to build a new model

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.

    Outputs: Tuple of (encoded_layers, pooled_output)
        `encoded_layers`: controled by `output_all_encoded_layers` argument:
            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
                to the last attention block of shape [batch_size, sequence_length, hidden_size],
        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
            classifier pretrained on top of the hidden state associated to the first character of the
            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = modeling.BertModel(config=config)
    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config):
        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)
        self.batch_size=config.batch_size
        self.seq_len=config.max_position_embeddings

    def __call__(self, input_ids, token_type_ids, attention_mask):
        extended_attention_mask = ht.array_reshape_op(attention_mask, [self.batch_size, 1, 1, self.seq_len])
        extended_attention_mask = (extended_attention_mask+(-1.0)) * 10000.0

        embedding_output = self.embeddings(input_ids, token_type_ids)
        sequence_output = self.encoder(embedding_output, extended_attention_mask)
        pooled_output = self.pooler(sequence_output)

        return sequence_output, pooled_output

'''-----------------------------------------------------------------------------------------------'''


'''
BertForPreTraining:
--------------------------------------------------------------------------------------------------'''
class BertForPreTraining(object):
    """BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads:
        - the masked language modeling head, and
        - the next sentence classification head.

    Params:
        config: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]
        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.

    Outputs:
        if `masked_lm_labels` and `next_sentence_label` are not `None`:
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `masked_lm_labels` or `next_sentence_label` is `None`:
            Outputs a tuple comprising
            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
            - the next sentence classification logits of shape [batch_size, 2].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForPreTraining(config)
    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """

    def __init__(self, config):
        self.bert = BertModel(config)
        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)

        self.vocab_size=config.vocab_size

    def __call__(self, input_ids, token_type_ids, attention_mask, masked_lm_labels=None, next_sentence_label=None):
        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        return_op = [prediction_scores, seq_relationship_score]
        if masked_lm_labels is not None and next_sentence_label is not None:
            '''
            masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
            prediction_scores: [batch_size, seq_len, vocab_size]
            next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
            seq_relationship_score: [batch_size, 2]

            masked_lm_loss: [batch_size*seq_len]
            next_sentence_loss: [batch_size]
            '''

            masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
            next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)

            return_op += [masked_lm_loss, next_sentence_loss]
        return return_op


class BertForMaskedLM(object):
    """BERT model with the masked language modeling head.
    This module comprises the BERT model followed by the masked language modeling head.

    Params:
        config: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]

    Outputs:
        if `masked_lm_labels` is  not `None`:
            Outputs the masked language modeling loss.
        if `masked_lm_labels` is `None`:
            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForMaskedLM(config)
    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config):
        self.bert = BertModel(config)
        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
        self.vocab_size=config.vocab_size

    def __call__(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask)
        prediction_scores = self.cls(sequence_output)

        return_op = [prediction_scores]
        if masked_lm_labels is not None:
            '''
            masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
            prediction_scores: [batch_size, seq_len, vocab_size]

            masked_lm_loss: [batch_size*seq_len]
            '''
            masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
            return_op += [masked_lm_loss]

        return return_op


class BertForNextSentencePrediction(object):
    """BERT model with next sentence prediction head.
    This module comprises the BERT model followed by the next sentence classification head.

    Params:
        config: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.

    Outputs:
        if `next_sentence_label` is not `None`:
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `next_sentence_label` is `None`:
            Outputs the next sentence classification logits of shape [batch_size, 2].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForNextSentencePrediction(config)
    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config):
        self.bert = BertModel(config)
        self.cls = BertOnlyNSPHead(config)

    def __call__(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        seq_relationship_score = self.cls(pooled_output)

        return_op = [seq_relationship_score]
        if next_sentence_label is not None:
            '''
            next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
            seq_relationship_score: [batch_size, 2]

            next_sentence_loss: [batch_size]
            '''
            next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)
            return_op += [next_sentence_loss]

        return return_op

'''-----------------------------------------------------------------------------------------------'''


'''
Bert Layer utils (Embedding & BerLayerNorm & Dropout & Linear)
--------------------------------------------------------------------------------------------------'''
class Embedding(object):
    def __init__(self, num_embeddings, embedding_dim, embedding_name=None, initializer=ht.init.xavier_normal):
        self.weight = initializer(name=embedding_name, shape=(num_embeddings, embedding_dim))
    def __call__(self, input_tensor):
        return ht.embedding_lookup_op(self.weight, input_tensor)

class BertLayerNorm(object):
    def __init__(self, hidden_size, eps=1e-12):
        self.eps=eps
        self.scale = ht.init.ones(name='layer_norm_scale', shape=(hidden_size, ))
        self.bias = ht.init.zeros(name='layer_norm_bias', shape=(hidden_size, ))
    def __call__(self, input_tensor):
        return ht.layer_normalization_op(input_tensor, self.scale, self.bias, eps=self.eps)

class Dropout(object):
    def __init__(self, dropout_prob=None):
        self.dropout_prob = dropout_prob
    def __call__(self, input_tensor):
        if self.dropout_prob is None or self.dropout_prob == 0.0:
            return input_tensor
        output = ht.dropout_op(input_tensor, 1.0 - self.dropout_prob)
        return output

class Linear(object):
    def __init__(self, in_features, out_features, bias=True, activation=None, kernel_initializer=ht.init.xavier_normal, bias_initializer=ht.init.zeros, input_shape=None):
        self.bias_flag = bias
        self.activation = activation
        self.weights = kernel_initializer(name='dense_weights', shape=(in_features, out_features))
        if self.bias_flag:
            self.bias = bias_initializer(name='dense_bias', shape=(out_features,))
        self.input_shape=input_shape
        self.in_features = in_features
        self.out_features = out_features
        if self.input_shape is not None and self.input_shape[-1]!=in_features:
            print("Specified in_features is not equal to input_shape[-1].")
            assert(False)
    def __call__(self, input_tensor):
        if self.input_shape is not None and len(self.input_shape)!=2:
            input_tensor = ht.array_reshape_op(input_tensor, [-1, self.in_features])
        outputs = ht.matmul_op(input_tensor, self.weights)
        if self.bias_flag:
            outputs = outputs + ht.broadcastto_op(self.bias, outputs)
        if self.activation is not None:
            outputs = self.activation(outputs)
        if self.input_shape is not None and len(self.input_shape)!=2:
            outputs = ht.array_reshape_op(outputs, self.input_shape[:-1]+[self.out_features])
        return outputs
'''-----------------------------------------------------------------------------------------------'''