log message

4 years ago · d177a8239e
--- a/examples/nlp/bert/bert_config.py
+++ b/examples/nlp/bert/bert_config.py
@@ -1,62 +0,0 @@
 '''
 BERT Config:
 --------------------------------------------------------------------------------------------------'''
 class BertConfig(object):
    """Configuration class to store the configuration of a `BertModel`.
    """
    def __init__(self,
                 vocab_size,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="relu",
                 hidden_dropout_prob=0.1,
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02,
                 output_hidden_states=False,
                 batch_size=100,
                 ):
        """Constructs BertConfig.

        Args:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
            num_attention_heads: Number of attention heads for each attention layer in
                the Transformer encoder.
            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
                layer in the Transformer encoder.
            hidden_act: The non-linear activation function (function or string) in the
                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
            hidden_dropout_prob: The dropout probabilitiy for all fully connected
                layers in the embeddings, encoder, and pooler.
            attention_probs_dropout_prob: The dropout ratio for the attention
                probabilities.
            max_position_embeddings: The maximum sequence length that this model might
                ever be used with. Typically set this to something large just in case
                (e.g., 512 or 1024 or 2048).
            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                `BertModel`.
            initializer_range: The sttdev of the truncated_normal_initializer for
                initializing all weight matrices.
        """

        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.hidden_act = hidden_act
        self.intermediate_size = intermediate_size
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
        self.max_position_embeddings = max_position_embeddings
        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
        self.output_hidden_states = output_hidden_states
        self.batch_size = batch_size


 '''-----------------------------------------------------------------------------------------------'''
--- a/examples/nlp/bert/hetu_bert.py
+++ b/examples/nlp/bert/hetu_bert.py
@@ -1,749 +0,0 @@
 import hetu as ht
 import numpy as np

 '''
 Bert Module Architecture & Input/Output Tensor Size

 BertModel Inputs: 
    input_ids: [batch_size, seq_len], word token indices in the vocabulary

 BertModel Outputs:
    sequence_output: [batch_size, seq_len, hidden_size] (from BertEncoder)
    pooled_output: [batch_size, hidden_size] (from BertPooler)

 BertModel:
    --[batch_size, seq_len]--
    BertEmbeddings:
        Embedding(word/position/token_type)
        LayerNorm
        Dropout
    --[batch_size, seq_len, hidden_size]--

    --[batch_size, seq_len, hidden_size]--
    BertEncoder:
        BertLayer(num_hidden_layers):
            BertAttention:
                BertSelfAttention
                --[batch_size, seq_len, hidden_size]--
                BertSelfOutput:
                    Linear
                    Dropout
                    Add & LayerNorm

            --[batch_size, seq_len, hidden_size]--
            BertIntermediate:
                Linear + Act(gule)
            --[batch_size, seq_len, intermediate_size]--
            BertOutput:
                Linear
                Dropout
                Add & LayerNorm
    --[batch_size, seq_len, hidden_size]--

    --[batch_size, seq_len, hidden_size]--
    BertPooler:
        (Slice, select [cls])
        --[batch_size, hidden_size]--
        Linear + Act(Tanh)
    --[batch_size, hidden_size]--

 Bert
 '''


 '''
 BertEmbeddings:
 --------------------------------------------------------------------------------------------------'''
 class BertEmbeddings(object):
    """Construct the embeddings from word, position and token_type embeddings.
    """
    def __init__(self, config):
        self.seq_len = config.max_position_embeddings
        self.batch_size = config.batch_size

        self.word_embeddings = Embedding(config.vocab_size, config.hidden_size, "word_embeddings")
        self.position_embeddings = Embedding(config.max_position_embeddings, config.hidden_size, 'position_embeddings')
        self.token_type_embeddings = Embedding(config.type_vocab_size, config.hidden_size, 'token_type_embeddings')

        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = Dropout(config.hidden_dropout_prob)

    def __call__(self, input_ids, token_type_ids):
        '''
        inputs:
            input_ids: [batch_size, seq_len]
            token_type_ids: [batch_size, seq_len]

        outputs:
            embeddings: [batch_size, seq_len, hidden_size]
        '''
        seq_length= self.seq_len
        batch_size = self.batch_size
        position_ids = ht.Variable('position_ids', value=np.arange(seq_length).reshape((1,-1)).repeat(batch_size,axis=0), dtype=np.long, trainable=False, ctx=input_ids.ctx)


        '''Embedding Size
        inputs_id:[batch_size, seq_len], embedding_table:[vocab_size, hidden_size] 
        position_ids:[batch_size, seq_len], embedding_table:[seq_len, hidden_size]
        token_type_ids:[batch_size, seq_len], embedding_tabel:[type_vocab_size, hidden_size]
            --> embeddings: [batch_size, seq_len, hidden_size]
        '''
        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        embeddings = words_embeddings + position_embeddings + token_type_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings
 '''-----------------------------------------------------------------------------------------------'''


 '''
 BertEncoder & BertLayer:
 --------------------------------------------------------------------------------------------------'''
 class BertEncoder(object):
    def __init__(self, config):
        self.output_hidden_states = config.output_hidden_states
        self.layer = [BertLayer(config) for _ in range(config.num_hidden_layers)]

    def __call__(self, hidden_states, attention_mask=None):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            attention_mask: [batch_size, num_heads, seq_len, seq_len]
        outputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            all_hidden_states: optional, num_hidden_layers * [batch_size, seq_len, hidden_size]
        '''

        for i, layer_module in enumerate(self.layer):
            hidden_states = layer_module(hidden_states, attention_mask)
        return hidden_states  # last-layer hidden state

 class BertLayer(object):
    def __init__(self, config):
        self.attention = BertAttention(config)
        self.intermediate = BertIntermediate(config)
        self.output = BertOutput(config)

    def __call__(self, hidden_states, attention_mask):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            attention_mask: [batch_size, num_heads, seq_len, seq_len]
        outputs:
            layer_output: [batch_size, seq_len, hidden_size]
        '''
        attention_output = self.attention(hidden_states, attention_mask)
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        return layer_output
 '''-----------------------------------------------------------------------------------------------'''


 '''
 BertAttention & BertSelfAttention & BertSelfOutput
 --------------------------------------------------------------------------------------------------'''
 class BertAttention(object):
    def __init__(self, config):
        self.self = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def __call__(self, input_tensor, attention_mask):
        '''
        inputs:
            input_tensor: [batch_size, seq_len, hidden_size]
            attention_mask: [batch_size, num_heads, seq_len, seq_len]
        outputs:
            attention_output: [batch_size, seq_len, hidden_size]
        '''
        self_output = self.self(input_tensor, attention_mask)
        attention_output = self.output(self_output, input_tensor)
        return attention_output

 class BertSelfAttention(object):
    def __init__(self, config):
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size #all_head_size == hidden_size
        self.hidden_size = config.hidden_size
        self.seq_len = config.max_position_embeddings
        self.batch_size = config.batch_size

        linear_input_shape = [self.batch_size, self.seq_len, self.hidden_size]
        self.query = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
        self.key = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
        self.value = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)

        self.dropout = Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, input_tensor):
        output_tensor = ht.array_reshape_op(
            input_tensor, [self.batch_size, self.seq_len, self.num_attention_heads, self.attention_head_size])
        output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3])
        return output_tensor

    def __call__(self, hidden_states, attention_mask):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            attention_mask: [batch_size, 1, 1, seq_len]
        outputs:
            context_layer: [batch_size, seq_len, hidden_size]
        '''

        # linear transformation
        mixed_query_layer = self.query(hidden_states) # [batch_size, seq_len, hidden_size]
        mixed_key_layer = self.key(hidden_states) # [batch_size, seq_len, hidden_size]
        mixed_value_layer = self.value(hidden_states) # [batch_size, seq_len, hidden_size]

        # transpose
        query_layer = self.transpose_for_scores(mixed_query_layer) # [batch_size, num_heads, seq_len, head_size]
        key_layer = self.transpose_for_scores(mixed_key_layer) # [batch_size, num_heads, seq_len, head_size]
        value_layer = self.transpose_for_scores(mixed_value_layer) # [batch_size, num_heads, seq_len, head_size]

        # score
        key_layer_scaled = key_layer * (1.0 / np.sqrt(float(self.attention_head_size)))
        attention_scores = ht.batch_matmul_op(query_layer, key_layer_scaled, trans_B=True) # [batch_size, num_heads, seq_len, seq_len]

        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        attention_scores = attention_scores + ht.broadcastto_op(attention_mask, attention_scores)  # [batch_size, num_heads, seq_len, seq_len]

        # Normalize the attention scores to probabilities.
        attention_probs = ht.softmax_op(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = ht.batch_matmul_op(attention_probs, value_layer) # [batch_size, num_heads, seq_len, head_size]
        context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3]) # [batch_size, seq_len, num_heads, head_size]
        context_layer = ht.array_reshape_op(context_layer, [-1, self.seq_len, self.all_head_size]) # [batch_size, seq_len, hidden_size]
        return context_layer

 class BertSelfOutput(object):
    def __init__(self, config):
        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
        self.dense = Linear(config.hidden_size, config.hidden_size, input_shape=linear_input_shape)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = Dropout(config.hidden_dropout_prob)

    def __call__(self, hidden_states, input_tensor):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
            input_tensor: [batch_size, seq_len, hidden_size]
        outputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        '''
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
 '''-----------------------------------------------------------------------------------------------'''


 '''
 BertIntermediate & BertOutput （2-layer FeedForward)
 --------------------------------------------------------------------------------------------------'''
 class BertIntermediate(object):
    def __init__(self, config):
        if config.hidden_act == "relu":
            self.intermediate_act_fn = ht.relu_op
        elif config.hidden_act == "gelu":
            self.intermediate_act_fn = ht.gelu_op
            print("Gelu activation is not implemented yet.")
            assert(False)
        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
        self.dense = Linear(config.hidden_size, config.intermediate_size, activation = self.intermediate_act_fn, input_shape=linear_input_shape)

    def __call__(self, hidden_states):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        outputs:
            hidden_states: [batch_size, seq_len, intermediate_size]
        '''
        hidden_states = self.dense(hidden_states)
        return hidden_states

 class BertOutput(object):
    def __init__(self, config):
        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.intermediate_size]
        self.dense = Linear(config.intermediate_size, config.hidden_size, input_shape=linear_input_shape)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = Dropout(config.hidden_dropout_prob)

    def __call__(self, hidden_states, input_tensor):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, intermediate_size]
        outputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        '''
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states
 '''-----------------------------------------------------------------------------------------------'''


 '''
 BertPooler
 --------------------------------------------------------------------------------------------------'''
 class BertPooler(object):
    def __init__(self, config):
        self.dense = Linear(config.hidden_size, config.hidden_size, activation = ht.tanh_op)
        self.batch_size = config.batch_size
        self.hidden_size = config.hidden_size
    def __call__(self, hidden_states):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        outputs:
            pooled_output: [batch_size, hidden_size]
        '''
        first_token_tensor = ht.slice_op(hidden_states,(0,0,0),(self.batch_size,1,self.hidden_size))
        first_token_tensor = ht.array_reshape_op(first_token_tensor, [self.batch_size, self.hidden_size])
        pooled_output = self.dense(first_token_tensor)
        return pooled_output
 '''-----------------------------------------------------------------------------------------------'''

 '''
 Bert Downstream Heads
 --------------------------------------------------------------------------------------------------'''
 class BertPredictionHeadTransform(object):
    def __init__(self, config):
        if config.hidden_act == "relu":
            self.hidden_act = ht.relu_op
        elif config.hidden_act == "gelu":
            self.hidden_act = ht.gelu_op
            print("Gelu activation is not implemented yet.")
            assert(False)
        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
        self.dense_act = Linear(config.hidden_size, config.hidden_size, activation=self.hidden_act, input_shape=linear_input_shape)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

    def __call__(self, hidden_states):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        outputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        '''
        hidden_states = self.dense_act(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        return hidden_states

 class BertLMPredictionHead(object):
    def __init__(self, config, bert_model_embedding_weights):
        '''
        bert_model_embedding_weights: [vocab_size, hidden_size]
        '''
        self.transform = BertPredictionHeadTransform(config)

        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
        self.decoder = Linear(config.hidden_size, config.vocab_size, bias_initializer=ht.init.zeros,input_shape=linear_input_shape)
        self.decoder.weights = ht.transpose_op(bert_model_embedding_weights)

    def __call__(self, hidden_states):
        '''
        inputs:
            hidden_states: [batch_size, seq_len, hidden_size]
        outputs:
            hidden_states: [batch_size, seq_len, vocab_size]
        '''
        hidden_states = self.transform(hidden_states)
        hidden_states = self.decoder(hidden_states)
        return hidden_states


 class BertOnlyMLMHead(object):
    def __init__(self, config, bert_model_embedding_weights):
        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)

    def __call__(self, sequence_output):
        '''
        inputs:
            sequence_output: [batch_size, seq_len, hidden_size]
        outputs:
            prediction_scores: [batch_size, seq_len, vocab_size]
        '''
        prediction_scores = self.predictions(sequence_output)
        return prediction_scores


 class BertOnlyNSPHead(object):
    def __init__(self, config):
        self.seq_relationship = Linear(config.hidden_size, 2)

    def __call__(self, pooled_output):
        '''
        inputs:
            pooled_output: [batch_size, hidden_size]
        outputs:
            seq_relationship_score: [batch_size, 2]
        '''
        seq_relationship_score = self.seq_relationship(pooled_output)
        return seq_relationship_score


 class BertPreTrainingHeads(object):
    def __init__(self, config, bert_model_embedding_weights):
        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
        self.seq_relationship = Linear(config.hidden_size, 2)

    def __call__(self, sequence_output, pooled_output):
        '''
        inputs:
            sequence_output: [batch_size, seq_len, hidden_size]
            pooled_output: [batch_size, hidden_size]
        outputs:
            prediction_scores: [batch_size, seq_len, vocab_size]
            seq_relationship_score: [batch_size, 2]
        '''
        prediction_scores = self.predictions(sequence_output)
        seq_relationship_score = self.seq_relationship(pooled_output)
        return prediction_scores, seq_relationship_score

 '''-----------------------------------------------------------------------------------------------'''


 '''
 BertModel:
 --------------------------------------------------------------------------------------------------'''
 class BertModel(object):
    """BERT model ("Bidirectional Embedding Representations from a Transformer").

    Params:
        config: a BertConfig class instance with the configuration to build a new model

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.

    Outputs: Tuple of (encoded_layers, pooled_output)
        `encoded_layers`: controled by `output_all_encoded_layers` argument:
            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
                to the last attention block of shape [batch_size, sequence_length, hidden_size],
        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
            classifier pretrained on top of the hidden state associated to the first character of the
            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = modeling.BertModel(config=config)
    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config):
        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)
        self.batch_size=config.batch_size
        self.seq_len=config.max_position_embeddings

    def __call__(self, input_ids, token_type_ids, attention_mask):
        extended_attention_mask = ht.array_reshape_op(attention_mask, [self.batch_size, 1, 1, self.seq_len])
        extended_attention_mask = (extended_attention_mask+(-1.0)) * 10000.0

        embedding_output = self.embeddings(input_ids, token_type_ids)
        sequence_output = self.encoder(embedding_output, extended_attention_mask)
        pooled_output = self.pooler(sequence_output)

        return sequence_output, pooled_output

 '''-----------------------------------------------------------------------------------------------'''


 '''
 BertForPreTraining:
 --------------------------------------------------------------------------------------------------'''
 class BertForPreTraining(object):
    """BERT model with pre-training heads.
    This module comprises the BERT model followed by the two pre-training heads:
        - the masked language modeling head, and
        - the next sentence classification head.

    Params:
        config: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]
        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.

    Outputs:
        if `masked_lm_labels` and `next_sentence_label` are not `None`:
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `masked_lm_labels` or `next_sentence_label` is `None`:
            Outputs a tuple comprising
            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
            - the next sentence classification logits of shape [batch_size, 2].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForPreTraining(config)
    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """

    def __init__(self, config):
        self.bert = BertModel(config)
        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)

        self.vocab_size=config.vocab_size

    def __call__(self, input_ids, token_type_ids, attention_mask, masked_lm_labels=None, next_sentence_label=None):
        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

        return_op = [prediction_scores, seq_relationship_score]
        if masked_lm_labels is not None and next_sentence_label is not None:
            '''
            masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
            prediction_scores: [batch_size, seq_len, vocab_size]
            next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
            seq_relationship_score: [batch_size, 2]

            masked_lm_loss: [batch_size*seq_len]
            next_sentence_loss: [batch_size]
            '''

            masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
            next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)

            return_op += [masked_lm_loss, next_sentence_loss]
        return return_op


 class BertForMaskedLM(object):
    """BERT model with the masked language modeling head.
    This module comprises the BERT model followed by the masked language modeling head.

    Params:
        config: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]

    Outputs:
        if `masked_lm_labels` is  not `None`:
            Outputs the masked language modeling loss.
        if `masked_lm_labels` is `None`:
            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForMaskedLM(config)
    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config):
        self.bert = BertModel(config)
        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
        self.vocab_size=config.vocab_size

    def __call__(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask)
        prediction_scores = self.cls(sequence_output)

        return_op = [prediction_scores]
        if masked_lm_labels is not None:
            '''
            masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
            prediction_scores: [batch_size, seq_len, vocab_size]

            masked_lm_loss: [batch_size*seq_len]
            '''
            masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
            return_op += [masked_lm_loss]

        return return_op


 class BertForNextSentencePrediction(object):
    """BERT model with next sentence prediction head.
    This module comprises the BERT model followed by the next sentence classification head.

    Params:
        config: a BertConfig class instance with the configuration to build a new model.

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
            with indices selected in [0, 1].
            0 => next sentence is the continuation, 1 => next sentence is a random sentence.

    Outputs:
        if `next_sentence_label` is not `None`:
            Outputs the total_loss which is the sum of the masked language modeling loss and the next
            sentence classification loss.
        if `next_sentence_label` is `None`:
            Outputs the next sentence classification logits of shape [batch_size, 2].

    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

    model = BertForNextSentencePrediction(config)
    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config):
        self.bert = BertModel(config)
        self.cls = BertOnlyNSPHead(config)

    def __call__(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        seq_relationship_score = self.cls(pooled_output)

        return_op = [seq_relationship_score]
        if next_sentence_label is not None:
            '''
            next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
            seq_relationship_score: [batch_size, 2]

            next_sentence_loss: [batch_size]
            '''
            next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)
            return_op += [next_sentence_loss]

        return return_op

 '''-----------------------------------------------------------------------------------------------'''



 '''
 Bert Layer utils (Embedding & BerLayerNorm & Dropout & Linear)
 --------------------------------------------------------------------------------------------------'''
 class Embedding(object):
    def __init__(self, num_embeddings, embedding_dim, embedding_name=None, initializer=ht.init.xavier_normal):
        self.weight = initializer(name=embedding_name, shape=(num_embeddings, embedding_dim))
    def __call__(self, input_tensor):
        return ht.embedding_lookup_op(self.weight, input_tensor)

 class BertLayerNorm(object):
    def __init__(self, hidden_size, eps=1e-12):
        self.eps=eps
        self.scale = ht.init.ones(name='layer_norm_scale', shape=(hidden_size, ))
        self.bias = ht.init.zeros(name='layer_norm_bias', shape=(hidden_size, ))
    def __call__(self, input_tensor):
        return ht.layer_normalization_op(input_tensor, self.scale, self.bias, eps=self.eps)

 class Dropout(object):
    def __init__(self, dropout_prob=None):
        self.dropout_prob = dropout_prob
    def __call__(self, input_tensor):
        if self.dropout_prob is None or self.dropout_prob == 0.0:
            return input_tensor
        output = ht.dropout_op(input_tensor, 1.0 - self.dropout_prob)
        return output

 class Linear(object):
    def __init__(self, in_features, out_features, bias=True, activation=None, kernel_initializer=ht.init.xavier_normal, bias_initializer=ht.init.zeros, input_shape=None):
        self.bias_flag = bias
        self.activation = activation
        self.weights = kernel_initializer(name='dense_weights', shape=(in_features, out_features))
        if self.bias_flag:
            self.bias = bias_initializer(name='dense_bias', shape=(out_features,))
        self.input_shape=input_shape
        self.in_features = in_features
        self.out_features = out_features
        if self.input_shape is not None and self.input_shape[-1]!=in_features:
            print("Specified in_features is not equal to input_shape[-1].")
            assert(False)
    def __call__(self, input_tensor):
        if self.input_shape is not None and len(self.input_shape)!=2:
            input_tensor = ht.array_reshape_op(input_tensor, [-1, self.in_features])
        outputs = ht.matmul_op(input_tensor, self.weights)
        if self.bias_flag:
            outputs = outputs + ht.broadcastto_op(self.bias, outputs)
        if self.activation is not None:
            outputs = self.activation(outputs)
        if self.input_shape is not None and len(self.input_shape)!=2:
            outputs = ht.array_reshape_op(outputs, self.input_shape[:-1]+[self.out_features])
        return outputs
 '''-----------------------------------------------------------------------------------------------'''
--- a/examples/nlp/bert/load_data.py
+++ b/examples/nlp/bert/load_data.py
@@ -1,76 +0,0 @@
 import numpy as np

 class DataLoader(object):
    def __init__(self, dataset='bookcorpus', doc_num=16000, save_gap=200, batch_size = 1024):
        self.data_names = ['input_ids','token_type_ids','attention_mask','masked_lm_labels','next_sentence_label']
        self.data = {'input_ids':[],
                    'token_type_ids':[],
                    'attention_mask':[],
                    'masked_lm_labels':[],
                    'next_sentence_label':[]}
        self.batch_size=batch_size
        self.batch_data = {'input_ids':[],
                    'token_type_ids':[],
                    'attention_mask':[],
                    'masked_lm_labels':[],
                    'next_sentence_label':[]}
        self.cur_batch_data = {'input_ids':[],
                    'token_type_ids':[],
                    'attention_mask':[],
                    'masked_lm_labels':[],
                    'next_sentence_label':[]}
        self.load_data(dataset=dataset, doc_num=doc_num, save_gap=save_gap)


    def load_data(self, dataset='bookcorpus', doc_num=16000, save_gap=200):
        print('Loading preprocessed dataset %s...'%dataset)
        data_dir = './preprocessed_data/%s/'%dataset

        for i in range(0,doc_num,save_gap):
            start, end = i, i+save_gap-1
            if end > doc_num-1:
                end = doc_num-1
            range_name = '_%d_%d.npy'%(start,end)
            print(start,end)
            for data_name in self.data_names:
                #print(data_dir+data_name+range_name)
                self.data[data_name].append(np.load(data_dir+data_name+range_name))
        
        for data_name in self.data_names:
            self.data[data_name] = np.concatenate(self.data[data_name],axis=0)
        
        self.data_len = self.data['input_ids'].shape[0]
        print(self.data['input_ids'].shape)

        print('Successfully loaded dataset %s!'%dataset)
            
    
    def make_epoch_data(self):
        batch_data = []

        for i in range(0, self.data_len, self.batch_size):
            start = i
            end = start + self.batch_size
            if end > self.data_len:
                end = self.data_len
            if end-start != self.batch_size:
                break
            for data_name in self.data_names:
                self.batch_data[data_name].append(self.data[data_name][start:end]) 

        self.batch_num = len(self.batch_data['input_ids'])
    
    def get_batch(self, idx):
        if idx >= self.batch_num:
            assert False
        for data_name in self.data_names:
            self.cur_batch_data[data_name] = self.batch_data[data_name][idx]

        return self.cur_batch_data.copy()
    
    def align(self, arr, length):
        ori_len = len(arr)
        if length > ori_len:
            return arr + [0] * (length - ori_len)
        else:
            return arr[:length]
--- a/examples/nlp/bert/processBertData.py
+++ b/examples/nlp/bert/processBertData.py
@@ -1,293 +0,0 @@
 from datasets import load_dataset
 import random
 import hetu
 import os
 import numpy as np

 ''' Usage example:
    In dir Hetu/examples/nlp/bert/: python processBertData.py
 '''

 # https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz


 class TrainingInstance(object):
  """A single training instance (sentence pair)."""

  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
               is_random_next):
    self.tokens = tokens
    self.segment_ids = segment_ids
    self.is_random_next = is_random_next
    self.masked_lm_positions = masked_lm_positions
    self.masked_lm_labels = masked_lm_labels

  def __str__(self):
    s = ""
    s += "tokens: %s\n" % (" ".join(
        [str(x) for x in self.tokens]))
    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
    s += "is_random_next: %s\n" % self.is_random_next
    s += "masked_lm_positions: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_positions]))
    s += "masked_lm_labels: %s\n" % (" ".join(
        [str(x) for x in self.masked_lm_labels]))
    s += "\n"
    return s

  def __repr__(self):
    return self.__str__()


 def create_masked_lm_predictions(tokens, masked_lm_prob,
                                 max_predictions_per_seq, vocab_words, rng):
    
    """Creates the predictions for the masked LM objective."""
    cand_indexes = []
    for (i, token) in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]":
            continue
        cand_indexes.append(i)
    rng.shuffle(cand_indexes)
    output_tokens = list(tokens)
    num_to_predict = min(max_predictions_per_seq,
                         max(1, int(round(len(tokens) * masked_lm_prob))))
    masked_lms = []
    for index in cand_indexes:
        if len(masked_lms) >= num_to_predict:
            break
        masked_token = None
        # replace with [MASK] at 80%.
        if rng.random() < 0.8:
            masked_token = "[MASK]"
        else:
            # keep original at 10%.
            if rng.random() < 0.5:
                masked_token = tokens[index]
            #  replace with random word at 10%.
            else:
                masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
        output_tokens[index] = masked_token
        masked_lms.append([index, tokens[index]])

    masked_lms.sort(key = lambda x: x[0])
    masked_lm_positions = []
    masked_lm_labels = []

    for p in masked_lms:
        masked_lm_positions.append(p[0])
        masked_lm_labels.append(p[1])

    return (output_tokens, masked_lm_positions, masked_lm_labels)


 def create_data_from_document(all_document,  doc_id, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
    """ Create Training example for input document """
    document = all_document[doc_id]
    max_num_tokens = max_seq_length - 3 # [CLS], [SEP], [SEP]
    target_seq_length = max_num_tokens
    # generate short sequence at the probility of short_seq_prob
    # In order to minimize the mismatch between pre-training and fine-tuning.
    if rng.random() < short_seq_prob:
        target_seq_length = rng.randint(2, max_num_tokens)
    instances = []
    current_chunk = []
    current_length = 0
    i = 0
    while i < len(document):
        segment = document[i]
        current_chunk.append(segment)
        current_length += len(segment)
        if i == len(document) - 1 or current_length >= target_seq_length:
            if current_chunk:
                # create sentence A
                a_end = 1
                if len(current_chunk) >= 2:
                    a_end = rng.randint(1, len(current_chunk) - 1)
                tokens_a = []
                for j in range(a_end):
                    tokens_a.extend([current_chunk[j]])
                tokens_b = []
                # Random next
                is_random_next = False
                if len(current_chunk) == 1 or rng.random() < 0.5:
                    is_random_next = True
                    target_b_length = target_seq_length - len(tokens_a)
                    for _ in range(10):
                        random_document_index = rng.randint(0, len(all_document) - 1)
                        if random_document_index != doc_id:
                            break
                    #If picked random document is the same as the current document
                    if random_document_index == doc_id:
                        is_random_next = False
                    random_document = all_document[random_document_index]
                    random_start = rng.randint(0, len(random_document) - 1)
                    for j in range(random_start, len(random_document)):
                        tokens_b.extend([random_document[j]])
                        if len(tokens_b) >= target_b_length:
                            break
                    # We didn't actually use these segments so we "put them back" so
                    # they don't go to waste.
                    num_unused_segments = len(current_chunk) - a_end
                    i -= num_unused_segments
                # Actual next
                else:
                    is_random_next = False
                    for j in range(a_end, len(current_chunk)):
                        tokens_b.extend([current_chunk[j]])
                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
                assert len(tokens_a) >= 1
                assert len(tokens_b) >= 1

                tokens = []
                segment_ids = []
                tokens.append("[CLS]")
                segment_ids.append(0)
                for token in tokens_a:
                    tokens.append(token)
                    segment_ids.append(0)

                tokens.append("[SEP]")
                segment_ids.append(0)

                for token in tokens_b:
                    tokens.append(token)
                    segment_ids.append(1)
                tokens.append("[SEP]")
                segment_ids.append(1)

                (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions(
                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
                
                instance = TrainingInstance(
                    tokens=tokens,
                    segment_ids=segment_ids,
                    is_random_next=is_random_next,
                    masked_lm_positions=masked_lm_positions,
                    masked_lm_labels=masked_lm_labels)
                instances.append(instance)
            current_chunk = []
            current_length = 0
        i += 1

    return instances
    
 def convert_instances_to_data(instances, tokenizer, max_seq_length):
    
    num_instances = len(instances)
    input_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
    input_mask_list = np.zeros([num_instances, max_seq_length], dtype="int32")
    segment_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
    masked_lm_labels = np.full([num_instances, max_seq_length],-1, dtype="int32")
    next_sentence_labels_list = np.zeros(num_instances, dtype="int32")

    for (idx, instance) in enumerate(instances):
        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
        input_mask = [1] * len(input_ids)
        segment_ids = list(instance.segment_ids)
        assert len(input_ids) <= max_seq_length

        padding_zero_list = [0]*int(max_seq_length - len(input_ids))
        input_ids += padding_zero_list
        input_mask += padding_zero_list
        segment_ids += padding_zero_list

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        masked_lm_positions = list(instance.masked_lm_positions)
        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)

        input_ids_list[idx][:] = input_ids
        input_mask_list[idx][:] = input_mask
        segment_ids_list[idx][:] = segment_ids
        masked_lm_labels[idx][masked_lm_positions] = masked_lm_ids
        next_sentence_labels_list[idx] = 1 if instance.is_random_next else 0

    return input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list

 def create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng):
    documents,  all_data = [], [[],[],[],[],[]]
    vocab_words = list(tokenizer.vocab.keys())

    save_path='./preprocessed_data/bookcorpus/'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    for i in range(dataset['train'].shape[0]):
        tokens = tokenizer.tokenize(dataset['train'][i]['text'])
        documents.append(tokens)
        instance = create_data_from_document(documents, i,\
                            max_seq_length, short_seq_prob, masked_lm_prob, 
                            max_predictions_per_seq, vocab_words, rng)
        data = convert_instances_to_data(instance, tokenizer, max_seq_length)
        print(i, len(tokens), len(instance))
        for j in range(5):
            all_data[j].append(data[j]) 

        save_gap=200
        if (i+1)%save_gap==0 and i:
            input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list = [np.concatenate(all_data[j],axis=0) for j in range(5)]
            print('Saving data from %d to %d: doc_num = %d, input_ids_shape ='%(i+1-save_gap,i, i+1), input_ids_list.shape)
            save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list, name='_%d_%d'%(i+1-save_gap,i))
            all_data = [[],[],[],[],[]]
        if i == dataset['train'].shape[0]-1:
            input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list = [np.concatenate(all_data[j],axis=0) for j in range(5)]
            print('Saving data from %d to %d: doc_num = %d, input_ids_shape ='%(save_gap*int(i/save_gap),i, i+1), input_ids_list.shape)
            save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list, name='_%d_%d'%(save_gap*int(i/save_gap),i))

 def save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list,name=''):
    save_path='./preprocessed_data/bookcorpus/'
    np.save(save_path+'input_ids'+name,np.array(input_ids_list))
    np.save(save_path+'token_type_ids'+name,np.array(segment_ids_list))
    np.save(save_path+'attention_mask'+name,np.array(input_mask_list))
    np.save(save_path+'masked_lm_labels'+name,np.array(masked_lm_labels))
    np.save(save_path+'next_sentence_label'+name,np.array(next_sentence_labels_list))

 def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
    """Truncates a pair of sequences to a maximum sequence length."""
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_num_tokens:
            break

        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
        assert len(trunc_tokens) >= 1

        #add more randomness and avoid biases.
        if rng.random() < 0.5:
            del trunc_tokens[0]
        else:
            trunc_tokens.pop()

 def show_dataset_detail(dataset):
    print(dataset.shape)
    print(dataset.column_names)
    print(dataset['train'].features)
    print(dataset['train'][0]['text'])

 if __name__ == "__main__":
    max_seq_length = 512
    do_lower_case = True
    short_seq_prob = 0.1
    masked_lm_prob = 0.15
    max_predictions_per_seq = 20
    
    vocab_path = "./datasets/bert-base-uncased-vocab.txt"
    dataset = load_dataset('../bookcorpus', cache_dir = "./cached_data")
    
    print("total number of documents {} ".format(dataset['train'].shape[0]))
    random_seed = 123
    rng = random.Random(random_seed)
    tokenizer = hetu.BertTokenizer(vocab_file=vocab_path, do_lower_case = do_lower_case)

    print("vocab_size =",len(tokenizer.vocab))
    print("max_seq_len =", max_seq_length)
    
    create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng)





--- a/examples/nlp/bert/train_hetu_bert.py
+++ b/examples/nlp/bert/train_hetu_bert.py
@@ -1,87 +0,0 @@
 from tqdm import tqdm
 import os
 import math
 import logging
 import hetu as ht
 from hetu_bert import BertForPreTraining
 from bert_config import BertConfig
 from load_data import DataLoader
 import numpy as np
 import time

 ''' Usage example:
    In dir Hetu/examples/nlp/bert/: python train_hetu_bert.py
 '''

 device_id=6
 executor_ctx = ht.gpu(device_id)

 num_epochs = 1
 lr = 1e-4

 config = BertConfig(vocab_size=30522, 
                    hidden_size=768,
                    num_hidden_layers=12, 
                    num_attention_heads=12, 
                    intermediate_size=3072, 
                    max_position_embeddings=512, 
                    #attention_probs_dropout_prob=0.0,
                    #hidden_dropout_prob=0.0,
                    batch_size=6)

 model = BertForPreTraining(config=config)

 batch_size = config.batch_size
 seq_len = config.max_position_embeddings
 vocab_size = config.vocab_size

 dataloader = DataLoader(dataset='bookcorpus', doc_num=200, save_gap=200, batch_size = batch_size)
 data_names = ['input_ids','token_type_ids','attention_mask','masked_lm_labels','next_sentence_label']

 input_ids = ht.Variable(name='input_ids', trainable=False)
 token_type_ids = ht.Variable(name='token_type_ids', trainable=False)
 attention_mask = ht.Variable(name='attention_mask', trainable=False)

 masked_lm_labels = ht.Variable(name='masked_lm_labels_one_hot', trainable=False)
 next_sentence_label = ht.Variable(name='next_sentence_label_one_hot', trainable=False)

 loss_position_sum = ht.Variable(name='loss_position_sum', trainable=False)

 _,_, masked_lm_loss, next_sentence_loss = model(input_ids, token_type_ids, attention_mask, masked_lm_labels, next_sentence_label)

 masked_lm_loss_mean = ht.div_op(ht.reduce_sum_op(masked_lm_loss, [0,1]), loss_position_sum)
 next_sentence_loss_mean = ht.reduce_mean_op(next_sentence_loss, [0])

 loss = masked_lm_loss_mean + next_sentence_loss_mean
 #opt = optimizer.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999, epsilon=1e-8)
 opt = ht.optim.SGDOptimizer(learning_rate=lr)
 train_op = opt.minimize(loss)

 executor = ht.Executor([masked_lm_loss_mean, next_sentence_loss_mean, loss, train_op],ctx=executor_ctx,dynamic_memory=True)


 dataloader.make_epoch_data()
 for ep in range(num_epochs):
    for i in range(dataloader.batch_num):
        batch_data = dataloader.get_batch(i)

        feed_dict = {
            input_ids: batch_data['input_ids'],
            token_type_ids: batch_data['token_type_ids'],
            attention_mask: batch_data['attention_mask'],
            masked_lm_labels: batch_data['masked_lm_labels'],
            next_sentence_label: batch_data['next_sentence_label'],
            loss_position_sum: np.array([np.where(batch_data['masked_lm_labels'].reshape(-1)!=-1)[0].shape[0]]),
        }
        
        start_time = time.time()
        results = executor.run(feed_dict = feed_dict)
        end_time = time.time()

        masked_lm_loss_mean_out = results[0].asnumpy()
        next_sentence_loss_mean_out = results[1].asnumpy()
        loss_out = results[2].asnumpy()

        print('[Epoch %d] (Iteration %d): Loss = %.3f, MLM_loss = %.3f, NSP_loss = %.6f, Time = %.3f'%(ep,i,loss_out, masked_lm_loss_mean_out, next_sentence_loss_mean_out, end_time-start_time))