From d177a8239e3b7f2ae67d5d5fad40335d37e66721 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E5=AE=87=E8=BD=A9?= <“942738126@qq.com”>
Date: Mon, 8 Nov 2021 21:37:28 +0800
Subject: [PATCH] log message

---
 examples/nlp/bert/bert_config.py     |  62 ---
 examples/nlp/bert/hetu_bert.py       | 749 ---------------------------
 examples/nlp/bert/load_data.py       |  76 ---
 examples/nlp/bert/processBertData.py | 293 -----------
 examples/nlp/bert/train_hetu_bert.py |  87 ----
 5 files changed, 1267 deletions(-)
 delete mode 100644 examples/nlp/bert/bert_config.py
 delete mode 100644 examples/nlp/bert/hetu_bert.py
 delete mode 100644 examples/nlp/bert/load_data.py
 delete mode 100644 examples/nlp/bert/processBertData.py
 delete mode 100644 examples/nlp/bert/train_hetu_bert.py

diff --git a/examples/nlp/bert/bert_config.py b/examples/nlp/bert/bert_config.py
deleted file mode 100644
index 6707662..0000000
--- a/examples/nlp/bert/bert_config.py
+++ /dev/null
@@ -1,62 +0,0 @@
-'''
-BERT Config:
---------------------------------------------------------------------------------------------------'''
-class BertConfig(object):
-    """Configuration class to store the configuration of a `BertModel`.
-    """
-    def __init__(self,
-                 vocab_size,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="relu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 output_hidden_states=False,
-                 batch_size=100,
-                 ):
-        """Constructs BertConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-        """
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.output_hidden_states = output_hidden_states
-        self.batch_size = batch_size
-
-
-'''-----------------------------------------------------------------------------------------------'''
diff --git a/examples/nlp/bert/hetu_bert.py b/examples/nlp/bert/hetu_bert.py
deleted file mode 100644
index 67343fc..0000000
--- a/examples/nlp/bert/hetu_bert.py
+++ /dev/null
@@ -1,749 +0,0 @@
-import hetu as ht
-import numpy as np
-
-'''
-Bert Module Architecture & Input/Output Tensor Size
-
-BertModel Inputs: 
-    input_ids: [batch_size, seq_len], word token indices in the vocabulary
-
-BertModel Outputs:
-    sequence_output: [batch_size, seq_len, hidden_size] (from BertEncoder)
-    pooled_output: [batch_size, hidden_size] (from BertPooler)
-
-BertModel:
-    --[batch_size, seq_len]--
-    BertEmbeddings:
-        Embedding(word/position/token_type)
-        LayerNorm
-        Dropout
-    --[batch_size, seq_len, hidden_size]--
-
-    --[batch_size, seq_len, hidden_size]--
-    BertEncoder:
-        BertLayer(num_hidden_layers):
-            BertAttention:
-                BertSelfAttention
-                --[batch_size, seq_len, hidden_size]--
-                BertSelfOutput:
-                    Linear
-                    Dropout
-                    Add & LayerNorm
-
-            --[batch_size, seq_len, hidden_size]--
-            BertIntermediate:
-                Linear + Act(gule)
-            --[batch_size, seq_len, intermediate_size]--
-            BertOutput:
-                Linear
-                Dropout
-                Add & LayerNorm
-    --[batch_size, seq_len, hidden_size]--
-
-    --[batch_size, seq_len, hidden_size]--
-    BertPooler:
-        (Slice, select [cls])
-        --[batch_size, hidden_size]--
-        Linear + Act(Tanh)
-    --[batch_size, hidden_size]--
-
-Bert
-'''
-
-
-'''
-BertEmbeddings:
---------------------------------------------------------------------------------------------------'''
-class BertEmbeddings(object):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-    def __init__(self, config):
-        self.seq_len = config.max_position_embeddings
-        self.batch_size = config.batch_size
-
-        self.word_embeddings = Embedding(config.vocab_size, config.hidden_size, "word_embeddings")
-        self.position_embeddings = Embedding(config.max_position_embeddings, config.hidden_size, 'position_embeddings')
-        self.token_type_embeddings = Embedding(config.type_vocab_size, config.hidden_size, 'token_type_embeddings')
-
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.dropout = Dropout(config.hidden_dropout_prob)
-
-    def __call__(self, input_ids, token_type_ids):
-        '''
-        inputs:
-            input_ids: [batch_size, seq_len]
-            token_type_ids: [batch_size, seq_len]
-
-        outputs:
-            embeddings: [batch_size, seq_len, hidden_size]
-        '''
-        seq_length= self.seq_len
-        batch_size = self.batch_size
-        position_ids = ht.Variable('position_ids', value=np.arange(seq_length).reshape((1,-1)).repeat(batch_size,axis=0), dtype=np.long, trainable=False, ctx=input_ids.ctx)
-
-
-        '''Embedding Size
-        inputs_id:[batch_size, seq_len], embedding_table:[vocab_size, hidden_size] 
-        position_ids:[batch_size, seq_len], embedding_table:[seq_len, hidden_size]
-        token_type_ids:[batch_size, seq_len], embedding_tabel:[type_vocab_size, hidden_size]
-            --> embeddings: [batch_size, seq_len, hidden_size]
-        '''
-        words_embeddings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = words_embeddings + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-'''-----------------------------------------------------------------------------------------------'''
-
-
-'''
-BertEncoder & BertLayer:
---------------------------------------------------------------------------------------------------'''
-class BertEncoder(object):
-    def __init__(self, config):
-        self.output_hidden_states = config.output_hidden_states
-        self.layer = [BertLayer(config) for _ in range(config.num_hidden_layers)]
-
-    def __call__(self, hidden_states, attention_mask=None):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-            attention_mask: [batch_size, num_heads, seq_len, seq_len]
-        outputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-            all_hidden_states: optional, num_hidden_layers * [batch_size, seq_len, hidden_size]
-        '''
-
-        for i, layer_module in enumerate(self.layer):
-            hidden_states = layer_module(hidden_states, attention_mask)
-        return hidden_states  # last-layer hidden state
-
-class BertLayer(object):
-    def __init__(self, config):
-        self.attention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def __call__(self, hidden_states, attention_mask):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-            attention_mask: [batch_size, num_heads, seq_len, seq_len]
-        outputs:
-            layer_output: [batch_size, seq_len, hidden_size]
-        '''
-        attention_output = self.attention(hidden_states, attention_mask)
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-'''-----------------------------------------------------------------------------------------------'''
-
-
-'''
-BertAttention & BertSelfAttention & BertSelfOutput
---------------------------------------------------------------------------------------------------'''
-class BertAttention(object):
-    def __init__(self, config):
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
-
-    def __call__(self, input_tensor, attention_mask):
-        '''
-        inputs:
-            input_tensor: [batch_size, seq_len, hidden_size]
-            attention_mask: [batch_size, num_heads, seq_len, seq_len]
-        outputs:
-            attention_output: [batch_size, seq_len, hidden_size]
-        '''
-        self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
-
-class BertSelfAttention(object):
-    def __init__(self, config):
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size #all_head_size == hidden_size
-        self.hidden_size = config.hidden_size
-        self.seq_len = config.max_position_embeddings
-        self.batch_size = config.batch_size
-
-        linear_input_shape = [self.batch_size, self.seq_len, self.hidden_size]
-        self.query = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
-        self.key = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
-        self.value = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
-
-        self.dropout = Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, input_tensor):
-        output_tensor = ht.array_reshape_op(
-            input_tensor, [self.batch_size, self.seq_len, self.num_attention_heads, self.attention_head_size])
-        output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3])
-        return output_tensor
-
-    def __call__(self, hidden_states, attention_mask):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-            attention_mask: [batch_size, 1, 1, seq_len]
-        outputs:
-            context_layer: [batch_size, seq_len, hidden_size]
-        '''
-
-        # linear transformation
-        mixed_query_layer = self.query(hidden_states) # [batch_size, seq_len, hidden_size]
-        mixed_key_layer = self.key(hidden_states) # [batch_size, seq_len, hidden_size]
-        mixed_value_layer = self.value(hidden_states) # [batch_size, seq_len, hidden_size]
-
-        # transpose
-        query_layer = self.transpose_for_scores(mixed_query_layer) # [batch_size, num_heads, seq_len, head_size]
-        key_layer = self.transpose_for_scores(mixed_key_layer) # [batch_size, num_heads, seq_len, head_size]
-        value_layer = self.transpose_for_scores(mixed_value_layer) # [batch_size, num_heads, seq_len, head_size]
-
-        # score
-        key_layer_scaled = key_layer * (1.0 / np.sqrt(float(self.attention_head_size)))
-        attention_scores = ht.batch_matmul_op(query_layer, key_layer_scaled, trans_B=True) # [batch_size, num_heads, seq_len, seq_len]
-
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-        attention_scores = attention_scores + ht.broadcastto_op(attention_mask, attention_scores)  # [batch_size, num_heads, seq_len, seq_len]
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = ht.softmax_op(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        context_layer = ht.batch_matmul_op(attention_probs, value_layer) # [batch_size, num_heads, seq_len, head_size]
-        context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3]) # [batch_size, seq_len, num_heads, head_size]
-        context_layer = ht.array_reshape_op(context_layer, [-1, self.seq_len, self.all_head_size]) # [batch_size, seq_len, hidden_size]
-        return context_layer
-
-class BertSelfOutput(object):
-    def __init__(self, config):
-        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
-        self.dense = Linear(config.hidden_size, config.hidden_size, input_shape=linear_input_shape)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.dropout = Dropout(config.hidden_dropout_prob)
-
-    def __call__(self, hidden_states, input_tensor):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-            input_tensor: [batch_size, seq_len, hidden_size]
-        outputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-        '''
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-'''-----------------------------------------------------------------------------------------------'''
-
-
-'''
-BertIntermediate & BertOutput （2-layer FeedForward)
---------------------------------------------------------------------------------------------------'''
-class BertIntermediate(object):
-    def __init__(self, config):
-        if config.hidden_act == "relu":
-            self.intermediate_act_fn = ht.relu_op
-        elif config.hidden_act == "gelu":
-            self.intermediate_act_fn = ht.gelu_op
-            print("Gelu activation is not implemented yet.")
-            assert(False)
-        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
-        self.dense = Linear(config.hidden_size, config.intermediate_size, activation = self.intermediate_act_fn, input_shape=linear_input_shape)
-
-    def __call__(self, hidden_states):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-        outputs:
-            hidden_states: [batch_size, seq_len, intermediate_size]
-        '''
-        hidden_states = self.dense(hidden_states)
-        return hidden_states
-
-class BertOutput(object):
-    def __init__(self, config):
-        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.intermediate_size]
-        self.dense = Linear(config.intermediate_size, config.hidden_size, input_shape=linear_input_shape)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-        self.dropout = Dropout(config.hidden_dropout_prob)
-
-    def __call__(self, hidden_states, input_tensor):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, intermediate_size]
-        outputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-        '''
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-'''-----------------------------------------------------------------------------------------------'''
-
-
-'''
-BertPooler
---------------------------------------------------------------------------------------------------'''
-class BertPooler(object):
-    def __init__(self, config):
-        self.dense = Linear(config.hidden_size, config.hidden_size, activation = ht.tanh_op)
-        self.batch_size = config.batch_size
-        self.hidden_size = config.hidden_size
-    def __call__(self, hidden_states):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-        outputs:
-            pooled_output: [batch_size, hidden_size]
-        '''
-        first_token_tensor = ht.slice_op(hidden_states,(0,0,0),(self.batch_size,1,self.hidden_size))
-        first_token_tensor = ht.array_reshape_op(first_token_tensor, [self.batch_size, self.hidden_size])
-        pooled_output = self.dense(first_token_tensor)
-        return pooled_output
-'''-----------------------------------------------------------------------------------------------'''
-
-'''
-Bert Downstream Heads
---------------------------------------------------------------------------------------------------'''
-class BertPredictionHeadTransform(object):
-    def __init__(self, config):
-        if config.hidden_act == "relu":
-            self.hidden_act = ht.relu_op
-        elif config.hidden_act == "gelu":
-            self.hidden_act = ht.gelu_op
-            print("Gelu activation is not implemented yet.")
-            assert(False)
-        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
-        self.dense_act = Linear(config.hidden_size, config.hidden_size, activation=self.hidden_act, input_shape=linear_input_shape)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
-
-    def __call__(self, hidden_states):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-        outputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-        '''
-        hidden_states = self.dense_act(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
-
-class BertLMPredictionHead(object):
-    def __init__(self, config, bert_model_embedding_weights):
-        '''
-        bert_model_embedding_weights: [vocab_size, hidden_size]
-        '''
-        self.transform = BertPredictionHeadTransform(config)
-
-        linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
-        self.decoder = Linear(config.hidden_size, config.vocab_size, bias_initializer=ht.init.zeros,input_shape=linear_input_shape)
-        self.decoder.weights = ht.transpose_op(bert_model_embedding_weights)
-
-    def __call__(self, hidden_states):
-        '''
-        inputs:
-            hidden_states: [batch_size, seq_len, hidden_size]
-        outputs:
-            hidden_states: [batch_size, seq_len, vocab_size]
-        '''
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-
-
-class BertOnlyMLMHead(object):
-    def __init__(self, config, bert_model_embedding_weights):
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-
-    def __call__(self, sequence_output):
-        '''
-        inputs:
-            sequence_output: [batch_size, seq_len, hidden_size]
-        outputs:
-            prediction_scores: [batch_size, seq_len, vocab_size]
-        '''
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
-
-
-class BertOnlyNSPHead(object):
-    def __init__(self, config):
-        self.seq_relationship = Linear(config.hidden_size, 2)
-
-    def __call__(self, pooled_output):
-        '''
-        inputs:
-            pooled_output: [batch_size, hidden_size]
-        outputs:
-            seq_relationship_score: [batch_size, 2]
-        '''
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-
-
-class BertPreTrainingHeads(object):
-    def __init__(self, config, bert_model_embedding_weights):
-        self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
-        self.seq_relationship = Linear(config.hidden_size, 2)
-
-    def __call__(self, sequence_output, pooled_output):
-        '''
-        inputs:
-            sequence_output: [batch_size, seq_len, hidden_size]
-            pooled_output: [batch_size, hidden_size]
-        outputs:
-            prediction_scores: [batch_size, seq_len, vocab_size]
-            seq_relationship_score: [batch_size, 2]
-        '''
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
-
-'''-----------------------------------------------------------------------------------------------'''
-
-
-'''
-BertModel:
---------------------------------------------------------------------------------------------------'''
-class BertModel(object):
-    """BERT model ("Bidirectional Embedding Representations from a Transformer").
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-
-    Outputs: Tuple of (encoded_layers, pooled_output)
-        `encoded_layers`: controled by `output_all_encoded_layers` argument:
-            - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
-                of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
-                encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
-            - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
-                to the last attention block of shape [batch_size, sequence_length, hidden_size],
-        `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
-            classifier pretrained on top of the hidden state associated to the first character of the
-            input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = modeling.BertModel(config=config)
-    all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-        self.batch_size=config.batch_size
-        self.seq_len=config.max_position_embeddings
-
-    def __call__(self, input_ids, token_type_ids, attention_mask):
-        extended_attention_mask = ht.array_reshape_op(attention_mask, [self.batch_size, 1, 1, self.seq_len])
-        extended_attention_mask = (extended_attention_mask+(-1.0)) * 10000.0
-
-        embedding_output = self.embeddings(input_ids, token_type_ids)
-        sequence_output = self.encoder(embedding_output, extended_attention_mask)
-        pooled_output = self.pooler(sequence_output)
-
-        return sequence_output, pooled_output
-
-'''-----------------------------------------------------------------------------------------------'''
-
-
-'''
-BertForPreTraining:
---------------------------------------------------------------------------------------------------'''
-class BertForPreTraining(object):
-    """BERT model with pre-training heads.
-    This module comprises the BERT model followed by the two pre-training heads:
-        - the masked language modeling head, and
-        - the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `masked_lm_labels` and `next_sentence_label` are not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `masked_lm_labels` or `next_sentence_label` is `None`:
-            Outputs a tuple comprising
-            - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
-            - the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForPreTraining(config)
-    masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-
-    def __init__(self, config):
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
-
-        self.vocab_size=config.vocab_size
-
-    def __call__(self, input_ids, token_type_ids, attention_mask, masked_lm_labels=None, next_sentence_label=None):
-        sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
-        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
-
-        return_op = [prediction_scores, seq_relationship_score]
-        if masked_lm_labels is not None and next_sentence_label is not None:
-            '''
-            masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
-            prediction_scores: [batch_size, seq_len, vocab_size]
-            next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
-            seq_relationship_score: [batch_size, 2]
-
-            masked_lm_loss: [batch_size*seq_len]
-            next_sentence_loss: [batch_size]
-            '''
-
-            masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
-            next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)
-
-            return_op += [masked_lm_loss, next_sentence_loss]
-        return return_op
-
-
-class BertForMaskedLM(object):
-    """BERT model with the masked language modeling head.
-    This module comprises the BERT model followed by the masked language modeling head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-
-    Outputs:
-        if `masked_lm_labels` is  not `None`:
-            Outputs the masked language modeling loss.
-        if `masked_lm_labels` is `None`:
-            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForMaskedLM(config)
-    masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
-        self.vocab_size=config.vocab_size
-
-    def __call__(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
-        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask)
-        prediction_scores = self.cls(sequence_output)
-
-        return_op = [prediction_scores]
-        if masked_lm_labels is not None:
-            '''
-            masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
-            prediction_scores: [batch_size, seq_len, vocab_size]
-
-            masked_lm_loss: [batch_size*seq_len]
-            '''
-            masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
-            return_op += [masked_lm_loss]
-
-        return return_op
-
-
-class BertForNextSentencePrediction(object):
-    """BERT model with next sentence prediction head.
-    This module comprises the BERT model followed by the next sentence classification head.
-
-    Params:
-        config: a BertConfig class instance with the configuration to build a new model.
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
-            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
-        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
-            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
-            a `sentence B` token (see BERT paper for more details).
-        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
-            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
-            input sequence length in the current batch. It's the mask that we typically use for attention when
-            a batch has varying length sentences.
-        `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, 1].
-            0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-
-    Outputs:
-        if `next_sentence_label` is not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `next_sentence_label` is `None`:
-            Outputs the next sentence classification logits of shape [batch_size, 2].
-
-    Example usage:
-    ```python
-    # Already been converted into WordPiece token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
-    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
-
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForNextSentencePrediction(config)
-    seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
-    """
-    def __init__(self, config):
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-
-    def __call__(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
-        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
-        seq_relationship_score = self.cls(pooled_output)
-
-        return_op = [seq_relationship_score]
-        if next_sentence_label is not None:
-            '''
-            next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
-            seq_relationship_score: [batch_size, 2]
-
-            next_sentence_loss: [batch_size]
-            '''
-            next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)
-            return_op += [next_sentence_loss]
-
-        return return_op
-
-'''-----------------------------------------------------------------------------------------------'''
-
-
-
-'''
-Bert Layer utils (Embedding & BerLayerNorm & Dropout & Linear)
---------------------------------------------------------------------------------------------------'''
-class Embedding(object):
-    def __init__(self, num_embeddings, embedding_dim, embedding_name=None, initializer=ht.init.xavier_normal):
-        self.weight = initializer(name=embedding_name, shape=(num_embeddings, embedding_dim))
-    def __call__(self, input_tensor):
-        return ht.embedding_lookup_op(self.weight, input_tensor)
-
-class BertLayerNorm(object):
-    def __init__(self, hidden_size, eps=1e-12):
-        self.eps=eps
-        self.scale = ht.init.ones(name='layer_norm_scale', shape=(hidden_size, ))
-        self.bias = ht.init.zeros(name='layer_norm_bias', shape=(hidden_size, ))
-    def __call__(self, input_tensor):
-        return ht.layer_normalization_op(input_tensor, self.scale, self.bias, eps=self.eps)
-
-class Dropout(object):
-    def __init__(self, dropout_prob=None):
-        self.dropout_prob = dropout_prob
-    def __call__(self, input_tensor):
-        if self.dropout_prob is None or self.dropout_prob == 0.0:
-            return input_tensor
-        output = ht.dropout_op(input_tensor, 1.0 - self.dropout_prob)
-        return output
-
-class Linear(object):
-    def __init__(self, in_features, out_features, bias=True, activation=None, kernel_initializer=ht.init.xavier_normal, bias_initializer=ht.init.zeros, input_shape=None):
-        self.bias_flag = bias
-        self.activation = activation
-        self.weights = kernel_initializer(name='dense_weights', shape=(in_features, out_features))
-        if self.bias_flag:
-            self.bias = bias_initializer(name='dense_bias', shape=(out_features,))
-        self.input_shape=input_shape
-        self.in_features = in_features
-        self.out_features = out_features
-        if self.input_shape is not None and self.input_shape[-1]!=in_features:
-            print("Specified in_features is not equal to input_shape[-1].")
-            assert(False)
-    def __call__(self, input_tensor):
-        if self.input_shape is not None and len(self.input_shape)!=2:
-            input_tensor = ht.array_reshape_op(input_tensor, [-1, self.in_features])
-        outputs = ht.matmul_op(input_tensor, self.weights)
-        if self.bias_flag:
-            outputs = outputs + ht.broadcastto_op(self.bias, outputs)
-        if self.activation is not None:
-            outputs = self.activation(outputs)
-        if self.input_shape is not None and len(self.input_shape)!=2:
-            outputs = ht.array_reshape_op(outputs, self.input_shape[:-1]+[self.out_features])
-        return outputs
-'''-----------------------------------------------------------------------------------------------'''
diff --git a/examples/nlp/bert/load_data.py b/examples/nlp/bert/load_data.py
deleted file mode 100644
index 499df1a..0000000
--- a/examples/nlp/bert/load_data.py
+++ /dev/null
@@ -1,76 +0,0 @@
-import numpy as np
-
-class DataLoader(object):
-    def __init__(self, dataset='bookcorpus', doc_num=16000, save_gap=200, batch_size = 1024):
-        self.data_names = ['input_ids','token_type_ids','attention_mask','masked_lm_labels','next_sentence_label']
-        self.data = {'input_ids':[],
-                    'token_type_ids':[],
-                    'attention_mask':[],
-                    'masked_lm_labels':[],
-                    'next_sentence_label':[]}
-        self.batch_size=batch_size
-        self.batch_data = {'input_ids':[],
-                    'token_type_ids':[],
-                    'attention_mask':[],
-                    'masked_lm_labels':[],
-                    'next_sentence_label':[]}
-        self.cur_batch_data = {'input_ids':[],
-                    'token_type_ids':[],
-                    'attention_mask':[],
-                    'masked_lm_labels':[],
-                    'next_sentence_label':[]}
-        self.load_data(dataset=dataset, doc_num=doc_num, save_gap=save_gap)
-
-
-    def load_data(self, dataset='bookcorpus', doc_num=16000, save_gap=200):
-        print('Loading preprocessed dataset %s...'%dataset)
-        data_dir = './preprocessed_data/%s/'%dataset
-
-        for i in range(0,doc_num,save_gap):
-            start, end = i, i+save_gap-1
-            if end > doc_num-1:
-                end = doc_num-1
-            range_name = '_%d_%d.npy'%(start,end)
-            print(start,end)
-            for data_name in self.data_names:
-                #print(data_dir+data_name+range_name)
-                self.data[data_name].append(np.load(data_dir+data_name+range_name))
-        
-        for data_name in self.data_names:
-            self.data[data_name] = np.concatenate(self.data[data_name],axis=0)
-        
-        self.data_len = self.data['input_ids'].shape[0]
-        print(self.data['input_ids'].shape)
-
-        print('Successfully loaded dataset %s!'%dataset)
-            
-    
-    def make_epoch_data(self):
-        batch_data = []
-
-        for i in range(0, self.data_len, self.batch_size):
-            start = i
-            end = start + self.batch_size
-            if end > self.data_len:
-                end = self.data_len
-            if end-start != self.batch_size:
-                break
-            for data_name in self.data_names:
-                self.batch_data[data_name].append(self.data[data_name][start:end]) 
-
-        self.batch_num = len(self.batch_data['input_ids'])
-    
-    def get_batch(self, idx):
-        if idx >= self.batch_num:
-            assert False
-        for data_name in self.data_names:
-            self.cur_batch_data[data_name] = self.batch_data[data_name][idx]
-
-        return self.cur_batch_data.copy()
-    
-    def align(self, arr, length):
-        ori_len = len(arr)
-        if length > ori_len:
-            return arr + [0] * (length - ori_len)
-        else:
-            return arr[:length]
diff --git a/examples/nlp/bert/processBertData.py b/examples/nlp/bert/processBertData.py
deleted file mode 100644
index a331d82..0000000
--- a/examples/nlp/bert/processBertData.py
+++ /dev/null
@@ -1,293 +0,0 @@
-from datasets import load_dataset
-import random
-import hetu
-import os
-import numpy as np
-
-''' Usage example:
-    In dir Hetu/examples/nlp/bert/: python processBertData.py
-'''
-
-# https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz
-
-
-class TrainingInstance(object):
-  """A single training instance (sentence pair)."""
-
-  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
-               is_random_next):
-    self.tokens = tokens
-    self.segment_ids = segment_ids
-    self.is_random_next = is_random_next
-    self.masked_lm_positions = masked_lm_positions
-    self.masked_lm_labels = masked_lm_labels
-
-  def __str__(self):
-    s = ""
-    s += "tokens: %s\n" % (" ".join(
-        [str(x) for x in self.tokens]))
-    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
-    s += "is_random_next: %s\n" % self.is_random_next
-    s += "masked_lm_positions: %s\n" % (" ".join(
-        [str(x) for x in self.masked_lm_positions]))
-    s += "masked_lm_labels: %s\n" % (" ".join(
-        [str(x) for x in self.masked_lm_labels]))
-    s += "\n"
-    return s
-
-  def __repr__(self):
-    return self.__str__()
-
-
-def create_masked_lm_predictions(tokens, masked_lm_prob,
-                                 max_predictions_per_seq, vocab_words, rng):
-    
-    """Creates the predictions for the masked LM objective."""
-    cand_indexes = []
-    for (i, token) in enumerate(tokens):
-        if token == "[CLS]" or token == "[SEP]":
-            continue
-        cand_indexes.append(i)
-    rng.shuffle(cand_indexes)
-    output_tokens = list(tokens)
-    num_to_predict = min(max_predictions_per_seq,
-                         max(1, int(round(len(tokens) * masked_lm_prob))))
-    masked_lms = []
-    for index in cand_indexes:
-        if len(masked_lms) >= num_to_predict:
-            break
-        masked_token = None
-        # replace with [MASK] at 80%.
-        if rng.random() < 0.8:
-            masked_token = "[MASK]"
-        else:
-            # keep original at 10%.
-            if rng.random() < 0.5:
-                masked_token = tokens[index]
-            #  replace with random word at 10%.
-            else:
-                masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
-        output_tokens[index] = masked_token
-        masked_lms.append([index, tokens[index]])
-
-    masked_lms.sort(key = lambda x: x[0])
-    masked_lm_positions = []
-    masked_lm_labels = []
-
-    for p in masked_lms:
-        masked_lm_positions.append(p[0])
-        masked_lm_labels.append(p[1])
-
-    return (output_tokens, masked_lm_positions, masked_lm_labels)
-
-
-def create_data_from_document(all_document,  doc_id, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
-    """ Create Training example for input document """
-    document = all_document[doc_id]
-    max_num_tokens = max_seq_length - 3 # [CLS], [SEP], [SEP]
-    target_seq_length = max_num_tokens
-    # generate short sequence at the probility of short_seq_prob
-    # In order to minimize the mismatch between pre-training and fine-tuning.
-    if rng.random() < short_seq_prob:
-        target_seq_length = rng.randint(2, max_num_tokens)
-    instances = []
-    current_chunk = []
-    current_length = 0
-    i = 0
-    while i < len(document):
-        segment = document[i]
-        current_chunk.append(segment)
-        current_length += len(segment)
-        if i == len(document) - 1 or current_length >= target_seq_length:
-            if current_chunk:
-                # create sentence A
-                a_end = 1
-                if len(current_chunk) >= 2:
-                    a_end = rng.randint(1, len(current_chunk) - 1)
-                tokens_a = []
-                for j in range(a_end):
-                    tokens_a.extend([current_chunk[j]])
-                tokens_b = []
-                # Random next
-                is_random_next = False
-                if len(current_chunk) == 1 or rng.random() < 0.5:
-                    is_random_next = True
-                    target_b_length = target_seq_length - len(tokens_a)
-                    for _ in range(10):
-                        random_document_index = rng.randint(0, len(all_document) - 1)
-                        if random_document_index != doc_id:
-                            break
-                    #If picked random document is the same as the current document
-                    if random_document_index == doc_id:
-                        is_random_next = False
-                    random_document = all_document[random_document_index]
-                    random_start = rng.randint(0, len(random_document) - 1)
-                    for j in range(random_start, len(random_document)):
-                        tokens_b.extend([random_document[j]])
-                        if len(tokens_b) >= target_b_length:
-                            break
-                    # We didn't actually use these segments so we "put them back" so
-                    # they don't go to waste.
-                    num_unused_segments = len(current_chunk) - a_end
-                    i -= num_unused_segments
-                # Actual next
-                else:
-                    is_random_next = False
-                    for j in range(a_end, len(current_chunk)):
-                        tokens_b.extend([current_chunk[j]])
-                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
-                assert len(tokens_a) >= 1
-                assert len(tokens_b) >= 1
-
-                tokens = []
-                segment_ids = []
-                tokens.append("[CLS]")
-                segment_ids.append(0)
-                for token in tokens_a:
-                    tokens.append(token)
-                    segment_ids.append(0)
-
-                tokens.append("[SEP]")
-                segment_ids.append(0)
-
-                for token in tokens_b:
-                    tokens.append(token)
-                    segment_ids.append(1)
-                tokens.append("[SEP]")
-                segment_ids.append(1)
-
-                (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions(
-                    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
-                
-                instance = TrainingInstance(
-                    tokens=tokens,
-                    segment_ids=segment_ids,
-                    is_random_next=is_random_next,
-                    masked_lm_positions=masked_lm_positions,
-                    masked_lm_labels=masked_lm_labels)
-                instances.append(instance)
-            current_chunk = []
-            current_length = 0
-        i += 1
-
-    return instances
-    
-def convert_instances_to_data(instances, tokenizer, max_seq_length):
-    
-    num_instances = len(instances)
-    input_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
-    input_mask_list = np.zeros([num_instances, max_seq_length], dtype="int32")
-    segment_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
-    masked_lm_labels = np.full([num_instances, max_seq_length],-1, dtype="int32")
-    next_sentence_labels_list = np.zeros(num_instances, dtype="int32")
-
-    for (idx, instance) in enumerate(instances):
-        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
-        input_mask = [1] * len(input_ids)
-        segment_ids = list(instance.segment_ids)
-        assert len(input_ids) <= max_seq_length
-
-        padding_zero_list = [0]*int(max_seq_length - len(input_ids))
-        input_ids += padding_zero_list
-        input_mask += padding_zero_list
-        segment_ids += padding_zero_list
-
-        assert len(input_ids) == max_seq_length
-        assert len(input_mask) == max_seq_length
-        assert len(segment_ids) == max_seq_length
-
-        masked_lm_positions = list(instance.masked_lm_positions)
-        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
-
-        input_ids_list[idx][:] = input_ids
-        input_mask_list[idx][:] = input_mask
-        segment_ids_list[idx][:] = segment_ids
-        masked_lm_labels[idx][masked_lm_positions] = masked_lm_ids
-        next_sentence_labels_list[idx] = 1 if instance.is_random_next else 0
-
-    return input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list
-
-def create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng):
-    documents,  all_data = [], [[],[],[],[],[]]
-    vocab_words = list(tokenizer.vocab.keys())
-
-    save_path='./preprocessed_data/bookcorpus/'
-    if not os.path.exists(save_path):
-        os.makedirs(save_path)
-
-    for i in range(dataset['train'].shape[0]):
-        tokens = tokenizer.tokenize(dataset['train'][i]['text'])
-        documents.append(tokens)
-        instance = create_data_from_document(documents, i,\
-                            max_seq_length, short_seq_prob, masked_lm_prob, 
-                            max_predictions_per_seq, vocab_words, rng)
-        data = convert_instances_to_data(instance, tokenizer, max_seq_length)
-        print(i, len(tokens), len(instance))
-        for j in range(5):
-            all_data[j].append(data[j]) 
-
-        save_gap=200
-        if (i+1)%save_gap==0 and i:
-            input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list = [np.concatenate(all_data[j],axis=0) for j in range(5)]
-            print('Saving data from %d to %d: doc_num = %d, input_ids_shape ='%(i+1-save_gap,i, i+1), input_ids_list.shape)
-            save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list, name='_%d_%d'%(i+1-save_gap,i))
-            all_data = [[],[],[],[],[]]
-        if i == dataset['train'].shape[0]-1:
-            input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list = [np.concatenate(all_data[j],axis=0) for j in range(5)]
-            print('Saving data from %d to %d: doc_num = %d, input_ids_shape ='%(save_gap*int(i/save_gap),i, i+1), input_ids_list.shape)
-            save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list, name='_%d_%d'%(save_gap*int(i/save_gap),i))
-
-def save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list,name=''):
-    save_path='./preprocessed_data/bookcorpus/'
-    np.save(save_path+'input_ids'+name,np.array(input_ids_list))
-    np.save(save_path+'token_type_ids'+name,np.array(segment_ids_list))
-    np.save(save_path+'attention_mask'+name,np.array(input_mask_list))
-    np.save(save_path+'masked_lm_labels'+name,np.array(masked_lm_labels))
-    np.save(save_path+'next_sentence_label'+name,np.array(next_sentence_labels_list))
-
-def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
-    """Truncates a pair of sequences to a maximum sequence length."""
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_num_tokens:
-            break
-
-        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
-        assert len(trunc_tokens) >= 1
-
-        #add more randomness and avoid biases.
-        if rng.random() < 0.5:
-            del trunc_tokens[0]
-        else:
-            trunc_tokens.pop()
-
-def show_dataset_detail(dataset):
-    print(dataset.shape)
-    print(dataset.column_names)
-    print(dataset['train'].features)
-    print(dataset['train'][0]['text'])
-
-if __name__ == "__main__":
-    max_seq_length = 512
-    do_lower_case = True
-    short_seq_prob = 0.1
-    masked_lm_prob = 0.15
-    max_predictions_per_seq = 20
-    
-    vocab_path = "./datasets/bert-base-uncased-vocab.txt"
-    dataset = load_dataset('../bookcorpus', cache_dir = "./cached_data")
-    
-    print("total number of documents {} ".format(dataset['train'].shape[0]))
-    random_seed = 123
-    rng = random.Random(random_seed)
-    tokenizer = hetu.BertTokenizer(vocab_file=vocab_path, do_lower_case = do_lower_case)
-
-    print("vocab_size =",len(tokenizer.vocab))
-    print("max_seq_len =", max_seq_length)
-    
-    create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng)
-
-
-
-
-
diff --git a/examples/nlp/bert/train_hetu_bert.py b/examples/nlp/bert/train_hetu_bert.py
deleted file mode 100644
index e116cf2..0000000
--- a/examples/nlp/bert/train_hetu_bert.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from tqdm import tqdm
-import os
-import math
-import logging
-import hetu as ht
-from hetu_bert import BertForPreTraining
-from bert_config import BertConfig
-from load_data import DataLoader
-import numpy as np
-import time
-
-''' Usage example:
-    In dir Hetu/examples/nlp/bert/: python train_hetu_bert.py
-'''
-
-device_id=6
-executor_ctx = ht.gpu(device_id)
-
-num_epochs = 1
-lr = 1e-4
-
-config = BertConfig(vocab_size=30522, 
-                    hidden_size=768,
-                    num_hidden_layers=12, 
-                    num_attention_heads=12, 
-                    intermediate_size=3072, 
-                    max_position_embeddings=512, 
-                    #attention_probs_dropout_prob=0.0,
-                    #hidden_dropout_prob=0.0,
-                    batch_size=6)
-
-model = BertForPreTraining(config=config)
-
-batch_size = config.batch_size
-seq_len = config.max_position_embeddings
-vocab_size = config.vocab_size
-
-dataloader = DataLoader(dataset='bookcorpus', doc_num=200, save_gap=200, batch_size = batch_size)
-data_names = ['input_ids','token_type_ids','attention_mask','masked_lm_labels','next_sentence_label']
-
-input_ids = ht.Variable(name='input_ids', trainable=False)
-token_type_ids = ht.Variable(name='token_type_ids', trainable=False)
-attention_mask = ht.Variable(name='attention_mask', trainable=False)
-
-masked_lm_labels = ht.Variable(name='masked_lm_labels_one_hot', trainable=False)
-next_sentence_label = ht.Variable(name='next_sentence_label_one_hot', trainable=False)
-
-loss_position_sum = ht.Variable(name='loss_position_sum', trainable=False)
-
-_,_, masked_lm_loss, next_sentence_loss = model(input_ids, token_type_ids, attention_mask, masked_lm_labels, next_sentence_label)
-
-masked_lm_loss_mean = ht.div_op(ht.reduce_sum_op(masked_lm_loss, [0,1]), loss_position_sum)
-next_sentence_loss_mean = ht.reduce_mean_op(next_sentence_loss, [0])
-
-loss = masked_lm_loss_mean + next_sentence_loss_mean
-#opt = optimizer.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999, epsilon=1e-8)
-opt = ht.optim.SGDOptimizer(learning_rate=lr)
-train_op = opt.minimize(loss)
-
-executor = ht.Executor([masked_lm_loss_mean, next_sentence_loss_mean, loss, train_op],ctx=executor_ctx,dynamic_memory=True)
-
-
-dataloader.make_epoch_data()
-for ep in range(num_epochs):
-    for i in range(dataloader.batch_num):
-        batch_data = dataloader.get_batch(i)
-
-        feed_dict = {
-            input_ids: batch_data['input_ids'],
-            token_type_ids: batch_data['token_type_ids'],
-            attention_mask: batch_data['attention_mask'],
-            masked_lm_labels: batch_data['masked_lm_labels'],
-            next_sentence_label: batch_data['next_sentence_label'],
-            loss_position_sum: np.array([np.where(batch_data['masked_lm_labels'].reshape(-1)!=-1)[0].shape[0]]),
-        }
-        
-        start_time = time.time()
-        results = executor.run(feed_dict = feed_dict)
-        end_time = time.time()
-
-        masked_lm_loss_mean_out = results[0].asnumpy()
-        next_sentence_loss_mean_out = results[1].asnumpy()
-        loss_out = results[2].asnumpy()
-
-        print('[Epoch %d] (Iteration %d): Loss = %.3f, MLM_loss = %.3f, NSP_loss = %.6f, Time = %.3f'%(ep,i,loss_out, masked_lm_loss_mean_out, next_sentence_loss_mean_out, end_time-start_time))
-
-