From d177a8239e3b7f2ae67d5d5fad40335d37e66721 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=81=E5=AE=87=E8=BD=A9?= <“942738126@qq.com”> Date: Mon, 8 Nov 2021 21:37:28 +0800 Subject: [PATCH] log message --- examples/nlp/bert/bert_config.py | 62 --- examples/nlp/bert/hetu_bert.py | 749 --------------------------- examples/nlp/bert/load_data.py | 76 --- examples/nlp/bert/processBertData.py | 293 ----------- examples/nlp/bert/train_hetu_bert.py | 87 ---- 5 files changed, 1267 deletions(-) delete mode 100644 examples/nlp/bert/bert_config.py delete mode 100644 examples/nlp/bert/hetu_bert.py delete mode 100644 examples/nlp/bert/load_data.py delete mode 100644 examples/nlp/bert/processBertData.py delete mode 100644 examples/nlp/bert/train_hetu_bert.py diff --git a/examples/nlp/bert/bert_config.py b/examples/nlp/bert/bert_config.py deleted file mode 100644 index 6707662..0000000 --- a/examples/nlp/bert/bert_config.py +++ /dev/null @@ -1,62 +0,0 @@ -''' -BERT Config: ---------------------------------------------------------------------------------------------------''' -class BertConfig(object): - """Configuration class to store the configuration of a `BertModel`. - """ - def __init__(self, - vocab_size, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - hidden_act="relu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=2, - initializer_range=0.02, - output_hidden_states=False, - batch_size=100, - ): - """Constructs BertConfig. - - Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. - hidden_size: Size of the encoder layers and the pooler layer. - num_hidden_layers: Number of hidden layers in the Transformer encoder. - num_attention_heads: Number of attention heads for each attention layer in - the Transformer encoder. - intermediate_size: The size of the "intermediate" (i.e., feed-forward) - layer in the Transformer encoder. - hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected - layers in the embeddings, encoder, and pooler. - attention_probs_dropout_prob: The dropout ratio for the attention - probabilities. - max_position_embeddings: The maximum sequence length that this model might - ever be used with. Typically set this to something large just in case - (e.g., 512 or 1024 or 2048). - type_vocab_size: The vocabulary size of the `token_type_ids` passed into - `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. - """ - - self.vocab_size = vocab_size - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.output_hidden_states = output_hidden_states - self.batch_size = batch_size - - -'''-----------------------------------------------------------------------------------------------''' diff --git a/examples/nlp/bert/hetu_bert.py b/examples/nlp/bert/hetu_bert.py deleted file mode 100644 index 67343fc..0000000 --- a/examples/nlp/bert/hetu_bert.py +++ /dev/null @@ -1,749 +0,0 @@ -import hetu as ht -import numpy as np - -''' -Bert Module Architecture & Input/Output Tensor Size - -BertModel Inputs: - input_ids: [batch_size, seq_len], word token indices in the vocabulary - -BertModel Outputs: - sequence_output: [batch_size, seq_len, hidden_size] (from BertEncoder) - pooled_output: [batch_size, hidden_size] (from BertPooler) - -BertModel: - --[batch_size, seq_len]-- - BertEmbeddings: - Embedding(word/position/token_type) - LayerNorm - Dropout - --[batch_size, seq_len, hidden_size]-- - - --[batch_size, seq_len, hidden_size]-- - BertEncoder: - BertLayer(num_hidden_layers): - BertAttention: - BertSelfAttention - --[batch_size, seq_len, hidden_size]-- - BertSelfOutput: - Linear - Dropout - Add & LayerNorm - - --[batch_size, seq_len, hidden_size]-- - BertIntermediate: - Linear + Act(gule) - --[batch_size, seq_len, intermediate_size]-- - BertOutput: - Linear - Dropout - Add & LayerNorm - --[batch_size, seq_len, hidden_size]-- - - --[batch_size, seq_len, hidden_size]-- - BertPooler: - (Slice, select [cls]) - --[batch_size, hidden_size]-- - Linear + Act(Tanh) - --[batch_size, hidden_size]-- - -Bert -''' - - -''' -BertEmbeddings: ---------------------------------------------------------------------------------------------------''' -class BertEmbeddings(object): - """Construct the embeddings from word, position and token_type embeddings. - """ - def __init__(self, config): - self.seq_len = config.max_position_embeddings - self.batch_size = config.batch_size - - self.word_embeddings = Embedding(config.vocab_size, config.hidden_size, "word_embeddings") - self.position_embeddings = Embedding(config.max_position_embeddings, config.hidden_size, 'position_embeddings') - self.token_type_embeddings = Embedding(config.type_vocab_size, config.hidden_size, 'token_type_embeddings') - - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = Dropout(config.hidden_dropout_prob) - - def __call__(self, input_ids, token_type_ids): - ''' - inputs: - input_ids: [batch_size, seq_len] - token_type_ids: [batch_size, seq_len] - - outputs: - embeddings: [batch_size, seq_len, hidden_size] - ''' - seq_length= self.seq_len - batch_size = self.batch_size - position_ids = ht.Variable('position_ids', value=np.arange(seq_length).reshape((1,-1)).repeat(batch_size,axis=0), dtype=np.long, trainable=False, ctx=input_ids.ctx) - - - '''Embedding Size - inputs_id:[batch_size, seq_len], embedding_table:[vocab_size, hidden_size] - position_ids:[batch_size, seq_len], embedding_table:[seq_len, hidden_size] - token_type_ids:[batch_size, seq_len], embedding_tabel:[type_vocab_size, hidden_size] - --> embeddings: [batch_size, seq_len, hidden_size] - ''' - words_embeddings = self.word_embeddings(input_ids) - position_embeddings = self.position_embeddings(position_ids) - token_type_embeddings = self.token_type_embeddings(token_type_ids) - - embeddings = words_embeddings + position_embeddings + token_type_embeddings - embeddings = self.LayerNorm(embeddings) - embeddings = self.dropout(embeddings) - return embeddings -'''-----------------------------------------------------------------------------------------------''' - - -''' -BertEncoder & BertLayer: ---------------------------------------------------------------------------------------------------''' -class BertEncoder(object): - def __init__(self, config): - self.output_hidden_states = config.output_hidden_states - self.layer = [BertLayer(config) for _ in range(config.num_hidden_layers)] - - def __call__(self, hidden_states, attention_mask=None): - ''' - inputs: - hidden_states: [batch_size, seq_len, hidden_size] - attention_mask: [batch_size, num_heads, seq_len, seq_len] - outputs: - hidden_states: [batch_size, seq_len, hidden_size] - all_hidden_states: optional, num_hidden_layers * [batch_size, seq_len, hidden_size] - ''' - - for i, layer_module in enumerate(self.layer): - hidden_states = layer_module(hidden_states, attention_mask) - return hidden_states # last-layer hidden state - -class BertLayer(object): - def __init__(self, config): - self.attention = BertAttention(config) - self.intermediate = BertIntermediate(config) - self.output = BertOutput(config) - - def __call__(self, hidden_states, attention_mask): - ''' - inputs: - hidden_states: [batch_size, seq_len, hidden_size] - attention_mask: [batch_size, num_heads, seq_len, seq_len] - outputs: - layer_output: [batch_size, seq_len, hidden_size] - ''' - attention_output = self.attention(hidden_states, attention_mask) - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - return layer_output -'''-----------------------------------------------------------------------------------------------''' - - -''' -BertAttention & BertSelfAttention & BertSelfOutput ---------------------------------------------------------------------------------------------------''' -class BertAttention(object): - def __init__(self, config): - self.self = BertSelfAttention(config) - self.output = BertSelfOutput(config) - - def __call__(self, input_tensor, attention_mask): - ''' - inputs: - input_tensor: [batch_size, seq_len, hidden_size] - attention_mask: [batch_size, num_heads, seq_len, seq_len] - outputs: - attention_output: [batch_size, seq_len, hidden_size] - ''' - self_output = self.self(input_tensor, attention_mask) - attention_output = self.output(self_output, input_tensor) - return attention_output - -class BertSelfAttention(object): - def __init__(self, config): - if config.hidden_size % config.num_attention_heads != 0: - raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads)) - self.num_attention_heads = config.num_attention_heads - self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size #all_head_size == hidden_size - self.hidden_size = config.hidden_size - self.seq_len = config.max_position_embeddings - self.batch_size = config.batch_size - - linear_input_shape = [self.batch_size, self.seq_len, self.hidden_size] - self.query = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape) - self.key = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape) - self.value = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape) - - self.dropout = Dropout(config.attention_probs_dropout_prob) - - def transpose_for_scores(self, input_tensor): - output_tensor = ht.array_reshape_op( - input_tensor, [self.batch_size, self.seq_len, self.num_attention_heads, self.attention_head_size]) - output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3]) - return output_tensor - - def __call__(self, hidden_states, attention_mask): - ''' - inputs: - hidden_states: [batch_size, seq_len, hidden_size] - attention_mask: [batch_size, 1, 1, seq_len] - outputs: - context_layer: [batch_size, seq_len, hidden_size] - ''' - - # linear transformation - mixed_query_layer = self.query(hidden_states) # [batch_size, seq_len, hidden_size] - mixed_key_layer = self.key(hidden_states) # [batch_size, seq_len, hidden_size] - mixed_value_layer = self.value(hidden_states) # [batch_size, seq_len, hidden_size] - - # transpose - query_layer = self.transpose_for_scores(mixed_query_layer) # [batch_size, num_heads, seq_len, head_size] - key_layer = self.transpose_for_scores(mixed_key_layer) # [batch_size, num_heads, seq_len, head_size] - value_layer = self.transpose_for_scores(mixed_value_layer) # [batch_size, num_heads, seq_len, head_size] - - # score - key_layer_scaled = key_layer * (1.0 / np.sqrt(float(self.attention_head_size))) - attention_scores = ht.batch_matmul_op(query_layer, key_layer_scaled, trans_B=True) # [batch_size, num_heads, seq_len, seq_len] - - # Apply the attention mask is (precomputed for all layers in BertModel forward() function) - attention_scores = attention_scores + ht.broadcastto_op(attention_mask, attention_scores) # [batch_size, num_heads, seq_len, seq_len] - - # Normalize the attention scores to probabilities. - attention_probs = ht.softmax_op(attention_scores) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) - - context_layer = ht.batch_matmul_op(attention_probs, value_layer) # [batch_size, num_heads, seq_len, head_size] - context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3]) # [batch_size, seq_len, num_heads, head_size] - context_layer = ht.array_reshape_op(context_layer, [-1, self.seq_len, self.all_head_size]) # [batch_size, seq_len, hidden_size] - return context_layer - -class BertSelfOutput(object): - def __init__(self, config): - linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size] - self.dense = Linear(config.hidden_size, config.hidden_size, input_shape=linear_input_shape) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = Dropout(config.hidden_dropout_prob) - - def __call__(self, hidden_states, input_tensor): - ''' - inputs: - hidden_states: [batch_size, seq_len, hidden_size] - input_tensor: [batch_size, seq_len, hidden_size] - outputs: - hidden_states: [batch_size, seq_len, hidden_size] - ''' - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states -'''-----------------------------------------------------------------------------------------------''' - - -''' -BertIntermediate & BertOutput (2-layer FeedForward) ---------------------------------------------------------------------------------------------------''' -class BertIntermediate(object): - def __init__(self, config): - if config.hidden_act == "relu": - self.intermediate_act_fn = ht.relu_op - elif config.hidden_act == "gelu": - self.intermediate_act_fn = ht.gelu_op - print("Gelu activation is not implemented yet.") - assert(False) - linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size] - self.dense = Linear(config.hidden_size, config.intermediate_size, activation = self.intermediate_act_fn, input_shape=linear_input_shape) - - def __call__(self, hidden_states): - ''' - inputs: - hidden_states: [batch_size, seq_len, hidden_size] - outputs: - hidden_states: [batch_size, seq_len, intermediate_size] - ''' - hidden_states = self.dense(hidden_states) - return hidden_states - -class BertOutput(object): - def __init__(self, config): - linear_input_shape = [config.batch_size, config.max_position_embeddings, config.intermediate_size] - self.dense = Linear(config.intermediate_size, config.hidden_size, input_shape=linear_input_shape) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - self.dropout = Dropout(config.hidden_dropout_prob) - - def __call__(self, hidden_states, input_tensor): - ''' - inputs: - hidden_states: [batch_size, seq_len, intermediate_size] - outputs: - hidden_states: [batch_size, seq_len, hidden_size] - ''' - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states) - hidden_states = self.LayerNorm(hidden_states + input_tensor) - return hidden_states -'''-----------------------------------------------------------------------------------------------''' - - -''' -BertPooler ---------------------------------------------------------------------------------------------------''' -class BertPooler(object): - def __init__(self, config): - self.dense = Linear(config.hidden_size, config.hidden_size, activation = ht.tanh_op) - self.batch_size = config.batch_size - self.hidden_size = config.hidden_size - def __call__(self, hidden_states): - ''' - inputs: - hidden_states: [batch_size, seq_len, hidden_size] - outputs: - pooled_output: [batch_size, hidden_size] - ''' - first_token_tensor = ht.slice_op(hidden_states,(0,0,0),(self.batch_size,1,self.hidden_size)) - first_token_tensor = ht.array_reshape_op(first_token_tensor, [self.batch_size, self.hidden_size]) - pooled_output = self.dense(first_token_tensor) - return pooled_output -'''-----------------------------------------------------------------------------------------------''' - -''' -Bert Downstream Heads ---------------------------------------------------------------------------------------------------''' -class BertPredictionHeadTransform(object): - def __init__(self, config): - if config.hidden_act == "relu": - self.hidden_act = ht.relu_op - elif config.hidden_act == "gelu": - self.hidden_act = ht.gelu_op - print("Gelu activation is not implemented yet.") - assert(False) - linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size] - self.dense_act = Linear(config.hidden_size, config.hidden_size, activation=self.hidden_act, input_shape=linear_input_shape) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) - - def __call__(self, hidden_states): - ''' - inputs: - hidden_states: [batch_size, seq_len, hidden_size] - outputs: - hidden_states: [batch_size, seq_len, hidden_size] - ''' - hidden_states = self.dense_act(hidden_states) - hidden_states = self.LayerNorm(hidden_states) - return hidden_states - -class BertLMPredictionHead(object): - def __init__(self, config, bert_model_embedding_weights): - ''' - bert_model_embedding_weights: [vocab_size, hidden_size] - ''' - self.transform = BertPredictionHeadTransform(config) - - linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size] - self.decoder = Linear(config.hidden_size, config.vocab_size, bias_initializer=ht.init.zeros,input_shape=linear_input_shape) - self.decoder.weights = ht.transpose_op(bert_model_embedding_weights) - - def __call__(self, hidden_states): - ''' - inputs: - hidden_states: [batch_size, seq_len, hidden_size] - outputs: - hidden_states: [batch_size, seq_len, vocab_size] - ''' - hidden_states = self.transform(hidden_states) - hidden_states = self.decoder(hidden_states) - return hidden_states - - -class BertOnlyMLMHead(object): - def __init__(self, config, bert_model_embedding_weights): - self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) - - def __call__(self, sequence_output): - ''' - inputs: - sequence_output: [batch_size, seq_len, hidden_size] - outputs: - prediction_scores: [batch_size, seq_len, vocab_size] - ''' - prediction_scores = self.predictions(sequence_output) - return prediction_scores - - -class BertOnlyNSPHead(object): - def __init__(self, config): - self.seq_relationship = Linear(config.hidden_size, 2) - - def __call__(self, pooled_output): - ''' - inputs: - pooled_output: [batch_size, hidden_size] - outputs: - seq_relationship_score: [batch_size, 2] - ''' - seq_relationship_score = self.seq_relationship(pooled_output) - return seq_relationship_score - - -class BertPreTrainingHeads(object): - def __init__(self, config, bert_model_embedding_weights): - self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) - self.seq_relationship = Linear(config.hidden_size, 2) - - def __call__(self, sequence_output, pooled_output): - ''' - inputs: - sequence_output: [batch_size, seq_len, hidden_size] - pooled_output: [batch_size, hidden_size] - outputs: - prediction_scores: [batch_size, seq_len, vocab_size] - seq_relationship_score: [batch_size, 2] - ''' - prediction_scores = self.predictions(sequence_output) - seq_relationship_score = self.seq_relationship(pooled_output) - return prediction_scores, seq_relationship_score - -'''-----------------------------------------------------------------------------------------------''' - - -''' -BertModel: ---------------------------------------------------------------------------------------------------''' -class BertModel(object): - """BERT model ("Bidirectional Embedding Representations from a Transformer"). - - Params: - config: a BertConfig class instance with the configuration to build a new model - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - - Outputs: Tuple of (encoded_layers, pooled_output) - `encoded_layers`: controled by `output_all_encoded_layers` argument: - - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end - of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each - encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size], - - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding - to the last attention block of shape [batch_size, sequence_length, hidden_size], - `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a - classifier pretrained on top of the hidden state associated to the first character of the - input (`CLS`) to train on the Next-Sentence task (see BERT's paper). - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = modeling.BertModel(config=config) - all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config): - self.embeddings = BertEmbeddings(config) - self.encoder = BertEncoder(config) - self.pooler = BertPooler(config) - self.batch_size=config.batch_size - self.seq_len=config.max_position_embeddings - - def __call__(self, input_ids, token_type_ids, attention_mask): - extended_attention_mask = ht.array_reshape_op(attention_mask, [self.batch_size, 1, 1, self.seq_len]) - extended_attention_mask = (extended_attention_mask+(-1.0)) * 10000.0 - - embedding_output = self.embeddings(input_ids, token_type_ids) - sequence_output = self.encoder(embedding_output, extended_attention_mask) - pooled_output = self.pooler(sequence_output) - - return sequence_output, pooled_output - -'''-----------------------------------------------------------------------------------------------''' - - -''' -BertForPreTraining: ---------------------------------------------------------------------------------------------------''' -class BertForPreTraining(object): - """BERT model with pre-training heads. - This module comprises the BERT model followed by the two pre-training heads: - - the masked language modeling head, and - - the next sentence classification head. - - Params: - config: a BertConfig class instance with the configuration to build a new model. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] - with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss - is only computed for the labels set in [0, ..., vocab_size] - `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size] - with indices selected in [0, 1]. - 0 => next sentence is the continuation, 1 => next sentence is a random sentence. - - Outputs: - if `masked_lm_labels` and `next_sentence_label` are not `None`: - Outputs the total_loss which is the sum of the masked language modeling loss and the next - sentence classification loss. - if `masked_lm_labels` or `next_sentence_label` is `None`: - Outputs a tuple comprising - - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and - - the next sentence classification logits of shape [batch_size, 2]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForPreTraining(config) - masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) - ``` - """ - - def __init__(self, config): - self.bert = BertModel(config) - self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) - - self.vocab_size=config.vocab_size - - def __call__(self, input_ids, token_type_ids, attention_mask, masked_lm_labels=None, next_sentence_label=None): - sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask) - prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) - - return_op = [prediction_scores, seq_relationship_score] - if masked_lm_labels is not None and next_sentence_label is not None: - ''' - masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0 - prediction_scores: [batch_size, seq_len, vocab_size] - next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0 - seq_relationship_score: [batch_size, 2] - - masked_lm_loss: [batch_size*seq_len] - next_sentence_loss: [batch_size] - ''' - - masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1) - next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1) - - return_op += [masked_lm_loss, next_sentence_loss] - return return_op - - -class BertForMaskedLM(object): - """BERT model with the masked language modeling head. - This module comprises the BERT model followed by the masked language modeling head. - - Params: - config: a BertConfig class instance with the configuration to build a new model. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] - with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss - is only computed for the labels set in [0, ..., vocab_size] - - Outputs: - if `masked_lm_labels` is not `None`: - Outputs the masked language modeling loss. - if `masked_lm_labels` is `None`: - Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForMaskedLM(config) - masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config): - self.bert = BertModel(config) - self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) - self.vocab_size=config.vocab_size - - def __call__(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None): - sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask) - prediction_scores = self.cls(sequence_output) - - return_op = [prediction_scores] - if masked_lm_labels is not None: - ''' - masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0 - prediction_scores: [batch_size, seq_len, vocab_size] - - masked_lm_loss: [batch_size*seq_len] - ''' - masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1) - return_op += [masked_lm_loss] - - return return_op - - -class BertForNextSentencePrediction(object): - """BERT model with next sentence prediction head. - This module comprises the BERT model followed by the next sentence classification head. - - Params: - config: a BertConfig class instance with the configuration to build a new model. - - Inputs: - `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] - with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts - `extract_features.py`, `run_classifier.py` and `run_squad.py`) - `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token - types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to - a `sentence B` token (see BERT paper for more details). - `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices - selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max - input sequence length in the current batch. It's the mask that we typically use for attention when - a batch has varying length sentences. - `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] - with indices selected in [0, 1]. - 0 => next sentence is the continuation, 1 => next sentence is a random sentence. - - Outputs: - if `next_sentence_label` is not `None`: - Outputs the total_loss which is the sum of the masked language modeling loss and the next - sentence classification loss. - if `next_sentence_label` is `None`: - Outputs the next sentence classification logits of shape [batch_size, 2]. - - Example usage: - ```python - # Already been converted into WordPiece token ids - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, - num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) - - model = BertForNextSentencePrediction(config) - seq_relationship_logits = model(input_ids, token_type_ids, input_mask) - ``` - """ - def __init__(self, config): - self.bert = BertModel(config) - self.cls = BertOnlyNSPHead(config) - - def __call__(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None): - _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask) - seq_relationship_score = self.cls(pooled_output) - - return_op = [seq_relationship_score] - if next_sentence_label is not None: - ''' - next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0 - seq_relationship_score: [batch_size, 2] - - next_sentence_loss: [batch_size] - ''' - next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1) - return_op += [next_sentence_loss] - - return return_op - -'''-----------------------------------------------------------------------------------------------''' - - - -''' -Bert Layer utils (Embedding & BerLayerNorm & Dropout & Linear) ---------------------------------------------------------------------------------------------------''' -class Embedding(object): - def __init__(self, num_embeddings, embedding_dim, embedding_name=None, initializer=ht.init.xavier_normal): - self.weight = initializer(name=embedding_name, shape=(num_embeddings, embedding_dim)) - def __call__(self, input_tensor): - return ht.embedding_lookup_op(self.weight, input_tensor) - -class BertLayerNorm(object): - def __init__(self, hidden_size, eps=1e-12): - self.eps=eps - self.scale = ht.init.ones(name='layer_norm_scale', shape=(hidden_size, )) - self.bias = ht.init.zeros(name='layer_norm_bias', shape=(hidden_size, )) - def __call__(self, input_tensor): - return ht.layer_normalization_op(input_tensor, self.scale, self.bias, eps=self.eps) - -class Dropout(object): - def __init__(self, dropout_prob=None): - self.dropout_prob = dropout_prob - def __call__(self, input_tensor): - if self.dropout_prob is None or self.dropout_prob == 0.0: - return input_tensor - output = ht.dropout_op(input_tensor, 1.0 - self.dropout_prob) - return output - -class Linear(object): - def __init__(self, in_features, out_features, bias=True, activation=None, kernel_initializer=ht.init.xavier_normal, bias_initializer=ht.init.zeros, input_shape=None): - self.bias_flag = bias - self.activation = activation - self.weights = kernel_initializer(name='dense_weights', shape=(in_features, out_features)) - if self.bias_flag: - self.bias = bias_initializer(name='dense_bias', shape=(out_features,)) - self.input_shape=input_shape - self.in_features = in_features - self.out_features = out_features - if self.input_shape is not None and self.input_shape[-1]!=in_features: - print("Specified in_features is not equal to input_shape[-1].") - assert(False) - def __call__(self, input_tensor): - if self.input_shape is not None and len(self.input_shape)!=2: - input_tensor = ht.array_reshape_op(input_tensor, [-1, self.in_features]) - outputs = ht.matmul_op(input_tensor, self.weights) - if self.bias_flag: - outputs = outputs + ht.broadcastto_op(self.bias, outputs) - if self.activation is not None: - outputs = self.activation(outputs) - if self.input_shape is not None and len(self.input_shape)!=2: - outputs = ht.array_reshape_op(outputs, self.input_shape[:-1]+[self.out_features]) - return outputs -'''-----------------------------------------------------------------------------------------------''' diff --git a/examples/nlp/bert/load_data.py b/examples/nlp/bert/load_data.py deleted file mode 100644 index 499df1a..0000000 --- a/examples/nlp/bert/load_data.py +++ /dev/null @@ -1,76 +0,0 @@ -import numpy as np - -class DataLoader(object): - def __init__(self, dataset='bookcorpus', doc_num=16000, save_gap=200, batch_size = 1024): - self.data_names = ['input_ids','token_type_ids','attention_mask','masked_lm_labels','next_sentence_label'] - self.data = {'input_ids':[], - 'token_type_ids':[], - 'attention_mask':[], - 'masked_lm_labels':[], - 'next_sentence_label':[]} - self.batch_size=batch_size - self.batch_data = {'input_ids':[], - 'token_type_ids':[], - 'attention_mask':[], - 'masked_lm_labels':[], - 'next_sentence_label':[]} - self.cur_batch_data = {'input_ids':[], - 'token_type_ids':[], - 'attention_mask':[], - 'masked_lm_labels':[], - 'next_sentence_label':[]} - self.load_data(dataset=dataset, doc_num=doc_num, save_gap=save_gap) - - - def load_data(self, dataset='bookcorpus', doc_num=16000, save_gap=200): - print('Loading preprocessed dataset %s...'%dataset) - data_dir = './preprocessed_data/%s/'%dataset - - for i in range(0,doc_num,save_gap): - start, end = i, i+save_gap-1 - if end > doc_num-1: - end = doc_num-1 - range_name = '_%d_%d.npy'%(start,end) - print(start,end) - for data_name in self.data_names: - #print(data_dir+data_name+range_name) - self.data[data_name].append(np.load(data_dir+data_name+range_name)) - - for data_name in self.data_names: - self.data[data_name] = np.concatenate(self.data[data_name],axis=0) - - self.data_len = self.data['input_ids'].shape[0] - print(self.data['input_ids'].shape) - - print('Successfully loaded dataset %s!'%dataset) - - - def make_epoch_data(self): - batch_data = [] - - for i in range(0, self.data_len, self.batch_size): - start = i - end = start + self.batch_size - if end > self.data_len: - end = self.data_len - if end-start != self.batch_size: - break - for data_name in self.data_names: - self.batch_data[data_name].append(self.data[data_name][start:end]) - - self.batch_num = len(self.batch_data['input_ids']) - - def get_batch(self, idx): - if idx >= self.batch_num: - assert False - for data_name in self.data_names: - self.cur_batch_data[data_name] = self.batch_data[data_name][idx] - - return self.cur_batch_data.copy() - - def align(self, arr, length): - ori_len = len(arr) - if length > ori_len: - return arr + [0] * (length - ori_len) - else: - return arr[:length] diff --git a/examples/nlp/bert/processBertData.py b/examples/nlp/bert/processBertData.py deleted file mode 100644 index a331d82..0000000 --- a/examples/nlp/bert/processBertData.py +++ /dev/null @@ -1,293 +0,0 @@ -from datasets import load_dataset -import random -import hetu -import os -import numpy as np - -''' Usage example: - In dir Hetu/examples/nlp/bert/: python processBertData.py -''' - -# https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz - - -class TrainingInstance(object): - """A single training instance (sentence pair).""" - - def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, - is_random_next): - self.tokens = tokens - self.segment_ids = segment_ids - self.is_random_next = is_random_next - self.masked_lm_positions = masked_lm_positions - self.masked_lm_labels = masked_lm_labels - - def __str__(self): - s = "" - s += "tokens: %s\n" % (" ".join( - [str(x) for x in self.tokens])) - s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) - s += "is_random_next: %s\n" % self.is_random_next - s += "masked_lm_positions: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_positions])) - s += "masked_lm_labels: %s\n" % (" ".join( - [str(x) for x in self.masked_lm_labels])) - s += "\n" - return s - - def __repr__(self): - return self.__str__() - - -def create_masked_lm_predictions(tokens, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng): - - """Creates the predictions for the masked LM objective.""" - cand_indexes = [] - for (i, token) in enumerate(tokens): - if token == "[CLS]" or token == "[SEP]": - continue - cand_indexes.append(i) - rng.shuffle(cand_indexes) - output_tokens = list(tokens) - num_to_predict = min(max_predictions_per_seq, - max(1, int(round(len(tokens) * masked_lm_prob)))) - masked_lms = [] - for index in cand_indexes: - if len(masked_lms) >= num_to_predict: - break - masked_token = None - # replace with [MASK] at 80%. - if rng.random() < 0.8: - masked_token = "[MASK]" - else: - # keep original at 10%. - if rng.random() < 0.5: - masked_token = tokens[index] - # replace with random word at 10%. - else: - masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] - output_tokens[index] = masked_token - masked_lms.append([index, tokens[index]]) - - masked_lms.sort(key = lambda x: x[0]) - masked_lm_positions = [] - masked_lm_labels = [] - - for p in masked_lms: - masked_lm_positions.append(p[0]) - masked_lm_labels.append(p[1]) - - return (output_tokens, masked_lm_positions, masked_lm_labels) - - -def create_data_from_document(all_document, doc_id, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng): - """ Create Training example for input document """ - document = all_document[doc_id] - max_num_tokens = max_seq_length - 3 # [CLS], [SEP], [SEP] - target_seq_length = max_num_tokens - # generate short sequence at the probility of short_seq_prob - # In order to minimize the mismatch between pre-training and fine-tuning. - if rng.random() < short_seq_prob: - target_seq_length = rng.randint(2, max_num_tokens) - instances = [] - current_chunk = [] - current_length = 0 - i = 0 - while i < len(document): - segment = document[i] - current_chunk.append(segment) - current_length += len(segment) - if i == len(document) - 1 or current_length >= target_seq_length: - if current_chunk: - # create sentence A - a_end = 1 - if len(current_chunk) >= 2: - a_end = rng.randint(1, len(current_chunk) - 1) - tokens_a = [] - for j in range(a_end): - tokens_a.extend([current_chunk[j]]) - tokens_b = [] - # Random next - is_random_next = False - if len(current_chunk) == 1 or rng.random() < 0.5: - is_random_next = True - target_b_length = target_seq_length - len(tokens_a) - for _ in range(10): - random_document_index = rng.randint(0, len(all_document) - 1) - if random_document_index != doc_id: - break - #If picked random document is the same as the current document - if random_document_index == doc_id: - is_random_next = False - random_document = all_document[random_document_index] - random_start = rng.randint(0, len(random_document) - 1) - for j in range(random_start, len(random_document)): - tokens_b.extend([random_document[j]]) - if len(tokens_b) >= target_b_length: - break - # We didn't actually use these segments so we "put them back" so - # they don't go to waste. - num_unused_segments = len(current_chunk) - a_end - i -= num_unused_segments - # Actual next - else: - is_random_next = False - for j in range(a_end, len(current_chunk)): - tokens_b.extend([current_chunk[j]]) - truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) - assert len(tokens_a) >= 1 - assert len(tokens_b) >= 1 - - tokens = [] - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(0) - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) - - tokens.append("[SEP]") - segment_ids.append(0) - - for token in tokens_b: - tokens.append(token) - segment_ids.append(1) - tokens.append("[SEP]") - segment_ids.append(1) - - (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions( - tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) - - instance = TrainingInstance( - tokens=tokens, - segment_ids=segment_ids, - is_random_next=is_random_next, - masked_lm_positions=masked_lm_positions, - masked_lm_labels=masked_lm_labels) - instances.append(instance) - current_chunk = [] - current_length = 0 - i += 1 - - return instances - -def convert_instances_to_data(instances, tokenizer, max_seq_length): - - num_instances = len(instances) - input_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32") - input_mask_list = np.zeros([num_instances, max_seq_length], dtype="int32") - segment_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32") - masked_lm_labels = np.full([num_instances, max_seq_length],-1, dtype="int32") - next_sentence_labels_list = np.zeros(num_instances, dtype="int32") - - for (idx, instance) in enumerate(instances): - input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) - input_mask = [1] * len(input_ids) - segment_ids = list(instance.segment_ids) - assert len(input_ids) <= max_seq_length - - padding_zero_list = [0]*int(max_seq_length - len(input_ids)) - input_ids += padding_zero_list - input_mask += padding_zero_list - segment_ids += padding_zero_list - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - masked_lm_positions = list(instance.masked_lm_positions) - masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) - - input_ids_list[idx][:] = input_ids - input_mask_list[idx][:] = input_mask - segment_ids_list[idx][:] = segment_ids - masked_lm_labels[idx][masked_lm_positions] = masked_lm_ids - next_sentence_labels_list[idx] = 1 if instance.is_random_next else 0 - - return input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list - -def create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng): - documents, all_data = [], [[],[],[],[],[]] - vocab_words = list(tokenizer.vocab.keys()) - - save_path='./preprocessed_data/bookcorpus/' - if not os.path.exists(save_path): - os.makedirs(save_path) - - for i in range(dataset['train'].shape[0]): - tokens = tokenizer.tokenize(dataset['train'][i]['text']) - documents.append(tokens) - instance = create_data_from_document(documents, i,\ - max_seq_length, short_seq_prob, masked_lm_prob, - max_predictions_per_seq, vocab_words, rng) - data = convert_instances_to_data(instance, tokenizer, max_seq_length) - print(i, len(tokens), len(instance)) - for j in range(5): - all_data[j].append(data[j]) - - save_gap=200 - if (i+1)%save_gap==0 and i: - input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list = [np.concatenate(all_data[j],axis=0) for j in range(5)] - print('Saving data from %d to %d: doc_num = %d, input_ids_shape ='%(i+1-save_gap,i, i+1), input_ids_list.shape) - save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list, name='_%d_%d'%(i+1-save_gap,i)) - all_data = [[],[],[],[],[]] - if i == dataset['train'].shape[0]-1: - input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list = [np.concatenate(all_data[j],axis=0) for j in range(5)] - print('Saving data from %d to %d: doc_num = %d, input_ids_shape ='%(save_gap*int(i/save_gap),i, i+1), input_ids_list.shape) - save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list, name='_%d_%d'%(save_gap*int(i/save_gap),i)) - -def save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list,name=''): - save_path='./preprocessed_data/bookcorpus/' - np.save(save_path+'input_ids'+name,np.array(input_ids_list)) - np.save(save_path+'token_type_ids'+name,np.array(segment_ids_list)) - np.save(save_path+'attention_mask'+name,np.array(input_mask_list)) - np.save(save_path+'masked_lm_labels'+name,np.array(masked_lm_labels)) - np.save(save_path+'next_sentence_label'+name,np.array(next_sentence_labels_list)) - -def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): - """Truncates a pair of sequences to a maximum sequence length.""" - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_num_tokens: - break - - trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b - assert len(trunc_tokens) >= 1 - - #add more randomness and avoid biases. - if rng.random() < 0.5: - del trunc_tokens[0] - else: - trunc_tokens.pop() - -def show_dataset_detail(dataset): - print(dataset.shape) - print(dataset.column_names) - print(dataset['train'].features) - print(dataset['train'][0]['text']) - -if __name__ == "__main__": - max_seq_length = 512 - do_lower_case = True - short_seq_prob = 0.1 - masked_lm_prob = 0.15 - max_predictions_per_seq = 20 - - vocab_path = "./datasets/bert-base-uncased-vocab.txt" - dataset = load_dataset('../bookcorpus', cache_dir = "./cached_data") - - print("total number of documents {} ".format(dataset['train'].shape[0])) - random_seed = 123 - rng = random.Random(random_seed) - tokenizer = hetu.BertTokenizer(vocab_file=vocab_path, do_lower_case = do_lower_case) - - print("vocab_size =",len(tokenizer.vocab)) - print("max_seq_len =", max_seq_length) - - create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng) - - - - - diff --git a/examples/nlp/bert/train_hetu_bert.py b/examples/nlp/bert/train_hetu_bert.py deleted file mode 100644 index e116cf2..0000000 --- a/examples/nlp/bert/train_hetu_bert.py +++ /dev/null @@ -1,87 +0,0 @@ -from tqdm import tqdm -import os -import math -import logging -import hetu as ht -from hetu_bert import BertForPreTraining -from bert_config import BertConfig -from load_data import DataLoader -import numpy as np -import time - -''' Usage example: - In dir Hetu/examples/nlp/bert/: python train_hetu_bert.py -''' - -device_id=6 -executor_ctx = ht.gpu(device_id) - -num_epochs = 1 -lr = 1e-4 - -config = BertConfig(vocab_size=30522, - hidden_size=768, - num_hidden_layers=12, - num_attention_heads=12, - intermediate_size=3072, - max_position_embeddings=512, - #attention_probs_dropout_prob=0.0, - #hidden_dropout_prob=0.0, - batch_size=6) - -model = BertForPreTraining(config=config) - -batch_size = config.batch_size -seq_len = config.max_position_embeddings -vocab_size = config.vocab_size - -dataloader = DataLoader(dataset='bookcorpus', doc_num=200, save_gap=200, batch_size = batch_size) -data_names = ['input_ids','token_type_ids','attention_mask','masked_lm_labels','next_sentence_label'] - -input_ids = ht.Variable(name='input_ids', trainable=False) -token_type_ids = ht.Variable(name='token_type_ids', trainable=False) -attention_mask = ht.Variable(name='attention_mask', trainable=False) - -masked_lm_labels = ht.Variable(name='masked_lm_labels_one_hot', trainable=False) -next_sentence_label = ht.Variable(name='next_sentence_label_one_hot', trainable=False) - -loss_position_sum = ht.Variable(name='loss_position_sum', trainable=False) - -_,_, masked_lm_loss, next_sentence_loss = model(input_ids, token_type_ids, attention_mask, masked_lm_labels, next_sentence_label) - -masked_lm_loss_mean = ht.div_op(ht.reduce_sum_op(masked_lm_loss, [0,1]), loss_position_sum) -next_sentence_loss_mean = ht.reduce_mean_op(next_sentence_loss, [0]) - -loss = masked_lm_loss_mean + next_sentence_loss_mean -#opt = optimizer.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999, epsilon=1e-8) -opt = ht.optim.SGDOptimizer(learning_rate=lr) -train_op = opt.minimize(loss) - -executor = ht.Executor([masked_lm_loss_mean, next_sentence_loss_mean, loss, train_op],ctx=executor_ctx,dynamic_memory=True) - - -dataloader.make_epoch_data() -for ep in range(num_epochs): - for i in range(dataloader.batch_num): - batch_data = dataloader.get_batch(i) - - feed_dict = { - input_ids: batch_data['input_ids'], - token_type_ids: batch_data['token_type_ids'], - attention_mask: batch_data['attention_mask'], - masked_lm_labels: batch_data['masked_lm_labels'], - next_sentence_label: batch_data['next_sentence_label'], - loss_position_sum: np.array([np.where(batch_data['masked_lm_labels'].reshape(-1)!=-1)[0].shape[0]]), - } - - start_time = time.time() - results = executor.run(feed_dict = feed_dict) - end_time = time.time() - - masked_lm_loss_mean_out = results[0].asnumpy() - next_sentence_loss_mean_out = results[1].asnumpy() - loss_out = results[2].asnumpy() - - print('[Epoch %d] (Iteration %d): Loss = %.3f, MLM_loss = %.3f, NSP_loss = %.6f, Time = %.3f'%(ep,i,loss_out, masked_lm_loss_mean_out, next_sentence_loss_mean_out, end_time-start_time)) - -