Browse Source

log message

pull/1/head
梁宇轩 4 years ago
parent
commit
d177a8239e
5 changed files with 0 additions and 1267 deletions
  1. +0
    -62
      examples/nlp/bert/bert_config.py
  2. +0
    -749
      examples/nlp/bert/hetu_bert.py
  3. +0
    -76
      examples/nlp/bert/load_data.py
  4. +0
    -293
      examples/nlp/bert/processBertData.py
  5. +0
    -87
      examples/nlp/bert/train_hetu_bert.py

+ 0
- 62
examples/nlp/bert/bert_config.py View File

@@ -1,62 +0,0 @@
'''
BERT Config:
--------------------------------------------------------------------------------------------------'''
class BertConfig(object):
"""Configuration class to store the configuration of a `BertModel`.
"""
def __init__(self,
vocab_size,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="relu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
output_hidden_states=False,
batch_size=100,
):
"""Constructs BertConfig.

Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
"""

self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.output_hidden_states = output_hidden_states
self.batch_size = batch_size


'''-----------------------------------------------------------------------------------------------'''

+ 0
- 749
examples/nlp/bert/hetu_bert.py View File

@@ -1,749 +0,0 @@
import hetu as ht
import numpy as np

'''
Bert Module Architecture & Input/Output Tensor Size

BertModel Inputs:
input_ids: [batch_size, seq_len], word token indices in the vocabulary

BertModel Outputs:
sequence_output: [batch_size, seq_len, hidden_size] (from BertEncoder)
pooled_output: [batch_size, hidden_size] (from BertPooler)

BertModel:
--[batch_size, seq_len]--
BertEmbeddings:
Embedding(word/position/token_type)
LayerNorm
Dropout
--[batch_size, seq_len, hidden_size]--

--[batch_size, seq_len, hidden_size]--
BertEncoder:
BertLayer(num_hidden_layers):
BertAttention:
BertSelfAttention
--[batch_size, seq_len, hidden_size]--
BertSelfOutput:
Linear
Dropout
Add & LayerNorm

--[batch_size, seq_len, hidden_size]--
BertIntermediate:
Linear + Act(gule)
--[batch_size, seq_len, intermediate_size]--
BertOutput:
Linear
Dropout
Add & LayerNorm
--[batch_size, seq_len, hidden_size]--

--[batch_size, seq_len, hidden_size]--
BertPooler:
(Slice, select [cls])
--[batch_size, hidden_size]--
Linear + Act(Tanh)
--[batch_size, hidden_size]--

Bert
'''


'''
BertEmbeddings:
--------------------------------------------------------------------------------------------------'''
class BertEmbeddings(object):
"""Construct the embeddings from word, position and token_type embeddings.
"""
def __init__(self, config):
self.seq_len = config.max_position_embeddings
self.batch_size = config.batch_size

self.word_embeddings = Embedding(config.vocab_size, config.hidden_size, "word_embeddings")
self.position_embeddings = Embedding(config.max_position_embeddings, config.hidden_size, 'position_embeddings')
self.token_type_embeddings = Embedding(config.type_vocab_size, config.hidden_size, 'token_type_embeddings')

self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = Dropout(config.hidden_dropout_prob)

def __call__(self, input_ids, token_type_ids):
'''
inputs:
input_ids: [batch_size, seq_len]
token_type_ids: [batch_size, seq_len]

outputs:
embeddings: [batch_size, seq_len, hidden_size]
'''
seq_length= self.seq_len
batch_size = self.batch_size
position_ids = ht.Variable('position_ids', value=np.arange(seq_length).reshape((1,-1)).repeat(batch_size,axis=0), dtype=np.long, trainable=False, ctx=input_ids.ctx)


'''Embedding Size
inputs_id:[batch_size, seq_len], embedding_table:[vocab_size, hidden_size]
position_ids:[batch_size, seq_len], embedding_table:[seq_len, hidden_size]
token_type_ids:[batch_size, seq_len], embedding_tabel:[type_vocab_size, hidden_size]
--> embeddings: [batch_size, seq_len, hidden_size]
'''
words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
'''-----------------------------------------------------------------------------------------------'''


'''
BertEncoder & BertLayer:
--------------------------------------------------------------------------------------------------'''
class BertEncoder(object):
def __init__(self, config):
self.output_hidden_states = config.output_hidden_states
self.layer = [BertLayer(config) for _ in range(config.num_hidden_layers)]

def __call__(self, hidden_states, attention_mask=None):
'''
inputs:
hidden_states: [batch_size, seq_len, hidden_size]
attention_mask: [batch_size, num_heads, seq_len, seq_len]
outputs:
hidden_states: [batch_size, seq_len, hidden_size]
all_hidden_states: optional, num_hidden_layers * [batch_size, seq_len, hidden_size]
'''

for i, layer_module in enumerate(self.layer):
hidden_states = layer_module(hidden_states, attention_mask)
return hidden_states # last-layer hidden state

class BertLayer(object):
def __init__(self, config):
self.attention = BertAttention(config)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)

def __call__(self, hidden_states, attention_mask):
'''
inputs:
hidden_states: [batch_size, seq_len, hidden_size]
attention_mask: [batch_size, num_heads, seq_len, seq_len]
outputs:
layer_output: [batch_size, seq_len, hidden_size]
'''
attention_output = self.attention(hidden_states, attention_mask)
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
'''-----------------------------------------------------------------------------------------------'''


'''
BertAttention & BertSelfAttention & BertSelfOutput
--------------------------------------------------------------------------------------------------'''
class BertAttention(object):
def __init__(self, config):
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)

def __call__(self, input_tensor, attention_mask):
'''
inputs:
input_tensor: [batch_size, seq_len, hidden_size]
attention_mask: [batch_size, num_heads, seq_len, seq_len]
outputs:
attention_output: [batch_size, seq_len, hidden_size]
'''
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output

class BertSelfAttention(object):
def __init__(self, config):
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size #all_head_size == hidden_size
self.hidden_size = config.hidden_size
self.seq_len = config.max_position_embeddings
self.batch_size = config.batch_size

linear_input_shape = [self.batch_size, self.seq_len, self.hidden_size]
self.query = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
self.key = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)
self.value = Linear(config.hidden_size, self.all_head_size, input_shape=linear_input_shape)

self.dropout = Dropout(config.attention_probs_dropout_prob)

def transpose_for_scores(self, input_tensor):
output_tensor = ht.array_reshape_op(
input_tensor, [self.batch_size, self.seq_len, self.num_attention_heads, self.attention_head_size])
output_tensor = ht.transpose_op(output_tensor, [0, 2, 1, 3])
return output_tensor

def __call__(self, hidden_states, attention_mask):
'''
inputs:
hidden_states: [batch_size, seq_len, hidden_size]
attention_mask: [batch_size, 1, 1, seq_len]
outputs:
context_layer: [batch_size, seq_len, hidden_size]
'''

# linear transformation
mixed_query_layer = self.query(hidden_states) # [batch_size, seq_len, hidden_size]
mixed_key_layer = self.key(hidden_states) # [batch_size, seq_len, hidden_size]
mixed_value_layer = self.value(hidden_states) # [batch_size, seq_len, hidden_size]

# transpose
query_layer = self.transpose_for_scores(mixed_query_layer) # [batch_size, num_heads, seq_len, head_size]
key_layer = self.transpose_for_scores(mixed_key_layer) # [batch_size, num_heads, seq_len, head_size]
value_layer = self.transpose_for_scores(mixed_value_layer) # [batch_size, num_heads, seq_len, head_size]

# score
key_layer_scaled = key_layer * (1.0 / np.sqrt(float(self.attention_head_size)))
attention_scores = ht.batch_matmul_op(query_layer, key_layer_scaled, trans_B=True) # [batch_size, num_heads, seq_len, seq_len]

# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + ht.broadcastto_op(attention_mask, attention_scores) # [batch_size, num_heads, seq_len, seq_len]

# Normalize the attention scores to probabilities.
attention_probs = ht.softmax_op(attention_scores)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

context_layer = ht.batch_matmul_op(attention_probs, value_layer) # [batch_size, num_heads, seq_len, head_size]
context_layer = ht.transpose_op(context_layer, [0, 2, 1, 3]) # [batch_size, seq_len, num_heads, head_size]
context_layer = ht.array_reshape_op(context_layer, [-1, self.seq_len, self.all_head_size]) # [batch_size, seq_len, hidden_size]
return context_layer

class BertSelfOutput(object):
def __init__(self, config):
linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
self.dense = Linear(config.hidden_size, config.hidden_size, input_shape=linear_input_shape)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = Dropout(config.hidden_dropout_prob)

def __call__(self, hidden_states, input_tensor):
'''
inputs:
hidden_states: [batch_size, seq_len, hidden_size]
input_tensor: [batch_size, seq_len, hidden_size]
outputs:
hidden_states: [batch_size, seq_len, hidden_size]
'''
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
'''-----------------------------------------------------------------------------------------------'''


'''
BertIntermediate & BertOutput (2-layer FeedForward)
--------------------------------------------------------------------------------------------------'''
class BertIntermediate(object):
def __init__(self, config):
if config.hidden_act == "relu":
self.intermediate_act_fn = ht.relu_op
elif config.hidden_act == "gelu":
self.intermediate_act_fn = ht.gelu_op
print("Gelu activation is not implemented yet.")
assert(False)
linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
self.dense = Linear(config.hidden_size, config.intermediate_size, activation = self.intermediate_act_fn, input_shape=linear_input_shape)

def __call__(self, hidden_states):
'''
inputs:
hidden_states: [batch_size, seq_len, hidden_size]
outputs:
hidden_states: [batch_size, seq_len, intermediate_size]
'''
hidden_states = self.dense(hidden_states)
return hidden_states

class BertOutput(object):
def __init__(self, config):
linear_input_shape = [config.batch_size, config.max_position_embeddings, config.intermediate_size]
self.dense = Linear(config.intermediate_size, config.hidden_size, input_shape=linear_input_shape)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
self.dropout = Dropout(config.hidden_dropout_prob)

def __call__(self, hidden_states, input_tensor):
'''
inputs:
hidden_states: [batch_size, seq_len, intermediate_size]
outputs:
hidden_states: [batch_size, seq_len, hidden_size]
'''
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
'''-----------------------------------------------------------------------------------------------'''


'''
BertPooler
--------------------------------------------------------------------------------------------------'''
class BertPooler(object):
def __init__(self, config):
self.dense = Linear(config.hidden_size, config.hidden_size, activation = ht.tanh_op)
self.batch_size = config.batch_size
self.hidden_size = config.hidden_size
def __call__(self, hidden_states):
'''
inputs:
hidden_states: [batch_size, seq_len, hidden_size]
outputs:
pooled_output: [batch_size, hidden_size]
'''
first_token_tensor = ht.slice_op(hidden_states,(0,0,0),(self.batch_size,1,self.hidden_size))
first_token_tensor = ht.array_reshape_op(first_token_tensor, [self.batch_size, self.hidden_size])
pooled_output = self.dense(first_token_tensor)
return pooled_output
'''-----------------------------------------------------------------------------------------------'''

'''
Bert Downstream Heads
--------------------------------------------------------------------------------------------------'''
class BertPredictionHeadTransform(object):
def __init__(self, config):
if config.hidden_act == "relu":
self.hidden_act = ht.relu_op
elif config.hidden_act == "gelu":
self.hidden_act = ht.gelu_op
print("Gelu activation is not implemented yet.")
assert(False)
linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
self.dense_act = Linear(config.hidden_size, config.hidden_size, activation=self.hidden_act, input_shape=linear_input_shape)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

def __call__(self, hidden_states):
'''
inputs:
hidden_states: [batch_size, seq_len, hidden_size]
outputs:
hidden_states: [batch_size, seq_len, hidden_size]
'''
hidden_states = self.dense_act(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states

class BertLMPredictionHead(object):
def __init__(self, config, bert_model_embedding_weights):
'''
bert_model_embedding_weights: [vocab_size, hidden_size]
'''
self.transform = BertPredictionHeadTransform(config)

linear_input_shape = [config.batch_size, config.max_position_embeddings, config.hidden_size]
self.decoder = Linear(config.hidden_size, config.vocab_size, bias_initializer=ht.init.zeros,input_shape=linear_input_shape)
self.decoder.weights = ht.transpose_op(bert_model_embedding_weights)

def __call__(self, hidden_states):
'''
inputs:
hidden_states: [batch_size, seq_len, hidden_size]
outputs:
hidden_states: [batch_size, seq_len, vocab_size]
'''
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states


class BertOnlyMLMHead(object):
def __init__(self, config, bert_model_embedding_weights):
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)

def __call__(self, sequence_output):
'''
inputs:
sequence_output: [batch_size, seq_len, hidden_size]
outputs:
prediction_scores: [batch_size, seq_len, vocab_size]
'''
prediction_scores = self.predictions(sequence_output)
return prediction_scores


class BertOnlyNSPHead(object):
def __init__(self, config):
self.seq_relationship = Linear(config.hidden_size, 2)

def __call__(self, pooled_output):
'''
inputs:
pooled_output: [batch_size, hidden_size]
outputs:
seq_relationship_score: [batch_size, 2]
'''
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score


class BertPreTrainingHeads(object):
def __init__(self, config, bert_model_embedding_weights):
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
self.seq_relationship = Linear(config.hidden_size, 2)

def __call__(self, sequence_output, pooled_output):
'''
inputs:
sequence_output: [batch_size, seq_len, hidden_size]
pooled_output: [batch_size, hidden_size]
outputs:
prediction_scores: [batch_size, seq_len, vocab_size]
seq_relationship_score: [batch_size, 2]
'''
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score

'''-----------------------------------------------------------------------------------------------'''


'''
BertModel:
--------------------------------------------------------------------------------------------------'''
class BertModel(object):
"""BERT model ("Bidirectional Embedding Representations from a Transformer").

Params:
config: a BertConfig class instance with the configuration to build a new model

Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.

Outputs: Tuple of (encoded_layers, pooled_output)
`encoded_layers`: controled by `output_all_encoded_layers` argument:
- `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
- `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
to the last attention block of shape [batch_size, sequence_length, hidden_size],
`pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
classifier pretrained on top of the hidden state associated to the first character of the
input (`CLS`) to train on the Next-Sentence task (see BERT's paper).

Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

model = modeling.BertModel(config=config)
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config)
self.batch_size=config.batch_size
self.seq_len=config.max_position_embeddings

def __call__(self, input_ids, token_type_ids, attention_mask):
extended_attention_mask = ht.array_reshape_op(attention_mask, [self.batch_size, 1, 1, self.seq_len])
extended_attention_mask = (extended_attention_mask+(-1.0)) * 10000.0

embedding_output = self.embeddings(input_ids, token_type_ids)
sequence_output = self.encoder(embedding_output, extended_attention_mask)
pooled_output = self.pooler(sequence_output)

return sequence_output, pooled_output

'''-----------------------------------------------------------------------------------------------'''


'''
BertForPreTraining:
--------------------------------------------------------------------------------------------------'''
class BertForPreTraining(object):
"""BERT model with pre-training heads.
This module comprises the BERT model followed by the two pre-training heads:
- the masked language modeling head, and
- the next sentence classification head.

Params:
config: a BertConfig class instance with the configuration to build a new model.

Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
`next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.

Outputs:
if `masked_lm_labels` and `next_sentence_label` are not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `masked_lm_labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2].

Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

model = BertForPreTraining(config)
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""

def __init__(self, config):
self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)

self.vocab_size=config.vocab_size

def __call__(self, input_ids, token_type_ids, attention_mask, masked_lm_labels=None, next_sentence_label=None):
sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)

return_op = [prediction_scores, seq_relationship_score]
if masked_lm_labels is not None and next_sentence_label is not None:
'''
masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
prediction_scores: [batch_size, seq_len, vocab_size]
next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
seq_relationship_score: [batch_size, 2]

masked_lm_loss: [batch_size*seq_len]
next_sentence_loss: [batch_size]
'''

masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)

return_op += [masked_lm_loss, next_sentence_loss]
return return_op


class BertForMaskedLM(object):
"""BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head.

Params:
config: a BertConfig class instance with the configuration to build a new model.

Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]

Outputs:
if `masked_lm_labels` is not `None`:
Outputs the masked language modeling loss.
if `masked_lm_labels` is `None`:
Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].

Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

model = BertForMaskedLM(config)
masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
self.bert = BertModel(config)
self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
self.vocab_size=config.vocab_size

def __call__(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask)
prediction_scores = self.cls(sequence_output)

return_op = [prediction_scores]
if masked_lm_labels is not None:
'''
masked_lm_labels: [batch_size, seq_len, vocab_size], one hot form, masked places are filled with 0
prediction_scores: [batch_size, seq_len, vocab_size]

masked_lm_loss: [batch_size*seq_len]
'''
masked_lm_loss = ht.softmaxcrossentropy_sparse_op(prediction_scores, masked_lm_labels, ignored_index=-1)
return_op += [masked_lm_loss]

return return_op


class BertForNextSentencePrediction(object):
"""BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head.

Params:
config: a BertConfig class instance with the configuration to build a new model.

Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence.

Outputs:
if `next_sentence_label` is not `None`:
Outputs the total_loss which is the sum of the masked language modeling loss and the next
sentence classification loss.
if `next_sentence_label` is `None`:
Outputs the next sentence classification logits of shape [batch_size, 2].

Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

model = BertForNextSentencePrediction(config)
seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config):
self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config)

def __call__(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
_, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
seq_relationship_score = self.cls(pooled_output)

return_op = [seq_relationship_score]
if next_sentence_label is not None:
'''
next_sentence_label: [batch_size, 2], one hot form, masked places are filled with 0
seq_relationship_score: [batch_size, 2]

next_sentence_loss: [batch_size]
'''
next_sentence_loss = ht.softmaxcrossentropy_sparse_op(seq_relationship_score, next_sentence_label, ignored_index=-1)
return_op += [next_sentence_loss]

return return_op

'''-----------------------------------------------------------------------------------------------'''



'''
Bert Layer utils (Embedding & BerLayerNorm & Dropout & Linear)
--------------------------------------------------------------------------------------------------'''
class Embedding(object):
def __init__(self, num_embeddings, embedding_dim, embedding_name=None, initializer=ht.init.xavier_normal):
self.weight = initializer(name=embedding_name, shape=(num_embeddings, embedding_dim))
def __call__(self, input_tensor):
return ht.embedding_lookup_op(self.weight, input_tensor)

class BertLayerNorm(object):
def __init__(self, hidden_size, eps=1e-12):
self.eps=eps
self.scale = ht.init.ones(name='layer_norm_scale', shape=(hidden_size, ))
self.bias = ht.init.zeros(name='layer_norm_bias', shape=(hidden_size, ))
def __call__(self, input_tensor):
return ht.layer_normalization_op(input_tensor, self.scale, self.bias, eps=self.eps)

class Dropout(object):
def __init__(self, dropout_prob=None):
self.dropout_prob = dropout_prob
def __call__(self, input_tensor):
if self.dropout_prob is None or self.dropout_prob == 0.0:
return input_tensor
output = ht.dropout_op(input_tensor, 1.0 - self.dropout_prob)
return output

class Linear(object):
def __init__(self, in_features, out_features, bias=True, activation=None, kernel_initializer=ht.init.xavier_normal, bias_initializer=ht.init.zeros, input_shape=None):
self.bias_flag = bias
self.activation = activation
self.weights = kernel_initializer(name='dense_weights', shape=(in_features, out_features))
if self.bias_flag:
self.bias = bias_initializer(name='dense_bias', shape=(out_features,))
self.input_shape=input_shape
self.in_features = in_features
self.out_features = out_features
if self.input_shape is not None and self.input_shape[-1]!=in_features:
print("Specified in_features is not equal to input_shape[-1].")
assert(False)
def __call__(self, input_tensor):
if self.input_shape is not None and len(self.input_shape)!=2:
input_tensor = ht.array_reshape_op(input_tensor, [-1, self.in_features])
outputs = ht.matmul_op(input_tensor, self.weights)
if self.bias_flag:
outputs = outputs + ht.broadcastto_op(self.bias, outputs)
if self.activation is not None:
outputs = self.activation(outputs)
if self.input_shape is not None and len(self.input_shape)!=2:
outputs = ht.array_reshape_op(outputs, self.input_shape[:-1]+[self.out_features])
return outputs
'''-----------------------------------------------------------------------------------------------'''

+ 0
- 76
examples/nlp/bert/load_data.py View File

@@ -1,76 +0,0 @@
import numpy as np

class DataLoader(object):
def __init__(self, dataset='bookcorpus', doc_num=16000, save_gap=200, batch_size = 1024):
self.data_names = ['input_ids','token_type_ids','attention_mask','masked_lm_labels','next_sentence_label']
self.data = {'input_ids':[],
'token_type_ids':[],
'attention_mask':[],
'masked_lm_labels':[],
'next_sentence_label':[]}
self.batch_size=batch_size
self.batch_data = {'input_ids':[],
'token_type_ids':[],
'attention_mask':[],
'masked_lm_labels':[],
'next_sentence_label':[]}
self.cur_batch_data = {'input_ids':[],
'token_type_ids':[],
'attention_mask':[],
'masked_lm_labels':[],
'next_sentence_label':[]}
self.load_data(dataset=dataset, doc_num=doc_num, save_gap=save_gap)


def load_data(self, dataset='bookcorpus', doc_num=16000, save_gap=200):
print('Loading preprocessed dataset %s...'%dataset)
data_dir = './preprocessed_data/%s/'%dataset

for i in range(0,doc_num,save_gap):
start, end = i, i+save_gap-1
if end > doc_num-1:
end = doc_num-1
range_name = '_%d_%d.npy'%(start,end)
print(start,end)
for data_name in self.data_names:
#print(data_dir+data_name+range_name)
self.data[data_name].append(np.load(data_dir+data_name+range_name))
for data_name in self.data_names:
self.data[data_name] = np.concatenate(self.data[data_name],axis=0)
self.data_len = self.data['input_ids'].shape[0]
print(self.data['input_ids'].shape)

print('Successfully loaded dataset %s!'%dataset)
def make_epoch_data(self):
batch_data = []

for i in range(0, self.data_len, self.batch_size):
start = i
end = start + self.batch_size
if end > self.data_len:
end = self.data_len
if end-start != self.batch_size:
break
for data_name in self.data_names:
self.batch_data[data_name].append(self.data[data_name][start:end])

self.batch_num = len(self.batch_data['input_ids'])
def get_batch(self, idx):
if idx >= self.batch_num:
assert False
for data_name in self.data_names:
self.cur_batch_data[data_name] = self.batch_data[data_name][idx]

return self.cur_batch_data.copy()
def align(self, arr, length):
ori_len = len(arr)
if length > ori_len:
return arr + [0] * (length - ori_len)
else:
return arr[:length]

+ 0
- 293
examples/nlp/bert/processBertData.py View File

@@ -1,293 +0,0 @@
from datasets import load_dataset
import random
import hetu
import os
import numpy as np

''' Usage example:
In dir Hetu/examples/nlp/bert/: python processBertData.py
'''

# https://the-eye.eu/public/AI/pile_preliminary_components/books1.tar.gz


class TrainingInstance(object):
"""A single training instance (sentence pair)."""

def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
is_random_next):
self.tokens = tokens
self.segment_ids = segment_ids
self.is_random_next = is_random_next
self.masked_lm_positions = masked_lm_positions
self.masked_lm_labels = masked_lm_labels

def __str__(self):
s = ""
s += "tokens: %s\n" % (" ".join(
[str(x) for x in self.tokens]))
s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
s += "is_random_next: %s\n" % self.is_random_next
s += "masked_lm_positions: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_positions]))
s += "masked_lm_labels: %s\n" % (" ".join(
[str(x) for x in self.masked_lm_labels]))
s += "\n"
return s

def __repr__(self):
return self.__str__()


def create_masked_lm_predictions(tokens, masked_lm_prob,
max_predictions_per_seq, vocab_words, rng):
"""Creates the predictions for the masked LM objective."""
cand_indexes = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
cand_indexes.append(i)
rng.shuffle(cand_indexes)
output_tokens = list(tokens)
num_to_predict = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
masked_lms = []
for index in cand_indexes:
if len(masked_lms) >= num_to_predict:
break
masked_token = None
# replace with [MASK] at 80%.
if rng.random() < 0.8:
masked_token = "[MASK]"
else:
# keep original at 10%.
if rng.random() < 0.5:
masked_token = tokens[index]
# replace with random word at 10%.
else:
masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
output_tokens[index] = masked_token
masked_lms.append([index, tokens[index]])

masked_lms.sort(key = lambda x: x[0])
masked_lm_positions = []
masked_lm_labels = []

for p in masked_lms:
masked_lm_positions.append(p[0])
masked_lm_labels.append(p[1])

return (output_tokens, masked_lm_positions, masked_lm_labels)


def create_data_from_document(all_document, doc_id, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
""" Create Training example for input document """
document = all_document[doc_id]
max_num_tokens = max_seq_length - 3 # [CLS], [SEP], [SEP]
target_seq_length = max_num_tokens
# generate short sequence at the probility of short_seq_prob
# In order to minimize the mismatch between pre-training and fine-tuning.
if rng.random() < short_seq_prob:
target_seq_length = rng.randint(2, max_num_tokens)
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# create sentence A
a_end = 1
if len(current_chunk) >= 2:
a_end = rng.randint(1, len(current_chunk) - 1)
tokens_a = []
for j in range(a_end):
tokens_a.extend([current_chunk[j]])
tokens_b = []
# Random next
is_random_next = False
if len(current_chunk) == 1 or rng.random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
for _ in range(10):
random_document_index = rng.randint(0, len(all_document) - 1)
if random_document_index != doc_id:
break
#If picked random document is the same as the current document
if random_document_index == doc_id:
is_random_next = False
random_document = all_document[random_document_index]
random_start = rng.randint(0, len(random_document) - 1)
for j in range(random_start, len(random_document)):
tokens_b.extend([random_document[j]])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend([current_chunk[j]])
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1

tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)

tokens.append("[SEP]")
segment_ids.append(0)

for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)

(tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
instance = TrainingInstance(
tokens=tokens,
segment_ids=segment_ids,
is_random_next=is_random_next,
masked_lm_positions=masked_lm_positions,
masked_lm_labels=masked_lm_labels)
instances.append(instance)
current_chunk = []
current_length = 0
i += 1

return instances
def convert_instances_to_data(instances, tokenizer, max_seq_length):
num_instances = len(instances)
input_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
input_mask_list = np.zeros([num_instances, max_seq_length], dtype="int32")
segment_ids_list = np.zeros([num_instances, max_seq_length], dtype="int32")
masked_lm_labels = np.full([num_instances, max_seq_length],-1, dtype="int32")
next_sentence_labels_list = np.zeros(num_instances, dtype="int32")

for (idx, instance) in enumerate(instances):
input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
input_mask = [1] * len(input_ids)
segment_ids = list(instance.segment_ids)
assert len(input_ids) <= max_seq_length

padding_zero_list = [0]*int(max_seq_length - len(input_ids))
input_ids += padding_zero_list
input_mask += padding_zero_list
segment_ids += padding_zero_list

assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length

masked_lm_positions = list(instance.masked_lm_positions)
masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)

input_ids_list[idx][:] = input_ids
input_mask_list[idx][:] = input_mask
segment_ids_list[idx][:] = segment_ids
masked_lm_labels[idx][masked_lm_positions] = masked_lm_ids
next_sentence_labels_list[idx] = 1 if instance.is_random_next else 0

return input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list

def create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng):
documents, all_data = [], [[],[],[],[],[]]
vocab_words = list(tokenizer.vocab.keys())

save_path='./preprocessed_data/bookcorpus/'
if not os.path.exists(save_path):
os.makedirs(save_path)

for i in range(dataset['train'].shape[0]):
tokens = tokenizer.tokenize(dataset['train'][i]['text'])
documents.append(tokens)
instance = create_data_from_document(documents, i,\
max_seq_length, short_seq_prob, masked_lm_prob,
max_predictions_per_seq, vocab_words, rng)
data = convert_instances_to_data(instance, tokenizer, max_seq_length)
print(i, len(tokens), len(instance))
for j in range(5):
all_data[j].append(data[j])

save_gap=200
if (i+1)%save_gap==0 and i:
input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list = [np.concatenate(all_data[j],axis=0) for j in range(5)]
print('Saving data from %d to %d: doc_num = %d, input_ids_shape ='%(i+1-save_gap,i, i+1), input_ids_list.shape)
save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list, name='_%d_%d'%(i+1-save_gap,i))
all_data = [[],[],[],[],[]]
if i == dataset['train'].shape[0]-1:
input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list = [np.concatenate(all_data[j],axis=0) for j in range(5)]
print('Saving data from %d to %d: doc_num = %d, input_ids_shape ='%(save_gap*int(i/save_gap),i, i+1), input_ids_list.shape)
save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list, name='_%d_%d'%(save_gap*int(i/save_gap),i))

def save_data(input_ids_list, input_mask_list, segment_ids_list, masked_lm_labels, next_sentence_labels_list,name=''):
save_path='./preprocessed_data/bookcorpus/'
np.save(save_path+'input_ids'+name,np.array(input_ids_list))
np.save(save_path+'token_type_ids'+name,np.array(segment_ids_list))
np.save(save_path+'attention_mask'+name,np.array(input_mask_list))
np.save(save_path+'masked_lm_labels'+name,np.array(masked_lm_labels))
np.save(save_path+'next_sentence_label'+name,np.array(next_sentence_labels_list))

def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
"""Truncates a pair of sequences to a maximum sequence length."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break

trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1

#add more randomness and avoid biases.
if rng.random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()

def show_dataset_detail(dataset):
print(dataset.shape)
print(dataset.column_names)
print(dataset['train'].features)
print(dataset['train'][0]['text'])

if __name__ == "__main__":
max_seq_length = 512
do_lower_case = True
short_seq_prob = 0.1
masked_lm_prob = 0.15
max_predictions_per_seq = 20
vocab_path = "./datasets/bert-base-uncased-vocab.txt"
dataset = load_dataset('../bookcorpus', cache_dir = "./cached_data")
print("total number of documents {} ".format(dataset['train'].shape[0]))
random_seed = 123
rng = random.Random(random_seed)
tokenizer = hetu.BertTokenizer(vocab_file=vocab_path, do_lower_case = do_lower_case)

print("vocab_size =",len(tokenizer.vocab))
print("max_seq_len =", max_seq_length)
create_pretrain_data(dataset, tokenizer, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq, rng)






+ 0
- 87
examples/nlp/bert/train_hetu_bert.py View File

@@ -1,87 +0,0 @@
from tqdm import tqdm
import os
import math
import logging
import hetu as ht
from hetu_bert import BertForPreTraining
from bert_config import BertConfig
from load_data import DataLoader
import numpy as np
import time

''' Usage example:
In dir Hetu/examples/nlp/bert/: python train_hetu_bert.py
'''

device_id=6
executor_ctx = ht.gpu(device_id)

num_epochs = 1
lr = 1e-4

config = BertConfig(vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
max_position_embeddings=512,
#attention_probs_dropout_prob=0.0,
#hidden_dropout_prob=0.0,
batch_size=6)

model = BertForPreTraining(config=config)

batch_size = config.batch_size
seq_len = config.max_position_embeddings
vocab_size = config.vocab_size

dataloader = DataLoader(dataset='bookcorpus', doc_num=200, save_gap=200, batch_size = batch_size)
data_names = ['input_ids','token_type_ids','attention_mask','masked_lm_labels','next_sentence_label']

input_ids = ht.Variable(name='input_ids', trainable=False)
token_type_ids = ht.Variable(name='token_type_ids', trainable=False)
attention_mask = ht.Variable(name='attention_mask', trainable=False)

masked_lm_labels = ht.Variable(name='masked_lm_labels_one_hot', trainable=False)
next_sentence_label = ht.Variable(name='next_sentence_label_one_hot', trainable=False)

loss_position_sum = ht.Variable(name='loss_position_sum', trainable=False)

_,_, masked_lm_loss, next_sentence_loss = model(input_ids, token_type_ids, attention_mask, masked_lm_labels, next_sentence_label)

masked_lm_loss_mean = ht.div_op(ht.reduce_sum_op(masked_lm_loss, [0,1]), loss_position_sum)
next_sentence_loss_mean = ht.reduce_mean_op(next_sentence_loss, [0])

loss = masked_lm_loss_mean + next_sentence_loss_mean
#opt = optimizer.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999, epsilon=1e-8)
opt = ht.optim.SGDOptimizer(learning_rate=lr)
train_op = opt.minimize(loss)

executor = ht.Executor([masked_lm_loss_mean, next_sentence_loss_mean, loss, train_op],ctx=executor_ctx,dynamic_memory=True)


dataloader.make_epoch_data()
for ep in range(num_epochs):
for i in range(dataloader.batch_num):
batch_data = dataloader.get_batch(i)

feed_dict = {
input_ids: batch_data['input_ids'],
token_type_ids: batch_data['token_type_ids'],
attention_mask: batch_data['attention_mask'],
masked_lm_labels: batch_data['masked_lm_labels'],
next_sentence_label: batch_data['next_sentence_label'],
loss_position_sum: np.array([np.where(batch_data['masked_lm_labels'].reshape(-1)!=-1)[0].shape[0]]),
}
start_time = time.time()
results = executor.run(feed_dict = feed_dict)
end_time = time.time()

masked_lm_loss_mean_out = results[0].asnumpy()
next_sentence_loss_mean_out = results[1].asnumpy()
loss_out = results[2].asnumpy()

print('[Epoch %d] (Iteration %d): Loss = %.3f, MLM_loss = %.3f, NSP_loss = %.6f, Time = %.3f'%(ep,i,loss_out, masked_lm_loss_mean_out, next_sentence_loss_mean_out, end_time-start_time))



Loading…
Cancel
Save