|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442 |
- import numpy as np
- import tensorflow as tf
-
- from tqdm import tqdm
- import logging
-
- logging.basicConfig(level=logging.INFO)
-
-
- def ln(inputs, epsilon=1e-8, scope="ln"):
- '''Applies layer normalization. See https://arxiv.org/abs/1607.06450.
- inputs: A tensor with 2 or more dimensions, where the first dimension has `batch_size`.
- epsilon: A floating number. A very small number for preventing ZeroDivision Error.
- scope: Optional scope for `variable_scope`.
-
- Returns:
- A tensor with the same shape and data dtype as `inputs`.
- '''
- with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
- inputs_shape = inputs.get_shape()
- params_shape = inputs_shape[-1:]
-
- mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
- beta = tf.get_variable("beta", params_shape,
- initializer=tf.zeros_initializer())
- gamma = tf.get_variable("gamma", params_shape,
- initializer=tf.ones_initializer())
- normalized = (inputs - mean) / ((variance + epsilon) ** (.5))
- outputs = gamma * normalized + beta
-
- return outputs
-
-
- def get_token_embeddings(vocab_size, num_units, initializer=tf.contrib.layers.xavier_initializer(), zero_pad=True):
- '''Constructs token embedding matrix.
- Note that the column of index 0's are set to zeros.
- vocab_size: scalar. V.
- num_units: embedding dimensionalty. E.
- zero_pad: Boolean. If True, all the values of the first row (id = 0) should be constant zero
- To apply query/key masks easily, zero pad is turned on.
-
- Returns
- weight variable: (V, E)
- '''
- with tf.variable_scope("shared_weight_matrix"):
- embeddings = tf.get_variable('weight_mat',
- dtype=tf.float32,
- shape=(vocab_size, num_units),
- initializer=initializer)
- if zero_pad:
- embeddings = tf.concat((tf.zeros(shape=[1, num_units]),
- embeddings[1:, :]), 0)
- return embeddings
-
-
- def multihead_attention(
- queries, keys, values,
- batch_size, hidden_size,
- num_attention_heads=8,
- query_act=None, key_act=None, value_act=None,
- attention_mask=None,
- attention_probs_dropout_prob=0.0,
- training=True, causality=False,
- scope="multihead_attention"):
-
- def transpose_for_scores(input_tensor):
- output_tensor = tf.reshape(
- input_tensor, [batch_size, -1, num_attention_heads, hidden_size // num_attention_heads])
-
- output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
- return output_tensor
-
- size_per_head = hidden_size // num_attention_heads
- with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
- # linear transformation
- query_layer = tf.layers.dense(
- queries, hidden_size, activation=query_act) # (N, T_q, d_model)
- key_layer = tf.layers.dense(
- keys, hidden_size, activation=key_act) # (N, T_k, d_model)
- value_layer = tf.layers.dense(
- values, hidden_size, activation=value_act) # (N, T_k, d_model)
-
- # transpose
- query_layer = transpose_for_scores(
- query_layer) # (N, h, T_q, d_model/h)
- key_layer = transpose_for_scores(key_layer) # (N, h, T_k, d_model/h)
- value_layer = transpose_for_scores(
- value_layer) # (N, h, T_k, d_model/h)
-
- # score
- attention_scores = tf.matmul(
- query_layer, key_layer, transpose_b=True) # (N, h, T_q, T_k)
- attention_scores /= size_per_head ** 0.5
-
- # mask
- if attention_mask is not None:
- attention_mask = tf.to_float(attention_mask) # (N, T_k)
- attention_mask = tf.reshape(
- attention_mask, [batch_size, 1, 1, -1]) # (N, 1, 1, T_k)
- attention_scores = attention_scores + \
- attention_mask * (-2**32+1) # (N, h, T_q, T_k)
- if causality:
- diag_vals = tf.ones_like(
- attention_scores[0, 0, :, :]) # (T_q, T_k)
- tril = tf.linalg.LinearOperatorLowerTriangular(
- diag_vals).to_dense() # (T_q, T_k)
- future_masks = tf.broadcast_to(
- tril, [batch_size, num_attention_heads, tril.shape[0], tril.shape[1]]) # (N, h, T_q, T_k)
- paddings = tf.ones_like(future_masks) * (-2**32+1)
- attention_scores = tf.where(
- tf.equal(future_masks, 0), paddings, attention_scores)
-
- # probs
- attention_probs = tf.nn.softmax(attention_scores) # (N, h, T_q, T_k)
- attention_probs = tf.layers.dropout(
- attention_probs, rate=attention_probs_dropout_prob, training=training)
- # (N, h, T_q, d_model/h)
- context_layer = tf.matmul(attention_probs, value_layer)
- context_layer = tf.transpose(
- context_layer, [0, 2, 1, 3]) # (N, T_q, h, d_model/h)
- outputs = tf.reshape(context_layer, [
- batch_size, -1, num_attention_heads * size_per_head]) # (N, T_q, d_model)
-
- # Residual connection
- outputs += queries # (N, T_q, d_model)
-
- # Normalize
- outputs = ln(outputs) # (N, T_q, d_model)
-
- return outputs
-
-
- def ff(inputs, num_units, scope="positionwise_feedforward"):
- '''position-wise feed forward net. See 3.3
-
- inputs: A 3d tensor with shape of [N, T, C].
- num_units: A list of two integers.
- scope: Optional scope for `variable_scope`.
-
- Returns:
- A 3d tensor with the same shape and dtype as inputs
- '''
- with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
- # Inner layer
- outputs = tf.layers.dense(inputs, num_units[0], activation=tf.nn.relu)
- # Outer layer
- outputs = tf.layers.dense(outputs, num_units[1])
- # Residual connection
- outputs += inputs
- # Normalize
- outputs = ln(outputs)
- return outputs
-
-
- def label_smoothing(inputs, epsilon=0.1):
- '''Applies label smoothing. See 5.4 and https://arxiv.org/abs/1512.00567.
- inputs: 3d tensor. [N, T, V], where V is the number of vocabulary.
- epsilon: Smoothing rate.
-
- For example,
-
- ```
- import tensorflow as tf
- inputs = tf.convert_to_tensor([[[0, 0, 1],
- [0, 1, 0],
- [1, 0, 0]],
-
- [[1, 0, 0],
- [1, 0, 0],
- [0, 1, 0]]], tf.float32)
-
- outputs = label_smoothing(inputs)
-
- with tf.Session() as sess:
- print(sess.run([outputs]))
-
- >>
- [array([[[ 0.03333334, 0.03333334, 0.93333334],
- [ 0.03333334, 0.93333334, 0.03333334],
- [ 0.93333334, 0.03333334, 0.03333334]],
-
- [[ 0.93333334, 0.03333334, 0.03333334],
- [ 0.93333334, 0.03333334, 0.03333334],
- [ 0.03333334, 0.93333334, 0.03333334]]], dtype=float32)]
- ```
- '''
- V = inputs.get_shape().as_list()[-1] # number of channels
- return ((1-epsilon) * inputs) + (epsilon / V)
-
-
- def positional_encoding(inputs,
- maxlen,
- masking=True,
- scope="positional_encoding"):
- '''Sinusoidal Positional_Encoding. See 3.5
- inputs: 3d tensor. (N, T, E)
- maxlen: scalar. Must be >= T
- masking: Boolean. If True, padding positions are set to zeros.
- scope: Optional scope for `variable_scope`.
-
- returns
- 3d tensor that has the same shape as inputs.
- '''
-
- E = inputs.get_shape().as_list()[-1] # static
- N, T = tf.shape(inputs)[0], tf.shape(inputs)[1] # dynamic
- with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
- # position indices
- position_ind = tf.tile(tf.expand_dims(
- tf.range(T), 0), [N, 1]) # (N, T)
-
- # First part of the PE function: sin and cos argument
- position_enc = np.array([
- [pos / np.power(10000, (i-i % 2)/E) for i in range(E)]
- for pos in range(maxlen)])
-
- # Second part, apply the cosine to even columns and sin to odds.
- position_enc[:, 0::2] = np.sin(position_enc[:, 0::2]) # dim 2i
- position_enc[:, 1::2] = np.cos(position_enc[:, 1::2]) # dim 2i+1
- position_enc = tf.convert_to_tensor(
- position_enc, tf.float32) # (maxlen, E)
-
- # lookup
- outputs = tf.nn.embedding_lookup(position_enc, position_ind)
-
- # masks
- if masking:
- outputs = tf.where(tf.equal(inputs, 0), inputs, outputs)
-
- return tf.to_float(outputs)
-
- # def noam_scheme(init_lr, global_step, warmup_steps=4000.):
- # '''Noam scheme learning rate decay
- # init_lr: initial learning rate. scalar.
- # global_step: scalar.
- # warmup_steps: scalar. During warmup_steps, learning rate increases
- # until it reaches init_lr.
- # '''
- # step = tf.cast(global_step + 1, dtype=tf.float32)
- # return init_lr * warmup_steps ** 0.5 * tf.minimum(step * warmup_steps ** -1.5, step ** -0.5)
-
-
- class Transformer(object):
- '''
- xs: tuple of
- x: int32 tensor. (N, T1)
- x_seqlens: int32 tensor. (N,)
- sents1: str tensor. (N,)
- ys: tuple of
- decoder_input: int32 tensor. (N, T2)
- y: int32 tensor. (N, T2)
- y_seqlen: int32 tensor. (N, )
- sents2: str tensor. (N,)
- training: boolean.
- '''
-
- def __init__(self, hp):
- self.hp = hp
- # self.token2idx, self.idx2token = load_vocab(hp.vocab)
- self.embeddings = get_token_embeddings(
- self.hp.vocab_size, self.hp.d_model, zero_pad=True)
-
- def encode(self, xs, training=True):
- '''
- Returns
- memory: encoder outputs. (N, T1, d_model)
- '''
- with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
- x = xs
-
- # src_masks
- src_masks = tf.math.equal(x, 0) # (N, T1)
-
- # embedding
- enc = tf.nn.embedding_lookup(
- self.embeddings, x) # (N, T1, d_model)
- enc *= self.hp.d_model**0.5 # scale
-
- enc += positional_encoding(enc, self.hp.maxlen1)
- enc = tf.layers.dropout(
- enc, self.hp.dropout_rate, training=training)
-
- # Blocks
- for i in range(self.hp.num_blocks):
- with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
- # self-attention
- enc = multihead_attention(
- queries=enc, keys=enc, values=enc,
- batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
- num_attention_heads=self.hp.num_heads,
- attention_mask=src_masks,
- attention_probs_dropout_prob=self.hp.dropout_rate,
- training=training,
- causality=False
- )
- # feed forward
- enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model])
- memory = enc
- return memory, src_masks
-
- def decode(self, ys, memory, src_masks, training=True):
- '''
- memory: encoder outputs. (N, T1, d_model)
- src_masks: (N, T1)
-
- Returns
- logits: (N, T2, V). float32.
- y_hat: (N, T2). int32
- y: (N, T2). int32
- sents2: (N,). string.
- '''
- with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
- decoder_inputs = ys
-
- # tgt_masks
- tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2)
-
- # embedding
- dec = tf.nn.embedding_lookup(
- self.embeddings, decoder_inputs) # (N, T2, d_model)
- dec *= self.hp.d_model ** 0.5 # scale
-
- dec += positional_encoding(dec, self.hp.maxlen2)
- dec = tf.layers.dropout(
- dec, self.hp.dropout_rate, training=training)
-
- # Blocks
- for i in range(self.hp.num_blocks):
- with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
- # Masked self-attention (Note that causality is True at this time)
- dec = multihead_attention(
- queries=dec, keys=dec, values=dec,
- batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
- num_attention_heads=self.hp.num_heads,
- attention_mask=tgt_masks,
- attention_probs_dropout_prob=self.hp.dropout_rate,
- training=training,
- causality=True,
- scope="self_attention"
- )
- # Vanilla attention
- dec = multihead_attention(
- queries=dec, keys=memory, values=memory,
- batch_size=self.hp.batch_size, hidden_size=self.hp.d_model,
- num_attention_heads=self.hp.num_heads,
- attention_mask=src_masks,
- attention_probs_dropout_prob=self.hp.dropout_rate,
- training=training,
- causality=False,
- scope="vanilla_attention"
- )
- # Feed Forward
- dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model])
-
- # Final linear projection (embedding weights are shared)
- weights = tf.transpose(self.embeddings) # (d_model, vocab_size)
- logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)
- # y_hat = tf.to_int32(tf.argmax(logits, axis=-1))
-
- return logits
-
- def train(self, xs, ys):
- '''
- Returns
- loss: scalar.
- train_op: training operation
- global_step: scalar.
- summaries: training summary node
- '''
- # forward
- memory, src_masks = self.encode(xs)
- logits = self.decode(ys[0], memory, src_masks)
-
- # train scheme
- y = ys[1]
- y_ = label_smoothing(tf.one_hot(y, depth=self.hp.vocab_size))
- loss = tf.nn.softmax_cross_entropy_with_logits_v2(
- logits=logits, labels=y_)
-
- return loss
-
- # def eval(self, xs, ys):
- # '''Predicts autoregressively
- # At inference, input ys is ignored.
- # Returns
- # y_hat: (N, T2)
- # '''
- # decoder_inputs, y, y_seqlen, sents2 = ys
-
- # decoder_inputs = tf.ones((tf.shape(xs[0])[0], 1), tf.int32) * self.token2idx["<s>"]
- # ys = (decoder_inputs, y, y_seqlen, sents2)
-
- # memory, sents1, src_masks = self.encode(xs, False)
-
- # logging.info("Inference graph is being built. Please be patient.")
- # for _ in tqdm(range(self.hp.maxlen2)):
- # logits, y_hat, y, sents2 = self.decode(ys, memory, src_masks, False)
- # if tf.reduce_sum(y_hat, 1) == self.token2idx["<pad>"]: break
-
- # _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)
- # ys = (_decoder_inputs, y, y_seqlen, sents2)
-
- # # monitor a random sample
- # n = tf.random_uniform((), 0, tf.shape(y_hat)[0]-1, tf.int32)
- # sent1 = sents1[n]
- # pred = convert_idx_to_token_tensor(y_hat[n], self.idx2token)
- # sent2 = sents2[n]
-
- # tf.summary.text("sent1", sent1)
- # tf.summary.text("pred", pred)
- # tf.summary.text("sent2", sent2)
- # summaries = tf.summary.merge_all()
-
- # return y_hat, summaries
-
-
- # def convert_idx_to_token_tensor(inputs, idx2token):
- # '''Converts int32 tensor to string tensor.
- # inputs: 1d int32 tensor. indices.
- # idx2token: dictionary
-
- # Returns
- # 1d string tensor.
- # '''
- # def my_func(inputs):
- # return " ".join(idx2token[elem] for elem in inputs)
-
- # return tf.py_func(my_func, [inputs], tf.string)
-
- # def load_vocab(vocab_fpath):
- # '''Loads vocabulary file and returns idx<->token maps
- # vocab_fpath: string. vocabulary file path.
- # Note that these are reserved
- # 0: <pad>, 1: <unk>, 2: <s>, 3: </s>
-
- # Returns
- # two dictionaries.
- # '''
- # vocab = [line.split()[0] for line in open(vocab_fpath, 'r', encoding='utf-8').read().splitlines()]
- # token2idx = {token: idx for idx, token in enumerate(vocab)}
- # idx2token = {idx: token for idx, token in enumerate(vocab)}
- # return token2idx, idx2token
|