From efeac2c4276cca00e3c2b03fed574f5678efbdb9 Mon Sep 17 00:00:00 2001 From: chenkaiyu1997 Date: Fri, 15 Feb 2019 17:31:56 +0800 Subject: [PATCH] Add ENAS (Efficient Neural Architecture Search) --- fastNLP/models/enas_controller.py | 223 +++++++++++++++++ fastNLP/models/enas_model.py | 388 ++++++++++++++++++++++++++++++ fastNLP/models/enas_trainer.py | 385 +++++++++++++++++++++++++++++ fastNLP/models/enas_utils.py | 56 +++++ test/models/test_enas.py | 112 +++++++++ 5 files changed, 1164 insertions(+) create mode 100644 fastNLP/models/enas_controller.py create mode 100644 fastNLP/models/enas_model.py create mode 100644 fastNLP/models/enas_trainer.py create mode 100644 fastNLP/models/enas_utils.py create mode 100644 test/models/test_enas.py diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py new file mode 100644 index 00000000..ae9bcfd2 --- /dev/null +++ b/fastNLP/models/enas_controller.py @@ -0,0 +1,223 @@ +# Code Modified from https://github.com/carpedm20/ENAS-pytorch +"""A module with NAS controller-related code.""" +import collections +import os + +import torch +import torch.nn.functional as F +import fastNLP +import fastNLP.models.enas_utils as utils +from fastNLP.models.enas_utils import Node + + +def _construct_dags(prev_nodes, activations, func_names, num_blocks): + """Constructs a set of DAGs based on the actions, i.e., previous nodes and + activation functions, sampled from the controller/policy pi. + + Args: + prev_nodes: Previous node actions from the policy. + activations: Activations sampled from the policy. + func_names: Mapping from activation function names to functions. + num_blocks: Number of blocks in the target RNN cell. + + Returns: + A list of DAGs defined by the inputs. + + RNN cell DAGs are represented in the following way: + + 1. Each element (node) in a DAG is a list of `Node`s. + + 2. The `Node`s in the list dag[i] correspond to the subsequent nodes + that take the output from node i as their own input. + + 3. dag[-1] is the node that takes input from x^{(t)} and h^{(t - 1)}. + dag[-1] always feeds dag[0]. + dag[-1] acts as if `w_xc`, `w_hc`, `w_xh` and `w_hh` are its + weights. + + 4. dag[N - 1] is the node that produces the hidden state passed to + the next timestep. dag[N - 1] is also always a leaf node, and therefore + is always averaged with the other leaf nodes and fed to the output + decoder. + """ + dags = [] + for nodes, func_ids in zip(prev_nodes, activations): + dag = collections.defaultdict(list) + + # add first node + dag[-1] = [Node(0, func_names[func_ids[0]])] + dag[-2] = [Node(0, func_names[func_ids[0]])] + + # add following nodes + for jdx, (idx, func_id) in enumerate(zip(nodes, func_ids[1:])): + dag[utils.to_item(idx)].append(Node(jdx + 1, func_names[func_id])) + + leaf_nodes = set(range(num_blocks)) - dag.keys() + + # merge with avg + for idx in leaf_nodes: + dag[idx] = [Node(num_blocks, 'avg')] + + # This is actually y^{(t)}. h^{(t)} is node N - 1 in + # the graph, where N Is the number of nodes. I.e., h^{(t)} takes + # only one other node as its input. + # last h[t] node + last_node = Node(num_blocks + 1, 'h[t]') + dag[num_blocks] = [last_node] + dags.append(dag) + + return dags + + +class Controller(torch.nn.Module): + """Based on + https://github.com/pytorch/examples/blob/master/word_language_model/model.py + + RL controllers do not necessarily have much to do with + language models. + + Base the controller RNN on the GRU from: + https://github.com/ikostrikov/pytorch-a2c-ppo-acktr/blob/master/model.py + """ + def __init__(self, num_blocks=4, controller_hid=100, cuda=False): + torch.nn.Module.__init__(self) + + # `num_tokens` here is just the activation function + # for every even step, + self.shared_rnn_activations = ['tanh', 'ReLU', 'identity', 'sigmoid'] + self.num_tokens = [len(self.shared_rnn_activations)] + self.controller_hid = controller_hid + self.use_cuda = cuda + self.num_blocks = num_blocks + for idx in range(num_blocks): + self.num_tokens += [idx + 1, len(self.shared_rnn_activations)] + self.func_names = self.shared_rnn_activations + + num_total_tokens = sum(self.num_tokens) + + self.encoder = torch.nn.Embedding(num_total_tokens, + controller_hid) + self.lstm = torch.nn.LSTMCell(controller_hid, controller_hid) + + # Perhaps these weights in the decoder should be + # shared? At least for the activation functions, which all have the + # same size. + self.decoders = [] + for idx, size in enumerate(self.num_tokens): + decoder = torch.nn.Linear(controller_hid, size) + self.decoders.append(decoder) + + self._decoders = torch.nn.ModuleList(self.decoders) + + self.reset_parameters() + self.static_init_hidden = utils.keydefaultdict(self.init_hidden) + + def _get_default_hidden(key): + return utils.get_variable( + torch.zeros(key, self.controller_hid), + self.use_cuda, + requires_grad=False) + + self.static_inputs = utils.keydefaultdict(_get_default_hidden) + + def reset_parameters(self): + init_range = 0.1 + for param in self.parameters(): + param.data.uniform_(-init_range, init_range) + for decoder in self.decoders: + decoder.bias.data.fill_(0) + + def forward(self, # pylint:disable=arguments-differ + inputs, + hidden, + block_idx, + is_embed): + if not is_embed: + embed = self.encoder(inputs) + else: + embed = inputs + + hx, cx = self.lstm(embed, hidden) + logits = self.decoders[block_idx](hx) + + logits /= 5.0 + + # # exploration + # if self.args.mode == 'train': + # logits = (2.5 * F.tanh(logits)) + + return logits, (hx, cx) + + def sample(self, batch_size=1, with_details=False, save_dir=None): + """Samples a set of `args.num_blocks` many computational nodes from the + controller, where each node is made up of an activation function, and + each node except the last also includes a previous node. + """ + if batch_size < 1: + raise Exception(f'Wrong batch_size: {batch_size} < 1') + + # [B, L, H] + inputs = self.static_inputs[batch_size] + hidden = self.static_init_hidden[batch_size] + + activations = [] + entropies = [] + log_probs = [] + prev_nodes = [] + # The RNN controller alternately outputs an activation, + # followed by a previous node, for each block except the last one, + # which only gets an activation function. The last node is the output + # node, and its previous node is the average of all leaf nodes. + for block_idx in range(2*(self.num_blocks - 1) + 1): + logits, hidden = self.forward(inputs, + hidden, + block_idx, + is_embed=(block_idx == 0)) + + probs = F.softmax(logits, dim=-1) + log_prob = F.log_softmax(logits, dim=-1) + # .mean() for entropy? + entropy = -(log_prob * probs).sum(1, keepdim=False) + + action = probs.multinomial(num_samples=1).data + selected_log_prob = log_prob.gather( + 1, utils.get_variable(action, requires_grad=False)) + + # why the [:, 0] here? Should it be .squeeze(), or + # .view()? Same below with `action`. + entropies.append(entropy) + log_probs.append(selected_log_prob[:, 0]) + + # 0: function, 1: previous node + mode = block_idx % 2 + inputs = utils.get_variable( + action[:, 0] + sum(self.num_tokens[:mode]), + requires_grad=False) + + if mode == 0: + activations.append(action[:, 0]) + elif mode == 1: + prev_nodes.append(action[:, 0]) + + prev_nodes = torch.stack(prev_nodes).transpose(0, 1) + activations = torch.stack(activations).transpose(0, 1) + + dags = _construct_dags(prev_nodes, + activations, + self.func_names, + self.num_blocks) + + if save_dir is not None: + for idx, dag in enumerate(dags): + utils.draw_network(dag, + os.path.join(save_dir, f'graph{idx}.png')) + + if with_details: + return dags, torch.cat(log_probs), torch.cat(entropies) + + return dags + + def init_hidden(self, batch_size): + zeros = torch.zeros(batch_size, self.controller_hid) + return (utils.get_variable(zeros, self.use_cuda, requires_grad=False), + utils.get_variable(zeros.clone(), self.use_cuda, requires_grad=False)) diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py new file mode 100644 index 00000000..cc91e675 --- /dev/null +++ b/fastNLP/models/enas_model.py @@ -0,0 +1,388 @@ +# Code Modified from https://github.com/carpedm20/ENAS-pytorch + +"""Module containing the shared RNN model.""" +import numpy as np +import collections + +import torch +from torch import nn +import torch.nn.functional as F +from torch.autograd import Variable + +import fastNLP.models.enas_utils as utils +from fastNLP.models.base_model import BaseModel +import fastNLP.modules.encoder as encoder + +def _get_dropped_weights(w_raw, dropout_p, is_training): + """Drops out weights to implement DropConnect. + + Args: + w_raw: Full, pre-dropout, weights to be dropped out. + dropout_p: Proportion of weights to drop out. + is_training: True iff _shared_ model is training. + + Returns: + The dropped weights. + + Why does torch.nn.functional.dropout() return: + 1. `torch.autograd.Variable()` on the training loop + 2. `torch.nn.Parameter()` on the controller or eval loop, when + training = False... + + Even though the call to `_setweights` in the Smerity repo's + `weight_drop.py` does not have this behaviour, and `F.dropout` always + returns `torch.autograd.Variable` there, even when `training=False`? + + The above TODO is the reason for the hacky check for `torch.nn.Parameter`. + """ + dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training) + + if isinstance(dropped_w, torch.nn.Parameter): + dropped_w = dropped_w.clone() + + return dropped_w + +class EmbeddingDropout(torch.nn.Embedding): + """Class for dropping out embeddings by zero'ing out parameters in the + embedding matrix. + + This is equivalent to dropping out particular words, e.g., in the sentence + 'the quick brown fox jumps over the lazy dog', dropping out 'the' would + lead to the sentence '### quick brown fox jumps over ### lazy dog' (in the + embedding vector space). + + See 'A Theoretically Grounded Application of Dropout in Recurrent Neural + Networks', (Gal and Ghahramani, 2016). + """ + def __init__(self, + num_embeddings, + embedding_dim, + max_norm=None, + norm_type=2, + scale_grad_by_freq=False, + sparse=False, + dropout=0.1, + scale=None): + """Embedding constructor. + + Args: + dropout: Dropout probability. + scale: Used to scale parameters of embedding weight matrix that are + not dropped out. Note that this is _in addition_ to the + `1/(1 - dropout)` scaling. + + See `torch.nn.Embedding` for remaining arguments. + """ + torch.nn.Embedding.__init__(self, + num_embeddings=num_embeddings, + embedding_dim=embedding_dim, + max_norm=max_norm, + norm_type=norm_type, + scale_grad_by_freq=scale_grad_by_freq, + sparse=sparse) + self.dropout = dropout + assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 ' + 'and < 1.0') + self.scale = scale + + def forward(self, inputs): # pylint:disable=arguments-differ + """Embeds `inputs` with the dropped out embedding weight matrix.""" + if self.training: + dropout = self.dropout + else: + dropout = 0 + + if dropout: + mask = self.weight.data.new(self.weight.size(0), 1) + mask.bernoulli_(1 - dropout) + mask = mask.expand_as(self.weight) + mask = mask / (1 - dropout) + masked_weight = self.weight * Variable(mask) + else: + masked_weight = self.weight + if self.scale and self.scale != 1: + masked_weight = masked_weight * self.scale + + return F.embedding(inputs, + masked_weight, + max_norm=self.max_norm, + norm_type=self.norm_type, + scale_grad_by_freq=self.scale_grad_by_freq, + sparse=self.sparse) + + +class LockedDropout(nn.Module): + # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py + def __init__(self): + super().__init__() + + def forward(self, x, dropout=0.5): + if not self.training or not dropout: + return x + m = x.data.new(1, x.size(1), x.size(2)).bernoulli_(1 - dropout) + mask = Variable(m, requires_grad=False) / (1 - dropout) + mask = mask.expand_as(x) + return mask * x + + +class ENASModel(BaseModel): + """Shared RNN model.""" + def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000): + super(ENASModel, self).__init__() + + self.use_cuda = cuda + + self.shared_hid = shared_hid + self.num_blocks = num_blocks + self.decoder = nn.Linear(self.shared_hid, num_classes) + self.encoder = EmbeddingDropout(embed_num, + shared_embed, + dropout=0.1) + self.lockdrop = LockedDropout() + self.dag = None + + # Tie weights + # self.decoder.weight = self.encoder.weight + + # Since W^{x, c} and W^{h, c} are always summed, there + # is no point duplicating their bias offset parameter. Likewise for + # W^{x, h} and W^{h, h}. + self.w_xc = nn.Linear(shared_embed, self.shared_hid) + self.w_xh = nn.Linear(shared_embed, self.shared_hid) + + # The raw weights are stored here because the hidden-to-hidden weights + # are weight dropped on the forward pass. + self.w_hc_raw = torch.nn.Parameter( + torch.Tensor(self.shared_hid, self.shared_hid)) + self.w_hh_raw = torch.nn.Parameter( + torch.Tensor(self.shared_hid, self.shared_hid)) + self.w_hc = None + self.w_hh = None + + self.w_h = collections.defaultdict(dict) + self.w_c = collections.defaultdict(dict) + + for idx in range(self.num_blocks): + for jdx in range(idx + 1, self.num_blocks): + self.w_h[idx][jdx] = nn.Linear(self.shared_hid, + self.shared_hid, + bias=False) + self.w_c[idx][jdx] = nn.Linear(self.shared_hid, + self.shared_hid, + bias=False) + + self._w_h = nn.ModuleList([self.w_h[idx][jdx] + for idx in self.w_h + for jdx in self.w_h[idx]]) + self._w_c = nn.ModuleList([self.w_c[idx][jdx] + for idx in self.w_c + for jdx in self.w_c[idx]]) + + self.batch_norm = None + # if args.mode == 'train': + # self.batch_norm = nn.BatchNorm1d(self.shared_hid) + # else: + # self.batch_norm = None + + self.reset_parameters() + self.static_init_hidden = utils.keydefaultdict(self.init_hidden) + + def setDAG(self, dag): + if self.dag is None: + self.dag = dag + + def forward(self, word_seq, hidden=None): + inputs = torch.transpose(word_seq, 0, 1) + + time_steps = inputs.size(0) + batch_size = inputs.size(1) + + + self.w_hh = _get_dropped_weights(self.w_hh_raw, + 0.5, + self.training) + self.w_hc = _get_dropped_weights(self.w_hc_raw, + 0.5, + self.training) + + # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden + hidden = self.static_init_hidden[batch_size] + + embed = self.encoder(inputs) + + embed = self.lockdrop(embed, 0.65 if self.training else 0) + + # The norm of hidden states are clipped here because + # otherwise ENAS is especially prone to exploding activations on the + # forward pass. This could probably be fixed in a more elegant way, but + # it might be exposing a weakness in the ENAS algorithm as currently + # proposed. + # + # For more details, see + # https://github.com/carpedm20/ENAS-pytorch/issues/6 + clipped_num = 0 + max_clipped_norm = 0 + h1tohT = [] + logits = [] + for step in range(time_steps): + x_t = embed[step] + logit, hidden = self.cell(x_t, hidden, self.dag) + + hidden_norms = hidden.norm(dim=-1) + max_norm = 25.0 + if hidden_norms.data.max() > max_norm: + # Just directly use the torch slice operations + # in PyTorch v0.4. + # + # This workaround for PyTorch v0.3.1 does everything in numpy, + # because the PyTorch slicing and slice assignment is too + # flaky. + hidden_norms = hidden_norms.data.cpu().numpy() + + clipped_num += 1 + if hidden_norms.max() > max_clipped_norm: + max_clipped_norm = hidden_norms.max() + + clip_select = hidden_norms > max_norm + clip_norms = hidden_norms[clip_select] + + mask = np.ones(hidden.size()) + normalizer = max_norm/clip_norms + normalizer = normalizer[:, np.newaxis] + + mask[clip_select] = normalizer + + if self.use_cuda: + hidden *= torch.autograd.Variable( + torch.FloatTensor(mask).cuda(), requires_grad=False) + else: + hidden *= torch.autograd.Variable( + torch.FloatTensor(mask), requires_grad=False) + logits.append(logit) + h1tohT.append(hidden) + + h1tohT = torch.stack(h1tohT) + output = torch.stack(logits) + raw_output = output + + output = self.lockdrop(output, 0.4 if self.training else 0) + + #Pooling + output = torch.mean(output, 0) + + decoded = self.decoder(output) + + extra_out = {'dropped': decoded, + 'hiddens': h1tohT, + 'raw': raw_output} + return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out} + + def cell(self, x, h_prev, dag): + """Computes a single pass through the discovered RNN cell.""" + c = {} + h = {} + f = {} + + f[0] = self.get_f(dag[-1][0].name) + c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None)) + h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) + + (1 - c[0])*h_prev) + + leaf_node_ids = [] + q = collections.deque() + q.append(0) + + # Computes connections from the parent nodes `node_id` + # to their child nodes `next_id` recursively, skipping leaf nodes. A + # leaf node is a node whose id == `self.num_blocks`. + # + # Connections between parent i and child j should be computed as + # h_j = c_j*f_{ij}{(W^h_{ij}*h_i)} + (1 - c_j)*h_i, + # where c_j = \sigmoid{(W^c_{ij}*h_i)} + # + # See Training details from Section 3.1 of the paper. + # + # The following algorithm does a breadth-first (since `q.popleft()` is + # used) search over the nodes and computes all the hidden states. + while True: + if len(q) == 0: + break + + node_id = q.popleft() + nodes = dag[node_id] + + for next_node in nodes: + next_id = next_node.id + if next_id == self.num_blocks: + leaf_node_ids.append(node_id) + assert len(nodes) == 1, ('parent of leaf node should have ' + 'only one child') + continue + + w_h = self.w_h[node_id][next_id] + w_c = self.w_c[node_id][next_id] + + f[next_id] = self.get_f(next_node.name) + c[next_id] = torch.sigmoid(w_c(h[node_id])) + h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) + + (1 - c[next_id])*h[node_id]) + + q.append(next_id) + + # Instead of averaging loose ends, perhaps there should + # be a set of separate unshared weights for each "loose" connection + # between each node in a cell and the output. + # + # As it stands, all weights W^h_{ij} are doing double duty by + # connecting both from i to j, as well as from i to the output. + + # average all the loose ends + leaf_nodes = [h[node_id] for node_id in leaf_node_ids] + output = torch.mean(torch.stack(leaf_nodes, 2), -1) + + # stabilizing the Updates of omega + if self.batch_norm is not None: + output = self.batch_norm(output) + + return output, h[self.num_blocks - 1] + + def init_hidden(self, batch_size): + zeros = torch.zeros(batch_size, self.shared_hid) + return utils.get_variable(zeros, self.use_cuda, requires_grad=False) + + def get_f(self, name): + name = name.lower() + if name == 'relu': + f = torch.relu + elif name == 'tanh': + f = torch.tanh + elif name == 'identity': + f = lambda x: x + elif name == 'sigmoid': + f = torch.sigmoid + return f + + + @property + def num_parameters(self): + def size(p): + return np.prod(p.size()) + return sum([size(param) for param in self.parameters()]) + + + def reset_parameters(self): + init_range = 0.025 + # init_range = 0.025 if self.args.mode == 'train' else 0.04 + for param in self.parameters(): + param.data.uniform_(-init_range, init_range) + self.decoder.bias.data.fill_(0) + + def predict(self, word_seq): + """ + + :param word_seq: torch.LongTensor, [batch_size, seq_len] + :return predict: dict of torch.LongTensor, [batch_size, seq_len] + """ + output = self(word_seq) + _, predict = output['pred'].max(dim=1) + return {'pred': predict} diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py new file mode 100644 index 00000000..22e323ce --- /dev/null +++ b/fastNLP/models/enas_trainer.py @@ -0,0 +1,385 @@ +# Code Modified from https://github.com/carpedm20/ENAS-pytorch + +import os +import time +from datetime import datetime +from datetime import timedelta + +import numpy as np +import torch +import math +from torch import nn + +try: + from tqdm.autonotebook import tqdm +except: + from fastNLP.core.utils import pseudo_tqdm as tqdm + +from fastNLP.core.batch import Batch +from fastNLP.core.callback import CallbackManager, CallbackException +from fastNLP.core.dataset import DataSet +from fastNLP.core.utils import CheckError +from fastNLP.core.utils import _move_dict_value_to_device +import fastNLP +import fastNLP.models.enas_utils as utils +from fastNLP.core.utils import _build_args + +from torch.optim import Adam + + +def _get_no_grad_ctx_mgr(): + """Returns a the `torch.no_grad` context manager for PyTorch version >= + 0.4, or a no-op context manager otherwise. + """ + return torch.no_grad() + + +class ENASTrainer(fastNLP.Trainer): + """A class to wrap training code.""" + def __init__(self, train_data, model, controller, **kwargs): + """Constructor for training algorithm. + :param DataSet train_data: the training data + :param torch.nn.modules.module model: a PyTorch model + :param torch.nn.modules.module controller: a PyTorch model + """ + self.final_epochs = kwargs['final_epochs'] + kwargs.pop('final_epochs') + super(ENASTrainer, self).__init__(train_data, model, **kwargs) + self.controller_step = 0 + self.shared_step = 0 + self.max_length = 35 + + self.shared = model + self.controller = controller + + self.shared_optim = Adam( + self.shared.parameters(), + lr=20.0, + weight_decay=1e-7) + + self.controller_optim = Adam( + self.controller.parameters(), + lr=3.5e-4) + + def train(self, load_best_model=True): + """ + :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效,如果True, trainer将在返回之前重新加载dev表现 + 最好的模型参数。 + :return results: 返回一个字典类型的数据, 内含以下内容:: + + seconds: float, 表示训练时长 + 以下三个内容只有在提供了dev_data的情况下会有。 + best_eval: Dict of Dict, 表示evaluation的结果 + best_epoch: int,在第几个epoch取得的最佳值 + best_step: int, 在第几个step(batch)更新取得的最佳值 + + """ + results = {} + if self.n_epochs <= 0: + print(f"training epoch is {self.n_epochs}, nothing was done.") + results['seconds'] = 0. + return results + try: + if torch.cuda.is_available() and self.use_cuda: + self.model = self.model.cuda() + self._model_device = self.model.parameters().__next__().device + self._mode(self.model, is_test=False) + + self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) + start_time = time.time() + print("training epochs started " + self.start_time, flush=True) + + try: + self.callback_manager.on_train_begin() + self._train() + self.callback_manager.on_train_end(self.model) + except (CallbackException, KeyboardInterrupt) as e: + self.callback_manager.on_exception(e, self.model) + + if self.dev_data is not None: + print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) + + self.tester._format_eval_results(self.best_dev_perf),) + results['best_eval'] = self.best_dev_perf + results['best_epoch'] = self.best_dev_epoch + results['best_step'] = self.best_dev_step + if load_best_model: + model_name = "best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]) + load_succeed = self._load_model(self.model, model_name) + if load_succeed: + print("Reloaded the best model.") + else: + print("Fail to reload best model.") + finally: + pass + results['seconds'] = round(time.time() - start_time, 2) + + return results + + def _train(self): + if not self.use_tqdm: + from fastNLP.core.utils import pseudo_tqdm as inner_tqdm + else: + inner_tqdm = tqdm + self.step = 0 + start = time.time() + total_steps = (len(self.train_data) // self.batch_size + int( + len(self.train_data) % self.batch_size != 0)) * self.n_epochs + with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: + avg_loss = 0 + data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, + prefetch=self.prefetch) + for epoch in range(1, self.n_epochs+1): + pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs)) + last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) + if epoch == self.n_epochs + 1 - self.final_epochs: + print('Entering the final stage. (Only train the selected structure)') + # early stopping + self.callback_manager.on_epoch_begin(epoch, self.n_epochs) + + # 1. Training the shared parameters omega of the child models + self.train_shared(pbar) + + # 2. Training the controller parameters theta + if not last_stage: + self.train_controller() + + if ((self.validate_every > 0 and self.step % self.validate_every == 0) or + (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ + and self.dev_data is not None: + if not last_stage: + self.derive() + eval_res = self._do_validation(epoch=epoch, step=self.step) + eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, + total_steps) + \ + self.tester._format_eval_results(eval_res) + pbar.write(eval_str) + + # lr decay; early stopping + self.callback_manager.on_epoch_end(epoch, self.n_epochs, self.optimizer) + # =============== epochs end =================== # + pbar.close() + # ============ tqdm end ============== # + + + def get_loss(self, inputs, targets, hidden, dags): + """Computes the loss for the same batch for M models. + + This amounts to an estimate of the loss, which is turned into an + estimate for the gradients of the shared model. + """ + if not isinstance(dags, list): + dags = [dags] + + loss = 0 + for dag in dags: + self.shared.setDAG(dag) + inputs = _build_args(self.shared.forward, **inputs) + inputs['hidden'] = hidden + result = self.shared(**inputs) + output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out'] + + self.callback_manager.on_loss_begin(targets, result) + sample_loss = self._compute_loss(result, targets) + loss += sample_loss + + assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`' + return loss, hidden, extra_out + + def train_shared(self, pbar=None, max_step=None, dag=None): + """Train the language model for 400 steps of minibatches of 64 + examples. + + Args: + max_step: Used to run extra training steps as a warm-up. + dag: If not None, is used instead of calling sample(). + + BPTT is truncated at 35 timesteps. + + For each weight update, gradients are estimated by sampling M models + from the fixed controller policy, and averaging their gradients + computed on a batch of training data. + """ + model = self.shared + model.train() + self.controller.eval() + + hidden = self.shared.init_hidden(self.batch_size) + + abs_max_grad = 0 + abs_max_hidden_norm = 0 + step = 0 + raw_total_loss = 0 + total_loss = 0 + train_idx = 0 + avg_loss = 0 + data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, + prefetch=self.prefetch) + + for batch_x, batch_y in data_iterator: + _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) + indices = data_iterator.get_batch_indices() + # negative sampling; replace unknown; re-weight batch_y + self.callback_manager.on_batch_begin(batch_x, batch_y, indices) + # prediction = self._data_forward(self.model, batch_x) + + dags = self.controller.sample(1) + inputs, targets = batch_x, batch_y + # self.callback_manager.on_loss_begin(batch_y, prediction) + loss, hidden, extra_out = self.get_loss(inputs, + targets, + hidden, + dags) + hidden.detach_() + + avg_loss += loss.item() + + # Is loss NaN or inf? requires_grad = False + self.callback_manager.on_backward_begin(loss, self.model) + self._grad_backward(loss) + self.callback_manager.on_backward_end(self.model) + + self._update() + self.callback_manager.on_step_end(self.optimizer) + + if (self.step+1) % self.print_every == 0: + if self.use_tqdm: + print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every) + pbar.update(self.print_every) + else: + end = time.time() + diff = timedelta(seconds=round(end - start)) + print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format( + epoch, self.step, avg_loss, diff) + pbar.set_postfix_str(print_output) + avg_loss = 0 + self.step += 1 + step += 1 + self.shared_step += 1 + self.callback_manager.on_batch_end() + # ================= mini-batch end ==================== # + + + def get_reward(self, dag, entropies, hidden, valid_idx=0): + """Computes the perplexity of a single sampled model on a minibatch of + validation data. + """ + if not isinstance(entropies, np.ndarray): + entropies = entropies.data.cpu().numpy() + + data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, + prefetch=self.prefetch) + + for inputs, targets in data_iterator: + valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) + valid_loss = utils.to_item(valid_loss.data) + + valid_ppl = math.exp(valid_loss) + + R = 80 / valid_ppl + + rewards = R + 1e-4 * entropies + + return rewards, hidden + + def train_controller(self): + """Fixes the shared parameters and updates the controller parameters. + + The controller is updated with a score function gradient estimator + (i.e., REINFORCE), with the reward being c/valid_ppl, where valid_ppl + is computed on a minibatch of validation data. + + A moving average baseline is used. + + The controller is trained for 2000 steps per epoch (i.e., + first (Train Shared) phase -> second (Train Controller) phase). + """ + model = self.controller + model.train() + # Why can't we call shared.eval() here? Leads to loss + # being uniformly zero for the controller. + # self.shared.eval() + + avg_reward_base = None + baseline = None + adv_history = [] + entropy_history = [] + reward_history = [] + + hidden = self.shared.init_hidden(self.batch_size) + total_loss = 0 + valid_idx = 0 + for step in range(20): + # sample models + dags, log_probs, entropies = self.controller.sample( + with_details=True) + + # calculate reward + np_entropies = entropies.data.cpu().numpy() + # No gradients should be backpropagated to the + # shared model during controller training, obviously. + with _get_no_grad_ctx_mgr(): + rewards, hidden = self.get_reward(dags, + np_entropies, + hidden, + valid_idx) + + + reward_history.extend(rewards) + entropy_history.extend(np_entropies) + + # moving average baseline + if baseline is None: + baseline = rewards + else: + decay = 0.95 + baseline = decay * baseline + (1 - decay) * rewards + + adv = rewards - baseline + adv_history.extend(adv) + + # policy loss + loss = -log_probs*utils.get_variable(adv, + self.use_cuda, + requires_grad=False) + + loss = loss.sum() # or loss.mean() + + # update + self.controller_optim.zero_grad() + loss.backward() + + self.controller_optim.step() + + total_loss += utils.to_item(loss.data) + + if ((step % 50) == 0) and (step > 0): + reward_history, adv_history, entropy_history = [], [], [] + total_loss = 0 + + self.controller_step += 1 + # prev_valid_idx = valid_idx + # valid_idx = ((valid_idx + self.max_length) % + # (self.valid_data.size(0) - 1)) + # # Whenever we wrap around to the beginning of the + # # validation data, we reset the hidden states. + # if prev_valid_idx > valid_idx: + # hidden = self.shared.init_hidden(self.batch_size) + + def derive(self, sample_num=10, valid_idx=0): + """We are always deriving based on the very first batch + of validation data? This seems wrong... + """ + hidden = self.shared.init_hidden(self.batch_size) + + dags, _, entropies = self.controller.sample(sample_num, + with_details=True) + + max_R = 0 + best_dag = None + for dag in dags: + R, _ = self.get_reward(dag, entropies, hidden, valid_idx) + if R.max() > max_R: + max_R = R.max() + best_dag = dag + + self.model.setDAG(best_dag) diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py new file mode 100644 index 00000000..e5027d81 --- /dev/null +++ b/fastNLP/models/enas_utils.py @@ -0,0 +1,56 @@ +# Code Modified from https://github.com/carpedm20/ENAS-pytorch + +from __future__ import print_function + +from collections import defaultdict +import collections +from datetime import datetime +import os +import json + +import numpy as np + +import torch +from torch.autograd import Variable + +def detach(h): + if type(h) == Variable: + return Variable(h.data) + else: + return tuple(detach(v) for v in h) + +def get_variable(inputs, cuda=False, **kwargs): + if type(inputs) in [list, np.ndarray]: + inputs = torch.Tensor(inputs) + if cuda: + out = Variable(inputs.cuda(), **kwargs) + else: + out = Variable(inputs, **kwargs) + return out + +def update_lr(optimizer, lr): + for param_group in optimizer.param_groups: + param_group['lr'] = lr + +Node = collections.namedtuple('Node', ['id', 'name']) + + +class keydefaultdict(defaultdict): + def __missing__(self, key): + if self.default_factory is None: + raise KeyError(key) + else: + ret = self[key] = self.default_factory(key) + return ret + + +def to_item(x): + """Converts x, possibly scalar and possibly tensor, to a Python scalar.""" + if isinstance(x, (float, int)): + return x + + if float(torch.__version__[0:3]) < 0.4: + assert (x.dim() == 1) and (len(x) == 1) + return x[0] + + return x.item() diff --git a/test/models/test_enas.py b/test/models/test_enas.py new file mode 100644 index 00000000..07a43205 --- /dev/null +++ b/test/models/test_enas.py @@ -0,0 +1,112 @@ +import unittest + +from fastNLP import DataSet +from fastNLP import Instance +from fastNLP import Vocabulary +from fastNLP.core.losses import CrossEntropyLoss +from fastNLP.core.metrics import AccuracyMetric + + +class TestENAS(unittest.TestCase): + def testENAS(self): + # 从csv读取数据到DataSet + sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv" + dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'), + sep='\t') + print(len(dataset)) + print(dataset[0]) + print(dataset[-3]) + + dataset.append(Instance(raw_sentence='fake data', label='0')) + # 将所有数字转为小写 + dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence') + # label转int + dataset.apply(lambda x: int(x['label']), new_field_name='label') + + # 使用空格分割句子 + def split_sent(ins): + return ins['raw_sentence'].split() + + dataset.apply(split_sent, new_field_name='words') + + # 增加长度信息 + dataset.apply(lambda x: len(x['words']), new_field_name='seq_len') + print(len(dataset)) + print(dataset[0]) + + # DataSet.drop(func)筛除数据 + dataset.drop(lambda x: x['seq_len'] <= 3) + print(len(dataset)) + + # 设置DataSet中,哪些field要转为tensor + # set target,loss或evaluate中的golden,计算loss,模型评估时使用 + dataset.set_target("label") + # set input,模型forward时使用 + dataset.set_input("words", "seq_len") + + # 分出测试集、训练集 + test_data, train_data = dataset.split(0.5) + print(len(test_data)) + print(len(train_data)) + + # 构建词表, Vocabulary.add(word) + vocab = Vocabulary(min_freq=2) + train_data.apply(lambda x: [vocab.add(word) for word in x['words']]) + vocab.build_vocab() + + # index句子, Vocabulary.to_index(word) + train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') + test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words') + print(test_data[0]) + + # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 + from fastNLP.core.batch import Batch + from fastNLP.core.sampler import RandomSampler + + batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler()) + for batch_x, batch_y in batch_iterator: + print("batch_x has: ", batch_x) + print("batch_y has: ", batch_y) + break + + from fastNLP.models.enas_model import ENASModel + from fastNLP.models.enas_controller import Controller + model = ENASModel(embed_num=len(vocab), num_classes=5) + controller = Controller() + + from fastNLP.models.enas_trainer import ENASTrainer + from copy import deepcopy + + # 更改DataSet中对应field的名称,要以模型的forward等参数名一致 + train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致 + train_data.rename_field('label', 'label_seq') + test_data.rename_field('words', 'word_seq') + test_data.rename_field('label', 'label_seq') + + loss = CrossEntropyLoss(pred="output", target="label_seq") + metric = AccuracyMetric(pred="predict", target="label_seq") + + trainer = ENASTrainer(model=model, controller=controller, train_data=train_data, dev_data=test_data, + loss=CrossEntropyLoss(pred="output", target="label_seq"), + metrics=AccuracyMetric(pred="predict", target="label_seq"), + check_code_level=-1, + save_path=None, + batch_size=32, + print_every=1, + n_epochs=3, + final_epochs=1) + trainer.train() + print('Train finished!') + + # 调用Tester在test_data上评价效果 + from fastNLP import Tester + + tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), + batch_size=4) + + acc = tester.test() + print(acc) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file